ceremonyclient/node/datarpc/data_worker_ipc_server.go
Cassandra Heart 53f7c2b5c9
v2.1.0.2 (#442)
* v2.1.0.2

* restore tweaks to simlibp2p

* fix: nil ref on size calc

* fix: panic should induce shutdown from event_distributor

* fix: friendlier initialization that requires less manual kickstarting for test/devnets

* fix: fewer available shards than provers should choose shard length

* fix: update stored worker registry, improve logging for debug mode

* fix: shut the fuck up, peer log

* qol: log value should be snake cased

* fix:non-archive snap sync issues

* fix: separate X448/Decaf448 signed keys, add onion key to registry

* fix: overflow arithmetic on frame number comparison

* fix: worker registration should be idempotent if inputs are same, otherwise permit updated records

* fix: remove global prover state from size calculation

* fix: divide by zero case

* fix: eager prover

* fix: broadcast listener default

* qol: diagnostic data for peer authenticator

* fix: master/worker connectivity issue in sparse networks

tight coupling of peer and workers can sometimes interfere if mesh is sparse, so give workers a pseudoidentity but publish messages with the proper peer key

* fix: reorder steps of join creation

* fix: join verify frame source + ensure domain is properly padded (unnecessary but good for consistency)

* fix: add delegate to protobuf <-> reified join conversion

* fix: preempt prover from planning with no workers

* fix: use the unallocated workers to generate a proof

* qol: underflow causes join fail in first ten frames on test/devnets

* qol: small logging tweaks for easier log correlation in debug mode

* qol: use fisher-yates shuffle to ensure prover allocations are evenly distributed when scores are equal

* qol: separate decisional logic on post-enrollment confirmation into consensus engine, proposer, and worker manager where relevant, refactor out scoring

* reuse shard descriptors for both join planning and confirm/reject decisions

* fix: add missing interface method and amend test blossomsub to use new peer id basis

* fix: only check allocations if they exist

* fix: pomw mint proof data needs to be hierarchically under global intrinsic domain

* staging temporary state under diagnostics

* fix: first phase of distributed lock refactoring

* fix: compute intrinsic locking

* fix: hypergraph intrinsic locking

* fix: token intrinsic locking

* fix: update execution engines to support new locking model

* fix: adjust tests with new execution shape

* fix: weave in lock/unlock semantics to liveness provider

* fix lock fallthrough, add missing allocation update

* qol: additional logging for diagnostics, also testnet/devnet handling for confirmations

* fix: establish grace period on halt scenario to permit recovery

* fix: support test/devnet defaults for coverage scenarios

* fix: nil ref on consensus halts for non-archive nodes

* fix: remove unnecessary prefix from prover ref

* add test coverage for fork choice behaviors and replay – once passing, blocker (2) is resolved

* fix: no fork replay on repeat for non-archive nodes, snap now behaves correctly

* rollup of pre-liveness check lock interactions

* ahead of tests, get the protobuf/metrics-related changes out so teams can prepare

* add test coverage for distributed lock behaviors – once passing, blocker (3) is resolved

* fix: blocker (3)

* Dev docs improvements (#445)

* Make install deps script more robust

* Improve testing instructions

* Worker node should stop upon OS SIGINT/SIGTERM signal (#447)

* move pebble close to Stop()

* move deferred Stop() to Start()

* add core id to worker stop log message

* create done os signal channel and stop worker upon message to it

---------

Co-authored-by: Cassandra Heart <7929478+CassOnMars@users.noreply.github.com>

---------

Co-authored-by: Daz <daz_the_corgi@proton.me>
Co-authored-by: Black Swan <3999712+blacks1ne@users.noreply.github.com>
2025-10-23 01:03:06 -05:00

236 lines
7.5 KiB
Go

package datarpc
import (
"context"
"encoding/hex"
pcrypto "github.com/libp2p/go-libp2p/core/crypto"
"github.com/multiformats/go-multiaddr"
mn "github.com/multiformats/go-multiaddr/net"
"github.com/pkg/errors"
"go.uber.org/zap"
"google.golang.org/grpc"
"source.quilibrium.com/quilibrium/monorepo/config"
"source.quilibrium.com/quilibrium/monorepo/node/consensus/app"
qgrpc "source.quilibrium.com/quilibrium/monorepo/node/internal/grpc"
"source.quilibrium.com/quilibrium/monorepo/node/keys"
"source.quilibrium.com/quilibrium/monorepo/node/p2p"
"source.quilibrium.com/quilibrium/monorepo/protobufs"
"source.quilibrium.com/quilibrium/monorepo/types/channel"
"source.quilibrium.com/quilibrium/monorepo/types/consensus"
"source.quilibrium.com/quilibrium/monorepo/types/crypto"
)
type DataWorkerIPCServer struct {
protobufs.UnimplementedDataIPCServiceServer
listenAddrGRPC string
config *config.Config
logger *zap.Logger
coreId uint32
parentProcessId int
signer crypto.Signer
signerRegistry consensus.SignerRegistry
proverRegistry consensus.ProverRegistry
peerInfoManager p2p.PeerInfoManager
authProvider channel.AuthenticationProvider
appConsensusEngineFactory *app.AppConsensusEngineFactory
appConsensusEngine *app.AppConsensusEngine
server *grpc.Server
frameProver crypto.FrameProver
quit chan struct{}
}
func NewDataWorkerIPCServer(
listenAddrGRPC string,
config *config.Config,
signerRegistry consensus.SignerRegistry,
proverRegistry consensus.ProverRegistry,
peerInfoManager p2p.PeerInfoManager,
frameProver crypto.FrameProver,
appConsensusEngineFactory *app.AppConsensusEngineFactory,
logger *zap.Logger,
coreId uint32,
parentProcessId int,
) (*DataWorkerIPCServer, error) {
peerPrivKey, err := hex.DecodeString(config.P2P.PeerPrivKey)
if err != nil {
logger.Panic("error decoding peerkey", zap.Error(err))
}
privKey, err := pcrypto.UnmarshalEd448PrivateKey(peerPrivKey)
if err != nil {
logger.Panic("error unmarshaling peerkey", zap.Error(err))
}
rawPriv, err := privKey.Raw()
if err != nil {
logger.Panic("error getting private key", zap.Error(err))
}
rawPub, err := privKey.GetPublic().Raw()
if err != nil {
logger.Panic("error getting public key", zap.Error(err))
}
signer, err := keys.Ed448KeyFromBytes(rawPriv, rawPub)
if err != nil {
logger.Panic("error creating signer", zap.Error(err))
}
return &DataWorkerIPCServer{
listenAddrGRPC: listenAddrGRPC,
config: config,
logger: logger,
coreId: coreId,
parentProcessId: parentProcessId,
signer: signer,
appConsensusEngineFactory: appConsensusEngineFactory,
signerRegistry: signerRegistry,
proverRegistry: proverRegistry,
frameProver: frameProver,
peerInfoManager: peerInfoManager,
}, nil
}
func (r *DataWorkerIPCServer) Start() error {
r.quit = make(chan struct{})
r.RespawnServer(nil)
<-r.quit
return nil
}
func (r *DataWorkerIPCServer) Stop() error {
r.logger.Info("stopping server gracefully")
r.server.GracefulStop()
go func() {
r.quit <- struct{}{}
}()
return nil
}
func (r *DataWorkerIPCServer) Respawn(
ctx context.Context,
req *protobufs.RespawnRequest,
) (*protobufs.RespawnResponse, error) {
err := r.RespawnServer(req.Filter)
if err != nil {
return nil, err
}
return &protobufs.RespawnResponse{}, nil
}
func (r *DataWorkerIPCServer) RespawnServer(filter []byte) error {
if r.server != nil {
r.logger.Info("stopping server for respawn")
r.server.GracefulStop()
r.server = nil
}
if r.appConsensusEngine != nil {
<-r.appConsensusEngine.Stop(false)
r.appConsensusEngine = nil
}
// Establish an auth provider
r.authProvider = p2p.NewPeerAuthenticator(
r.logger,
r.config.P2P,
r.peerInfoManager,
r.proverRegistry,
r.signerRegistry,
filter,
nil,
map[string]channel.AllowedPeerPolicyType{
"quilibrium.node.application.pb.HypergraphComparisonService": channel.AnyProverPeer,
"quilibrium.node.node.pb.DataIPCService": channel.OnlySelfPeer,
"quilibrium.node.global.pb.GlobalService": channel.OnlyGlobalProverPeer,
"quilibrium.node.global.pb.AppShardService": channel.OnlyShardProverPeer,
"quilibrium.node.global.pb.OnionService": channel.AnyPeer,
"quilibrium.node.global.pb.KeyRegistryService": channel.OnlySelfPeer,
},
map[string]channel.AllowedPeerPolicyType{
"/quilibrium.node.application.pb.HypergraphComparisonService/HyperStream": channel.OnlyShardProverPeer,
"/quilibrium.node.global.pb.MixnetService/GetTag": channel.AnyPeer,
"/quilibrium.node.global.pb.MixnetService/PutTag": channel.AnyPeer,
"/quilibrium.node.global.pb.MixnetService/PutMessage": channel.AnyPeer,
"/quilibrium.node.global.pb.MixnetService/RoundStream": channel.OnlyGlobalProverPeer,
"/quilibrium.node.global.pb.DispatchService/PutInboxMessage": channel.OnlySelfPeer,
"/quilibrium.node.global.pb.DispatchService/GetInboxMessages": channel.OnlySelfPeer,
"/quilibrium.node.global.pb.DispatchService/PutHub": channel.OnlySelfPeer,
"/quilibrium.node.global.pb.DispatchService/GetHub": channel.OnlySelfPeer,
"/quilibrium.node.global.pb.DispatchService/Sync": channel.AnyProverPeer,
"/quilibrium.node.ferretproxy.pb.FerretProxy/AliceProxy": channel.OnlySelfPeer,
"/quilibrium.node.ferretproxy.pb.FerretProxy/BobProxy": channel.AnyPeer,
},
)
tlsCreds, err := r.authProvider.CreateServerTLSCredentials()
if err != nil {
return errors.Wrap(err, "respawn server")
}
r.server = qgrpc.NewServer(
grpc.Creds(tlsCreds),
grpc.ChainUnaryInterceptor(r.authProvider.UnaryInterceptor),
grpc.ChainStreamInterceptor(r.authProvider.StreamInterceptor),
grpc.MaxRecvMsgSize(10*1024*1024),
grpc.MaxSendMsgSize(10*1024*1024),
)
mg, err := multiaddr.NewMultiaddr(r.listenAddrGRPC)
if err != nil {
return errors.Wrap(err, "respawn server")
}
lis, err := mn.Listen(mg)
if err != nil {
return errors.Wrap(err, "respawn server")
}
r.logger.Info(
"data worker listening",
zap.String("address", r.listenAddrGRPC),
zap.String("resolved", lis.Addr().String()),
)
if len(filter) != 0 {
globalTimeReel, err := r.appConsensusEngineFactory.CreateGlobalTimeReel()
if err != nil {
return errors.Wrap(err, "respawn server")
}
r.appConsensusEngine, err = r.appConsensusEngineFactory.CreateAppConsensusEngine(
filter,
uint(r.coreId),
globalTimeReel,
r.server,
)
}
go func() {
protobufs.RegisterDataIPCServiceServer(r.server, r)
if err := r.server.Serve(mn.NetListener(lis)); err != nil {
r.logger.Info("terminating server", zap.Error(err))
}
}()
return nil
}
// CreateJoinProof implements protobufs.DataIPCServiceServer.
func (r *DataWorkerIPCServer) CreateJoinProof(
ctx context.Context,
req *protobufs.CreateJoinProofRequest,
) (*protobufs.CreateJoinProofResponse, error) {
r.logger.Debug("received request to create join proof")
proof := r.frameProver.CalculateMultiProof(
[32]byte(req.Challenge),
req.Difficulty,
req.Ids,
req.ProverIndex,
)
return &protobufs.CreateJoinProofResponse{
Response: proof[:],
}, nil
}