diff --git a/lifecycle/supervisor.go b/lifecycle/supervisor.go index 3e462fb..f4e920d 100644 --- a/lifecycle/supervisor.go +++ b/lifecycle/supervisor.go @@ -74,7 +74,7 @@ func NewSupervisor(nodes []*Node) (*Supervisor, error) { return s, nil } -func (s *Supervisor) Run(ctx context.Context) error { +func (s *Supervisor) Start(ctx context.Context) error { ctx, stopSignals := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM) defer stopSignals() diff --git a/lifecycle/supervisor_test.go b/lifecycle/supervisor_test.go index c017588..3c6ba8b 100644 --- a/lifecycle/supervisor_test.go +++ b/lifecycle/supervisor_test.go @@ -349,7 +349,7 @@ func TestSupervisor_Stop_StopsDescendantsOnly(t *testing.T) { defer cancel() go func() { - _ = s.Run(ctx) + _ = s.Start(ctx) }() // Wait for all to be Ready. @@ -394,7 +394,7 @@ func TestSupervisor_StopParents_StopsAncestorsAndDesc(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go func() { _ = s.Run(ctx) }() + go func() { _ = s.Start(ctx) }() time.Sleep(150 * time.Millisecond) @@ -433,7 +433,7 @@ func TestSupervisor_ShutdownAll(t *testing.T) { done := make(chan struct{}) go func() { - _ = s.Run(ctx) // Run should return after Shutdown cascade completes + _ = s.Start(ctx) // Run should return after Shutdown cascade completes close(done) }() diff --git a/node/consensus/app/app_consensus_engine.go b/node/consensus/app/app_consensus_engine.go index 97bd346..00ea3b9 100644 --- a/node/consensus/app/app_consensus_engine.go +++ b/node/consensus/app/app_consensus_engine.go @@ -54,6 +54,7 @@ import ( // AppConsensusEngine uses the generic state machine for consensus type AppConsensusEngine struct { + *lifecycle.ComponentManager protobufs.AppShardServiceServer logger *zap.Logger @@ -105,7 +106,6 @@ type AppConsensusEngine struct { ctx lifecycle.SignalerContext cancel context.CancelFunc quit chan struct{} - wg sync.WaitGroup canRunStandalone bool blacklistMap map[string]bool alertPublicKey []byte @@ -352,12 +352,6 @@ func NewAppConsensusEngine( return nil, errors.Wrap(err, "failed to initialize execution engines") } - // Register all execution engines with the consensus engine - err = engine.executionManager.RegisterAllEngines(engine.RegisterExecutor) - if err != nil { - return nil, errors.Wrap(err, "failed to register execution engines") - } - engine.syncProvider = &AppSyncProvider{engine: engine} engine.votingProvider = &AppVotingProvider{engine: engine} engine.leaderProvider = &AppLeaderProvider{engine: engine} @@ -419,6 +413,162 @@ func NewAppConsensusEngine( executorsRegistered.WithLabelValues(engine.appAddressHex).Set(0) pendingMessagesCount.WithLabelValues(engine.appAddressHex).Set(0) + engine.ctx, engine.cancel, _ = lifecycle.WithSignallerAndCancel( + context.Background(), + ) + componentBuilder := lifecycle.NewComponentManagerBuilder() + // Add execution engines + componentBuilder.AddWorker(engine.executionManager.Start) + componentBuilder.AddWorker(engine.eventDistributor.Start) + componentBuilder.AddWorker(engine.appTimeReel.Start) + frame, _, err := engine.clockStore.GetLatestShardClockFrame(engine.appAddress) + if err != nil { + engine.logger.Warn( + "invalid frame retrieved, will resync", + zap.Error(err), + ) + } + + var initialState **protobufs.AppShardFrame = nil + if frame != nil { + initialState = &frame + } + + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + if err := engine.startConsensus(initialState, ctx, ready); err != nil { + ctx.Throw(err) + return + } + + <-ctx.Done() + }) + + err = engine.subscribeToConsensusMessages() + if err != nil { + engine.ctx.Throw(errors.Wrap(err, "start")) + return nil, err + } + + err = engine.subscribeToProverMessages() + if err != nil { + engine.ctx.Throw(errors.Wrap(err, "start")) + return nil, err + } + + err = engine.subscribeToFrameMessages() + if err != nil { + engine.ctx.Throw(errors.Wrap(err, "start")) + return nil, err + } + + err = engine.subscribeToGlobalFrameMessages() + if err != nil { + engine.ctx.Throw(errors.Wrap(err, "start")) + return nil, err + } + + err = engine.subscribeToGlobalProverMessages() + if err != nil { + engine.ctx.Throw(errors.Wrap(err, "start")) + return nil, err + } + + err = engine.subscribeToGlobalAlertMessages() + if err != nil { + engine.ctx.Throw(errors.Wrap(err, "start")) + return nil, err + } + + err = engine.subscribeToPeerInfoMessages() + if err != nil { + engine.ctx.Throw(errors.Wrap(err, "start")) + return nil, err + } + + err = engine.subscribeToDispatchMessages() + if err != nil { + engine.ctx.Throw(errors.Wrap(err, "start")) + return nil, err + } + + // Start message queue processors + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processConsensusMessageQueue(ctx) + }) + + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processProverMessageQueue(ctx) + }) + + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processFrameMessageQueue(ctx) + }) + + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processGlobalFrameMessageQueue(ctx) + }) + + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processAlertMessageQueue(ctx) + }) + + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processPeerInfoMessageQueue(ctx) + }) + + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processDispatchMessageQueue(ctx) + }) + + // Start event distributor event loop + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.eventDistributorLoop(ctx) + }) + + // Start metrics update goroutine + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.updateMetricsLoop(ctx) + }) + return engine, nil } @@ -426,134 +576,6 @@ func (e *AppConsensusEngine) Start(quit chan struct{}) <-chan error { errChan := make(chan error, 1) e.quit = quit - e.ctx, e.cancel, _ = lifecycle.WithSignallerAndCancel(context.Background()) - - // Start execution engines - if err := e.executionManager.StartAll(e.quit); err != nil { - errChan <- errors.Wrap(err, "start execution engines") - close(errChan) - return errChan - } - - if err := e.eventDistributor.Start(e.ctx); err != nil { - errChan <- errors.Wrap(err, "start event distributor") - close(errChan) - return errChan - } - - err := e.appTimeReel.Start() - if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - frame, _, err := e.clockStore.GetLatestShardClockFrame(e.appAddress) - if err != nil { - e.logger.Warn( - "invalid frame retrieved, will resync", - zap.Error(err), - ) - } - - e.ensureGlobalClient() - - var initialState **protobufs.AppShardFrame = nil - if frame != nil { - initialState = &frame - } - - if err := e.startConsensus(initialState); err != nil { - errChan <- errors.Wrap(err, "start state machine") - close(errChan) - return errChan - } - - err = e.subscribeToConsensusMessages() - if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - err = e.subscribeToProverMessages() - if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - err = e.subscribeToFrameMessages() - if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - err = e.subscribeToGlobalFrameMessages() - if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - err = e.subscribeToGlobalProverMessages() - if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - err = e.subscribeToGlobalAlertMessages() - if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - err = e.subscribeToPeerInfoMessages() - if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - err = e.subscribeToDispatchMessages() - if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - // Start message queue processors - e.wg.Add(1) - go e.processConsensusMessageQueue() - - e.wg.Add(1) - go e.processProverMessageQueue() - - e.wg.Add(1) - go e.processFrameMessageQueue() - - e.wg.Add(1) - go e.processGlobalFrameMessageQueue() - - e.wg.Add(1) - go e.processAlertMessageQueue() - - e.wg.Add(1) - go e.processPeerInfoMessageQueue() - - e.wg.Add(1) - go e.processDispatchMessageQueue() - - // Start event distributor event loop - e.wg.Add(1) - go e.eventDistributorLoop() - - // Start metrics update goroutine - e.wg.Add(1) - go e.updateMetricsLoop() e.logger.Info( "app consensus engine started", @@ -576,57 +598,6 @@ func (e *AppConsensusEngine) Stop(force bool) <-chan error { e.cancel() } - // Stop event distributor - if e.eventDistributor != nil { - if err := e.eventDistributor.Stop(); err != nil && !force { - e.logger.Warn("error stopping event distributor", zap.Error(err)) - select { - case errChan <- errors.Wrap(err, "stop event distributor"): - default: - } - } - } - - // Stop execution engines - if e.executionManager != nil { - if err := e.executionManager.StopAll(force); err != nil && !force { - e.logger.Warn("error stopping execution engines", zap.Error(err)) - select { - case errChan <- errors.Wrap(err, "stop execution engines"): - default: - } - } - } - - // Wait for goroutines to finish with shorter timeout for tests - done := make(chan struct{}) - go func() { - e.wg.Wait() - close(done) - }() - - // Use shorter timeout in test environments - timeout := 30 * time.Second - if e.config.P2P.Network == 99 { - timeout = 5 * time.Second - } - - select { - case <-done: - // Clean shutdown - e.logger.Info("app consensus engine stopped cleanly") - case <-time.After(timeout): - if !force { - e.logger.Error("timeout waiting for graceful shutdown") - select { - case errChan <- errors.New("timeout waiting for graceful shutdown"): - default: - } - } else { - e.logger.Warn("forced shutdown after timeout") - } - } - // Unsubscribe from pubsub to stop new messages from arriving e.pubsub.Unsubscribe(e.getConsensusMessageBitmask(), false) e.pubsub.UnregisterValidator(e.getConsensusMessageBitmask()) @@ -680,81 +651,6 @@ func (e *AppConsensusEngine) GetState() typesconsensus.EngineState { } } -func (e *AppConsensusEngine) RegisterExecutor( - exec execution.ShardExecutionEngine, - frame uint64, -) <-chan error { - errChan := make(chan error, 1) - - e.executorsMu.Lock() - defer e.executorsMu.Unlock() - - name := exec.GetName() - if _, exists := e.executors[name]; exists { - errChan <- errors.New("executor already registered") - close(errChan) - return errChan - } - - e.executors[name] = exec - - // Update metrics - executorRegistrationTotal.WithLabelValues(e.appAddressHex, "register").Inc() - executorsRegistered.WithLabelValues( - e.appAddressHex, - ).Set(float64(len(e.executors))) - - close(errChan) - return errChan -} - -func (e *AppConsensusEngine) UnregisterExecutor( - name string, - frame uint64, - force bool, -) <-chan error { - errChan := make(chan error, 1) - - e.executorsMu.Lock() - defer e.executorsMu.Unlock() - - if _, exists := e.executors[name]; !exists { - errChan <- errors.New("executor not registered") - close(errChan) - return errChan - } - - // Stop the executor - if exec, ok := e.executors[name]; ok { - stopErrChan := exec.Stop(force) - select { - case err := <-stopErrChan: - if err != nil && !force { - errChan <- errors.Wrap(err, "stop executor") - close(errChan) - return errChan - } - case <-time.After(5 * time.Second): - if !force { - errChan <- errors.New("timeout stopping executor") - close(errChan) - return errChan - } - } - } - - delete(e.executors, name) - - // Update metrics - executorRegistrationTotal.WithLabelValues(e.appAddressHex, "unregister").Inc() - executorsRegistered.WithLabelValues( - e.appAddressHex, - ).Set(float64(len(e.executors))) - - close(errChan) - return errChan -} - func (e *AppConsensusEngine) GetProvingKey( engineConfig *config.EngineConfig, ) (crypto.Signer, crypto.KeyType, []byte, []byte) { @@ -1130,28 +1026,22 @@ func (e *AppConsensusEngine) cleanupFrameStore() { ) } -func (e *AppConsensusEngine) updateMetricsLoop() { +func (e *AppConsensusEngine) updateMetricsLoop( + ctx lifecycle.SignalerContext, +) { defer func() { if r := recover(); r != nil { e.logger.Error("fatal error encountered", zap.Any("panic", r)) - if e.cancel != nil { - e.cancel() - } - // Avoid blocking on quit channel during panic recovery - select { - case e.quit <- struct{}{}: - default: - } + ctx.Throw(errors.Errorf("fatal unhandled error encountered: %v", r)) } }() - defer e.wg.Done() ticker := time.NewTicker(10 * time.Second) defer ticker.Stop() for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return @@ -1388,7 +1278,7 @@ func (e *AppConsensusEngine) internalProveFrame( timestamp := time.Now().UnixMilli() difficulty := e.difficultyAdjuster.GetNextDifficulty( - previousFrame.Rank()+1, + previousFrame.GetRank()+1, timestamp, ) @@ -1525,6 +1415,8 @@ func (e *AppConsensusEngine) ensureGlobalClient() error { func (e *AppConsensusEngine) startConsensus( initialFrame **protobufs.AppShardFrame, + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, ) error { var err error e.consensusParticipant, err = participant.NewParticipant[ @@ -1555,7 +1447,8 @@ func (e *AppConsensusEngine) startConsensus( return err } - e.consensusParticipant.Start(e.ctx) + ready() + e.consensusParticipant.Start(ctx) return nil } diff --git a/node/consensus/app/consensus_liveness_provider.go b/node/consensus/app/consensus_liveness_provider.go index 549b4d4..0420c3c 100644 --- a/node/consensus/app/consensus_liveness_provider.go +++ b/node/consensus/app/consensus_liveness_provider.go @@ -77,7 +77,7 @@ func (p *AppLivenessProvider) Collect( zap.Int("valid_message_count", len(finalizedMessages)), zap.Uint64( "current_frame", - p.engine.GetFrame().Rank(), + p.engine.GetFrame().GetRank(), ), ) transactionsCollectedTotal.WithLabelValues(p.engine.appAddressHex).Add( diff --git a/node/consensus/app/consensus_transition_listener.go b/node/consensus/app/consensus_transition_listener.go deleted file mode 100644 index 3a98780..0000000 --- a/node/consensus/app/consensus_transition_listener.go +++ /dev/null @@ -1,52 +0,0 @@ -package app - -import ( - "go.uber.org/zap" - "source.quilibrium.com/quilibrium/monorepo/consensus" -) - -type AppTracer struct { - logger *zap.Logger -} - -func (t *AppTracer) Trace(message string) { - t.logger.Debug(message) -} - -func (t *AppTracer) Error(message string, err error) { - t.logger.Error(message, zap.Error(err)) -} - -// AppTransitionListener handles state transitions -type AppTransitionListener struct { - engine *AppConsensusEngine - logger *zap.Logger -} - -func (l *AppTransitionListener) OnTransition( - from consensus.State, - to consensus.State, - event consensus.Event, -) { - var stateValue float64 - switch to { - case consensus.StateStopped: - stateValue = 0 - case consensus.StateStarting: - stateValue = 1 - case consensus.StateLoading: - stateValue = 2 - case consensus.StateCollecting: - stateValue = 3 - case consensus.StateProving: - stateValue = 4 - case consensus.StatePublishing: - stateValue = 5 - case consensus.StateVerifying: - stateValue = 6 - case consensus.StateStopping: - stateValue = 7 - } - - engineState.WithLabelValues(l.engine.appAddressHex).Set(stateValue) -} diff --git a/node/consensus/app/consensus_voting_provider.go b/node/consensus/app/consensus_voting_provider.go index 834ae32..55c654a 100644 --- a/node/consensus/app/consensus_voting_provider.go +++ b/node/consensus/app/consensus_voting_provider.go @@ -1,20 +1,12 @@ package app import ( - "bytes" "context" - "encoding/hex" "slices" "sync" - "time" - "github.com/iden3/go-iden3-crypto/poseidon" - "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" - "go.uber.org/zap" - "golang.org/x/crypto/sha3" - "google.golang.org/protobuf/proto" "source.quilibrium.com/quilibrium/monorepo/consensus" + "source.quilibrium.com/quilibrium/monorepo/consensus/models" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/tries" up2p "source.quilibrium.com/quilibrium/monorepo/utils/p2p" @@ -23,671 +15,28 @@ import ( // AppVotingProvider implements VotingProvider type AppVotingProvider struct { engine *AppConsensusEngine - proposalVotes map[consensus.Identity]map[consensus.Identity]**protobufs.FrameVote + proposalVotes map[models.Identity]map[models.Identity]**protobufs.ProposalVote mu sync.Mutex } -func (p *AppVotingProvider) SendProposal( - proposal **protobufs.AppShardFrame, - ctx context.Context, -) error { - timer := prometheus.NewTimer(framePublishingDuration.WithLabelValues( - p.engine.appAddressHex, - )) - defer timer.ObserveDuration() - - if proposal == nil || (*proposal).Header == nil { - framePublishingTotal.WithLabelValues(p.engine.appAddressHex, "error").Inc() - return errors.Wrap( - errors.New("invalid proposal"), - "send proposal", - ) - } - - p.engine.logger.Info( - "sending proposal", - zap.Uint64("frame_number", (*proposal).Header.FrameNumber), - zap.String("prover", hex.EncodeToString((*proposal).Header.Prover)), - ) - - // Serialize the frame using canonical bytes - frameData, err := (*proposal).ToCanonicalBytes() - if err != nil { - framePublishingTotal.WithLabelValues(p.engine.appAddressHex, "error").Inc() - return errors.Wrap(err, "serialize proposal") - } - - // Publish to the network - if err := p.engine.pubsub.PublishToBitmask( - p.engine.getConsensusMessageBitmask(), - frameData, - ); err != nil { - framePublishingTotal.WithLabelValues(p.engine.appAddressHex, "error").Inc() - return errors.Wrap(err, "send proposal") - } - - // Store the frame - frameIDBI, _ := poseidon.HashBytes((*proposal).Header.Output) - frameID := frameIDBI.FillBytes(make([]byte, 32)) - p.engine.frameStoreMu.Lock() - p.engine.frameStore[string(frameID)] = - (*proposal).Clone().(*protobufs.AppShardFrame) - p.engine.frameStoreMu.Unlock() - - framePublishingTotal.WithLabelValues(p.engine.appAddressHex, "success").Inc() - return nil +// FinalizeQuorumCertificate implements consensus.VotingProvider. +func (p *AppVotingProvider) FinalizeQuorumCertificate(ctx context.Context, state *models.State[*protobufs.AppShardFrame], aggregatedSignature models.AggregatedSignature) (models.QuorumCertificate, error) { + panic("unimplemented") } -func (p *AppVotingProvider) DecideAndSendVote( - proposals map[consensus.Identity]**protobufs.AppShardFrame, - ctx context.Context, -) (PeerID, **protobufs.FrameVote, error) { - var chosenProposal *protobufs.AppShardFrame - var chosenID consensus.Identity - parentFrame := p.engine.GetFrame() - if parentFrame == nil { - return PeerID{}, nil, errors.Wrap( - errors.New("no frame: no valid proposals to vote on"), - "decide and send vote", - ) - } - - parentSelector := p.engine.calculateFrameSelector(parentFrame.Header) - provers, err := p.engine.proverRegistry.GetOrderedProvers( - [32]byte(parentSelector), - p.engine.appAddress, - ) - if err != nil { - return PeerID{}, nil, errors.Wrap(err, "decide and send vote") - } - - for _, id := range provers { - prop := proposals[PeerID{ID: id}.Identity()] - if prop == nil { - p.engine.logger.Debug( - "proposer not found for prover", - zap.String("prover", PeerID{ID: id}.Identity()), - ) - continue - } - // Validate the proposal - valid, err := p.engine.frameValidator.Validate((*prop)) - if err != nil { - p.engine.logger.Debug("proposal validation error", zap.Error(err)) - continue - } - - p.engine.frameStoreMu.RLock() - _, hasParent := p.engine.frameStore[string( - (*prop).Header.ParentSelector, - )] - p.engine.frameStoreMu.RUnlock() - // Do we have continuity? - if !hasParent { - p.engine.logger.Debug( - "proposed frame out of sequence", - zap.String( - "proposed_parent_selector", - hex.EncodeToString((*prop).Header.ParentSelector), - ), - zap.String( - "target_parent_selector", - hex.EncodeToString(parentSelector), - ), - zap.Uint64("proposed_frame_number", (*prop).Header.FrameNumber), - zap.Uint64("target_frame_number", parentFrame.Header.FrameNumber+1), - ) - continue - } else { - p.engine.logger.Debug( - "proposed frame in sequence", - zap.String( - "proposed_parent_selector", - hex.EncodeToString((*prop).Header.ParentSelector), - ), - zap.String( - "target_parent_selector", - hex.EncodeToString(parentSelector), - ), - zap.Uint64("proposed_frame_number", (*prop).Header.FrameNumber), - zap.Uint64("target_frame_number", parentFrame.Header.FrameNumber+1), - ) - } - - if valid { - // Validate fee multiplier is within acceptable bounds (+/-10% of base) - baseFeeMultiplier, err := p.engine.dynamicFeeManager.GetNextFeeMultiplier( - p.engine.appAddress, - ) - if err != nil { - p.engine.logger.Debug( - "could not get base fee multiplier for validation", - zap.Error(err), - ) - continue - } - - // Calculate the maximum allowed deviation (10%) - maxIncrease := baseFeeMultiplier + (baseFeeMultiplier / 10) - minDecrease := baseFeeMultiplier - (baseFeeMultiplier / 10) - if minDecrease < 1 { - minDecrease = 1 - } - - proposedFee := (*prop).Header.FeeMultiplierVote - - // Reject if fee is outside acceptable bounds - if proposedFee > maxIncrease || proposedFee < minDecrease { - p.engine.logger.Debug( - "rejecting proposal with excessive fee change", - zap.Uint64("base_fee", baseFeeMultiplier), - zap.Uint64("proposed_fee", proposedFee), - zap.Uint64("max_allowed", maxIncrease), - zap.Uint64("min_allowed", minDecrease), - ) - continue - } - - chosenProposal = (*prop) - chosenID = PeerID{ID: id}.Identity() - break - } - } - - if chosenProposal == nil { - return PeerID{}, nil, errors.Wrap( - errors.New("no valid proposals to vote on"), - "decide and send vote", - ) - } - - // Get signing key - signer, _, publicKey, _ := p.engine.GetProvingKey(p.engine.config.Engine) - if publicKey == nil { - return PeerID{}, nil, errors.Wrap( - errors.New("no proving key available for voting"), - "decide and send vote", - ) - } - - // Create vote (signature) - signatureData, err := p.engine.frameProver.GetFrameSignaturePayload( - chosenProposal.Header, - ) - if err != nil { - return PeerID{}, nil, errors.Wrap(err, "decide and send vote") - } - - sig, err := signer.SignWithDomain( - signatureData, - append([]byte("shard"), p.engine.appAddress...), - ) - if err != nil { - return PeerID{}, nil, errors.Wrap(err, "decide and send vote") - } - - // Get our voter address - voterAddress := p.engine.getAddressFromPublicKey(publicKey) - - // Create vote message - vote := &protobufs.FrameVote{ - Filter: p.engine.appAddress, - FrameNumber: chosenProposal.Header.FrameNumber, - Proposer: chosenProposal.Header.Prover, - Approve: true, - Timestamp: time.Now().UnixMilli(), - PublicKeySignatureBls48581: &protobufs.BLS48581AddressedSignature{ - Address: voterAddress, - Signature: sig, - }, - } - - // Serialize and publish vote - data, err := vote.ToCanonicalBytes() - if err != nil { - return PeerID{}, nil, errors.Wrap(err, "serialize vote") - } - - if err := p.engine.pubsub.PublishToBitmask( - p.engine.getConsensusMessageBitmask(), - data, - ); err != nil { - p.engine.logger.Error("failed to publish vote", zap.Error(err)) - } - - // Store our vote - p.mu.Lock() - if _, ok := p.proposalVotes[chosenID]; !ok { - p.proposalVotes[chosenID] = map[consensus.Identity]**protobufs.FrameVote{} - } - p.proposalVotes[chosenID][p.engine.getPeerID().Identity()] = &vote - p.mu.Unlock() - - p.engine.logger.Info( - "decided and sent vote", - zap.Uint64("frame_number", chosenProposal.Header.FrameNumber), - zap.String("for_proposal", chosenID), - ) - - // Return the peer ID from the chosen proposal's prover - return PeerID{ID: chosenProposal.Header.Prover}, &vote, nil +// FinalizeTimeout implements consensus.VotingProvider. +func (p *AppVotingProvider) FinalizeTimeout(ctx context.Context, rank uint64, latestQuorumCertificate models.QuorumCertificate, latestQuorumCertificateRanks []uint64, aggregatedSignature models.AggregatedSignature) (models.TimeoutCertificate, error) { + panic("unimplemented") } -func (p *AppVotingProvider) SendVote( - vote **protobufs.FrameVote, - ctx context.Context, -) (PeerID, error) { - if vote == nil || *vote == nil { - return PeerID{}, errors.Wrap( - errors.New("no vote provided"), - "send vote", - ) - } - - bumpVote := &protobufs.FrameVote{ - Filter: p.engine.appAddress, - FrameNumber: (*vote).FrameNumber, - Proposer: (*vote).Proposer, - Approve: true, - Timestamp: time.Now().UnixMilli(), - PublicKeySignatureBls48581: (*vote).PublicKeySignatureBls48581, - } - - data, err := (*bumpVote).ToCanonicalBytes() - if err != nil { - return PeerID{}, errors.Wrap(err, "serialize vote") - } - - if err := p.engine.pubsub.PublishToBitmask( - p.engine.getConsensusMessageBitmask(), - data, - ); err != nil { - p.engine.logger.Error("failed to publish vote", zap.Error(err)) - } - - return PeerID{ID: (*vote).Proposer}, nil +// SignTimeoutVote implements consensus.VotingProvider. +func (p *AppVotingProvider) SignTimeoutVote(ctx context.Context, filter []byte, currentRank uint64, newestQuorumCertificateRank uint64) (**protobufs.ProposalVote, error) { + panic("unimplemented") } -func (p *AppVotingProvider) IsQuorum( - proposalVotes map[consensus.Identity]**protobufs.FrameVote, - ctx context.Context, -) (bool, error) { - // Get active prover count for quorum calculation - activeProvers, err := p.engine.proverRegistry.GetActiveProvers( - p.engine.appAddress, - ) - if err != nil { - return false, errors.Wrap(err, "is quorum") - } - - minVotes := len(activeProvers) * 2 / 3 - if minVotes < int(p.engine.minimumProvers()) { - minVotes = int(p.engine.minimumProvers()) - } - - totalVotes := len(proposalVotes) - - if totalVotes >= minVotes { - return true, nil - } - - return false, nil -} - -func (p *AppVotingProvider) FinalizeVotes( - proposals map[consensus.Identity]**protobufs.AppShardFrame, - proposalVotes map[consensus.Identity]**protobufs.FrameVote, - ctx context.Context, -) (**protobufs.AppShardFrame, PeerID, error) { - // Count approvals and collect signatures - var signatures [][]byte - var publicKeys [][]byte - var chosenProposal **protobufs.AppShardFrame - var chosenProposerID PeerID - winnerCount := 0 - parentFrame := p.engine.GetFrame() - voteCount := map[string]int{} - for _, vote := range proposalVotes { - count, ok := voteCount[string((*vote).Proposer)] - if !ok { - voteCount[string((*vote).Proposer)] = 1 - } else { - voteCount[string((*vote).Proposer)] = count + 1 - } - } - for _, proposal := range proposals { - if proposal == nil { - continue - } - - p.engine.frameStoreMu.RLock() - _, hasParent := p.engine.frameStore[string( - (*proposal).Header.ParentSelector, - )] - p.engine.frameStoreMu.RUnlock() - - count := 0 - if hasParent { - count = voteCount[string((*proposal).Header.Prover)] - } - if count > winnerCount { - winnerCount = count - chosenProposal = proposal - chosenProposerID = PeerID{ID: (*proposal).Header.Prover} - } - } - - if chosenProposal == nil && len(proposals) > 0 { - // No specific votes, just pick first proposal - for _, proposal := range proposals { - if proposal == nil { - continue - } - p.engine.frameStoreMu.RLock() - parent, hasParent := p.engine.frameStore[string( - (*proposal).Header.ParentSelector, - )] - p.engine.frameStoreMu.RUnlock() - if hasParent && (parentFrame == nil || - parent.Header.FrameNumber == parentFrame.Header.FrameNumber) { - chosenProposal = proposal - chosenProposerID = PeerID{ID: (*proposal).Header.Prover} - break - } - } - } - - if chosenProposal == nil { - return &parentFrame, PeerID{}, errors.Wrap( - errors.New("no proposals to finalize"), - "finalize votes", - ) - } - - err := p.engine.ensureGlobalClient() - if err != nil { - return &parentFrame, PeerID{}, errors.Wrap( - errors.New("cannot confirm cross-shard locks"), - "finalize votes", - ) - } - - res, err := p.engine.globalClient.GetLockedAddresses( - ctx, - &protobufs.GetLockedAddressesRequest{ - ShardAddress: p.engine.appAddress, - FrameNumber: (*chosenProposal).Header.FrameNumber, - }, - ) - if err != nil { - p.engine.globalClient = nil - return &parentFrame, PeerID{}, errors.Wrap( - errors.New("cannot confirm cross-shard locks"), - "finalize votes", - ) - } - - // Build a map of transaction hashes to their committed status - txMap := map[string]bool{} - for _, req := range (*chosenProposal).Requests { - tx, err := req.ToCanonicalBytes() - if err != nil { - return &parentFrame, PeerID{}, errors.Wrap( - err, - "finalize votes", - ) - } - - txHash := sha3.Sum256(tx) - p.engine.logger.Debug( - "adding transaction in frame to commit check", - zap.String("tx_hash", hex.EncodeToString(txHash[:])), - ) - txMap[string(txHash[:])] = false - } - - // Check that transactions are committed in our shard and collect shard - // addresses - shardAddressesSet := make(map[string]bool) - for _, tx := range res.Transactions { - p.engine.logger.Debug( - "checking transaction from global map", - zap.String("tx_hash", hex.EncodeToString(tx.TransactionHash)), - ) - if _, ok := txMap[string(tx.TransactionHash)]; ok { - txMap[string(tx.TransactionHash)] = tx.Committed - - // Extract shard addresses from each locked transaction's shard addresses - for _, shardAddr := range tx.ShardAddresses { - // Extract the applicable shard address (can be shorter than the full - // address) - extractedShards := p.extractShardAddresses(shardAddr) - for _, extractedShard := range extractedShards { - shardAddrStr := string(extractedShard) - shardAddressesSet[shardAddrStr] = true - } - } - } - } - - // Check that all transactions are committed in our shard - for _, committed := range txMap { - if !committed { - return &parentFrame, PeerID{}, errors.Wrap( - errors.New("tx not committed in our shard"), - "finalize votes", - ) - } - } - - // Check cross-shard locks for each unique shard address - for shardAddrStr := range shardAddressesSet { - shardAddr := []byte(shardAddrStr) - - // Skip our own shard since we already checked it - if bytes.Equal(shardAddr, p.engine.appAddress) { - continue - } - - // Query the global client for locked addresses in this shard - shardRes, err := p.engine.globalClient.GetLockedAddresses( - ctx, - &protobufs.GetLockedAddressesRequest{ - ShardAddress: shardAddr, - FrameNumber: (*chosenProposal).Header.FrameNumber, - }, - ) - if err != nil { - p.engine.logger.Debug( - "failed to get locked addresses for shard", - zap.String("shard_addr", hex.EncodeToString(shardAddr)), - zap.Error(err), - ) - continue - } - - // Check that all our transactions are committed in this shard - for txHashStr := range txMap { - committedInShard := false - for _, tx := range shardRes.Transactions { - if string(tx.TransactionHash) == txHashStr { - committedInShard = tx.Committed - break - } - } - - if !committedInShard { - return &parentFrame, PeerID{}, errors.Wrap( - errors.New("tx cross-shard lock unconfirmed"), - "finalize votes", - ) - } - } - } - - proverSet, err := p.engine.proverRegistry.GetActiveProvers( - p.engine.appAddress, - ) - if err != nil { - return &parentFrame, PeerID{}, errors.Wrap(err, "finalize votes") - } - - proverMap := map[string][]byte{} - for _, prover := range proverSet { - proverMap[string(prover.Address)] = prover.PublicKey - } - - voterMap := map[string]**protobufs.FrameVote{} - - // Collect all signatures for aggregation - for _, vote := range proposalVotes { - if vote == nil { - continue - } - - if (*vote).FrameNumber != (*chosenProposal).Header.FrameNumber || - !bytes.Equal((*vote).Proposer, (*chosenProposal).Header.Prover) { - continue - } - - if (*vote).PublicKeySignatureBls48581.Signature != nil && - (*vote).PublicKeySignatureBls48581.Address != nil { - signatures = append( - signatures, - (*vote).PublicKeySignatureBls48581.Signature, - ) - - pub := proverMap[string((*vote).PublicKeySignatureBls48581.Address)] - publicKeys = append(publicKeys, pub) - voterMap[string((*vote).PublicKeySignatureBls48581.Address)] = vote - } - } - - if len(signatures) == 0 { - return &parentFrame, PeerID{}, errors.Wrap( - errors.New("no signatures to aggregate"), - "finalize votes", - ) - } - - // Aggregate signatures - aggregateOutput, err := p.engine.keyManager.Aggregate(publicKeys, signatures) - if err != nil { - return &parentFrame, PeerID{}, errors.Wrap(err, "finalize votes") - } - aggregatedSignature := aggregateOutput.GetAggregateSignature() - - // Create participant bitmap - provers, err := p.engine.proverRegistry.GetActiveProvers(p.engine.appAddress) - if err != nil { - return &parentFrame, PeerID{}, errors.Wrap(err, "finalize votes") - } - - bitmask := make([]byte, (len(provers)+7)/8) - - for i := 0; i < len(provers); i++ { - activeProver := provers[i] - if _, ok := voterMap[string(activeProver.Address)]; !ok { - continue - } - if !bytes.Equal( - (*voterMap[string(activeProver.Address)]).Proposer, - chosenProposerID.ID, - ) { - continue - } - - byteIndex := i / 8 - bitIndex := i % 8 - bitmask[byteIndex] |= (1 << bitIndex) - } - - // Update the frame with aggregated signature - finalizedFrame := &protobufs.AppShardFrame{ - Header: &protobufs.FrameHeader{ - Address: (*chosenProposal).Header.Address, - FrameNumber: (*chosenProposal).Header.FrameNumber, - ParentSelector: (*chosenProposal).Header.ParentSelector, - Timestamp: (*chosenProposal).Header.Timestamp, - Difficulty: (*chosenProposal).Header.Difficulty, - RequestsRoot: (*chosenProposal).Header.RequestsRoot, - StateRoots: (*chosenProposal).Header.StateRoots, - Output: (*chosenProposal).Header.Output, - Prover: (*chosenProposal).Header.Prover, - FeeMultiplierVote: (*chosenProposal).Header.FeeMultiplierVote, - PublicKeySignatureBls48581: &protobufs.BLS48581AggregateSignature{ - Signature: aggregatedSignature, - PublicKey: &protobufs.BLS48581G2PublicKey{ - KeyValue: aggregateOutput.GetAggregatePublicKey(), - }, - Bitmask: bitmask, - }, - }, - Requests: (*chosenProposal).Requests, - } - - p.engine.logger.Info( - "finalized votes", - zap.Uint64("frame_number", finalizedFrame.Header.FrameNumber), - zap.Int("signatures", len(signatures)), - ) - - return &finalizedFrame, chosenProposerID, nil -} - -func (p *AppVotingProvider) SendConfirmation( - finalized **protobufs.AppShardFrame, - ctx context.Context, -) error { - if finalized == nil || (*finalized).Header == nil { - return errors.New("invalid finalized frame") - } - - copiedFinalized := proto.Clone(*finalized).(*protobufs.AppShardFrame) - - // Create frame confirmation - confirmation := &protobufs.FrameConfirmation{ - Filter: p.engine.appAddress, - FrameNumber: copiedFinalized.Header.FrameNumber, - Selector: p.engine.calculateFrameSelector((*finalized).Header), - Timestamp: time.Now().UnixMilli(), - AggregateSignature: copiedFinalized.Header.PublicKeySignatureBls48581, - } - - // Serialize using canonical bytes - data, err := confirmation.ToCanonicalBytes() - if err != nil { - return errors.Wrap(err, "serialize confirmation") - } - - if err := p.engine.pubsub.PublishToBitmask( - p.engine.getConsensusMessageBitmask(), - data, - ); err != nil { - return errors.Wrap(err, "publish confirmation") - } - - // Insert into time reel - if err := p.engine.appTimeReel.Insert( - p.engine.ctx, - copiedFinalized, - ); err != nil { - p.engine.logger.Error("failed to add frame to time reel", zap.Error(err)) - // Clean up on error - - frameIDBI, _ := poseidon.HashBytes(copiedFinalized.Header.Output) - frameID := frameIDBI.FillBytes(make([]byte, 32)) - p.engine.frameStoreMu.Lock() - delete(p.engine.frameStore, string(frameID)) - p.engine.frameStoreMu.Unlock() - } - - p.engine.logger.Info( - "sent confirmation", - zap.Uint64("frame_number", copiedFinalized.Header.FrameNumber), - ) - - return nil +// SignVote implements consensus.VotingProvider. +func (p *AppVotingProvider) SignVote(ctx context.Context, state *models.State[*protobufs.AppShardFrame]) (**protobufs.ProposalVote, error) { + panic("unimplemented") } // GetFullPath converts a key to its path representation using 6-bit nibbles @@ -797,3 +146,5 @@ func uint32ToBytes(path []uint32) []byte { } return bytes } + +var _ consensus.VotingProvider[*protobufs.AppShardFrame, *protobufs.ProposalVote, PeerID] = (*AppVotingProvider)(nil) diff --git a/node/consensus/app/event_distributor.go b/node/consensus/app/event_distributor.go index d5138ed..88dd4d7 100644 --- a/node/consensus/app/event_distributor.go +++ b/node/consensus/app/event_distributor.go @@ -6,6 +6,7 @@ import ( "github.com/pkg/errors" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/consensus/global" consensustime "source.quilibrium.com/quilibrium/monorepo/node/consensus/time" globalintrinsics "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/global" @@ -13,21 +14,15 @@ import ( "source.quilibrium.com/quilibrium/monorepo/types/schema" ) -func (e *AppConsensusEngine) eventDistributorLoop() { +func (e *AppConsensusEngine) eventDistributorLoop( + ctx lifecycle.SignalerContext, +) { defer func() { if r := recover(); r != nil { e.logger.Error("fatal error encountered", zap.Any("panic", r)) - if e.cancel != nil { - e.cancel() - } - // Avoid blocking on quit channel during panic recovery - select { - case e.quit <- struct{}{}: - default: - } + ctx.Throw(errors.Errorf("fatal unhandled error encountered: %v", r)) } }() - defer e.wg.Done() // Subscribe to events from the event distributor eventCh := e.eventDistributor.Subscribe(hex.EncodeToString(e.appAddress)) @@ -35,7 +30,7 @@ func (e *AppConsensusEngine) eventDistributorLoop() { for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return @@ -172,16 +167,10 @@ func (e *AppConsensusEngine) eventDistributorLoop() { if ok && data.Message != "" { e.logger.Error(data.Message) e.halt() - if err := e.stateMachine.Stop(); err != nil { - e.logger.Error( - "error occurred while halting consensus", - zap.Error(err), - ) - } go func() { for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-time.After(10 * time.Second): e.logger.Error( @@ -200,16 +189,10 @@ func (e *AppConsensusEngine) eventDistributorLoop() { zap.Error(data.Error), ) e.halt() - if err := e.stateMachine.Stop(); err != nil { - e.logger.Error( - "error occurred while halting consensus", - zap.Error(err), - ) - } go func() { for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-time.After(10 * time.Second): e.logger.Error( diff --git a/node/consensus/app/message_processors.go b/node/consensus/app/message_processors.go index b8d468b..5057532 100644 --- a/node/consensus/app/message_processors.go +++ b/node/consensus/app/message_processors.go @@ -3,7 +3,6 @@ package app import ( "bytes" "encoding/binary" - "encoding/hex" "github.com/iden3/go-iden3-crypto/poseidon" "github.com/libp2p/go-libp2p/core/peer" @@ -11,16 +10,17 @@ import ( "go.uber.org/zap" "golang.org/x/crypto/sha3" "source.quilibrium.com/quilibrium/monorepo/go-libp2p-blossomsub/pb" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/crypto" ) -func (e *AppConsensusEngine) processConsensusMessageQueue() { - defer e.wg.Done() - +func (e *AppConsensusEngine) processConsensusMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return @@ -30,14 +30,14 @@ func (e *AppConsensusEngine) processConsensusMessageQueue() { } } -func (e *AppConsensusEngine) processProverMessageQueue() { - defer e.wg.Done() - +func (e *AppConsensusEngine) processProverMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { case <-e.haltCtx.Done(): return - case <-e.ctx.Done(): + case <-ctx.Done(): return case message := <-e.proverMessageQueue: e.handleProverMessage(message) @@ -45,14 +45,14 @@ func (e *AppConsensusEngine) processProverMessageQueue() { } } -func (e *AppConsensusEngine) processFrameMessageQueue() { - defer e.wg.Done() - +func (e *AppConsensusEngine) processFrameMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { case <-e.haltCtx.Done(): return - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return @@ -62,14 +62,14 @@ func (e *AppConsensusEngine) processFrameMessageQueue() { } } -func (e *AppConsensusEngine) processGlobalFrameMessageQueue() { - defer e.wg.Done() - +func (e *AppConsensusEngine) processGlobalFrameMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { case <-e.haltCtx.Done(): return - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return @@ -79,12 +79,12 @@ func (e *AppConsensusEngine) processGlobalFrameMessageQueue() { } } -func (e *AppConsensusEngine) processAlertMessageQueue() { - defer e.wg.Done() - +func (e *AppConsensusEngine) processAlertMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return @@ -94,14 +94,14 @@ func (e *AppConsensusEngine) processAlertMessageQueue() { } } -func (e *AppConsensusEngine) processPeerInfoMessageQueue() { - defer e.wg.Done() - +func (e *AppConsensusEngine) processPeerInfoMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { case <-e.haltCtx.Done(): return - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return @@ -111,12 +111,12 @@ func (e *AppConsensusEngine) processPeerInfoMessageQueue() { } } -func (e *AppConsensusEngine) processDispatchMessageQueue() { - defer e.wg.Done() - +func (e *AppConsensusEngine) processDispatchMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return @@ -142,14 +142,11 @@ func (e *AppConsensusEngine) handleConsensusMessage(message *pb.Message) { case protobufs.AppShardFrameType: e.handleProposal(message) - case protobufs.ProverLivenessCheckType: - e.handleLivenessCheck(message) - - case protobufs.FrameVoteType: + case protobufs.ProposalVoteType: e.handleVote(message) - case protobufs.FrameConfirmationType: - e.handleConfirmation(message) + case protobufs.TimeoutStateType: + e.handleTimeoutState(message) default: e.logger.Debug( @@ -196,7 +193,7 @@ func (e *AppConsensusEngine) handleFrameMessage(message *pb.Message) { e.frameStore[string(frameID)] = frame e.frameStoreMu.Unlock() - if err := e.appTimeReel.Insert(e.ctx, frame); err != nil { + if err := e.appTimeReel.Insert(ctx, frame); err != nil { // Success metric recorded at the end of processing framesProcessedTotal.WithLabelValues("error").Inc() return @@ -276,7 +273,7 @@ func (e *AppConsensusEngine) handleGlobalFrameMessage(message *pb.Message) { return } - if err := e.globalTimeReel.Insert(e.ctx, frame); err != nil { + if err := e.globalTimeReel.Insert(ctx, frame); err != nil { // Success metric recorded at the end of processing globalFramesProcessedTotal.WithLabelValues("error").Inc() return @@ -382,7 +379,7 @@ func (e *AppConsensusEngine) handleDispatchMessage(message *pb.Message) { } if err := e.dispatchService.AddInboxMessage( - e.ctx, + ctx, envelope, ); err != nil { e.logger.Debug("failed to add inbox message", zap.Error(err)) @@ -395,7 +392,7 @@ func (e *AppConsensusEngine) handleDispatchMessage(message *pb.Message) { } if err := e.dispatchService.AddHubInboxAssociation( - e.ctx, + ctx, envelope, ); err != nil { e.logger.Debug("failed to add inbox message", zap.Error(err)) @@ -408,7 +405,7 @@ func (e *AppConsensusEngine) handleDispatchMessage(message *pb.Message) { } if err := e.dispatchService.DeleteHubInboxAssociation( - e.ctx, + ctx, envelope, ); err != nil { e.logger.Debug("failed to add inbox message", zap.Error(err)) @@ -460,116 +457,13 @@ func (e *AppConsensusEngine) handleProposal(message *pb.Message) { } } -func (e *AppConsensusEngine) handleLivenessCheck(message *pb.Message) { - timer := prometheus.NewTimer( - livenessCheckProcessingDuration.WithLabelValues(e.appAddressHex), - ) - defer timer.ObserveDuration() - - livenessCheck := &protobufs.ProverLivenessCheck{} - if err := livenessCheck.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal liveness check", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() - return - } - - if !bytes.Equal(livenessCheck.Filter, e.appAddress) { - return - } - - // Validate the liveness check structure - if err := livenessCheck.Validate(); err != nil { - e.logger.Debug("invalid liveness check", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() - return - } - - proverSet, err := e.proverRegistry.GetActiveProvers(e.appAddress) - if err != nil { - e.logger.Error("could not receive liveness check", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() - return - } - - lcBytes, err := livenessCheck.ConstructSignaturePayload() - if err != nil { - e.logger.Error( - "could not construct signature message for liveness check", - zap.Error(err), - ) - livenessCheckProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() - return - } - - var found []byte = nil - for _, prover := range proverSet { - if bytes.Equal( - prover.Address, - livenessCheck.PublicKeySignatureBls48581.Address, - ) { - valid, err := e.keyManager.ValidateSignature( - crypto.KeyTypeBLS48581G1, - prover.PublicKey, - lcBytes, - livenessCheck.PublicKeySignatureBls48581.Signature, - livenessCheck.GetSignatureDomain(), - ) - if err != nil || !valid { - e.logger.Error( - "could not validate signature for liveness check", - zap.Error(err), - ) - break - } - found = prover.PublicKey - - break - } - } - - if found == nil { - e.logger.Warn( - "invalid liveness check", - zap.String( - "prover", - hex.EncodeToString( - livenessCheck.PublicKeySignatureBls48581.Address, - ), - ), - ) - livenessCheckProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() - return - } - - if livenessCheck.PublicKeySignatureBls48581 == nil { - e.logger.Error("no signature on liveness check") - livenessCheckProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() - } - - commitment := CollectedCommitments{ - commitmentHash: livenessCheck.CommitmentHash, - frameNumber: livenessCheck.FrameNumber, - prover: livenessCheck.PublicKeySignatureBls48581.Address, - } - if err := e.stateMachine.ReceiveLivenessCheck( - PeerID{ID: livenessCheck.PublicKeySignatureBls48581.Address}, - commitment, - ); err != nil { - e.logger.Error("could not receive liveness check", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() - return - } - - livenessCheckProcessedTotal.WithLabelValues(e.appAddressHex, "success").Inc() -} - func (e *AppConsensusEngine) handleVote(message *pb.Message) { timer := prometheus.NewTimer( voteProcessingDuration.WithLabelValues(e.appAddressHex), ) defer timer.ObserveDuration() - vote := &protobufs.FrameVote{} + vote := &protobufs.ProposalVote{} if err := vote.FromCanonicalBytes(message.Data); err != nil { e.logger.Debug("failed to unmarshal vote", zap.Error(err)) voteProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() @@ -599,20 +493,20 @@ func (e *AppConsensusEngine) handleVote(message *pb.Message) { voteProcessedTotal.WithLabelValues(e.appAddressHex, "success").Inc() } -func (e *AppConsensusEngine) handleConfirmation(message *pb.Message) { +func (e *AppConsensusEngine) handleTimeoutState(message *pb.Message) { timer := prometheus.NewTimer( - confirmationProcessingDuration.WithLabelValues(e.appAddressHex), + timeoutStateProcessingDuration.WithLabelValues(e.appAddressHex), ) defer timer.ObserveDuration() - confirmation := &protobufs.FrameConfirmation{} - if err := confirmation.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal confirmation", zap.Error(err)) - confirmationProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() + timeoutState := &protobufs.TimeoutState{} + if err := timeoutState.FromCanonicalBytes(message.Data); err != nil { + e.logger.Debug("failed to unmarshal timeoutState", zap.Error(err)) + timeoutStateProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() return } - if !bytes.Equal(confirmation.Filter, e.appAddress) { + if !bytes.Equal(timeoutState.Filter, e.appAddress) { return } @@ -620,9 +514,9 @@ func (e *AppConsensusEngine) handleConfirmation(message *pb.Message) { var matchingFrame *protobufs.AppShardFrame for _, frame := range e.frameStore { if frame.Header != nil && - frame.Header.FrameNumber == confirmation.FrameNumber { + frame.Header.FrameNumber == timeoutState.FrameNumber { frameSelector := e.calculateFrameSelector(frame.Header) - if bytes.Equal(frameSelector, confirmation.Selector) { + if bytes.Equal(frameSelector, timeoutState.Selector) { matchingFrame = frame break } @@ -639,39 +533,39 @@ func (e *AppConsensusEngine) handleConfirmation(message *pb.Message) { defer e.frameStoreMu.Unlock() matchingFrame.Header.PublicKeySignatureBls48581 = - confirmation.AggregateSignature + timeoutState.AggregateSignature valid, err := e.frameValidator.Validate(matchingFrame) if !valid || err != nil { - e.logger.Error("received invalid confirmation", zap.Error(err)) - confirmationProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() + e.logger.Error("received invalid timeoutState", zap.Error(err)) + timeoutStateProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() return } if matchingFrame.Header.Prover == nil { - e.logger.Error("confirmation with no matched prover") - confirmationProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() + e.logger.Error("timeoutState with no matched prover") + timeoutStateProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() return } - if err := e.stateMachine.ReceiveConfirmation( + if err := e.stateMachine.ReceivetimeoutState( PeerID{ID: matchingFrame.Header.Prover}, &matchingFrame, ); err != nil { - e.logger.Error("could not receive confirmation", zap.Error(err)) - confirmationProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() + e.logger.Error("could not receive timeoutState", zap.Error(err)) + timeoutStateProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() return } - if err := e.appTimeReel.Insert(e.ctx, matchingFrame); err != nil { + if err := e.appTimeReel.Insert(ctx, matchingFrame); err != nil { e.logger.Error( "could not insert into time reel", zap.Error(err), ) - confirmationProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() + timeoutStateProcessedTotal.WithLabelValues(e.appAddressHex, "error").Inc() return } - confirmationProcessedTotal.WithLabelValues(e.appAddressHex, "success").Inc() + timeoutStateProcessedTotal.WithLabelValues(e.appAddressHex, "success").Inc() } func (e *AppConsensusEngine) peekMessageType(message *pb.Message) uint32 { diff --git a/node/consensus/app/message_validation.go b/node/consensus/app/message_validation.go index a5735e7..a39a206 100644 --- a/node/consensus/app/message_validation.go +++ b/node/consensus/app/message_validation.go @@ -83,60 +83,20 @@ func (e *AppConsensusEngine) validateConsensusMessage( proposalValidationTotal.WithLabelValues(e.appAddressHex, "accept").Inc() - case protobufs.ProverLivenessCheckType: - timer := prometheus.NewTimer( - livenessCheckValidationDuration.WithLabelValues(e.appAddressHex), - ) - defer timer.ObserveDuration() - - livenessCheck := &protobufs.ProverLivenessCheck{} - if err := livenessCheck.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal liveness check", zap.Error(err)) - livenessCheckValidationTotal.WithLabelValues( - e.appAddressHex, - "reject", - ).Inc() - return p2p.ValidationResultReject - } - - now := time.Now().UnixMilli() - if livenessCheck.Timestamp > now+500 || - livenessCheck.Timestamp < now-1000 { - livenessCheckValidationTotal.WithLabelValues( - e.appAddressHex, - "ignore", - ).Inc() - return p2p.ValidationResultIgnore - } - - if err := livenessCheck.Validate(); err != nil { - e.logger.Debug("failed to validate liveness check", zap.Error(err)) - livenessCheckValidationTotal.WithLabelValues( - e.appAddressHex, - "reject", - ).Inc() - return p2p.ValidationResultReject - } - - livenessCheckValidationTotal.WithLabelValues( - e.appAddressHex, - "accept", - ).Inc() - - case protobufs.FrameVoteType: + case protobufs.ProposalVoteType: timer := prometheus.NewTimer( voteValidationDuration.WithLabelValues(e.appAddressHex), ) defer timer.ObserveDuration() - vote := &protobufs.FrameVote{} + vote := &protobufs.ProposalVote{} if err := vote.FromCanonicalBytes(message.Data); err != nil { e.logger.Debug("failed to unmarshal vote", zap.Error(err)) voteValidationTotal.WithLabelValues(e.appAddressHex, "reject").Inc() return p2p.ValidationResultReject } - now := time.Now().UnixMilli() + now := uint64(time.Now().UnixMilli()) if vote.Timestamp > now+5000 || vote.Timestamp < now-5000 { voteValidationTotal.WithLabelValues(e.appAddressHex, "ignore").Inc() return p2p.ValidationResultIgnore @@ -150,41 +110,41 @@ func (e *AppConsensusEngine) validateConsensusMessage( voteValidationTotal.WithLabelValues(e.appAddressHex, "accept").Inc() - case protobufs.FrameConfirmationType: + case protobufs.TimeoutStateType: timer := prometheus.NewTimer( - confirmationValidationDuration.WithLabelValues(e.appAddressHex), + timeoutStateValidationDuration.WithLabelValues(e.appAddressHex), ) defer timer.ObserveDuration() - confirmation := &protobufs.FrameConfirmation{} - if err := confirmation.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal confirmation", zap.Error(err)) - confirmationValidationTotal.WithLabelValues( + timeoutState := &protobufs.TimeoutState{} + if err := timeoutState.FromCanonicalBytes(message.Data); err != nil { + e.logger.Debug("failed to unmarshal timeout state", zap.Error(err)) + timeoutStateValidationTotal.WithLabelValues( e.appAddressHex, "reject", ).Inc() return p2p.ValidationResultReject } - now := time.Now().UnixMilli() - if confirmation.Timestamp > now+5000 || confirmation.Timestamp < now-5000 { - confirmationValidationTotal.WithLabelValues( + now := uint64(time.Now().UnixMilli()) + if timeoutState.Timestamp > now+5000 || timeoutState.Timestamp < now-5000 { + timeoutStateValidationTotal.WithLabelValues( e.appAddressHex, "ignore", ).Inc() return p2p.ValidationResultIgnore } - if err := confirmation.Validate(); err != nil { - e.logger.Debug("failed to validate confirmation", zap.Error(err)) - confirmationValidationTotal.WithLabelValues( + if err := timeoutState.Validate(); err != nil { + e.logger.Debug("failed to validate timeout state", zap.Error(err)) + timeoutStateValidationTotal.WithLabelValues( e.appAddressHex, "reject", ).Inc() return p2p.ValidationResultReject } - confirmationValidationTotal.WithLabelValues(e.appAddressHex, "accept").Inc() + timeoutStateValidationTotal.WithLabelValues(e.appAddressHex, "accept").Inc() default: return p2p.ValidationResultReject diff --git a/node/consensus/app/metrics.go b/node/consensus/app/metrics.go index 7ad82be..e19fcae 100644 --- a/node/consensus/app/metrics.go +++ b/node/consensus/app/metrics.go @@ -99,50 +99,6 @@ var ( []string{"app_address"}, ) - // Shard liveness check processing metrics - livenessCheckProcessedTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Namespace: metricsNamespace, - Subsystem: subsystem, - Name: "liveness_check_processed_total", - Help: "Total number of shard liveness checks processed by the app consensus engine", - }, - []string{"app_address", "status"}, // status: "success", "error", "invalid" - ) - - livenessCheckProcessingDuration = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Namespace: metricsNamespace, - Subsystem: subsystem, - Name: "liveness_check_processing_duration_seconds", - Help: "Time taken to process a shard liveness check", - Buckets: prometheus.DefBuckets, - }, - []string{"app_address"}, - ) - - // Shard liveness check validation metrics - livenessCheckValidationTotal = promauto.NewCounterVec( - prometheus.CounterOpts{ - Namespace: metricsNamespace, - Subsystem: subsystem, - Name: "liveness_check_validation_total", - Help: "Total number of shard liveness check validations", - }, - []string{"app_address", "result"}, // result: "accept", "reject", "ignore" - ) - - livenessCheckValidationDuration = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Namespace: metricsNamespace, - Subsystem: subsystem, - Name: "liveness_check_validation_duration_seconds", - Help: "Time taken to validate a shard liveness check", - Buckets: prometheus.DefBuckets, - }, - []string{"app_address"}, - ) - // Shard vote processing metrics voteProcessedTotal = promauto.NewCounterVec( prometheus.CounterOpts{ @@ -187,45 +143,45 @@ var ( []string{"app_address"}, ) - // Shard confirmation processing metrics - confirmationProcessedTotal = promauto.NewCounterVec( + // Shard timeout stateprocessing metrics + timeoutStateProcessedTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "confirmation_processed_total", - Help: "Total number of shard confirmations processed by the app consensus engine", + Name: "timeout_state_processed_total", + Help: "Total number of shard timeout states processed by the app consensus engine", }, []string{"app_address", "status"}, // status: "success", "error", "invalid" ) - confirmationProcessingDuration = promauto.NewHistogramVec( + timeoutStateProcessingDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "confirmation_processing_duration_seconds", - Help: "Time taken to process a shard confirmation", + Name: "timeout_state_processing_duration_seconds", + Help: "Time taken to process a shard timeout state", Buckets: prometheus.DefBuckets, }, []string{"app_address"}, ) - // Shard confirmation validation metrics - confirmationValidationTotal = promauto.NewCounterVec( + // Shard timeout statevalidation metrics + timeoutStateValidationTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "confirmation_validation_total", - Help: "Total number of shard confirmation validations", + Name: "timeout_state_validation_total", + Help: "Total number of shard timeout statevalidations", }, []string{"app_address", "result"}, // result: "accept", "reject", "ignore" ) - confirmationValidationDuration = promauto.NewHistogramVec( + timeoutStateValidationDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "confirmation_validation_duration_seconds", - Help: "Time taken to validate a shard confirmation", + Name: "timeout_state_validation_duration_seconds", + Help: "Time taken to validate a shard timeout state", Buckets: prometheus.DefBuckets, }, []string{"app_address"}, diff --git a/node/consensus/events/app_event_distributor.go b/node/consensus/events/app_event_distributor.go index bacb502..e1b5070 100644 --- a/node/consensus/events/app_event_distributor.go +++ b/node/consensus/events/app_event_distributor.go @@ -6,6 +6,7 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" consensustime "source.quilibrium.com/quilibrium/monorepo/node/consensus/time" "source.quilibrium.com/quilibrium/monorepo/types/consensus" ) @@ -36,52 +37,32 @@ func NewAppEventDistributor( } // Start begins the event processing loop -func (a *AppEventDistributor) Start(ctx context.Context) error { - a.mu.Lock() - defer a.mu.Unlock() - - if a.running { - return nil - } - - a.ctx, a.cancel = context.WithCancel(ctx) - a.running = true - a.startTime = time.Now() +func (g *AppEventDistributor) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + g.mu.Lock() + g.ctx = ctx + g.running = true + g.startTime = time.Now() distributorStartsTotal.WithLabelValues("app").Inc() + g.mu.Unlock() + ready() + g.wg.Add(2) + go g.processEvents() + go g.trackUptime() - a.wg.Add(1) - go a.processEvents() - - go a.trackUptime() - - return nil -} - -// Stop gracefully shuts down the distributor -func (a *AppEventDistributor) Stop() error { - a.mu.Lock() - if !a.running { - a.mu.Unlock() - return nil - } - a.running = false - a.mu.Unlock() - - a.cancel() - a.wg.Wait() - - a.mu.Lock() - for _, ch := range a.subscribers { + <-ctx.Done() + g.mu.Lock() + g.running = false + for _, ch := range g.subscribers { close(ch) } - a.subscribers = make(map[string]chan consensus.ControlEvent) - a.mu.Unlock() - + g.subscribers = make(map[string]chan consensus.ControlEvent) distributorStopsTotal.WithLabelValues("app").Inc() distributorUptime.WithLabelValues("app").Set(0) - - return nil + g.mu.Unlock() } // Subscribe registers a new subscriber diff --git a/node/consensus/events/distributor_test.go b/node/consensus/events/distributor_test.go index 108cc0d..0e4e88c 100644 --- a/node/consensus/events/distributor_test.go +++ b/node/consensus/events/distributor_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" consensustime "source.quilibrium.com/quilibrium/monorepo/node/consensus/time" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/consensus" @@ -81,25 +82,21 @@ func TestGlobalEventDistributor_StartStop(t *testing.T) { globalEventCh := make(chan consensustime.GlobalEvent, 10) distributor := NewGlobalEventDistributor(globalEventCh) - ctx := context.Background() + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) // Test starting - err := distributor.Start(ctx) - require.NoError(t, err) - - // Test starting again (should be idempotent) - err = distributor.Start(ctx) - require.NoError(t, err) + go distributor.Start(ctx, func() {}) // Test stopping - err = distributor.Stop() - require.NoError(t, err) - - // Test stopping again (should be idempotent) - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } close(globalEventCh) + } func TestGlobalEventDistributor_Subscribe(t *testing.T) { @@ -116,9 +113,8 @@ func TestGlobalEventDistributor_Subscribe(t *testing.T) { assert.NotNil(t, sub3Ch) // Start the distributor - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(t, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) // Send a test event testEvent := createTestGlobalEvent(consensustime.TimeReelEventNewHead, 100) @@ -146,8 +142,12 @@ func TestGlobalEventDistributor_Subscribe(t *testing.T) { } // Stop the distributor - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } close(globalEventCh) } @@ -160,9 +160,8 @@ func TestGlobalEventDistributor_Unsubscribe(t *testing.T) { sub2Ch := distributor.Subscribe("subscriber2") // Start the distributor - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(t, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) // Unsubscribe subscriber1 distributor.Unsubscribe("subscriber1") @@ -198,8 +197,12 @@ func TestGlobalEventDistributor_Unsubscribe(t *testing.T) { assert.False(t, ok, "Unsubscribed channel should be closed") // Stop the distributor - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } close(globalEventCh) } @@ -211,9 +214,8 @@ func TestGlobalEventDistributor_EventTypes(t *testing.T) { subCh := distributor.Subscribe("test-subscriber") // Start the distributor - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(t, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) // Test NewHead event newHeadEvent := createTestGlobalEvent(consensustime.TimeReelEventNewHead, 100) @@ -243,8 +245,12 @@ func TestGlobalEventDistributor_EventTypes(t *testing.T) { assert.Equal(t, equivocationEvent, *eventData) // Stop the distributor - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } close(globalEventCh) } @@ -253,11 +259,10 @@ func TestGlobalEventDistributor_ContextCancellation(t *testing.T) { distributor := NewGlobalEventDistributor(globalEventCh) // Create a cancellable context - ctx, cancel := context.WithCancel(context.Background()) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) // Start the distributor - err := distributor.Start(ctx) - require.NoError(t, err) + go distributor.Start(ctx, func() {}) // Subscribe subCh := distributor.Subscribe("test-subscriber") @@ -269,8 +274,12 @@ func TestGlobalEventDistributor_ContextCancellation(t *testing.T) { time.Sleep(100 * time.Millisecond) // Stop should work gracefully - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } // Channel should be closed _, ok := <-subCh @@ -284,23 +293,18 @@ func TestAppEventDistributor_StartStop(t *testing.T) { appEventCh := make(chan consensustime.AppEvent, 10) distributor := NewAppEventDistributor(globalEventCh, appEventCh) - ctx := context.Background() + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) // Test starting - err := distributor.Start(ctx) - require.NoError(t, err) - - // Test starting again (should be idempotent) - err = distributor.Start(ctx) - require.NoError(t, err) + go distributor.Start(ctx, func() {}) // Test stopping - err = distributor.Stop() - require.NoError(t, err) - - // Test stopping again (should be idempotent) - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } close(globalEventCh) close(appEventCh) @@ -315,9 +319,8 @@ func TestAppEventDistributor_GlobalAndAppEvents(t *testing.T) { subCh := distributor.Subscribe("test-subscriber") // Start the distributor - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(t, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) // Test Global event globalEvent := createTestGlobalEvent(consensustime.TimeReelEventNewHead, 100) @@ -338,8 +341,12 @@ func TestAppEventDistributor_GlobalAndAppEvents(t *testing.T) { assert.Equal(t, appEvent, *appEventData) // Stop the distributor - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } close(globalEventCh) close(appEventCh) } @@ -353,9 +360,8 @@ func TestAppEventDistributor_AllEventTypes(t *testing.T) { subCh := distributor.Subscribe("test-subscriber") // Start the distributor - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(t, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) // Test all global event types globalNewHead := createTestGlobalEvent(consensustime.TimeReelEventNewHead, 100) @@ -390,8 +396,12 @@ func TestAppEventDistributor_AllEventTypes(t *testing.T) { assert.Equal(t, consensus.ControlEventAppEquivocation, event.Type) // Stop the distributor - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } close(globalEventCh) close(appEventCh) } @@ -406,9 +416,8 @@ func TestAppEventDistributor_MultipleSubscribers(t *testing.T) { sub2Ch := distributor.Subscribe("subscriber2") // Start the distributor - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(t, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) // Send events globalEvent := createTestGlobalEvent(consensustime.TimeReelEventNewHead, 100) @@ -445,8 +454,12 @@ func TestAppEventDistributor_MultipleSubscribers(t *testing.T) { assert.Equal(t, 2, receivedApp) // Stop the distributor - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } close(globalEventCh) close(appEventCh) } @@ -460,9 +473,8 @@ func TestAppEventDistributor_ChannelClosure(t *testing.T) { subCh := distributor.Subscribe("test-subscriber") // Start the distributor - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(t, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) // Close the input channels close(globalEventCh) @@ -471,8 +483,12 @@ func TestAppEventDistributor_ChannelClosure(t *testing.T) { time.Sleep(100 * time.Millisecond) // Stop should work gracefully - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } // Subscriber channel should be closed _, ok := <-subCh @@ -495,9 +511,8 @@ func TestConcurrentSubscribeUnsubscribe(t *testing.T) { globalEventCh := make(chan consensustime.GlobalEvent, 10) distributor := NewGlobalEventDistributor(globalEventCh) - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(t, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) // Concurrently subscribe and unsubscribe done := make(chan bool) @@ -537,8 +552,12 @@ func TestConcurrentSubscribeUnsubscribe(t *testing.T) { } // Stop the distributor - err = distributor.Stop() - require.NoError(t, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(t, err) + } wg.Wait() close(globalEventCh) @@ -568,9 +587,8 @@ func BenchmarkGlobalEventDistributor_Broadcast(b *testing.B) { }(ch) } - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(b, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) b.ResetTimer() @@ -588,8 +606,12 @@ func BenchmarkGlobalEventDistributor_Broadcast(b *testing.B) { // Signal consumers to stop close(done) - err = distributor.Stop() - require.NoError(b, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(b, err) + } close(globalEventCh) // Wait for all consumers to finish @@ -620,9 +642,8 @@ func BenchmarkAppEventDistributor_MixedEvents(b *testing.B) { }(ch) } - ctx := context.Background() - err := distributor.Start(ctx) - require.NoError(b, err) + ctx, cancel, errCh := lifecycle.WithSignallerAndCancel(context.Background()) + go distributor.Start(ctx, func() {}) b.ResetTimer() @@ -642,8 +663,12 @@ func BenchmarkAppEventDistributor_MixedEvents(b *testing.B) { // Signal consumers to stop close(done) - err = distributor.Stop() - require.NoError(b, err) + cancel() + select { + case <-ctx.Done(): + case err, _ := <-errCh: + require.NoError(b, err) + } close(globalEventCh) close(appEventCh) diff --git a/node/consensus/events/global_event_distributor.go b/node/consensus/events/global_event_distributor.go index 5c0bb32..cbed453 100644 --- a/node/consensus/events/global_event_distributor.go +++ b/node/consensus/events/global_event_distributor.go @@ -6,6 +6,7 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" consensustime "source.quilibrium.com/quilibrium/monorepo/node/consensus/time" "source.quilibrium.com/quilibrium/monorepo/types/consensus" ) @@ -34,52 +35,32 @@ func NewGlobalEventDistributor( } // Start begins the event processing loop -func (g *GlobalEventDistributor) Start(ctx context.Context) error { +func (g *GlobalEventDistributor) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { g.mu.Lock() - defer g.mu.Unlock() - - if g.running { - return nil - } - - g.ctx, g.cancel = context.WithCancel(ctx) + g.ctx = ctx g.running = true g.startTime = time.Now() distributorStartsTotal.WithLabelValues("global").Inc() - - g.wg.Add(1) + g.mu.Unlock() + ready() + g.wg.Add(2) go g.processEvents() - go g.trackUptime() - return nil -} - -// Stop gracefully shuts down the distributor -func (g *GlobalEventDistributor) Stop() error { + <-ctx.Done() g.mu.Lock() - if !g.running { - g.mu.Unlock() - return nil - } g.running = false - g.mu.Unlock() - - g.cancel() - g.wg.Wait() - - g.mu.Lock() for _, ch := range g.subscribers { close(ch) } g.subscribers = make(map[string]chan consensus.ControlEvent) - g.mu.Unlock() - distributorStopsTotal.WithLabelValues("global").Inc() distributorUptime.WithLabelValues("global").Set(0) - - return nil + g.mu.Unlock() } // Subscribe registers a new subscriber @@ -194,6 +175,7 @@ func (g *GlobalEventDistributor) broadcast(event consensus.ControlEvent) { // trackUptime periodically updates the uptime metric func (g *GlobalEventDistributor) trackUptime() { + defer g.wg.Done() ticker := time.NewTicker(10 * time.Second) defer ticker.Stop() diff --git a/node/consensus/global/event_distributor.go b/node/consensus/global/event_distributor.go index 45fe318..6585540 100644 --- a/node/consensus/global/event_distributor.go +++ b/node/consensus/global/event_distributor.go @@ -14,6 +14,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" "source.quilibrium.com/quilibrium/monorepo/config" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/consensus/provers" consensustime "source.quilibrium.com/quilibrium/monorepo/node/consensus/time" globalintrinsics "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/global" @@ -24,19 +25,15 @@ import ( "source.quilibrium.com/quilibrium/monorepo/types/schema" ) -func (e *GlobalConsensusEngine) eventDistributorLoop() { +func (e *GlobalConsensusEngine) eventDistributorLoop( + ctx lifecycle.SignalerContext, +) { defer func() { if r := recover(); r != nil { e.logger.Error("fatal error encountered", zap.Any("panic", r)) - if e.cancel != nil { - e.cancel() - } - go func() { - e.Stop(false) - }() + ctx.Throw(errors.Errorf("fatal unhandled error encountered: %v", r)) } }() - defer e.wg.Done() // Subscribe to events from the event distributor eventCh := e.eventDistributor.Subscribe("global") @@ -44,7 +41,7 @@ func (e *GlobalConsensusEngine) eventDistributorLoop() { for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return diff --git a/node/consensus/global/global_consensus_engine.go b/node/consensus/global/global_consensus_engine.go index da7cd73..600693c 100644 --- a/node/consensus/global/global_consensus_engine.go +++ b/node/consensus/global/global_consensus_engine.go @@ -53,7 +53,6 @@ import ( typesconsensus "source.quilibrium.com/quilibrium/monorepo/types/consensus" "source.quilibrium.com/quilibrium/monorepo/types/crypto" typesdispatch "source.quilibrium.com/quilibrium/monorepo/types/dispatch" - "source.quilibrium.com/quilibrium/monorepo/types/execution" "source.quilibrium.com/quilibrium/monorepo/types/execution/intrinsics" "source.quilibrium.com/quilibrium/monorepo/types/execution/state" "source.quilibrium.com/quilibrium/monorepo/types/hypergraph" @@ -89,6 +88,7 @@ type LockedTransaction struct { // GlobalConsensusEngine uses the generic state machine for consensus type GlobalConsensusEngine struct { + *lifecycle.ComponentManager protobufs.GlobalServiceServer logger *zap.Logger @@ -113,8 +113,6 @@ type GlobalConsensusEngine struct { dispatchService typesdispatch.DispatchService globalTimeReel *consensustime.GlobalTimeReel blsConstructor crypto.BlsConstructor - executors map[string]execution.ShardExecutionEngine - executorsMu sync.RWMutex executionManager *manager.ExecutionEngineManager mixnet typesconsensus.Mixnet peerInfoManager tp2p.PeerInfoManager @@ -257,7 +255,6 @@ func NewGlobalConsensusEngine( eventDistributor: eventDistributor, globalTimeReel: globalTimeReel, peerInfoManager: peerInfoManager, - executors: make(map[string]execution.ShardExecutionEngine), frameStore: make(map[string]*protobufs.GlobalFrame), appFrameStore: make(map[string]*protobufs.AppShardFrame), globalConsensusMessageQueue: make(chan *pb.Message, 1000), @@ -425,12 +422,6 @@ func NewGlobalConsensusEngine( return nil, errors.Wrap(err, "new global consensus engine") } - // Register all execution engines with the consensus engine - err = engine.executionManager.RegisterAllEngines(engine.RegisterExecutor) - if err != nil { - return nil, errors.Wrap(err, "new global consensus engine") - } - // Initialize metrics engineState.Set(0) // EngineStateStopped currentDifficulty.Set(float64(config.Engine.Difficulty)) @@ -463,54 +454,34 @@ func NewGlobalConsensusEngine( // Set up gRPC server with TLS credentials if err := engine.setupGRPCServer(); err != nil { - panic(errors.Wrap(err, "failed to setup gRPC server")) + return nil, errors.Wrap(err, "failed to setup gRPC server") } - return engine, nil -} + componentBuilder := lifecycle.NewComponentManagerBuilder() -func (e *GlobalConsensusEngine) Start(quit chan struct{}) <-chan error { - errChan := make(chan error, 1) - - e.quit = quit - e.ctx, e.cancel, _ = lifecycle.WithSignallerAndCancel(context.Background()) - - // Start worker manager background process (if applicable) - if !e.config.Engine.ArchiveMode { - if err := e.workerManager.Start(e.ctx); err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } + // Add worker manager background process (if applicable) + if !engine.config.Engine.ArchiveMode { + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + if err := engine.workerManager.Start(ctx); err != nil { + ctx.Throw(err) + return + } + ready() + <-ctx.Done() + }) } - // Start execution engines - if err := e.executionManager.StartAll(e.quit); err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } + // Add execution engines + componentBuilder.AddWorker(engine.executionManager.Start) + componentBuilder.AddWorker(engine.eventDistributor.Start) + componentBuilder.AddWorker(engine.globalTimeReel.Start) - // Start the event distributor - if err := e.eventDistributor.Start(e.ctx); err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - err := e.globalTimeReel.Start() + frame, err := engine.clockStore.GetLatestGlobalClockFrame() if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan - } - - frame, err := e.clockStore.GetLatestGlobalClockFrame() - if err != nil { - e.logger.Warn( - "invalid frame retrieved, will resync", - zap.Error(err), - ) + frame = engine.initializeGenesis() } var initialState **protobufs.GlobalFrame = nil @@ -518,132 +489,183 @@ func (e *GlobalConsensusEngine) Start(quit chan struct{}) <-chan error { initialState = &frame } - if e.config.P2P.Network == 99 || e.config.Engine.ArchiveMode { - if err := e.startConsensus(initialState); err != nil { - errChan <- errors.Wrap(err, "start state machine") - close(errChan) - return errChan - } + if engine.config.P2P.Network == 99 || engine.config.Engine.ArchiveMode { + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + if err := engine.startConsensus(initialState, ctx, ready); err != nil { + ctx.Throw(err) + return + } + + <-ctx.Done() + }) } - // Confirm initial state - if !e.config.Engine.ArchiveMode { - latest, err := e.clockStore.GetLatestGlobalClockFrame() - if err != nil || latest == nil { - e.logger.Info("initializing genesis") - e.initializeGenesis() - } - } + componentBuilder.AddWorker(engine.peerInfoManager.Start) // Subscribe to global consensus if participating - err = e.subscribeToGlobalConsensus() + err = engine.subscribeToGlobalConsensus() if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan + engine.ctx.Throw(errors.Wrap(err, "start")) + return nil, err } // Subscribe to shard consensus messages to broker lock agreement - err = e.subscribeToShardConsensusMessages() + err = engine.subscribeToShardConsensusMessages() if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan + engine.ctx.Throw(err) + return nil, errors.Wrap(err, "start") } // Subscribe to frames - err = e.subscribeToFrameMessages() + err = engine.subscribeToFrameMessages() if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan + engine.ctx.Throw(err) + return nil, errors.Wrap(err, "start") } // Subscribe to prover messages - err = e.subscribeToProverMessages() + err = engine.subscribeToProverMessages() if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan + engine.ctx.Throw(err) + return nil, errors.Wrap(err, "start") } // Subscribe to peer info messages - err = e.subscribeToPeerInfoMessages() + err = engine.subscribeToPeerInfoMessages() if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan + engine.ctx.Throw(err) + return nil, errors.Wrap(err, "start") } // Subscribe to alert messages - err = e.subscribeToAlertMessages() + err = engine.subscribeToAlertMessages() if err != nil { - errChan <- errors.Wrap(err, "start") - close(errChan) - return errChan + engine.ctx.Throw(err) + return nil, errors.Wrap(err, "start") } - e.peerInfoManager.Start() - // Start consensus message queue processor - e.wg.Add(1) - go e.processGlobalConsensusMessageQueue() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processGlobalConsensusMessageQueue(ctx) + }) // Start shard consensus message queue processor - e.wg.Add(1) - go e.processShardConsensusMessageQueue() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processShardConsensusMessageQueue(ctx) + }) // Start frame message queue processor - e.wg.Add(1) - go e.processFrameMessageQueue() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processFrameMessageQueue(ctx) + }) // Start prover message queue processor - e.wg.Add(1) - go e.processProverMessageQueue() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processProverMessageQueue(ctx) + }) // Start peer info message queue processor - e.wg.Add(1) - go e.processPeerInfoMessageQueue() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processPeerInfoMessageQueue(ctx) + }) // Start alert message queue processor - e.wg.Add(1) - go e.processAlertMessageQueue() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.processAlertMessageQueue(ctx) + }) // Start periodic peer info reporting - e.wg.Add(1) - go e.reportPeerInfoPeriodically() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.reportPeerInfoPeriodically(ctx) + }) // Start event distributor event loop - e.wg.Add(1) - go e.eventDistributorLoop() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.eventDistributorLoop(ctx) + }) // Start periodic metrics update - e.wg.Add(1) - go e.updateMetrics() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.updateMetrics(ctx) + }) // Start periodic tx lock pruning - e.wg.Add(1) - go e.pruneTxLocksPeriodically() + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + ready() + engine.pruneTxLocksPeriodically(ctx) + }) - if e.grpcServer != nil { + if engine.grpcServer != nil { // Register all services with the gRPC server - e.RegisterServices(e.grpcServer) + engine.RegisterServices(engine.grpcServer) // Start serving the gRPC server - go func() { - if err := e.grpcServer.Serve(e.grpcListener); err != nil { - e.logger.Error("gRPC server error", zap.Error(err)) + componentBuilder.AddWorker(func( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, + ) { + go func() { + if err := engine.grpcServer.Serve(engine.grpcListener); err != nil { + engine.logger.Error("gRPC server error", zap.Error(err)) + ctx.Throw(err) + } + }() + ready() + engine.logger.Info("started gRPC server", + zap.String("address", engine.grpcListener.Addr().String())) + <-ctx.Done() + engine.logger.Info("stopping gRPC server") + engine.grpcServer.GracefulStop() + if engine.grpcListener != nil { + engine.grpcListener.Close() } - }() - - e.logger.Info("started gRPC server", - zap.String("address", e.grpcListener.Addr().String())) + }) } - e.logger.Info("global consensus engine started") - - close(errChan) - return errChan + engine.ComponentManager = componentBuilder.Build() + return engine, nil } func (e *GlobalConsensusEngine) setupGRPCServer() error { @@ -747,40 +769,11 @@ func (e *GlobalConsensusEngine) getAddressFromPublicKey( func (e *GlobalConsensusEngine) Stop(force bool) <-chan error { errChan := make(chan error, 1) - // Stop worker manager background process (if applicable) - if !e.config.Engine.ArchiveMode { - if err := e.workerManager.Stop(); err != nil { - errChan <- errors.Wrap(err, "stop") - close(errChan) - return errChan - } - } - - if e.grpcServer != nil { - e.logger.Info("stopping gRPC server") - e.grpcServer.GracefulStop() - if e.grpcListener != nil { - e.grpcListener.Close() - } - } - - // Stop execution engines - if e.executionManager != nil { - if err := e.executionManager.StopAll(force); err != nil && !force { - errChan <- errors.Wrap(err, "stop") - } - } - // Cancel context if e.cancel != nil { e.cancel() } - // Stop event distributor - if err := e.eventDistributor.Stop(); err != nil && !force { - errChan <- errors.Wrap(err, "stop") - } - // Unsubscribe from pubsub if e.config.Engine.ArchiveMode || e.config.P2P.Network == 99 { e.pubsub.Unsubscribe(GLOBAL_CONSENSUS_BITMASK, false) @@ -806,17 +799,8 @@ func (e *GlobalConsensusEngine) Stop(force bool) <-chan error { e.pubsub.Unsubscribe(GLOBAL_ALERT_BITMASK, false) e.pubsub.UnregisterValidator(GLOBAL_ALERT_BITMASK) - e.peerInfoManager.Stop() - - // Wait for goroutines to finish - done := make(chan struct{}) - go func() { - e.wg.Wait() - close(done) - }() - select { - case <-done: + case <-e.ctx.Done(): // Clean shutdown case <-time.After(30 * time.Second): if !force { @@ -861,77 +845,6 @@ func (e *GlobalConsensusEngine) GetState() typesconsensus.EngineState { } } -func (e *GlobalConsensusEngine) RegisterExecutor( - exec execution.ShardExecutionEngine, - frame uint64, -) <-chan error { - errChan := make(chan error, 1) - - e.executorsMu.Lock() - defer e.executorsMu.Unlock() - - name := exec.GetName() - if _, exists := e.executors[name]; exists { - errChan <- errors.New("executor already registered") - close(errChan) - return errChan - } - - e.executors[name] = exec - - // Update metrics - executorRegistrationTotal.WithLabelValues("register").Inc() - executorsRegistered.Set(float64(len(e.executors))) - - close(errChan) - return errChan -} - -func (e *GlobalConsensusEngine) UnregisterExecutor( - name string, - frame uint64, - force bool, -) <-chan error { - errChan := make(chan error, 1) - - e.executorsMu.Lock() - defer e.executorsMu.Unlock() - - if _, exists := e.executors[name]; !exists { - errChan <- errors.New("executor not registered") - close(errChan) - return errChan - } - - // Stop the executor - if exec, ok := e.executors[name]; ok { - stopErrChan := exec.Stop(force) - select { - case err := <-stopErrChan: - if err != nil && !force { - errChan <- errors.Wrap(err, "stop executor") - close(errChan) - return errChan - } - case <-time.After(5 * time.Second): - if !force { - errChan <- errors.New("timeout stopping executor") - close(errChan) - return errChan - } - } - } - - delete(e.executors, name) - - // Update metrics - executorRegistrationTotal.WithLabelValues("unregister").Inc() - executorsRegistered.Set(float64(len(e.executors))) - - close(errChan) - return errChan -} - func (e *GlobalConsensusEngine) GetProvingKey( engineConfig *config.EngineConfig, ) (crypto.Signer, crypto.KeyType, []byte, []byte) { @@ -1325,24 +1238,22 @@ func (e *GlobalConsensusEngine) getProverAddress() []byte { return addressBI.FillBytes(make([]byte, 32)) } -func (e *GlobalConsensusEngine) updateMetrics() { +func (e *GlobalConsensusEngine) updateMetrics( + ctx lifecycle.SignalerContext, +) { defer func() { if r := recover(); r != nil { e.logger.Error("fatal error encountered", zap.Any("panic", r)) - if e.cancel != nil { - e.cancel() - } - e.quit <- struct{}{} + ctx.Throw(errors.Errorf("fatal unhandled error encountered: %v", r)) } }() - defer e.wg.Done() ticker := time.NewTicker(10 * time.Second) defer ticker.Stop() for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-e.quit: return @@ -1353,12 +1264,6 @@ func (e *GlobalConsensusEngine) updateMetrics() { e.lastProvenFrameTimeMu.RUnlock() timeSinceLastProvenFrame.Set(timeSince) - // Update executor count - e.executorsMu.RLock() - execCount := len(e.executors) - e.executorsMu.RUnlock() - executorsRegistered.Set(float64(execCount)) - // Update current frame number if frame := e.GetFrame(); frame != nil && frame.Header != nil { currentFrameNumber.Set(float64(frame.Header.FrameNumber)) @@ -1828,16 +1733,16 @@ func (e *GlobalConsensusEngine) signPeerInfo( // reportPeerInfoPeriodically sends peer info over the peer info bitmask every // 5 minutes -func (e *GlobalConsensusEngine) reportPeerInfoPeriodically() { - defer e.wg.Done() - +func (e *GlobalConsensusEngine) reportPeerInfoPeriodically( + ctx lifecycle.SignalerContext, +) { e.logger.Info("starting periodic peer info reporting") ticker := time.NewTicker(5 * time.Minute) defer ticker.Stop() for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): e.logger.Info("stopping periodic peer info reporting") return case <-ticker.C: @@ -1867,9 +1772,9 @@ func (e *GlobalConsensusEngine) reportPeerInfoPeriodically() { } } -func (e *GlobalConsensusEngine) pruneTxLocksPeriodically() { - defer e.wg.Done() - +func (e *GlobalConsensusEngine) pruneTxLocksPeriodically( + ctx lifecycle.SignalerContext, +) { ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() @@ -1877,7 +1782,7 @@ func (e *GlobalConsensusEngine) pruneTxLocksPeriodically() { for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case <-ticker.C: e.pruneTxLocks() @@ -2400,6 +2305,8 @@ func (e *GlobalConsensusEngine) DecideWorkerJoins( func (e *GlobalConsensusEngine) startConsensus( initialFrame **protobufs.GlobalFrame, + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, ) error { var err error e.consensusParticipant, err = participant.NewParticipant[ diff --git a/node/consensus/global/message_processors.go b/node/consensus/global/message_processors.go index e1a66d3..73de5b8 100644 --- a/node/consensus/global/message_processors.go +++ b/node/consensus/global/message_processors.go @@ -5,7 +5,6 @@ import ( "encoding/binary" "encoding/hex" "fmt" - "math/bits" "slices" "github.com/iden3/go-iden3-crypto/poseidon" @@ -14,6 +13,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" "source.quilibrium.com/quilibrium/monorepo/go-libp2p-blossomsub/pb" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/crypto" "source.quilibrium.com/quilibrium/monorepo/types/tries" @@ -21,10 +21,11 @@ import ( var keyRegistryDomain = []byte("KEY_REGISTRY") -func (e *GlobalConsensusEngine) processGlobalConsensusMessageQueue() { - defer e.wg.Done() - +func (e *GlobalConsensusEngine) processGlobalConsensusMessageQueue( + ctx lifecycle.SignalerContext, +) { if e.config.P2P.Network != 99 && !e.config.Engine.ArchiveMode { + <-ctx.Done() return } @@ -32,7 +33,7 @@ func (e *GlobalConsensusEngine) processGlobalConsensusMessageQueue() { select { case <-e.haltCtx.Done(): return - case <-e.ctx.Done(): + case <-ctx.Done(): return case message := <-e.globalConsensusMessageQueue: e.handleGlobalConsensusMessage(message) @@ -42,14 +43,14 @@ func (e *GlobalConsensusEngine) processGlobalConsensusMessageQueue() { } } -func (e *GlobalConsensusEngine) processShardConsensusMessageQueue() { - defer e.wg.Done() - +func (e *GlobalConsensusEngine) processShardConsensusMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { case <-e.haltCtx.Done(): return - case <-e.ctx.Done(): + case <-ctx.Done(): return case message := <-e.shardConsensusMessageQueue: e.handleShardConsensusMessage(message) @@ -57,9 +58,9 @@ func (e *GlobalConsensusEngine) processShardConsensusMessageQueue() { } } -func (e *GlobalConsensusEngine) processProverMessageQueue() { - defer e.wg.Done() - +func (e *GlobalConsensusEngine) processProverMessageQueue( + ctx lifecycle.SignalerContext, +) { if e.config.P2P.Network != 99 && !e.config.Engine.ArchiveMode { return } @@ -68,7 +69,7 @@ func (e *GlobalConsensusEngine) processProverMessageQueue() { select { case <-e.haltCtx.Done(): return - case <-e.ctx.Done(): + case <-ctx.Done(): return case message := <-e.globalProverMessageQueue: e.handleProverMessage(message) @@ -76,14 +77,14 @@ func (e *GlobalConsensusEngine) processProverMessageQueue() { } } -func (e *GlobalConsensusEngine) processFrameMessageQueue() { - defer e.wg.Done() - +func (e *GlobalConsensusEngine) processFrameMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { case <-e.haltCtx.Done(): return - case <-e.ctx.Done(): + case <-ctx.Done(): return case message := <-e.globalFrameMessageQueue: e.handleFrameMessage(message) @@ -91,14 +92,14 @@ func (e *GlobalConsensusEngine) processFrameMessageQueue() { } } -func (e *GlobalConsensusEngine) processPeerInfoMessageQueue() { - defer e.wg.Done() - +func (e *GlobalConsensusEngine) processPeerInfoMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { case <-e.haltCtx.Done(): return - case <-e.ctx.Done(): + case <-ctx.Done(): return case message := <-e.globalPeerInfoMessageQueue: e.handlePeerInfoMessage(message) @@ -106,12 +107,12 @@ func (e *GlobalConsensusEngine) processPeerInfoMessageQueue() { } } -func (e *GlobalConsensusEngine) processAlertMessageQueue() { - defer e.wg.Done() - +func (e *GlobalConsensusEngine) processAlertMessageQueue( + ctx lifecycle.SignalerContext, +) { for { select { - case <-e.ctx.Done(): + case <-ctx.Done(): return case message := <-e.globalAlertMessageQueue: e.handleAlertMessage(message) @@ -138,14 +139,11 @@ func (e *GlobalConsensusEngine) handleGlobalConsensusMessage( case protobufs.GlobalFrameType: e.handleProposal(message) - case protobufs.ProverLivenessCheckType: - e.handleLivenessCheck(message) - - case protobufs.FrameVoteType: + case protobufs.ProposalVoteType: e.handleVote(message) - case protobufs.FrameConfirmationType: - e.handleConfirmation(message) + case protobufs.TimeoutStateType: + e.handleTimeoutState(message) case protobufs.MessageBundleType: e.handleMessageBundle(message) @@ -174,17 +172,14 @@ func (e *GlobalConsensusEngine) handleShardConsensusMessage( typePrefix := e.peekMessageType(message) switch typePrefix { - case protobufs.GlobalFrameType: + case protobufs.AppShardFrameType: e.handleShardProposal(message) case protobufs.ProverLivenessCheckType: e.handleShardLivenessCheck(message) - case protobufs.FrameVoteType: + case protobufs.ProposalVoteType: e.handleShardVote(message) - - case protobufs.FrameConfirmationType: - e.handleShardConfirmation(message) } } @@ -843,122 +838,11 @@ func (e *GlobalConsensusEngine) handleProposal(message *pb.Message) { proposalProcessedTotal.WithLabelValues("success").Inc() } -func (e *GlobalConsensusEngine) handleLivenessCheck(message *pb.Message) { - timer := prometheus.NewTimer(livenessCheckProcessingDuration) - defer timer.ObserveDuration() - - livenessCheck := &protobufs.ProverLivenessCheck{} - if err := livenessCheck.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal liveness check", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues("error").Inc() - return - } - - // Validate the liveness check structure - if err := livenessCheck.Validate(); err != nil { - e.logger.Debug("invalid liveness check", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues("error").Inc() - return - } - - proverSet, err := e.proverRegistry.GetActiveProvers(nil) - if err != nil { - e.logger.Error("could not receive liveness check", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues("error").Inc() - return - } - - var found []byte = nil - for _, prover := range proverSet { - if bytes.Equal( - prover.Address, - livenessCheck.PublicKeySignatureBls48581.Address, - ) { - lcBytes, err := livenessCheck.ConstructSignaturePayload() - if err != nil { - e.logger.Error( - "could not construct signature message for liveness check", - zap.Error(err), - ) - break - } - valid, err := e.keyManager.ValidateSignature( - crypto.KeyTypeBLS48581G1, - prover.PublicKey, - lcBytes, - livenessCheck.PublicKeySignatureBls48581.Signature, - livenessCheck.GetSignatureDomain(), - ) - if err != nil || !valid { - e.logger.Error( - "could not validate signature for liveness check", - zap.Error(err), - ) - break - } - found = prover.PublicKey - - break - } - } - - if found == nil { - e.logger.Warn( - "invalid liveness check", - zap.String( - "prover", - hex.EncodeToString( - livenessCheck.PublicKeySignatureBls48581.Address, - ), - ), - ) - livenessCheckProcessedTotal.WithLabelValues("error").Inc() - return - } - - signatureData, err := livenessCheck.ConstructSignaturePayload() - if err != nil { - e.logger.Error("invalid signature payload", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues("error").Inc() - return - } - - valid, err := e.keyManager.ValidateSignature( - crypto.KeyTypeBLS48581G1, - found, - signatureData, - livenessCheck.PublicKeySignatureBls48581.Signature, - livenessCheck.GetSignatureDomain(), - ) - - if err != nil || !valid { - e.logger.Error("invalid liveness check signature", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues("error").Inc() - return - } - - commitment := GlobalCollectedCommitments{ - frameNumber: livenessCheck.FrameNumber, - commitmentHash: livenessCheck.CommitmentHash, - prover: livenessCheck.PublicKeySignatureBls48581.Address, - } - if err := e.stateMachine.ReceiveLivenessCheck( - GlobalPeerID{ID: livenessCheck.PublicKeySignatureBls48581.Address}, - commitment, - ); err != nil { - e.logger.Error("could not receive liveness check", zap.Error(err)) - livenessCheckProcessedTotal.WithLabelValues("error").Inc() - return - } - - livenessCheckProcessedTotal.WithLabelValues("success").Inc() -} - func (e *GlobalConsensusEngine) handleVote(message *pb.Message) { timer := prometheus.NewTimer(voteProcessingDuration) defer timer.ObserveDuration() - vote := &protobufs.FrameVote{} + vote := &protobufs.ProposalVote{} if err := vote.FromCanonicalBytes(message.Data); err != nil { e.logger.Debug("failed to unmarshal vote", zap.Error(err)) voteProcessedTotal.WithLabelValues("error").Inc() @@ -1078,6 +962,64 @@ func (e *GlobalConsensusEngine) handleVote(message *pb.Message) { voteProcessedTotal.WithLabelValues("success").Inc() } +func (e *GlobalConsensusEngine) handleTimeoutState(message *pb.Message) { + timer := prometheus.NewTimer(voteProcessingDuration) + defer timer.ObserveDuration() + + state := &protobufs.TimeoutState{} + if err := state.FromCanonicalBytes(message.Data); err != nil { + e.logger.Debug("failed to unmarshal timeout", zap.Error(err)) + voteProcessedTotal.WithLabelValues("error").Inc() + return + } + + // Validate the vote structure + if err := state.Validate(); err != nil { + e.logger.Debug("invalid timeout", zap.Error(err)) + voteProcessedTotal.WithLabelValues("error").Inc() + return + } + + // Validate the voter's signature + proverSet, err := e.proverRegistry.GetActiveProvers(nil) + if err != nil { + e.logger.Error("could not get active provers", zap.Error(err)) + voteProcessedTotal.WithLabelValues("error").Inc() + return + } + + // Find the voter's public key + var voterPublicKey []byte = nil + for _, prover := range proverSet { + if bytes.Equal( + prover.Address, + state.Vote.PublicKeySignatureBls48581.Address, + ) { + voterPublicKey = prover.PublicKey + break + } + } + + if voterPublicKey == nil { + e.logger.Warn( + "invalid vote - voter not found", + zap.String( + "voter", + hex.EncodeToString( + state.Vote.PublicKeySignatureBls48581.Address, + ), + ), + ) + voteProcessedTotal.WithLabelValues("error").Inc() + return + } + + // Signature is valid, process the vote + if err := e.timeoutCollectorDistributor.OnTimeoutProcessed(state) + + voteProcessedTotal.WithLabelValues("success").Inc() +} + func (e *GlobalConsensusEngine) handleMessageBundle(message *pb.Message) { // MessageBundle messages need to be collected for execution // Store them in pendingMessages to be processed during Collect @@ -1088,130 +1030,6 @@ func (e *GlobalConsensusEngine) handleMessageBundle(message *pb.Message) { e.logger.Debug("collected global request for execution") } -func (e *GlobalConsensusEngine) handleConfirmation(message *pb.Message) { - timer := prometheus.NewTimer(confirmationProcessingDuration) - defer timer.ObserveDuration() - - confirmation := &protobufs.FrameConfirmation{} - if err := confirmation.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal confirmation", zap.Error(err)) - confirmationProcessedTotal.WithLabelValues("error").Inc() - return - } - - // Validate the confirmation structure - if err := confirmation.Validate(); err != nil { - e.logger.Debug("invalid confirmation", zap.Error(err)) - confirmationProcessedTotal.WithLabelValues("error").Inc() - return - } - - // Find the frame with matching selector - e.frameStoreMu.RLock() - var matchingFrame *protobufs.GlobalFrame - for frameID, frame := range e.frameStore { - if frame.Header != nil && - frame.Header.FrameNumber == confirmation.FrameNumber && - frameID == string(confirmation.Selector) { - matchingFrame = frame - break - } - } - - if matchingFrame == nil { - e.frameStoreMu.RUnlock() - return - } - - e.frameStoreMu.RUnlock() - e.frameStoreMu.Lock() - defer e.frameStoreMu.Unlock() - matchingFrame.Header.PublicKeySignatureBls48581 = - confirmation.AggregateSignature - valid, err := e.frameValidator.Validate(matchingFrame) - if !valid || err != nil { - e.logger.Error("received invalid confirmation", zap.Error(err)) - confirmationProcessedTotal.WithLabelValues("error").Inc() - return - } - - // Check if we already have a confirmation stowed - exceeds := false - set := 0 - for _, b := range matchingFrame.Header.PublicKeySignatureBls48581.Bitmask { - set += bits.OnesCount8(b) - if set > 1 { - exceeds = true - break - } - } - if exceeds { - // Skip the remaining operations - return - } - - // Extract proposer address from the original frame - var proposerAddress []byte - frameSignature := matchingFrame.Header.PublicKeySignatureBls48581 - if frameSignature != nil && frameSignature.PublicKey != nil && - len(frameSignature.PublicKey.KeyValue) > 0 { - proposerAddress = e.getAddressFromPublicKey( - frameSignature.PublicKey.KeyValue, - ) - } else if frameSignature != nil && - frameSignature.Bitmask != nil { - // Extract from bitmask if no public key - provers, err := e.proverRegistry.GetActiveProvers(nil) - if err == nil { - for i := 0; i < len(provers); i++ { - byteIndex := i / 8 - bitIndex := i % 8 - if byteIndex < len(frameSignature.Bitmask) && - (frameSignature.Bitmask[byteIndex]&(1< 0 { - if err := e.stateMachine.ReceiveConfirmation( - GlobalPeerID{ID: proposerAddress}, - &matchingFrame, - ); err != nil { - e.logger.Error("could not receive confirmation", zap.Error(err)) - confirmationProcessedTotal.WithLabelValues("error").Inc() - return - } - } - err = e.globalTimeReel.Insert(e.ctx, matchingFrame) - if err != nil { - e.logger.Error( - "could not insert into time reel", - zap.Error(err), - ) - confirmationProcessedTotal.WithLabelValues("error").Inc() - return - } - - confirmationProcessedTotal.WithLabelValues("success").Inc() -} - func (e *GlobalConsensusEngine) handleShardProposal(message *pb.Message) { timer := prometheus.NewTimer(shardProposalProcessingDuration) defer timer.ObserveDuration() @@ -1494,83 +1312,6 @@ func (e *GlobalConsensusEngine) handleShardVote(message *pb.Message) { shardVoteProcessedTotal.WithLabelValues("success").Inc() } -func (e *GlobalConsensusEngine) handleShardConfirmation(message *pb.Message) { - timer := prometheus.NewTimer(shardConfirmationProcessingDuration) - defer timer.ObserveDuration() - - confirmation := &protobufs.FrameConfirmation{} - if err := confirmation.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal confirmation", zap.Error(err)) - shardConfirmationProcessedTotal.WithLabelValues("error").Inc() - return - } - - // Validate the confirmation structure - if err := confirmation.Validate(); err != nil { - e.logger.Debug("invalid confirmation", zap.Error(err)) - shardConfirmationProcessedTotal.WithLabelValues("error").Inc() - return - } - - e.appFrameStoreMu.Lock() - matchingFrame := e.appFrameStore[string(confirmation.Selector)] - e.appFrameStoreMu.Unlock() - - if matchingFrame == nil { - e.logger.Error("could not find matching frame") - shardConfirmationProcessedTotal.WithLabelValues("error").Inc() - return - } - - matchingFrame.Header.PublicKeySignatureBls48581 = - confirmation.AggregateSignature - valid, err := e.appFrameValidator.Validate(matchingFrame) - if !valid || err != nil { - e.logger.Error("received invalid confirmation", zap.Error(err)) - shardConfirmationProcessedTotal.WithLabelValues("error").Inc() - return - } - - // Check if we already have a confirmation stowed - exceeds := false - set := 0 - for _, b := range matchingFrame.Header.PublicKeySignatureBls48581.Bitmask { - set += bits.OnesCount8(b) - if set > 1 { - exceeds = true - break - } - } - if exceeds { - // Skip the remaining operations - return - } - - e.txLockMu.Lock() - if _, ok := e.txLockMap[confirmation.FrameNumber]; !ok { - e.txLockMap[confirmation.FrameNumber] = make( - map[string]map[string]*LockedTransaction, - ) - } - _, ok := e.txLockMap[confirmation.FrameNumber][string(confirmation.Filter)] - if !ok { - e.txLockMap[confirmation.FrameNumber][string(confirmation.Filter)] = - make(map[string]*LockedTransaction) - } - txSet := e.txLockMap[confirmation.FrameNumber][string(confirmation.Filter)] - for _, l := range txSet { - for _, p := range slices.Collect(slices.Chunk(l.Prover, 32)) { - if bytes.Equal(p, matchingFrame.Header.Prover) { - l.Committed = true - l.Filled = true - } - } - } - e.txLockMu.Unlock() - - shardConfirmationProcessedTotal.WithLabelValues("success").Inc() -} - func (e *GlobalConsensusEngine) peekMessageType(message *pb.Message) uint32 { // Check if data is long enough to contain type prefix if len(message.Data) < 4 { diff --git a/node/consensus/global/message_validation.go b/node/consensus/global/message_validation.go index 5c6f719..c2293d7 100644 --- a/node/consensus/global/message_validation.go +++ b/node/consensus/global/message_validation.go @@ -71,48 +71,20 @@ func (e *GlobalConsensusEngine) validateGlobalConsensusMessage( proposalValidationTotal.WithLabelValues("accept").Inc() - case protobufs.ProverLivenessCheckType: - start := time.Now() - defer func() { - livenessCheckValidationDuration.Observe(time.Since(start).Seconds()) - }() - - livenessCheck := &protobufs.ProverLivenessCheck{} - if err := livenessCheck.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal liveness check", zap.Error(err)) - livenessCheckValidationTotal.WithLabelValues("reject").Inc() - return tp2p.ValidationResultReject - } - - now := time.Now().UnixMilli() - if livenessCheck.Timestamp > now+5000 || - livenessCheck.Timestamp < now-5000 { - return tp2p.ValidationResultIgnore - } - - // Validate the liveness check - if err := livenessCheck.Validate(); err != nil { - e.logger.Debug("invalid liveness check", zap.Error(err)) - livenessCheckValidationTotal.WithLabelValues("reject").Inc() - return tp2p.ValidationResultReject - } - - livenessCheckValidationTotal.WithLabelValues("accept").Inc() - - case protobufs.FrameVoteType: + case protobufs.ProposalVoteType: start := time.Now() defer func() { voteValidationDuration.Observe(time.Since(start).Seconds()) }() - vote := &protobufs.FrameVote{} + vote := &protobufs.ProposalVote{} if err := vote.FromCanonicalBytes(message.Data); err != nil { e.logger.Debug("failed to unmarshal vote", zap.Error(err)) voteValidationTotal.WithLabelValues("reject").Inc() return tp2p.ValidationResultReject } - now := time.Now().UnixMilli() + now := uint64(time.Now().UnixMilli()) if vote.Timestamp > now+5000 || vote.Timestamp < now-5000 { return tp2p.ValidationResultIgnore } @@ -126,33 +98,33 @@ func (e *GlobalConsensusEngine) validateGlobalConsensusMessage( voteValidationTotal.WithLabelValues("accept").Inc() - case protobufs.FrameConfirmationType: + case protobufs.TimeoutStateType: start := time.Now() defer func() { - confirmationValidationDuration.Observe(time.Since(start).Seconds()) + timeoutStateValidationDuration.Observe(time.Since(start).Seconds()) }() - confirmation := &protobufs.FrameConfirmation{} - if err := confirmation.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal confirmation", zap.Error(err)) - confirmationValidationTotal.WithLabelValues("reject").Inc() + timeoutState := &protobufs.QuorumCertificate{} + if err := timeoutState.FromCanonicalBytes(message.Data); err != nil { + e.logger.Debug("failed to unmarshal timeoutState", zap.Error(err)) + timeoutStateValidationTotal.WithLabelValues("reject").Inc() return tp2p.ValidationResultReject } - now := time.Now().UnixMilli() - if confirmation.Timestamp > now+5000 || - confirmation.Timestamp < now-5000 { + now := uint64(time.Now().UnixMilli()) + if timeoutState.Timestamp > now+5000 || + timeoutState.Timestamp < now-5000 { return tp2p.ValidationResultIgnore } - // Validate the confirmation - if err := confirmation.Validate(); err != nil { - e.logger.Debug("invalid confirmation", zap.Error(err)) - confirmationValidationTotal.WithLabelValues("reject").Inc() + // Validate the timeoutState + if err := timeoutState.Validate(); err != nil { + e.logger.Debug("invalid timeoutState", zap.Error(err)) + timeoutStateValidationTotal.WithLabelValues("reject").Inc() return tp2p.ValidationResultReject } - confirmationValidationTotal.WithLabelValues("accept").Inc() + timeoutStateValidationTotal.WithLabelValues("accept").Inc() default: e.logger.Debug("received unknown type", zap.Uint32("type", typePrefix)) @@ -224,48 +196,20 @@ func (e *GlobalConsensusEngine) validateShardConsensusMessage( shardProposalValidationTotal.WithLabelValues("accept").Inc() - case protobufs.ProverLivenessCheckType: - start := time.Now() - defer func() { - shardLivenessCheckValidationDuration.Observe(time.Since(start).Seconds()) - }() - - livenessCheck := &protobufs.ProverLivenessCheck{} - if err := livenessCheck.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal liveness check", zap.Error(err)) - shardLivenessCheckValidationTotal.WithLabelValues("reject").Inc() - return tp2p.ValidationResultReject - } - - now := time.Now().UnixMilli() - if livenessCheck.Timestamp > now+500 || - livenessCheck.Timestamp < now-1000 { - shardLivenessCheckValidationTotal.WithLabelValues("ignore").Inc() - return tp2p.ValidationResultIgnore - } - - if err := livenessCheck.Validate(); err != nil { - e.logger.Debug("failed to validate liveness check", zap.Error(err)) - shardLivenessCheckValidationTotal.WithLabelValues("reject").Inc() - return tp2p.ValidationResultReject - } - - shardLivenessCheckValidationTotal.WithLabelValues("accept").Inc() - - case protobufs.FrameVoteType: + case protobufs.ProposalVoteType: start := time.Now() defer func() { shardVoteValidationDuration.Observe(time.Since(start).Seconds()) }() - vote := &protobufs.FrameVote{} + vote := &protobufs.ProposalVote{} if err := vote.FromCanonicalBytes(message.Data); err != nil { e.logger.Debug("failed to unmarshal vote", zap.Error(err)) shardVoteValidationTotal.WithLabelValues("reject").Inc() return tp2p.ValidationResultReject } - now := time.Now().UnixMilli() + now := uint64(time.Now().UnixMilli()) if vote.Timestamp > now+5000 || vote.Timestamp < now-5000 { shardVoteValidationTotal.WithLabelValues("ignore").Inc() return tp2p.ValidationResultIgnore @@ -279,32 +223,32 @@ func (e *GlobalConsensusEngine) validateShardConsensusMessage( shardVoteValidationTotal.WithLabelValues("accept").Inc() - case protobufs.FrameConfirmationType: + case protobufs.TimeoutStateType: start := time.Now() defer func() { - shardConfirmationValidationDuration.Observe(time.Since(start).Seconds()) + shardTimeoutStateValidationDuration.Observe(time.Since(start).Seconds()) }() - confirmation := &protobufs.FrameConfirmation{} - if err := confirmation.FromCanonicalBytes(message.Data); err != nil { - e.logger.Debug("failed to unmarshal confirmation", zap.Error(err)) - shardConfirmationValidationTotal.WithLabelValues("reject").Inc() + timeoutState := &protobufs.TimeoutState{} + if err := timeoutState.FromCanonicalBytes(message.Data); err != nil { + e.logger.Debug("failed to unmarshal timeoutState", zap.Error(err)) + shardTimeoutStateValidationTotal.WithLabelValues("reject").Inc() return tp2p.ValidationResultReject } - now := time.Now().UnixMilli() - if confirmation.Timestamp > now+5000 || confirmation.Timestamp < now-5000 { - shardConfirmationValidationTotal.WithLabelValues("ignore").Inc() + now := uint64(time.Now().UnixMilli()) + if timeoutState.Timestamp > now+5000 || timeoutState.Timestamp < now-5000 { + shardTimeoutStateValidationTotal.WithLabelValues("ignore").Inc() return tp2p.ValidationResultIgnore } - if err := confirmation.Validate(); err != nil { - e.logger.Debug("failed to validate confirmation", zap.Error(err)) - shardConfirmationValidationTotal.WithLabelValues("reject").Inc() + if err := timeoutState.Validate(); err != nil { + e.logger.Debug("failed to validate timeoutState", zap.Error(err)) + shardTimeoutStateValidationTotal.WithLabelValues("reject").Inc() return tp2p.ValidationResultReject } - shardConfirmationValidationTotal.WithLabelValues("accept").Inc() + shardTimeoutStateValidationTotal.WithLabelValues("accept").Inc() default: return tp2p.ValidationResultReject diff --git a/node/consensus/global/metrics.go b/node/consensus/global/metrics.go index 0d18789..a8fabc6 100644 --- a/node/consensus/global/metrics.go +++ b/node/consensus/global/metrics.go @@ -179,86 +179,86 @@ var ( }, ) - // Global confirmation processing metrics - confirmationProcessedTotal = promauto.NewCounterVec( + // Global timeout state processing metrics + timeoutStateProcessedTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "confirmation_processed_total", - Help: "Total number of global confirmations processed by the global consensus engine", + Name: "timeout_state_processed_total", + Help: "Total number of global timeouts processed by the global consensus engine", }, []string{"status"}, // status: "success", "error", "invalid" ) - confirmationProcessingDuration = promauto.NewHistogram( + timeoutStateProcessingDuration = promauto.NewHistogram( prometheus.HistogramOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "confirmation_processing_duration_seconds", - Help: "Time taken to process a global confirmation", + Name: "timeout_state_processing_duration_seconds", + Help: "Time taken to process a global timeout", Buckets: prometheus.DefBuckets, }, ) - // Global confirmation validation metrics - confirmationValidationTotal = promauto.NewCounterVec( + // Global timeout state validation metrics + timeoutStateValidationTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "confirmation_validation_total", - Help: "Total number of global confirmation validations", + Name: "timeout_state_validation_total", + Help: "Total number of global timeout state validations", }, []string{"result"}, // result: "accept", "reject", "ignore" ) - confirmationValidationDuration = promauto.NewHistogram( + timeoutStateValidationDuration = promauto.NewHistogram( prometheus.HistogramOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "confirmation_validation_duration_seconds", - Help: "Time taken to validate a global confirmation", + Name: "timeout_state_validation_duration_seconds", + Help: "Time taken to validate a global timeout", Buckets: prometheus.DefBuckets, }, ) - // Shard confirmation processing metrics - shardConfirmationProcessedTotal = promauto.NewCounterVec( + // Shard timeout state processing metrics + shardTimeoutStateProcessedTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "shard_confirmation_processed_total", - Help: "Total number of shard confirmations processed by the global consensus engine", + Name: "shard_timeout_state_processed_total", + Help: "Total number of shard timeouts processed by the global consensus engine", }, []string{"status"}, // status: "success", "error", "invalid" ) - shardConfirmationProcessingDuration = promauto.NewHistogram( + shardTimeoutStateProcessingDuration = promauto.NewHistogram( prometheus.HistogramOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "shard_confirmation_processing_duration_seconds", - Help: "Time taken to process a shard confirmation", + Name: "shard_timeout_state_processing_duration_seconds", + Help: "Time taken to process a shard timeout", Buckets: prometheus.DefBuckets, }, ) - // Shard confirmation validation metrics - shardConfirmationValidationTotal = promauto.NewCounterVec( + // Shard timeout state validation metrics + shardTimeoutStateValidationTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "shard_confirmation_validation_total", - Help: "Total number of shard confirmation validations", + Name: "shard_timeout_state_validation_total", + Help: "Total number of shard timeout state validations", }, []string{"result"}, // result: "accept", "reject", "ignore" ) - shardConfirmationValidationDuration = promauto.NewHistogram( + shardTimeoutStateValidationDuration = promauto.NewHistogram( prometheus.HistogramOpts{ Namespace: metricsNamespace, Subsystem: subsystem, - Name: "shard_confirmation_validation_duration_seconds", - Help: "Time taken to validate a shard confirmation", + Name: "shard_timeout_state_validation_duration_seconds", + Help: "Time taken to validate a shard timeout", Buckets: prometheus.DefBuckets, }, ) diff --git a/node/consensus/time/app_time_reel.go b/node/consensus/time/app_time_reel.go index 8c10112..a427c68 100644 --- a/node/consensus/time/app_time_reel.go +++ b/node/consensus/time/app_time_reel.go @@ -15,6 +15,7 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/consensus" "source.quilibrium.com/quilibrium/monorepo/types/store" @@ -105,9 +106,7 @@ type AppTimeReel struct { ) error // Control - ctx context.Context - cancel context.CancelFunc - wg sync.WaitGroup + ctx context.Context // Archive mode: whether to hold historic frame data archiveMode bool @@ -126,8 +125,6 @@ func NewAppTimeReel( return nil, errors.Wrap(err, "failed to create LRU cache") } - ctx, cancel := context.WithCancel(context.Background()) - return &AppTimeReel{ logger: logger, address: address, @@ -153,8 +150,6 @@ func NewAppTimeReel( return nil }, store: clockStore, - ctx: ctx, - cancel: cancel, archiveMode: archiveMode, }, nil } @@ -180,27 +175,28 @@ func (a *AppTimeReel) SetRevertFunc( } // Start starts the app time reel -func (a *AppTimeReel) Start() error { +func (a *AppTimeReel) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + a.ctx = ctx a.logger.Info( "starting app time reel", zap.String("address", fmt.Sprintf("%x", a.address)), ) if err := a.bootstrapFromStore(); err != nil { - return errors.Wrap(err, "start app time reel") + ctx.Throw(errors.Wrap(err, "start app time reel")) + return } - return nil -} + ready() + <-ctx.Done() -// Stop stops the app time reel -func (a *AppTimeReel) Stop() { a.logger.Info( "stopping app time reel", zap.String("address", fmt.Sprintf("%x", a.address)), ) - a.cancel() - a.wg.Wait() close(a.eventCh) close(a.eventDone) } diff --git a/node/consensus/time/app_time_reel_test.go b/node/consensus/time/app_time_reel_test.go index 5812f00..a2483cc 100644 --- a/node/consensus/time/app_time_reel_test.go +++ b/node/consensus/time/app_time_reel_test.go @@ -12,6 +12,7 @@ import ( "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/mocks" ) @@ -51,12 +52,10 @@ func TestAppTimeReel_BasicOperations(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() - + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Test address getter assert.Equal(t, address, atr.GetAddress()) @@ -135,11 +134,10 @@ func TestAppTimeReel_WrongAddress(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Try to insert frame with wrong address wrongFrame := &protobufs.AppShardFrame{ @@ -166,11 +164,10 @@ func TestAppTimeReel_Equivocation(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Subscribe to events eventCh := atr.GetEventCh() @@ -268,11 +265,10 @@ func TestAppTimeReel_Fork(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.AppShardFrame{ @@ -342,11 +338,10 @@ func TestAppTimeReel_ParentValidation(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.AppShardFrame{ @@ -409,11 +404,10 @@ func TestAppTimeReel_ForkDetection(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Collect events @@ -492,11 +486,10 @@ func TestAppTimeReel_ForkChoice_MoreSignatures(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Drain any existing events @@ -603,11 +596,10 @@ func TestAppTimeReel_ForkChoice_NoReplacement(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Drain any existing events @@ -714,11 +706,10 @@ func TestAppTimeReel_DeepForkChoice_ReverseInsertion(t *testing.T) { atr, err := NewAppTimeReel(logger, address, reg, s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Drain any existing events @@ -1047,11 +1038,10 @@ func TestAppTimeReel_MultipleProvers(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Different provers create frames provers := [][]byte{ @@ -1175,11 +1165,10 @@ func TestAppTimeReel_ComplexForkWithOutOfOrderInsertion(t *testing.T) { atr, err := NewAppTimeReel(logger, address, proverRegistry, s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Collect all events @@ -1392,11 +1381,10 @@ func TestAppTimeReel_TreePruning(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.AppShardFrame{ @@ -1481,11 +1469,10 @@ func TestAppTimeReel_TreePruningWithForks(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.AppShardFrame{ @@ -1601,11 +1588,10 @@ func TestAppTimeReel_ForkChoiceInsertionOrder(t *testing.T) { atr, err := NewAppTimeReel(logger, address, reg, s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Drain any existing events @@ -1817,11 +1803,10 @@ func TestAppTimeReel_ForkEventsWithReplay(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Collect all events @@ -2000,11 +1985,10 @@ func TestAppTimeReel_ComprehensiveEquivocation(t *testing.T) { atr, err := NewAppTimeReel(logger, address, createTestProverRegistry(true), s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Collect equivocation events @@ -2162,11 +2146,10 @@ func TestAppTimeReel_ProverRegistryForkChoice(t *testing.T) { atr, err := NewAppTimeReel(logger, address, proverRegistry, s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Create genesis frame @@ -2293,11 +2276,10 @@ func TestAppTimeReel_ProverRegistryWithOrderedProvers(t *testing.T) { atr, err := NewAppTimeReel(logger, address, proverRegistry, s, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Create genesis frame genesis := &protobufs.AppShardFrame{ diff --git a/node/consensus/time/global_time_reel.go b/node/consensus/time/global_time_reel.go index d0cf073..99f2707 100644 --- a/node/consensus/time/global_time_reel.go +++ b/node/consensus/time/global_time_reel.go @@ -14,6 +14,7 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/consensus" "source.quilibrium.com/quilibrium/monorepo/types/store" @@ -110,8 +111,7 @@ type GlobalTimeReel struct { ) error // Control - ctx context.Context - cancel context.CancelFunc + ctx context.Context // Network-specific consensus toggles genesisFrameNumber uint64 @@ -135,8 +135,6 @@ func NewGlobalTimeReel( return nil, errors.Wrap(err, "new global time reel") } - ctx, cancel := context.WithCancel(context.Background()) - genesisFrameNumber := uint64(0) if network == 0 { @@ -169,8 +167,6 @@ func NewGlobalTimeReel( ) error { return nil }, - ctx: ctx, - cancel: cancel, genesisFrameNumber: genesisFrameNumber, archiveMode: archiveMode, }, nil @@ -199,21 +195,23 @@ func (g *GlobalTimeReel) SetRevertFunc( } // Start starts the global time reel -func (g *GlobalTimeReel) Start() error { +func (g *GlobalTimeReel) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + g.ctx = ctx g.logger.Info("starting global time reel") // Warm the in-memory tree/cache from store. if err := g.bootstrapFromStore(); err != nil { - return errors.Wrap(err, "start") + ctx.Throw(err) + return } - return nil -} + ready() + <-ctx.Done() -// Stop stops the global time reel -func (g *GlobalTimeReel) Stop() { g.logger.Info("stopping global time reel") - g.cancel() close(g.eventCh) close(g.eventDone) } diff --git a/node/consensus/time/global_time_reel_equivocation_test.go b/node/consensus/time/global_time_reel_equivocation_test.go index 5061f73..10dfee2 100644 --- a/node/consensus/time/global_time_reel_equivocation_test.go +++ b/node/consensus/time/global_time_reel_equivocation_test.go @@ -10,6 +10,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" ) @@ -24,11 +25,10 @@ func TestGlobalTimeReel_MassiveEquivocationForkChoice(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Collect events @@ -152,11 +152,10 @@ func TestGlobalTimeReel_EquivocationWithForkChoice(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Drain initial events @@ -252,11 +251,10 @@ func TestGlobalTimeReel_NonOverlappingForks(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.GlobalFrame{ diff --git a/node/consensus/time/global_time_reel_test.go b/node/consensus/time/global_time_reel_test.go index 8b4402f..7c318fc 100644 --- a/node/consensus/time/global_time_reel_test.go +++ b/node/consensus/time/global_time_reel_test.go @@ -11,6 +11,7 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/zap" "source.quilibrium.com/quilibrium/monorepo/config" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/store" "source.quilibrium.com/quilibrium/monorepo/protobufs" ) @@ -27,11 +28,10 @@ func TestGlobalTimeReel_BasicOperations(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Test inserting genesis frame genesis := &protobufs.GlobalFrame{ @@ -108,11 +108,10 @@ func TestGlobalTimeReel_Equivocation(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Subscribe to events eventCh := atr.GetEventCh() @@ -208,11 +207,10 @@ func TestGlobalTimeReel_Fork(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.GlobalFrame{ @@ -280,11 +278,10 @@ func TestGlobalTimeReel_ParentValidation(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.GlobalFrame{ @@ -342,11 +339,10 @@ func TestGlobalTimeReel_ForkDetection(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Collect events @@ -423,11 +419,10 @@ func TestGlobalTimeReel_ForkChoice_MoreSignatures(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Drain any existing events @@ -536,11 +531,10 @@ func TestGlobalTimeReel_ForkChoice_NoReplacement(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Drain any existing events @@ -641,11 +635,10 @@ func TestGlobalTimeReel_DeepForkChoice_ReverseInsertion(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Drain any existing events @@ -872,11 +865,10 @@ func TestGlobalTimeReel_TreePruning(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.GlobalFrame{ @@ -956,11 +948,10 @@ func TestGlobalTimeReel_TreePruningWithForks(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.GlobalFrame{ @@ -1066,11 +1057,10 @@ func TestGlobalTimeReel_ForkChoiceInsertionOrder(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Drain any existing events @@ -1227,11 +1217,10 @@ func TestGlobalTimeReel_ForkEventsWithReplay(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Collect all events @@ -1395,11 +1384,10 @@ func TestGlobalTimeReel_ComprehensiveEquivocation(t *testing.T) { atr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = atr.Start() - require.NoError(t, err) - defer atr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go atr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() eventCh := atr.GetEventCh() // Collect equivocation events @@ -1529,8 +1517,10 @@ func TestGlobalTimeReel_NonArchive_BootstrapLoadsWindowOf360(t *testing.T) { // Start a new reel in non-archive mode; it should bootstrap only last 360. tr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, false) require.NoError(t, err) - require.NoError(t, tr.Start()) - defer tr.Stop() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go tr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() head, err := tr.GetHead() require.NoError(t, err) @@ -1555,8 +1545,10 @@ func TestGlobalTimeReel_NonArchive_SnapForward_WhenGapExceeds360(t *testing.T) { tr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, false) require.NoError(t, err) - require.NoError(t, tr.Start()) - defer tr.Stop() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go tr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() head, err := tr.GetHead() require.NoError(t, err) @@ -1586,8 +1578,10 @@ func TestGlobalTimeReel_NonArchive_PrunesStore_AsHeadAdvances(t *testing.T) { tr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, false) require.NoError(t, err) - require.NoError(t, tr.Start()) - defer tr.Stop() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go tr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert a contiguous chain via Insert so persistCanonicalFrames runs and // prunes store. @@ -1622,10 +1616,10 @@ func TestGlobalTimeReel_NonArchive_PendingResolves_WhenParentArrives(t *testing. tr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, false) require.NoError(t, err) - require.NoError(t, tr.Start()) - defer tr.Stop() - - ctx := context.Background() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go tr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() var prev *protobufs.GlobalFrame for n := uint64(90); n <= 99; n++ { @@ -1686,10 +1680,11 @@ func TestGlobalTimeReel_NonArchive_SnapThenAppend_NoSpuriousForks(t *testing.T) tr, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, false) require.NoError(t, err) - require.NoError(t, tr.Start()) - defer tr.Stop() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go tr.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() - ctx := context.Background() eventCh := tr.GetEventCh() // Drain any startup/new head events. @@ -1823,8 +1818,10 @@ func buildAndPersistChain(t *testing.T, s *store.PebbleClockStore, start, end ui // note: needs to be non-archive otherwise insert will only set as pending reel, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, false) require.NoError(t, err) - require.NoError(t, reel.Start()) - defer reel.Stop() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go reel.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() var prev *protobufs.GlobalFrame for n := start; n <= end; n++ { diff --git a/node/consensus/time/simple_equivocation_test.go b/node/consensus/time/simple_equivocation_test.go index 833bbb6..f7442a8 100644 --- a/node/consensus/time/simple_equivocation_test.go +++ b/node/consensus/time/simple_equivocation_test.go @@ -3,10 +3,12 @@ package time import ( "context" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" ) @@ -17,9 +19,10 @@ func TestGlobalTimeReel_SimpleEquivocation(t *testing.T) { globalReel, err := NewGlobalTimeReel(logger, createTestProverRegistry(true), s, 99, true) require.NoError(t, err) - err = globalReel.Start() - require.NoError(t, err) - defer globalReel.Stop() + ctx, cancel, _ := lifecycle.WithSignallerAndCancel(context.Background()) + go globalReel.Start(ctx, func() {}) + time.Sleep(100 * time.Millisecond) + defer cancel() // Insert genesis genesis := &protobufs.GlobalFrame{ diff --git a/node/execution/engines/compute_execution_engine.go b/node/execution/engines/compute_execution_engine.go index ad9d2cb..a273915 100644 --- a/node/execution/engines/compute_execution_engine.go +++ b/node/execution/engines/compute_execution_engine.go @@ -11,6 +11,7 @@ import ( "github.com/pkg/errors" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/execution/fees" "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/compute" hgstate "source.quilibrium.com/quilibrium/monorepo/node/execution/state/hypergraph" @@ -41,7 +42,7 @@ type ComputeExecutionEngine struct { intrinsicsMutex sync.RWMutex mode ExecutionMode mu sync.RWMutex - stopChan chan struct{} + ctx lifecycle.SignalerContext } func NewComputeExecutionEngine( @@ -136,45 +137,16 @@ func (e *ComputeExecutionEngine) GetCapabilities() []*protobufs.Capability { return capabilities } -func (e *ComputeExecutionEngine) Start() <-chan error { - errChan := make(chan error, 1) +func (e *ComputeExecutionEngine) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + e.logger.Info("starting compute execution engine") + e.ctx = ctx + ready() - e.mu.Lock() - e.stopChan = make(chan struct{}, 1) - e.mu.Unlock() - - go func() { - e.logger.Info("starting compute execution engine") - - <-e.stopChan - e.logger.Info("stopping compute execution engine") - }() - - return errChan -} - -func (e *ComputeExecutionEngine) Stop(force bool) <-chan error { - errChan := make(chan error) - - go func() { - e.logger.Info("stopping compute execution engine", zap.Bool("force", force)) - - // Signal stop if we have a stopChan - e.mu.RLock() - if e.stopChan != nil { - select { - case <-e.stopChan: - // Already closed - default: - close(e.stopChan) - } - } - e.mu.RUnlock() - - close(errChan) - }() - - return errChan + <-ctx.Done() + e.logger.Info("stopping compute execution engine") } func (e *ComputeExecutionEngine) Prove( @@ -478,37 +450,42 @@ func (e *ComputeExecutionEngine) validateBundle( // Validate each operation in the bundle sequentially for i, op := range bundle.Requests { - e.logger.Debug( - "validating bundled operation", - zap.Int("operation", i), - zap.String("address", hex.EncodeToString(address)), - ) - - // Check if this is a compute operation type - isComputeOp := op.GetComputeDeploy() != nil || - op.GetComputeUpdate() != nil || - op.GetCodeDeploy() != nil || - op.GetCodeExecute() != nil || - op.GetCodeFinalize() != nil - - if !isComputeOp { - // Skip non-compute operations + select { + case <-e.ctx.Done(): + return errors.Wrap(errors.New("context canceled"), "validate bundle") + default: e.logger.Debug( - "skipping non-compute operation in bundle", + "validating bundled operation", zap.Int("operation", i), + zap.String("address", hex.EncodeToString(address)), ) - continue - } - // Validate this operation individually - err := e.validateIndividualMessage( - frameNumber, - address, - op, - true, - ) - if err != nil { - return errors.Wrap(err, "validate bundle") + // Check if this is a compute operation type + isComputeOp := op.GetComputeDeploy() != nil || + op.GetComputeUpdate() != nil || + op.GetCodeDeploy() != nil || + op.GetCodeExecute() != nil || + op.GetCodeFinalize() != nil + + if !isComputeOp { + // Skip non-compute operations + e.logger.Debug( + "skipping non-compute operation in bundle", + zap.Int("operation", i), + ) + continue + } + + // Validate this operation individually + err := e.validateIndividualMessage( + frameNumber, + address, + op, + true, + ) + if err != nil { + return errors.Wrap(err, "validate bundle") + } } } diff --git a/node/execution/engines/compute_execution_engine_test.go b/node/execution/engines/compute_execution_engine_test.go index 9791d56..736a2ed 100644 --- a/node/execution/engines/compute_execution_engine_test.go +++ b/node/execution/engines/compute_execution_engine_test.go @@ -1487,8 +1487,6 @@ req:A a rdfs:Property; assert.NoError(t, err) assertCodeExecutionResult(t, mode, msgs, err, false) } - - <-engine.Stop(false) }) }) @@ -1569,8 +1567,6 @@ req:A a rdfs:Property; if engineMode == engines.GlobalMode && err == nil { assert.NotNil(t, msgs, "Bundled operations should produce responses in GlobalMode") } - - <-engine.Stop(false) }) }) @@ -1660,7 +1656,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -1753,7 +1748,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -1855,7 +1849,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -1943,7 +1936,6 @@ req:A a rdfs:Property; assert.Error(t, err) assert.Nil(t, msgs) - <-engine.Stop(false) }) }) @@ -2015,7 +2007,6 @@ req:A a rdfs:Property; assert.Error(t, err) assert.Nil(t, msgs) - <-engine.Stop(false) }) }) @@ -2124,7 +2115,6 @@ req:A a rdfs:Property; } assert.Nil(t, msgs) - <-engine.Stop(false) }) }) @@ -2401,7 +2391,6 @@ req:A a rdfs:Property; } assert.Nil(t, msgs) - <-engine.Stop(false) }) }) @@ -2485,7 +2474,6 @@ req:A a rdfs:Property; assert.Error(t, err) assert.Nil(t, msgs) - <-engine.Stop(false) }) }) @@ -2570,7 +2558,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -2648,7 +2635,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -2728,7 +2714,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -2814,7 +2799,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -2901,7 +2885,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -2997,7 +2980,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -3093,7 +3075,6 @@ req:A a rdfs:Property; // All operations should be in the same stage since there are no conflicts - <-engine.Stop(false) }) }) @@ -3197,7 +3178,6 @@ req:A a rdfs:Property; // Should produce stages: [op1], [op2, op3], [op4] - <-engine.Stop(false) }) }) @@ -3304,7 +3284,6 @@ req:A a rdfs:Property; // Stage 1: op3 (conflicts with op1) // Stage 2: op4 (depends on op1 and op2) - <-engine.Stop(false) }) }) @@ -3396,7 +3375,6 @@ req:A a rdfs:Property; assert.Equal(t, []byte("op2"), ce.ExecuteOperations[1].Identifier) assert.Equal(t, [][]byte{[]byte("op1")}, ce.ExecuteOperations[1].Dependencies) - <-engine.Stop(false) }) }) @@ -3492,7 +3470,6 @@ req:A a rdfs:Property; // The execution stages should be computed and stored - <-engine.Stop(false) }) }) @@ -3572,7 +3549,6 @@ req:A a rdfs:Property; // Verify rendezvous is stored correctly assert.NotNil(t, ce.Rendezvous) - <-engine.Stop(false) }) }) @@ -3659,7 +3635,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -3791,7 +3766,6 @@ req:A a rdfs:Property; // Since we can't easily verify the exact calls, we trust the test passes } - <-engine.Stop(false) }) }) @@ -3916,7 +3890,6 @@ req:A a rdfs:Property; // State changes should not be committed for failed operations - <-engine.Stop(false) }) }) @@ -4096,7 +4069,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -4229,7 +4201,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -4353,7 +4324,6 @@ req:A a rdfs:Property; assert.Contains(t, err.Error(), "empty") } - <-engine.Stop(false) }) }) @@ -4491,7 +4461,6 @@ rdfs:range req:Request. assert.Contains(t, err.Error(), "limit") } - <-engine.Stop(false) }) }) @@ -4610,7 +4579,6 @@ req:A a rdfs:Property; assertCodeExecutionResult(t, mode, msgs, err, true) assert.Len(t, cf.Results, 2) - <-engine.Stop(false) }) }) @@ -4705,7 +4673,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -4831,7 +4798,6 @@ req:A a rdfs:Property; } assert.Nil(t, msgs) - <-engine.Stop(false) }) }) @@ -4954,7 +4920,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -5077,7 +5042,6 @@ req:A a rdfs:Property; msgs, err := engine.ProcessMessage(1, big.NewInt(0), msg.Address, msg.Payload, state) assertCodeExecutionResult(t, mode, msgs, err, false) - <-engine.Stop(false) }) }) @@ -5196,7 +5160,6 @@ req:A a rdfs:Property; assertCodeExecutionResult(t, mode, msgs, err, false) } - <-engine.Stop(false) }) }) @@ -5335,7 +5298,6 @@ req:A a rdfs:Property; } } - <-engine.Stop(false) }) }) } diff --git a/node/execution/engines/global_execution_engine.go b/node/execution/engines/global_execution_engine.go index aaec4df..96430f7 100644 --- a/node/execution/engines/global_execution_engine.go +++ b/node/execution/engines/global_execution_engine.go @@ -12,6 +12,7 @@ import ( "github.com/pkg/errors" "go.uber.org/zap" "source.quilibrium.com/quilibrium/monorepo/config" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/global" "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/token" "source.quilibrium.com/quilibrium/monorepo/protobufs" @@ -46,7 +47,7 @@ type GlobalExecutionEngine struct { intrinsics map[string]intrinsics.Intrinsic intrinsicsMutex sync.RWMutex mu sync.RWMutex - stopChan chan struct{} + ctx lifecycle.SignalerContext } func NewGlobalExecutionEngine( @@ -119,45 +120,15 @@ func (e *GlobalExecutionEngine) GetCapabilities() []*protobufs.Capability { } } -func (e *GlobalExecutionEngine) Start() <-chan error { - errChan := make(chan error, 1) - - e.mu.Lock() - e.stopChan = make(chan struct{}, 1) - e.mu.Unlock() - - go func() { - e.logger.Info("starting global execution engine") - - <-e.stopChan - e.logger.Info("stopping global execution engine") - }() - - return errChan -} - -func (e *GlobalExecutionEngine) Stop(force bool) <-chan error { - errChan := make(chan error, 1) - - go func() { - e.logger.Info("stopping global execution engine", zap.Bool("force", force)) - - // Signal stop if we have a stopChan - e.mu.RLock() - if e.stopChan != nil { - select { - case <-e.stopChan: - // Already closed - default: - close(e.stopChan) - } - } - e.mu.RUnlock() - - close(errChan) - }() - - return errChan +func (e *GlobalExecutionEngine) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + e.ctx = ctx + e.logger.Info("starting global execution engine") + ready() + <-e.ctx.Done() + e.logger.Info("stopping global execution engine") } func (e *GlobalExecutionEngine) ValidateMessage( @@ -217,49 +188,54 @@ func (e *GlobalExecutionEngine) validateBundle( // Validate each operation in the bundle sequentially for i, op := range bundle.Requests { - e.logger.Debug( - "validating bundled operation", - zap.Int("operation", i), - zap.String("address", hex.EncodeToString(address)), - ) - - // Check if this is a global operation type - isGlobalOp := op.GetJoin() != nil || - op.GetLeave() != nil || - op.GetPause() != nil || - op.GetResume() != nil || - op.GetConfirm() != nil || - op.GetReject() != nil || - op.GetKick() != nil || - op.GetUpdate() != nil || - op.GetShard() != nil - - if !isGlobalOp { - if e.config.Network == 0 && - frameNumber <= token.FRAME_2_1_EXTENDED_ENROLL_CONFIRM_END { - return errors.Wrap( - errors.New("enrollment period has not ended"), - "validate bundle", - ) - } - // Skip non-global operations (e.g., token payments, compute ops) - // They are retained in the bundle for reference but not validated here + select { + case <-e.ctx.Done(): + return errors.Wrap(errors.New("context canceled"), "validate bundle") + default: e.logger.Debug( - "skipping non-global operation in bundle", + "validating bundled operation", zap.Int("operation", i), + zap.String("address", hex.EncodeToString(address)), ) - continue - } - // Validate this operation individually - err := e.validateIndividualMessage( - frameNumber, - address, - op, - true, - ) - if err != nil { - return errors.Wrap(err, "validate bundle") + // Check if this is a global operation type + isGlobalOp := op.GetJoin() != nil || + op.GetLeave() != nil || + op.GetPause() != nil || + op.GetResume() != nil || + op.GetConfirm() != nil || + op.GetReject() != nil || + op.GetKick() != nil || + op.GetUpdate() != nil || + op.GetShard() != nil + + if !isGlobalOp { + if e.config.Network == 0 && + frameNumber <= token.FRAME_2_1_EXTENDED_ENROLL_CONFIRM_END { + return errors.Wrap( + errors.New("enrollment period has not ended"), + "validate bundle", + ) + } + // Skip non-global operations (e.g., token payments, compute ops) + // They are retained in the bundle for reference but not validated here + e.logger.Debug( + "skipping non-global operation in bundle", + zap.Int("operation", i), + ) + continue + } + + // Validate this operation individually + err := e.validateIndividualMessage( + frameNumber, + address, + op, + true, + ) + if err != nil { + return errors.Wrap(err, "validate bundle") + } } } diff --git a/node/execution/engines/global_execution_engine_test.go b/node/execution/engines/global_execution_engine_test.go index 39882e4..6a15e4a 100644 --- a/node/execution/engines/global_execution_engine_test.go +++ b/node/execution/engines/global_execution_engine_test.go @@ -2,6 +2,7 @@ package engines_test import ( "bytes" + "context" "crypto/rand" "math/big" "slices" @@ -14,6 +15,7 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/zap" "source.quilibrium.com/quilibrium/monorepo/config" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/execution/engines" "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/global" hgstate "source.quilibrium.com/quilibrium/monorepo/node/execution/state/hypergraph" @@ -56,7 +58,8 @@ func TestGlobalExecutionEngine_Start(t *testing.T) { require.NoError(t, err) // Test starting and stopping the engine - errChan := engine.Start() + ctx, cancel, errChan := lifecycle.WithSignallerAndCancel(context.Background()) + engine.Start(ctx, func() {}) // Engine should start without errors select { @@ -67,7 +70,8 @@ func TestGlobalExecutionEngine_Start(t *testing.T) { } // Stop the engine - <-engine.Stop(false) + cancel() + <-ctx.Done() } func TestGlobalExecutionEngine_ProcessMessage(t *testing.T) { diff --git a/node/execution/engines/hypergraph_execution_engine.go b/node/execution/engines/hypergraph_execution_engine.go index 38d7a59..c46ded4 100644 --- a/node/execution/engines/hypergraph_execution_engine.go +++ b/node/execution/engines/hypergraph_execution_engine.go @@ -11,6 +11,7 @@ import ( "github.com/pkg/errors" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/execution/fees" hypergraphintrinsic "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/hypergraph" "source.quilibrium.com/quilibrium/monorepo/protobufs" @@ -38,7 +39,7 @@ type HypergraphExecutionEngine struct { intrinsicsMutex sync.RWMutex mode ExecutionMode mu sync.RWMutex - stopChan chan struct{} + ctx lifecycle.SignalerContext } func NewHypergraphExecutionEngine( @@ -204,48 +205,15 @@ func (e *HypergraphExecutionEngine) GetCapabilities() []*protobufs.Capability { } } -func (e *HypergraphExecutionEngine) Start() <-chan error { - errChan := make(chan error, 1) - - e.mu.Lock() - e.stopChan = make(chan struct{}, 1) - e.mu.Unlock() - - go func() { - e.logger.Info("starting hypergraph execution engine") - - <-e.stopChan - e.logger.Info("stopping hypergraph execution engine") - }() - - return errChan -} - -func (e *HypergraphExecutionEngine) Stop(force bool) <-chan error { - errChan := make(chan error, 1) - - go func() { - e.logger.Info( - "stopping hypergraph execution engine", - zap.Bool("force", force), - ) - - // Signal stop if we have a stopChan - e.mu.RLock() - if e.stopChan != nil { - select { - case <-e.stopChan: - // Already closed - default: - close(e.stopChan) - } - } - e.mu.RUnlock() - - close(errChan) - }() - - return errChan +func (e *HypergraphExecutionEngine) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + e.ctx = ctx + e.logger.Info("starting hypergraph execution engine") + ready() + <-e.ctx.Done() + e.logger.Info("stopping hypergraph execution engine") } func (e *HypergraphExecutionEngine) ValidateMessage( @@ -310,38 +278,43 @@ func (e *HypergraphExecutionEngine) validateBundle( // Validate each operation in the bundle sequentially for i, op := range bundle.Requests { - e.logger.Debug( - "validating bundled operation", - zap.Int("operation", i), - zap.String("address", hex.EncodeToString(address)), - ) - - // Check if this is a hypergraph operation type - isHypergraphOp := op.GetHypergraphDeploy() != nil || - op.GetHypergraphUpdate() != nil || - op.GetVertexAdd() != nil || - op.GetVertexRemove() != nil || - op.GetHyperedgeAdd() != nil || - op.GetHyperedgeRemove() != nil - - if !isHypergraphOp { - // Skip non-hypergraph operations + select { + case <-e.ctx.Done(): + return errors.Wrap(errors.New("context canceled"), "validate bundle") + default: e.logger.Debug( - "skipping non-hypergraph operation in bundle", + "validating bundled operation", zap.Int("operation", i), + zap.String("address", hex.EncodeToString(address)), ) - continue - } - // Validate this operation individually - err := e.validateIndividualMessage( - frameNumber, - address, - op, - true, - ) - if err != nil { - return errors.Wrap(err, "validate bundle") + // Check if this is a hypergraph operation type + isHypergraphOp := op.GetHypergraphDeploy() != nil || + op.GetHypergraphUpdate() != nil || + op.GetVertexAdd() != nil || + op.GetVertexRemove() != nil || + op.GetHyperedgeAdd() != nil || + op.GetHyperedgeRemove() != nil + + if !isHypergraphOp { + // Skip non-hypergraph operations + e.logger.Debug( + "skipping non-hypergraph operation in bundle", + zap.Int("operation", i), + ) + continue + } + + // Validate this operation individually + err := e.validateIndividualMessage( + frameNumber, + address, + op, + true, + ) + if err != nil { + return errors.Wrap(err, "validate bundle") + } } } diff --git a/node/execution/engines/hypergraph_execution_engine_test.go b/node/execution/engines/hypergraph_execution_engine_test.go index ee6dd0a..40ffa3a 100644 --- a/node/execution/engines/hypergraph_execution_engine_test.go +++ b/node/execution/engines/hypergraph_execution_engine_test.go @@ -1,6 +1,7 @@ package engines_test import ( + "context" "crypto/rand" "math/big" "testing" @@ -12,6 +13,7 @@ import ( "go.uber.org/zap" "golang.org/x/crypto/sha3" hgcrdt "source.quilibrium.com/quilibrium/monorepo/hypergraph" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/execution/engines" "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/hypergraph" hgstate "source.quilibrium.com/quilibrium/monorepo/node/execution/state/hypergraph" @@ -48,7 +50,8 @@ func TestHypergraphExecutionEngine_Start(t *testing.T) { require.NoError(t, err) // Test starting and stopping the engine - errChan := engine.Start() + ctx, cancel, errChan := lifecycle.WithSignallerAndCancel(context.Background()) + engine.Start(ctx, func() {}) // Engine should start without errors select { @@ -59,7 +62,8 @@ func TestHypergraphExecutionEngine_Start(t *testing.T) { } // Stop the engine - <-engine.Stop(false) + cancel() + <-ctx.Done() } func TestHypergraphExecutionEngine_ProcessMessage_Deploy(t *testing.T) { diff --git a/node/execution/engines/token_execution_engine.go b/node/execution/engines/token_execution_engine.go index ce01fcf..7099578 100644 --- a/node/execution/engines/token_execution_engine.go +++ b/node/execution/engines/token_execution_engine.go @@ -11,6 +11,7 @@ import ( "github.com/pkg/errors" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/execution/fees" "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/token" "source.quilibrium.com/quilibrium/monorepo/protobufs" @@ -49,7 +50,7 @@ type TokenExecutionEngine struct { intrinsicsMutex sync.RWMutex mode ExecutionMode mu sync.RWMutex - stopChan chan struct{} + ctx lifecycle.SignalerContext } func NewTokenExecutionEngine( @@ -254,45 +255,15 @@ func (e *TokenExecutionEngine) GetCapabilities() []*protobufs.Capability { } } -func (e *TokenExecutionEngine) Start() <-chan error { - errChan := make(chan error, 1) - - e.mu.Lock() - e.stopChan = make(chan struct{}, 1) - e.mu.Unlock() - - go func() { - e.logger.Info("starting token execution engine") - - <-e.stopChan - e.logger.Info("stopping token execution engine") - }() - - return errChan -} - -func (e *TokenExecutionEngine) Stop(force bool) <-chan error { - errChan := make(chan error, 1) - - go func() { - e.logger.Info("stopping token execution engine", zap.Bool("force", force)) - - // Signal stop if we have a stopChan - e.mu.RLock() - if e.stopChan != nil { - select { - case <-e.stopChan: - // Already closed - default: - close(e.stopChan) - } - } - e.mu.RUnlock() - - close(errChan) - }() - - return errChan +func (e *TokenExecutionEngine) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + e.ctx = ctx + e.logger.Info("starting token execution engine") + ready() + <-e.ctx.Done() + e.logger.Info("stopping token execution engine") } func (e *TokenExecutionEngine) ValidateMessage( @@ -357,37 +328,42 @@ func (e *TokenExecutionEngine) validateBundle( // Validate each operation in the bundle sequentially for i, op := range bundle.Requests { - e.logger.Debug( - "validating bundled operation", - zap.Int("operation", i), - zap.String("address", hex.EncodeToString(address)), - ) - - // Check if this is a hypergraph operation type - isHypergraphOp := op.GetTokenDeploy() != nil || - op.GetTokenUpdate() != nil || - op.GetTransaction() != nil || - op.GetMintTransaction() != nil || - op.GetPendingTransaction() != nil - - if !isHypergraphOp { - // Skip non-token operations + select { + case <-e.ctx.Done(): + return errors.Wrap(errors.New("context canceled"), "validate bundle") + default: e.logger.Debug( - "skipping non-token operation in bundle", + "validating bundled operation", zap.Int("operation", i), + zap.String("address", hex.EncodeToString(address)), ) - continue - } - // Validate this operation individually - err := e.validateIndividualMessage( - frameNumber, - address, - op, - true, - ) - if err != nil { - return errors.Wrap(err, "validate bundle") + // Check if this is a hypergraph operation type + isHypergraphOp := op.GetTokenDeploy() != nil || + op.GetTokenUpdate() != nil || + op.GetTransaction() != nil || + op.GetMintTransaction() != nil || + op.GetPendingTransaction() != nil + + if !isHypergraphOp { + // Skip non-token operations + e.logger.Debug( + "skipping non-token operation in bundle", + zap.Int("operation", i), + ) + continue + } + + // Validate this operation individually + err := e.validateIndividualMessage( + frameNumber, + address, + op, + true, + ) + if err != nil { + return errors.Wrap(err, "validate bundle") + } } } diff --git a/node/execution/engines/token_execution_engine_test.go b/node/execution/engines/token_execution_engine_test.go index 25cc9b9..ed4f35a 100644 --- a/node/execution/engines/token_execution_engine_test.go +++ b/node/execution/engines/token_execution_engine_test.go @@ -2,6 +2,7 @@ package engines_test import ( "bytes" + "context" "crypto/rand" "math/big" "slices" @@ -14,6 +15,7 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/zap" "golang.org/x/crypto/sha3" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/execution/engines" "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/token" hgstate "source.quilibrium.com/quilibrium/monorepo/node/execution/state/hypergraph" @@ -86,7 +88,8 @@ func TestTokenExecutionEngine_Start(t *testing.T) { require.NoError(t, err) // Test starting and stopping the engine - errChan := engine.Start() + ctx, cancel, errChan := lifecycle.WithSignallerAndCancel(context.Background()) + engine.Start(ctx, func() {}) // Engine should start without errors select { @@ -97,7 +100,8 @@ func TestTokenExecutionEngine_Start(t *testing.T) { } // Stop the engine - <-engine.Stop(false) + cancel() + <-ctx.Done() } func TestTokenExecutionEngine_ProcessMessage_DeployEdgeCases(t *testing.T) { diff --git a/node/execution/manager/execution_manager.go b/node/execution/manager/execution_manager.go index 13f89a5..862f82c 100644 --- a/node/execution/manager/execution_manager.go +++ b/node/execution/manager/execution_manager.go @@ -8,12 +8,12 @@ import ( "slices" "strings" "sync" - "time" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" "source.quilibrium.com/quilibrium/monorepo/config" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/node/execution/engines" "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/compute" hypergraphintrinsic "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/hypergraph" @@ -33,6 +33,7 @@ import ( // ExecutionEngineManager manages the lifecycle and coordination of execution // engines type ExecutionEngineManager struct { + builder lifecycle.ComponentManagerBuilder logger *zap.Logger config *config.Config engines map[string]execution.ShardExecutionEngine @@ -51,8 +52,6 @@ type ExecutionEngineManager struct { proverRegistry consensus.ProverRegistry blsConstructor crypto.BlsConstructor includeGlobal bool - quit chan struct{} - wg sync.WaitGroup } // NewExecutionEngineManager creates a new execution engine manager @@ -74,7 +73,7 @@ func NewExecutionEngineManager( blsConstructor crypto.BlsConstructor, includeGlobal bool, ) (*ExecutionEngineManager, error) { - return &ExecutionEngineManager{ + em := &ExecutionEngineManager{ logger: logger.With( zap.String("component", "execution_manager"), ), @@ -94,8 +93,20 @@ func NewExecutionEngineManager( proverRegistry: proverRegistry, blsConstructor: blsConstructor, includeGlobal: includeGlobal, - quit: make(chan struct{}), - }, nil + } + + err := em.InitializeEngines() + if err != nil { + return nil, err + } + + em.builder = lifecycle.NewComponentManagerBuilder() + + for _, engine := range em.engines { + em.builder.AddWorker(engine.Start) + } + + return em, nil } // InitializeEngines creates and registers all execution engines @@ -146,109 +157,15 @@ func (m *ExecutionEngineManager) InitializeEngines() error { } // StartAll starts all registered execution engines -func (m *ExecutionEngineManager) StartAll(quit chan struct{}) error { - m.enginesMu.RLock() - defer m.enginesMu.RUnlock() - +func (m *ExecutionEngineManager) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { m.logger.Info("starting all execution engines") - - for name, engine := range m.engines { - m.wg.Add(1) - go func(name string, engine execution.ShardExecutionEngine) { - defer m.wg.Done() - - m.logger.Info("starting execution engine", zap.String("engine", name)) - - // Start the engine - errChan := engine.Start() - - // Wait for any startup errors - select { - case err := <-errChan: - if err != nil { - m.logger.Error( - "execution engine failed to start", - zap.String("engine", name), - zap.Error(err), - ) - } - case <-time.After(5 * time.Second): - // Give engines time to report startup errors - m.logger.Info( - "execution engine started successfully", - zap.String("engine", name), - ) - } - }(name, engine) - } - - return nil -} - -// StopAll stops all execution engines -func (m *ExecutionEngineManager) StopAll(force bool) error { - m.enginesMu.RLock() - defer m.enginesMu.RUnlock() - - m.logger.Info("stopping all execution engines") - - var stopErrors []error - stopWg := sync.WaitGroup{} - - for name, engine := range m.engines { - stopWg.Add(1) - go func(name string, engine execution.ShardExecutionEngine) { - defer stopWg.Done() - - m.logger.Info("stopping execution engine", zap.String("engine", name)) - - errChan := engine.Stop(force) - select { - case err := <-errChan: - if err != nil && !force { - m.logger.Error( - "error stopping execution engine", - zap.String("engine", name), - zap.Error(err), - ) - stopErrors = append(stopErrors, err) - } - case <-time.After(10 * time.Second): - if !force { - err := errors.Errorf("timeout stopping engine: %s", name) - m.logger.Error( - "timeout stopping execution engine", - zap.String("engine", name), - ) - stopErrors = append(stopErrors, err) - } - } - }(name, engine) - } - - stopWg.Wait() - - if len(stopErrors) > 0 && !force { - return errors.Errorf("failed to stop %d engines", len(stopErrors)) - } - - // Wait for all goroutines to finish - done := make(chan struct{}) - go func() { - m.wg.Wait() - close(done) - }() - - select { - case <-done: - m.logger.Info("all execution engines stopped") - case <-time.After(30 * time.Second): - if !force { - return errors.New("timeout waiting for execution engines to stop") - } - } - - return nil + m.builder.Build().Start(ctx) + ready() + <-ctx.Done() + m.logger.Info("all execution engines stopped") } // GetEngine returns a specific execution engine by name @@ -737,30 +654,3 @@ func (m *ExecutionEngineManager) selectEngine( return nil } - -// RegisterAllEngines registers all engines from the manager with a consensus -// engine -func (m *ExecutionEngineManager) RegisterAllEngines( - registerFunc func(execution.ShardExecutionEngine, uint64) <-chan error, -) error { - m.enginesMu.RLock() - defer m.enginesMu.RUnlock() - - for name, engine := range m.engines { - errChan := registerFunc(engine, 0) // frame 0 for initial registration - select { - case err := <-errChan: - if err != nil { - return errors.Wrapf(err, "failed to register engine: %s", name) - } - m.logger.Info( - "registered engine with consensus", - zap.String("engine", name), - ) - default: - // Non-blocking, registration initiated - } - } - - return nil -} diff --git a/node/go.mod b/node/go.mod index 2f1030b..e4fd6c1 100644 --- a/node/go.mod +++ b/node/go.mod @@ -87,6 +87,7 @@ require ( github.com/deiu/gon3 v0.0.0-20241212124032-93153c038193 // indirect github.com/deiu/rdf2go v0.0.0-20241212211204-b661ba0dfd25 // indirect github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect + github.com/gammazero/deque v0.2.0 // indirect github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect github.com/libp2p/go-libp2p-routing-helpers v0.7.2 // indirect github.com/libp2p/go-yamux/v5 v5.0.1 // indirect @@ -167,6 +168,7 @@ require ( github.com/decred/dcrd/dcrec/secp256k1/v4 v4.4.0 // indirect github.com/flynn/noise v1.1.0 // indirect github.com/francoispqt/gojay v1.2.13 // indirect + github.com/gammazero/workerpool v1.1.3 github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect diff --git a/node/p2p/peer_info_manager.go b/node/p2p/peer_info_manager.go index 13571eb..c8f3047 100644 --- a/node/p2p/peer_info_manager.go +++ b/node/p2p/peer_info_manager.go @@ -6,6 +6,7 @@ import ( "time" "go.uber.org/zap" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/p2p" ) @@ -13,11 +14,11 @@ import ( type InMemoryPeerInfoManager struct { logger *zap.Logger peerInfoCh chan *protobufs.PeerInfo - quitCh chan struct{} peerInfoMx sync.RWMutex peerMap map[string]*p2p.PeerInfo fastestPeers []*p2p.PeerInfo + ctx lifecycle.SignalerContext } var _ p2p.PeerInfoManager = (*InMemoryPeerInfoManager)(nil) @@ -31,62 +32,59 @@ func NewInMemoryPeerInfoManager(logger *zap.Logger) *InMemoryPeerInfoManager { } } -func (m *InMemoryPeerInfoManager) Start() { - go func() { - for { - select { - case info := <-m.peerInfoCh: - m.peerInfoMx.Lock() - reachability := []p2p.Reachability{} - for _, r := range info.Reachability { - reachability = append(reachability, p2p.Reachability{ - Filter: r.Filter, - PubsubMultiaddrs: r.PubsubMultiaddrs, - StreamMultiaddrs: r.StreamMultiaddrs, - }) - } - capabilities := []p2p.Capability{} - for _, c := range info.Capabilities { - capabilities = append(capabilities, p2p.Capability{ - ProtocolIdentifier: c.ProtocolIdentifier, - AdditionalMetadata: c.AdditionalMetadata, - }) - } - seen := time.Now().UnixMilli() - m.peerMap[string(info.PeerId)] = &p2p.PeerInfo{ - PeerId: info.PeerId, - Bandwidth: 100, - Capabilities: capabilities, - Reachability: reachability, - Cores: uint32(len(reachability)), - LastSeen: seen, - } - m.searchAndInsertPeer(&p2p.PeerInfo{ - PeerId: info.PeerId, - Bandwidth: 100, - Capabilities: capabilities, - Reachability: reachability, - Cores: uint32(len(reachability)), - LastSeen: seen, +func (m *InMemoryPeerInfoManager) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + ready() + for { + select { + case info := <-m.peerInfoCh: + m.peerInfoMx.Lock() + reachability := []p2p.Reachability{} + for _, r := range info.Reachability { + reachability = append(reachability, p2p.Reachability{ + Filter: r.Filter, + PubsubMultiaddrs: r.PubsubMultiaddrs, + StreamMultiaddrs: r.StreamMultiaddrs, }) - m.peerInfoMx.Unlock() - case <-m.quitCh: - return } + capabilities := []p2p.Capability{} + for _, c := range info.Capabilities { + capabilities = append(capabilities, p2p.Capability{ + ProtocolIdentifier: c.ProtocolIdentifier, + AdditionalMetadata: c.AdditionalMetadata, + }) + } + seen := time.Now().UnixMilli() + m.peerMap[string(info.PeerId)] = &p2p.PeerInfo{ + PeerId: info.PeerId, + Bandwidth: 100, + Capabilities: capabilities, + Reachability: reachability, + Cores: uint32(len(reachability)), + LastSeen: seen, + } + m.searchAndInsertPeer(&p2p.PeerInfo{ + PeerId: info.PeerId, + Bandwidth: 100, + Capabilities: capabilities, + Reachability: reachability, + Cores: uint32(len(reachability)), + LastSeen: seen, + }) + m.peerInfoMx.Unlock() + case <-m.ctx.Done(): + return } - }() -} - -func (m *InMemoryPeerInfoManager) Stop() { - go func() { - m.quitCh <- struct{}{} - }() + } } func (m *InMemoryPeerInfoManager) AddPeerInfo(info *protobufs.PeerInfo) { - go func() { - m.peerInfoCh <- info - }() + select { + case <-m.ctx.Done(): + case m.peerInfoCh <- info: + } } func (m *InMemoryPeerInfoManager) GetPeerInfo(peerId []byte) *p2p.PeerInfo { diff --git a/types/consensus/distributor.go b/types/consensus/distributor.go index 075ebe3..a5e51c5 100644 --- a/types/consensus/distributor.go +++ b/types/consensus/distributor.go @@ -1,7 +1,7 @@ package consensus import ( - "context" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" ) type ControlEventType int @@ -126,10 +126,7 @@ func (s *ShardSplitEventData) ControlEventData() {} // EventDistributor defines the interface for event distribution systems type EventDistributor interface { // Start begins the event processing loop with a cancelable context - Start(ctx context.Context) error - - // Stop gracefully shuts down the event distributor - Stop() error + Start(ctx lifecycle.SignalerContext, ready lifecycle.ReadyFunc) // Subscribe registers a new subscriber with a unique ID and returns their // control event channel diff --git a/types/execution/execution_engine.go b/types/execution/execution_engine.go index fed1234..103d5bb 100644 --- a/types/execution/execution_engine.go +++ b/types/execution/execution_engine.go @@ -3,6 +3,7 @@ package execution import ( "math/big" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/execution/state" ) @@ -14,8 +15,7 @@ type ProcessMessageResult struct { type ShardExecutionEngine interface { GetName() string - Start() <-chan error - Stop(force bool) <-chan error + Start(ctx lifecycle.SignalerContext, ready lifecycle.ReadyFunc) ValidateMessage(frameNumber uint64, address []byte, message []byte) error ProcessMessage( frameNumber uint64, diff --git a/types/go.mod b/types/go.mod index 1cd647a..22936ea 100644 --- a/types/go.mod +++ b/types/go.mod @@ -12,6 +12,8 @@ replace source.quilibrium.com/quilibrium/monorepo/config => ../config replace source.quilibrium.com/quilibrium/monorepo/utils => ../utils +replace source.quilibrium.com/quilibrium/monorepo/lifecycle => ../lifecycle + replace github.com/multiformats/go-multiaddr => ../go-multiaddr replace github.com/multiformats/go-multiaddr-dns => ../go-multiaddr-dns diff --git a/types/mocks/event_distributor.go b/types/mocks/event_distributor.go index 3b85b9d..69c0c8b 100644 --- a/types/mocks/event_distributor.go +++ b/types/mocks/event_distributor.go @@ -1,9 +1,8 @@ package mocks import ( - "context" - "github.com/stretchr/testify/mock" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/types/consensus" ) @@ -11,14 +10,11 @@ type MockEventDistributor struct { mock.Mock } -func (m *MockEventDistributor) Start(ctx context.Context) error { - args := m.Called(ctx) - return args.Error(0) -} - -func (m *MockEventDistributor) Stop() error { - args := m.Called() - return args.Error(0) +func (m *MockEventDistributor) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + m.Called(ctx, ready) } func (m *MockEventDistributor) Subscribe( @@ -35,3 +31,5 @@ func (m *MockEventDistributor) Publish(event consensus.ControlEvent) { func (m *MockEventDistributor) Unsubscribe(id string) { m.Called(id) } + +var _ consensus.EventDistributor = (*MockEventDistributor)(nil) diff --git a/types/mocks/peer_info_manager.go b/types/mocks/peer_info_manager.go index a12fda5..67cef21 100644 --- a/types/mocks/peer_info_manager.go +++ b/types/mocks/peer_info_manager.go @@ -2,6 +2,7 @@ package mocks import ( "github.com/stretchr/testify/mock" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/p2p" ) @@ -36,11 +37,9 @@ func (m *MockPeerInfoManager) GetPeersBySpeed() [][]byte { } // Start implements p2p.PeerInfoManager. -func (m *MockPeerInfoManager) Start() { - m.Called() -} - -// Stop implements p2p.PeerInfoManager. -func (m *MockPeerInfoManager) Stop() { - m.Called() +func (m *MockPeerInfoManager) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + m.Called(ctx, ready) } diff --git a/types/mocks/shard_execution.go b/types/mocks/shard_execution.go index 40421fc..f6b40ce 100644 --- a/types/mocks/shard_execution.go +++ b/types/mocks/shard_execution.go @@ -4,6 +4,7 @@ import ( "math/big" "github.com/stretchr/testify/mock" + "source.quilibrium.com/quilibrium/monorepo/lifecycle" "source.quilibrium.com/quilibrium/monorepo/protobufs" "source.quilibrium.com/quilibrium/monorepo/types/crypto" "source.quilibrium.com/quilibrium/monorepo/types/execution" @@ -117,15 +118,11 @@ func (m *MockShardExecutionEngine) ProcessMessage( } // Start implements execution.ShardExecutionEngine. -func (m *MockShardExecutionEngine) Start() <-chan error { - args := m.Called() - return args.Get(0).(chan error) -} - -// Stop implements execution.ShardExecutionEngine. -func (m *MockShardExecutionEngine) Stop(force bool) <-chan error { - args := m.Called(force) - return args.Get(0).(chan error) +func (m *MockShardExecutionEngine) Start( + ctx lifecycle.SignalerContext, + ready lifecycle.ReadyFunc, +) { + m.Called(ctx, ready) } var _ execution.ShardExecutionEngine = (*MockShardExecutionEngine)(nil) diff --git a/types/p2p/peer_info_manager.go b/types/p2p/peer_info_manager.go index bdeeb7d..aaf617a 100644 --- a/types/p2p/peer_info_manager.go +++ b/types/p2p/peer_info_manager.go @@ -1,10 +1,12 @@ package p2p -import "source.quilibrium.com/quilibrium/monorepo/protobufs" +import ( + "source.quilibrium.com/quilibrium/monorepo/lifecycle" + "source.quilibrium.com/quilibrium/monorepo/protobufs" +) type PeerInfoManager interface { - Start() - Stop() + Start(context lifecycle.SignalerContext, ready lifecycle.ReadyFunc) AddPeerInfo(info *protobufs.PeerInfo) GetPeerInfo(peerId []byte) *PeerInfo GetPeerMap() map[string]*PeerInfo