ceremonyclient/node/consensus/app/event_distributor.go
Cassandra Heart 53f7c2b5c9
v2.1.0.2 (#442)
* v2.1.0.2

* restore tweaks to simlibp2p

* fix: nil ref on size calc

* fix: panic should induce shutdown from event_distributor

* fix: friendlier initialization that requires less manual kickstarting for test/devnets

* fix: fewer available shards than provers should choose shard length

* fix: update stored worker registry, improve logging for debug mode

* fix: shut the fuck up, peer log

* qol: log value should be snake cased

* fix:non-archive snap sync issues

* fix: separate X448/Decaf448 signed keys, add onion key to registry

* fix: overflow arithmetic on frame number comparison

* fix: worker registration should be idempotent if inputs are same, otherwise permit updated records

* fix: remove global prover state from size calculation

* fix: divide by zero case

* fix: eager prover

* fix: broadcast listener default

* qol: diagnostic data for peer authenticator

* fix: master/worker connectivity issue in sparse networks

tight coupling of peer and workers can sometimes interfere if mesh is sparse, so give workers a pseudoidentity but publish messages with the proper peer key

* fix: reorder steps of join creation

* fix: join verify frame source + ensure domain is properly padded (unnecessary but good for consistency)

* fix: add delegate to protobuf <-> reified join conversion

* fix: preempt prover from planning with no workers

* fix: use the unallocated workers to generate a proof

* qol: underflow causes join fail in first ten frames on test/devnets

* qol: small logging tweaks for easier log correlation in debug mode

* qol: use fisher-yates shuffle to ensure prover allocations are evenly distributed when scores are equal

* qol: separate decisional logic on post-enrollment confirmation into consensus engine, proposer, and worker manager where relevant, refactor out scoring

* reuse shard descriptors for both join planning and confirm/reject decisions

* fix: add missing interface method and amend test blossomsub to use new peer id basis

* fix: only check allocations if they exist

* fix: pomw mint proof data needs to be hierarchically under global intrinsic domain

* staging temporary state under diagnostics

* fix: first phase of distributed lock refactoring

* fix: compute intrinsic locking

* fix: hypergraph intrinsic locking

* fix: token intrinsic locking

* fix: update execution engines to support new locking model

* fix: adjust tests with new execution shape

* fix: weave in lock/unlock semantics to liveness provider

* fix lock fallthrough, add missing allocation update

* qol: additional logging for diagnostics, also testnet/devnet handling for confirmations

* fix: establish grace period on halt scenario to permit recovery

* fix: support test/devnet defaults for coverage scenarios

* fix: nil ref on consensus halts for non-archive nodes

* fix: remove unnecessary prefix from prover ref

* add test coverage for fork choice behaviors and replay – once passing, blocker (2) is resolved

* fix: no fork replay on repeat for non-archive nodes, snap now behaves correctly

* rollup of pre-liveness check lock interactions

* ahead of tests, get the protobuf/metrics-related changes out so teams can prepare

* add test coverage for distributed lock behaviors – once passing, blocker (3) is resolved

* fix: blocker (3)

* Dev docs improvements (#445)

* Make install deps script more robust

* Improve testing instructions

* Worker node should stop upon OS SIGINT/SIGTERM signal (#447)

* move pebble close to Stop()

* move deferred Stop() to Start()

* add core id to worker stop log message

* create done os signal channel and stop worker upon message to it

---------

Co-authored-by: Cassandra Heart <7929478+CassOnMars@users.noreply.github.com>

---------

Co-authored-by: Daz <daz_the_corgi@proton.me>
Co-authored-by: Black Swan <3999712+blacks1ne@users.noreply.github.com>
2025-10-23 01:03:06 -05:00

270 lines
7.1 KiB
Go

package app
import (
"encoding/hex"
"time"
"github.com/pkg/errors"
"go.uber.org/zap"
"source.quilibrium.com/quilibrium/monorepo/node/consensus/global"
consensustime "source.quilibrium.com/quilibrium/monorepo/node/consensus/time"
globalintrinsics "source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/global"
typesconsensus "source.quilibrium.com/quilibrium/monorepo/types/consensus"
"source.quilibrium.com/quilibrium/monorepo/types/schema"
)
func (e *AppConsensusEngine) eventDistributorLoop() {
defer func() {
if r := recover(); r != nil {
e.logger.Error("fatal error encountered", zap.Any("panic", r))
if e.cancel != nil {
e.cancel()
}
// Avoid blocking on quit channel during panic recovery
select {
case e.quit <- struct{}{}:
default:
}
}
}()
defer e.wg.Done()
// Subscribe to events from the event distributor
eventCh := e.eventDistributor.Subscribe(hex.EncodeToString(e.appAddress))
defer e.eventDistributor.Unsubscribe(hex.EncodeToString(e.appAddress))
for {
select {
case <-e.ctx.Done():
return
case <-e.quit:
return
case event, ok := <-eventCh:
if !ok {
e.logger.Error("event channel closed unexpectedly")
return
}
switch event.Type {
case typesconsensus.ControlEventAppNewHead:
if data, ok := event.Data.(*consensustime.AppEvent); ok &&
data.Frame != nil {
e.logger.Debug(
"received new app head event",
zap.Uint64("frame_number", data.Frame.Header.FrameNumber),
)
// Record the fee vote from the accepted frame
if err := e.dynamicFeeManager.AddFrameFeeVote(
e.appAddress,
data.Frame.Header.FrameNumber,
data.Frame.Header.FeeMultiplierVote,
); err != nil {
e.logger.Error(
"failed to add frame fee vote",
zap.Uint64("frame_number", data.Frame.Header.FrameNumber),
zap.Uint64("fee_vote", data.Frame.Header.FeeMultiplierVote),
zap.Error(err),
)
}
}
case typesconsensus.ControlEventAppEquivocation:
// Handle equivocation by constructing and publishing a ProverKick
// message
if data, ok := event.Data.(*consensustime.AppEvent); ok &&
data.Frame != nil && data.OldHead != nil {
e.logger.Warn(
"received equivocating frame",
zap.Uint64("frame_number", data.Frame.Header.FrameNumber),
)
// The equivocating prover is the one who signed the new frame
if data.Frame.Header != nil &&
data.Frame.Header.PublicKeySignatureBls48581 != nil &&
data.Frame.Header.PublicKeySignatureBls48581.PublicKey != nil {
kickedProverPublicKey :=
data.Frame.Header.PublicKeySignatureBls48581.PublicKey.KeyValue
// Serialize both conflicting frame headers
conflictingFrame1, err := data.OldHead.Header.ToCanonicalBytes()
if err != nil {
e.logger.Error(
"failed to marshal old frame header",
zap.Error(err),
)
continue
}
conflictingFrame2, err := data.Frame.Header.ToCanonicalBytes()
if err != nil {
e.logger.Error(
"failed to marshal new frame header",
zap.Error(err),
)
continue
}
// Create the ProverKick message using the intrinsic struct
proverKick, err := globalintrinsics.NewProverKick(
data.Frame.Header.FrameNumber,
kickedProverPublicKey,
conflictingFrame1,
conflictingFrame2,
e.blsConstructor,
e.frameProver,
e.hypergraph,
schema.NewRDFMultiprover(
&schema.TurtleRDFParser{},
e.inclusionProver,
),
e.proverRegistry,
e.clockStore,
)
if err != nil {
e.logger.Error(
"failed to construct prover kick",
zap.Error(err),
)
continue
}
err = proverKick.Prove(data.Frame.Header.FrameNumber)
if err != nil {
e.logger.Error(
"failed to prove prover kick",
zap.Error(err),
)
continue
}
// Serialize the ProverKick to the request form
kickBytes, err := proverKick.ToRequestBytes()
if err != nil {
e.logger.Error(
"failed to serialize prover kick",
zap.Error(err),
)
continue
}
// Publish the kick message
if err := e.pubsub.PublishToBitmask(
global.GLOBAL_PROVER_BITMASK,
kickBytes,
); err != nil {
e.logger.Error("failed to publish prover kick", zap.Error(err))
} else {
e.logger.Info(
"published prover kick for equivocation",
zap.Uint64("frame_number", data.Frame.Header.FrameNumber),
zap.String(
"kicked_prover",
hex.EncodeToString(kickedProverPublicKey),
),
)
}
}
}
case typesconsensus.ControlEventCoverageHalt:
data, ok := event.Data.(*typesconsensus.CoverageEventData)
if ok && data.Message != "" {
e.logger.Error(data.Message)
e.halt()
if err := e.stateMachine.Stop(); err != nil {
e.logger.Error(
"error occurred while halting consensus",
zap.Error(err),
)
}
go func() {
for {
select {
case <-e.ctx.Done():
return
case <-time.After(10 * time.Second):
e.logger.Error(
"full halt detected, leaving system in halted state until recovery",
)
}
}
}()
}
case typesconsensus.ControlEventHalt:
data, ok := event.Data.(*typesconsensus.ErrorEventData)
if ok && data.Error != nil {
e.logger.Error(
"full halt detected, leaving system in halted state",
zap.Error(data.Error),
)
e.halt()
if err := e.stateMachine.Stop(); err != nil {
e.logger.Error(
"error occurred while halting consensus",
zap.Error(err),
)
}
go func() {
for {
select {
case <-e.ctx.Done():
return
case <-time.After(10 * time.Second):
e.logger.Error(
"full halt detected, leaving system in halted state",
zap.Error(data.Error),
)
}
}
}()
}
case typesconsensus.ControlEventAppFork:
if data, ok := event.Data.(*consensustime.AppEvent); ok &&
data.Frame != nil {
e.logger.Debug(
"received new app fork event",
zap.Uint64("frame_number", data.Frame.Header.FrameNumber),
)
// Remove the forked fee votes
removed, err := e.dynamicFeeManager.RewindToFrame(
e.appAddress,
data.Frame.Header.FrameNumber,
)
if err != nil {
e.logger.Error(
"failed to rewind frame fee vote",
zap.Uint64("frame_number", data.Frame.Header.FrameNumber),
zap.Error(err),
)
}
e.logger.Info("rewound fee votes", zap.Int("removed_votes", removed))
}
default:
e.logger.Debug(
"received unhandled event type",
zap.Int("event_type", int(event.Type)),
)
}
}
}
}
func (e *AppConsensusEngine) emitAlertEvent(alertMessage string) {
event := typesconsensus.ControlEvent{
Type: typesconsensus.ControlEventHalt,
Data: &typesconsensus.ErrorEventData{
Error: errors.New(alertMessage),
},
}
go e.eventDistributor.Publish(event)
e.logger.Info("emitted alert message")
}