ceremonyclient/node/consensus/global/message_validation.go
Cassandra Heart ce4f77b140
v2.1.0.19 (#515)
* v2.1.0.19

* enhanced error logging, fix seniority marker join blocker, fix sync message size limit defaults

* resolve signature failure

* additional error logging for merge-related signatures

* fix: one-shot sync message size, app shard TC signature size, collector/hotstuff race condition, expired joins blocking new joins due to pruning disable

* remove compat with old 2.0.0 blossomsub

* fix: resolve abandoned prover joins

* reload prover registry

* fix stale worker proposal edge

* add full sanity check on join before submitting to identify bug

* resolve non-fallthrough condition that should be fallthrough

* fix: resolve rare SIGFPE, fix orphan expired joins blocking workers from reallocating

* add reconnect fallback if no peers are found with variable reconnect time (#511)

Co-authored-by: Tyler Sturos <55340199+tjsturos@users.noreply.github.com>

* update base peer count to 1 (#513)

* fix: expired prover join frames, starting port ranges, proposer getting stuck, and seniority on joins

* fix: panic on shutdown, libp2p discovery picking inaccessible peers, coverage event check not in shutdown logic, amend app shard worker behavior to mirror global for prover root reconciliation

* fix: shutdown scenario quirks, reload hanging

* fix: do not bailout early on shutdown of coverage check

* fix: force registry refresh on worker waiting for registration

* add more logging to wait for prover

* fix: worker manager refreshes the filter on allocation, snapshots blocking close on shutdown

* tweak: force shutdown after five seconds for app worker

* fix: don't loop when shutting down

* fix: slight reordering, also added named workers to trace hanging shutdowns

* use deterministic key for peer id of workers to stop flagging workers as sybil attacks

* fix: remove pubsub stop from app consensus engine as it shouldn't manage pubsub lifecycle, integrate shutdown context to PerformSync to prevent stuck syncs from halting respawn

* fix: blossomsub pubsub interface does not properly track subscription status

* fix: subscribe order to avoid nil panic

* switch from dnsaddr to dns4

* add missing quic-v1

* additional logging to isolate respawn quirks

* fix: dnsaddr -> dns4 for blossomsub

* sort-of fix: apply sledgehammer to restart logic

* fix: restore proper respawn logic, fix frozen hypergraph post respawn, unsubscribe from bitmask previously missing

---------

Co-authored-by: winged-pegasus <55340199+winged-pegasus@users.noreply.github.com>
Co-authored-by: Tyler Sturos <55340199+tjsturos@users.noreply.github.com>
2026-02-26 04:20:13 -06:00

666 lines
19 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package global
import (
"encoding/binary"
"encoding/hex"
"time"
"github.com/libp2p/go-libp2p/core/peer"
"go.uber.org/zap"
"source.quilibrium.com/quilibrium/monorepo/go-libp2p-blossomsub/pb"
"source.quilibrium.com/quilibrium/monorepo/node/internal/frametime"
"source.quilibrium.com/quilibrium/monorepo/protobufs"
"source.quilibrium.com/quilibrium/monorepo/types/crypto"
tp2p "source.quilibrium.com/quilibrium/monorepo/types/p2p"
)
func (e *GlobalConsensusEngine) validateGlobalConsensusMessage(
_ peer.ID,
message *pb.Message,
) tp2p.ValidationResult {
// Check if data is long enough to contain type prefix
if len(message.Data) < 4 {
e.logger.Debug(
"message too short",
zap.Int("data_length", len(message.Data)),
)
return tp2p.ValidationResultReject
}
// Read type prefix from first 4 bytes
typePrefix := binary.BigEndian.Uint32(message.Data[:4])
switch typePrefix {
case protobufs.GlobalProposalType:
start := time.Now()
defer func() {
proposalValidationDuration.Observe(time.Since(start).Seconds())
}()
proposal := &protobufs.GlobalProposal{}
if err := proposal.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal proposal", zap.Error(err))
proposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if err := proposal.Validate(); err != nil {
e.logger.Debug("invalid proposal", zap.Error(err))
proposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultIgnore
}
if e.currentRank > proposal.GetRank() {
e.logger.Debug(
"proposal is stale",
zap.Uint64("current_rank", e.currentRank),
zap.Uint64("timeout_rank", proposal.GetRank()),
)
proposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultIgnore
}
valid, err := e.frameValidator.Validate(proposal.State)
if err != nil {
e.logger.Debug("global frame validation error", zap.Error(err))
proposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if !valid {
e.logger.Debug(
"invalid global frame",
zap.String("reason", "frame validator returned false"),
)
proposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
proposalValidationTotal.WithLabelValues("accept").Inc()
case protobufs.ProposalVoteType:
start := time.Now()
defer func() {
voteValidationDuration.Observe(time.Since(start).Seconds())
}()
vote := &protobufs.ProposalVote{}
if err := vote.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal vote", zap.Error(err))
voteValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
// We should still accept votes for the past rank either because a peer
// needs it, or because we need it to trump a TC
if e.currentRank > vote.Rank+1 {
e.logger.Debug(
"vote is stale",
zap.Uint64("current_rank", e.currentRank),
zap.Uint64("timeout_rank", vote.Rank),
)
return tp2p.ValidationResultIgnore
}
// Validate the vote
if err := vote.Validate(); err != nil {
e.logger.Debug("invalid vote", zap.Error(err))
voteValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
voteValidationTotal.WithLabelValues("accept").Inc()
case protobufs.TimeoutStateType:
start := time.Now()
defer func() {
timeoutStateValidationDuration.Observe(time.Since(start).Seconds())
}()
timeoutState := &protobufs.TimeoutState{}
if err := timeoutState.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal timeoutState", zap.Error(err))
timeoutStateValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
// Validate the timeoutState
if err := timeoutState.Validate(); err != nil {
e.logger.Debug("invalid timeoutState", zap.Error(err))
timeoutStateValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
e.logger.Debug(
"received timeout",
zap.Uint64("rank", timeoutState.Vote.Rank),
zap.String(
"voter",
hex.EncodeToString([]byte(timeoutState.Vote.Identity())),
),
)
// We should still accept votes for the past rank in case a peer needs it
if e.currentRank > timeoutState.Vote.Rank+1 {
e.logger.Debug(
"timeout is stale",
zap.Uint64("current_rank", e.currentRank),
zap.Uint64("timeout_rank", timeoutState.Vote.Rank),
)
return tp2p.ValidationResultIgnore
}
timeoutStateValidationTotal.WithLabelValues("accept").Inc()
default:
e.logger.Debug("received unknown type", zap.Uint32("type", typePrefix))
return tp2p.ValidationResultIgnore
}
return tp2p.ValidationResultAccept
}
func (e *GlobalConsensusEngine) validateShardConsensusMessage(
_ peer.ID,
message *pb.Message,
) tp2p.ValidationResult {
// Check if data is long enough to contain type prefix
if len(message.Data) < 4 {
e.logger.Debug(
"message too short",
zap.Int("data_length", len(message.Data)),
)
return tp2p.ValidationResultReject
}
// Read type prefix from first 4 bytes
typePrefix := binary.BigEndian.Uint32(message.Data[:4])
switch typePrefix {
case protobufs.AppShardFrameType:
start := time.Now()
defer func() {
shardProposalValidationDuration.Observe(time.Since(start).Seconds())
}()
frame := &protobufs.AppShardFrame{}
if err := frame.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal frame", zap.Error(err))
shardProposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if frame.Header == nil {
e.logger.Debug("frame has no header")
shardProposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if frametime.AppFrameSince(frame) > 20*time.Second {
e.logger.Debug(
"ignoring shard proposal",
zap.String("reason", "frame too old"),
)
shardProposalValidationTotal.WithLabelValues("ignore").Inc()
return tp2p.ValidationResultIgnore
}
if frame.Header.PublicKeySignatureBls48581 != nil {
e.logger.Debug("frame validation has signature")
shardProposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
valid, err := e.appFrameValidator.Validate(frame)
if err != nil {
e.logger.Debug("frame validation error", zap.Error(err))
shardProposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if !valid {
e.logger.Debug(
"invalid app frame",
zap.String("reason", "frame validator returned false"),
)
shardProposalValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
shardProposalValidationTotal.WithLabelValues("accept").Inc()
case protobufs.ProposalVoteType:
start := time.Now()
defer func() {
shardVoteValidationDuration.Observe(time.Since(start).Seconds())
}()
vote := &protobufs.ProposalVote{}
if err := vote.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal vote", zap.Error(err))
shardVoteValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
now := uint64(time.Now().UnixMilli())
if vote.Timestamp > now+5000 || vote.Timestamp < now-5000 {
e.logger.Debug(
"ignoring shard vote",
zap.String("reason", "timestamp out of window"),
zap.Uint64("timestamp", vote.Timestamp),
zap.Uint64("now", now),
)
shardVoteValidationTotal.WithLabelValues("ignore").Inc()
return tp2p.ValidationResultIgnore
}
if err := vote.Validate(); err != nil {
e.logger.Debug("failed to validate vote", zap.Error(err))
shardVoteValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
shardVoteValidationTotal.WithLabelValues("accept").Inc()
case protobufs.TimeoutStateType:
start := time.Now()
defer func() {
shardTimeoutStateValidationDuration.Observe(time.Since(start).Seconds())
}()
timeoutState := &protobufs.TimeoutState{}
if err := timeoutState.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal timeoutState", zap.Error(err))
shardTimeoutStateValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
now := uint64(time.Now().UnixMilli())
if timeoutState.Timestamp > now+5000 || timeoutState.Timestamp < now-5000 {
e.logger.Debug(
"ignoring shard timeout",
zap.String("reason", "timestamp out of window"),
zap.Uint64("timestamp", timeoutState.Timestamp),
zap.Uint64("now", now),
)
shardTimeoutStateValidationTotal.WithLabelValues("ignore").Inc()
return tp2p.ValidationResultIgnore
}
if err := timeoutState.Validate(); err != nil {
e.logger.Debug("failed to validate timeoutState", zap.Error(err))
shardTimeoutStateValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
shardTimeoutStateValidationTotal.WithLabelValues("accept").Inc()
case protobufs.ProverLivenessCheckType:
check := &protobufs.ProverLivenessCheck{}
if err := check.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal liveness check", zap.Error(err))
return tp2p.ValidationResultReject
}
if err := check.Validate(); err != nil {
e.logger.Debug("invalid liveness check", zap.Error(err))
return tp2p.ValidationResultReject
}
default:
e.logger.Debug(
"rejecting shard consensus message",
zap.String("reason", "unknown type prefix"),
zap.Uint32("type", typePrefix),
)
return tp2p.ValidationResultReject
}
return tp2p.ValidationResultAccept
}
func (e *GlobalConsensusEngine) validateProverMessage(
peerID peer.ID,
message *pb.Message,
) tp2p.ValidationResult {
e.logger.Debug(
"validating prover message from peer",
zap.String("peer_id", peer.ID(message.From).String()),
)
// Check if data is long enough to contain type prefix
if len(message.Data) < 4 {
e.logger.Debug(
"message too short",
zap.Int("data_length", len(message.Data)),
)
return tp2p.ValidationResultReject
}
// Read type prefix from first 4 bytes
typePrefix := binary.BigEndian.Uint32(message.Data[:4])
switch typePrefix {
case protobufs.MessageBundleType:
e.logger.Debug(
"validating message bundle from peer",
zap.String("peer_id", peer.ID(message.From).String()),
)
// Prover messages come wrapped in MessageBundle
messageBundle := &protobufs.MessageBundle{}
if err := messageBundle.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal message bundle", zap.Error(err))
return tp2p.ValidationResultReject
}
for _, r := range messageBundle.Requests {
if r.GetKick() != nil {
e.logger.Debug(
"ignoring prover message",
zap.String("reason", "bundle contains kick request"),
)
return tp2p.ValidationResultIgnore
}
}
if err := messageBundle.Validate(); err != nil {
e.logger.Debug("invalid request", zap.Error(err))
return tp2p.ValidationResultReject
}
now := time.Now().UnixMilli()
if messageBundle.Timestamp > now+5000 ||
messageBundle.Timestamp < now-5000 {
e.logger.Debug("message too late or too early")
return tp2p.ValidationResultIgnore
}
default:
e.logger.Debug("received unknown type", zap.Uint32("type", typePrefix))
return tp2p.ValidationResultIgnore
}
return tp2p.ValidationResultAccept
}
func (e *GlobalConsensusEngine) validateAppFrameMessage(
_ peer.ID,
message *pb.Message,
) tp2p.ValidationResult {
// Check if data is long enough to contain type prefix
if len(message.Data) < 4 {
e.logger.Debug(
"message too short",
zap.Int("data_length", len(message.Data)),
)
return tp2p.ValidationResultReject
}
// Read type prefix from first 4 bytes
typePrefix := binary.BigEndian.Uint32(message.Data[:4])
switch typePrefix {
case protobufs.AppShardFrameType:
start := time.Now()
defer func() {
shardFrameValidationDuration.Observe(time.Since(start).Seconds())
}()
frame := &protobufs.AppShardFrame{}
if err := frame.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal frame", zap.Error(err))
shardFrameValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if frame.Header.PublicKeySignatureBls48581 == nil ||
frame.Header.PublicKeySignatureBls48581.PublicKey == nil ||
frame.Header.PublicKeySignatureBls48581.PublicKey.KeyValue == nil {
e.logger.Debug("frame validation missing signature")
shardFrameValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
valid, err := e.appFrameValidator.Validate(frame)
if err != nil {
e.logger.Debug("frame validation error", zap.Error(err))
shardFrameValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if !valid {
e.logger.Debug(
"invalid app frame",
zap.String("reason", "frame validator returned false"),
)
shardFrameValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if frametime.AppFrameSince(frame) > 120*time.Second {
e.logger.Debug(
"ignoring app frame",
zap.String("reason", "frame too old"),
)
shardFrameValidationTotal.WithLabelValues("ignore").Inc()
return tp2p.ValidationResultIgnore
}
shardFrameValidationTotal.WithLabelValues("accept").Inc()
default:
e.logger.Debug(
"rejecting app frame message",
zap.String("reason", "unknown type prefix"),
zap.Uint32("type", typePrefix),
)
return tp2p.ValidationResultReject
}
return tp2p.ValidationResultAccept
}
func (e *GlobalConsensusEngine) validateFrameMessage(
_ peer.ID,
message *pb.Message,
) tp2p.ValidationResult {
// Check if data is long enough to contain type prefix
if len(message.Data) < 4 {
e.logger.Debug(
"message too short",
zap.Int("data_length", len(message.Data)),
)
return tp2p.ValidationResultReject
}
// Read type prefix from first 4 bytes
typePrefix := binary.BigEndian.Uint32(message.Data[:4])
switch typePrefix {
case protobufs.GlobalFrameType:
start := time.Now()
defer func() {
frameValidationDuration.Observe(time.Since(start).Seconds())
}()
frame := &protobufs.GlobalFrame{}
if err := frame.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal frame", zap.Error(err))
frameValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if frame.Header.PublicKeySignatureBls48581 == nil ||
frame.Header.PublicKeySignatureBls48581.PublicKey == nil ||
frame.Header.PublicKeySignatureBls48581.PublicKey.KeyValue == nil {
e.logger.Debug("global frame validation missing signature")
frameValidationTotal.WithLabelValues("reject").Inc()
return tp2p.ValidationResultReject
}
if e.currentRank > frame.GetRank()+2 {
e.logger.Debug(
"ignoring global frame",
zap.String("reason", "rank too old"),
zap.Uint64("current_rank", e.currentRank),
zap.Uint64("frame_rank", frame.GetRank()),
)
frameValidationTotal.WithLabelValues("ignore").Inc()
return tp2p.ValidationResultIgnore
}
if frametime.GlobalFrameSince(frame) > 120*time.Second {
e.logger.Debug(
"ignoring global frame",
zap.String("reason", "frame too old"),
)
frameValidationTotal.WithLabelValues("ignore").Inc()
return tp2p.ValidationResultIgnore
}
frameValidationTotal.WithLabelValues("accept").Inc()
default:
e.logger.Debug("received unknown type", zap.Uint32("type", typePrefix))
return tp2p.ValidationResultIgnore
}
return tp2p.ValidationResultAccept
}
func (e *GlobalConsensusEngine) validatePeerInfoMessage(
_ peer.ID,
message *pb.Message,
) tp2p.ValidationResult {
// Check if data is long enough to contain type prefix
if len(message.Data) < 4 {
e.logger.Debug(
"message too short",
zap.Int("data_length", len(message.Data)),
)
return tp2p.ValidationResultReject
}
// Read type prefix from first 4 bytes
typePrefix := binary.BigEndian.Uint32(message.Data[:4])
switch typePrefix {
case protobufs.PeerInfoType:
peerInfo := &protobufs.PeerInfo{}
if err := peerInfo.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal peer info", zap.Error(err))
return tp2p.ValidationResultReject
}
err := peerInfo.Validate()
if err != nil {
e.logger.Debug("peer info validation error", zap.Error(err))
return tp2p.ValidationResultReject
}
now := time.Now().UnixMilli()
if peerInfo.Timestamp < now-60000 {
e.logger.Debug("peer info timestamp too old, rejecting",
zap.Int64("peer_timestamp", peerInfo.Timestamp),
)
return tp2p.ValidationResultReject
}
if peerInfo.Timestamp < now-1000 {
e.logger.Debug("peer info timestamp too old, ignoring",
zap.Int64("peer_timestamp", peerInfo.Timestamp),
)
return tp2p.ValidationResultIgnore
}
if peerInfo.Timestamp > now+5000 {
e.logger.Debug("peer info timestamp too far in future",
zap.Int64("peer_timestamp", peerInfo.Timestamp),
)
return tp2p.ValidationResultIgnore
}
case protobufs.KeyRegistryType:
keyRegistry := &protobufs.KeyRegistry{}
if err := keyRegistry.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal key registry", zap.Error(err))
return tp2p.ValidationResultReject
}
err := keyRegistry.Validate()
if err != nil {
e.logger.Debug("key registry validation error", zap.Error(err))
return tp2p.ValidationResultReject
}
now := time.Now().UnixMilli()
if int64(keyRegistry.LastUpdated) < now-60000 {
e.logger.Debug("key registry timestamp too old, rejecting")
return tp2p.ValidationResultReject
}
if int64(keyRegistry.LastUpdated) < now-1000 {
e.logger.Debug("key registry timestamp too old")
return tp2p.ValidationResultIgnore
}
if int64(keyRegistry.LastUpdated) > now+5000 {
e.logger.Debug("key registry timestamp too far in future")
return tp2p.ValidationResultIgnore
}
default:
e.logger.Debug("received unknown type", zap.Uint32("type", typePrefix))
return tp2p.ValidationResultIgnore
}
return tp2p.ValidationResultAccept
}
func (e *GlobalConsensusEngine) validateAlertMessage(
_ peer.ID,
message *pb.Message,
) tp2p.ValidationResult {
// Check if data is long enough to contain type prefix
if len(message.Data) < 4 {
e.logger.Debug(
"message too short",
zap.Int("data_length", len(message.Data)),
)
return tp2p.ValidationResultReject
}
// Read type prefix from first 4 bytes
typePrefix := binary.BigEndian.Uint32(message.Data[:4])
switch typePrefix {
case protobufs.GlobalAlertType:
alert := &protobufs.GlobalAlert{}
if err := alert.FromCanonicalBytes(message.Data); err != nil {
e.logger.Debug("failed to unmarshal alert", zap.Error(err))
return tp2p.ValidationResultReject
}
err := alert.Validate()
if err != nil {
e.logger.Debug("alert validation error", zap.Error(err))
return tp2p.ValidationResultReject
}
valid, err := e.keyManager.ValidateSignature(
crypto.KeyTypeEd448,
e.alertPublicKey,
[]byte(alert.Message),
alert.Signature,
[]byte("GLOBAL_ALERT"),
)
if !valid || err != nil {
e.logger.Debug("alert signature invalid")
return tp2p.ValidationResultReject
}
default:
e.logger.Debug("received unknown type", zap.Uint32("type", typePrefix))
return tp2p.ValidationResultIgnore
}
return tp2p.ValidationResultAccept
}