ceremonyclient/node/consensus/app/consensus_sync_provider.go
Cassandra Heart 53f7c2b5c9
v2.1.0.2 (#442)
* v2.1.0.2

* restore tweaks to simlibp2p

* fix: nil ref on size calc

* fix: panic should induce shutdown from event_distributor

* fix: friendlier initialization that requires less manual kickstarting for test/devnets

* fix: fewer available shards than provers should choose shard length

* fix: update stored worker registry, improve logging for debug mode

* fix: shut the fuck up, peer log

* qol: log value should be snake cased

* fix:non-archive snap sync issues

* fix: separate X448/Decaf448 signed keys, add onion key to registry

* fix: overflow arithmetic on frame number comparison

* fix: worker registration should be idempotent if inputs are same, otherwise permit updated records

* fix: remove global prover state from size calculation

* fix: divide by zero case

* fix: eager prover

* fix: broadcast listener default

* qol: diagnostic data for peer authenticator

* fix: master/worker connectivity issue in sparse networks

tight coupling of peer and workers can sometimes interfere if mesh is sparse, so give workers a pseudoidentity but publish messages with the proper peer key

* fix: reorder steps of join creation

* fix: join verify frame source + ensure domain is properly padded (unnecessary but good for consistency)

* fix: add delegate to protobuf <-> reified join conversion

* fix: preempt prover from planning with no workers

* fix: use the unallocated workers to generate a proof

* qol: underflow causes join fail in first ten frames on test/devnets

* qol: small logging tweaks for easier log correlation in debug mode

* qol: use fisher-yates shuffle to ensure prover allocations are evenly distributed when scores are equal

* qol: separate decisional logic on post-enrollment confirmation into consensus engine, proposer, and worker manager where relevant, refactor out scoring

* reuse shard descriptors for both join planning and confirm/reject decisions

* fix: add missing interface method and amend test blossomsub to use new peer id basis

* fix: only check allocations if they exist

* fix: pomw mint proof data needs to be hierarchically under global intrinsic domain

* staging temporary state under diagnostics

* fix: first phase of distributed lock refactoring

* fix: compute intrinsic locking

* fix: hypergraph intrinsic locking

* fix: token intrinsic locking

* fix: update execution engines to support new locking model

* fix: adjust tests with new execution shape

* fix: weave in lock/unlock semantics to liveness provider

* fix lock fallthrough, add missing allocation update

* qol: additional logging for diagnostics, also testnet/devnet handling for confirmations

* fix: establish grace period on halt scenario to permit recovery

* fix: support test/devnet defaults for coverage scenarios

* fix: nil ref on consensus halts for non-archive nodes

* fix: remove unnecessary prefix from prover ref

* add test coverage for fork choice behaviors and replay – once passing, blocker (2) is resolved

* fix: no fork replay on repeat for non-archive nodes, snap now behaves correctly

* rollup of pre-liveness check lock interactions

* ahead of tests, get the protobuf/metrics-related changes out so teams can prepare

* add test coverage for distributed lock behaviors – once passing, blocker (3) is resolved

* fix: blocker (3)

* Dev docs improvements (#445)

* Make install deps script more robust

* Improve testing instructions

* Worker node should stop upon OS SIGINT/SIGTERM signal (#447)

* move pebble close to Stop()

* move deferred Stop() to Start()

* add core id to worker stop log message

* create done os signal channel and stop worker upon message to it

---------

Co-authored-by: Cassandra Heart <7929478+CassOnMars@users.noreply.github.com>

---------

Co-authored-by: Daz <daz_the_corgi@proton.me>
Co-authored-by: Black Swan <3999712+blacks1ne@users.noreply.github.com>
2025-10-23 01:03:06 -05:00

583 lines
14 KiB
Go

package app
import (
"bufio"
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"math/big"
"net/http"
"os"
"path"
"path/filepath"
"strings"
"time"
"github.com/libp2p/go-libp2p/core/crypto"
"github.com/libp2p/go-libp2p/core/peer"
"github.com/pkg/errors"
"go.uber.org/zap"
"google.golang.org/grpc"
"source.quilibrium.com/quilibrium/monorepo/node/execution/intrinsics/token"
"source.quilibrium.com/quilibrium/monorepo/node/internal/frametime"
"source.quilibrium.com/quilibrium/monorepo/protobufs"
"source.quilibrium.com/quilibrium/monorepo/types/tries"
up2p "source.quilibrium.com/quilibrium/monorepo/utils/p2p"
)
// AppSyncProvider implements SyncProvider
type AppSyncProvider struct {
engine *AppConsensusEngine
}
func (p *AppSyncProvider) Synchronize(
existing **protobufs.AppShardFrame,
ctx context.Context,
) (<-chan **protobufs.AppShardFrame, <-chan error) {
dataCh := make(chan **protobufs.AppShardFrame, 1)
errCh := make(chan error, 1)
go func() {
defer close(dataCh)
defer close(errCh)
defer func() {
if r := recover(); r != nil {
errCh <- errors.Wrap(
errors.New(fmt.Sprintf("fatal error encountered: %+v", r)),
"synchronize",
)
}
}()
// Check if we have a current frame
p.engine.frameStoreMu.RLock()
hasFrame := len(p.engine.frameStore) > 0
p.engine.frameStoreMu.RUnlock()
if !hasFrame {
// No peers and no frame - we're the first node, initialize genesis
p.engine.logger.Info("no frame detected, initializing with genesis")
syncStatusCheck.WithLabelValues(p.engine.appAddressHex, "synced").Inc()
genesis := p.engine.initializeGenesis()
dataCh <- &genesis
errCh <- nil
return
}
peerCount := p.engine.pubsub.GetPeerstoreCount()
if peerCount < int(p.engine.minimumProvers()) {
errCh <- errors.Wrap(
errors.New("minimum provers not reached"),
"synchronize",
)
return
}
// We have frames, return the latest one
p.engine.frameStoreMu.RLock()
var latestFrame *protobufs.AppShardFrame
var maxFrameNumber uint64
for _, frame := range p.engine.frameStore {
if frame.Header != nil && frame.Header.FrameNumber > maxFrameNumber {
maxFrameNumber = frame.Header.FrameNumber
latestFrame = frame
}
}
p.engine.frameStoreMu.RUnlock()
if latestFrame != nil {
bits := up2p.GetBloomFilterIndices(p.engine.appAddress, 256, 3)
l2 := make([]byte, 32)
copy(l2, p.engine.appAddress[:min(len(p.engine.appAddress), 32)])
shardKey := tries.ShardKey{
L1: [3]byte(bits),
L2: [32]byte(l2),
}
shouldHypersync := false
comm, err := p.engine.hypergraph.GetShardCommits(
latestFrame.Header.FrameNumber,
p.engine.appAddress,
)
if err != nil {
p.engine.logger.Error("could not get commits", zap.Error(err))
} else {
for i, c := range comm {
if !bytes.Equal(c, latestFrame.Header.StateRoots[i]) {
shouldHypersync = true
break
}
}
if shouldHypersync {
p.hyperSyncWithProver(latestFrame.Header.Prover, shardKey)
}
}
}
// TODO(2.1.1): remove this
if p.engine.config.P2P.Network == 0 &&
bytes.Equal(p.engine.appAddress[:32], token.QUIL_TOKEN_ADDRESS[:]) {
// Empty, candidate for snapshot reload
if p.engine.hypergraph.GetSize(nil, nil).Cmp(big.NewInt(0)) == 0 {
config := p.engine.config.DB
cfgPath := config.Path
coreId := p.engine.coreId
if coreId > 0 && len(config.WorkerPaths) > int(coreId-1) {
cfgPath = config.WorkerPaths[coreId-1]
} else if coreId > 0 {
cfgPath = fmt.Sprintf(config.WorkerPathPrefix, coreId)
}
err := p.downloadSnapshot(
cfgPath,
p.engine.config.P2P.Network,
p.engine.appAddress,
)
if err != nil {
p.engine.logger.Warn(
"could not perform snapshot reload",
zap.Error(err),
)
}
}
}
err := p.syncWithMesh()
if err != nil {
if latestFrame != nil {
dataCh <- &latestFrame
} else if existing != nil {
dataCh <- existing
}
errCh <- err
return
}
if latestFrame != nil {
p.engine.logger.Info("returning latest frame")
dataCh <- &latestFrame
} else if existing != nil {
p.engine.logger.Info("returning existing frame")
dataCh <- existing
}
syncStatusCheck.WithLabelValues(p.engine.appAddressHex, "synced").Inc()
errCh <- nil
}()
return dataCh, errCh
}
func (p *AppSyncProvider) syncWithMesh() error {
p.engine.logger.Info("synchronizing with peers")
latest, err := p.engine.appTimeReel.GetHead()
if err != nil {
return errors.Wrap(err, "sync")
}
peers, err := p.engine.proverRegistry.GetActiveProvers(p.engine.appAddress)
if len(peers) <= 1 || err != nil {
p.engine.logger.Info("no peers to sync from")
return nil
}
for _, candidate := range peers {
if bytes.Equal(candidate.Address, p.engine.getProverAddress()) {
continue
}
registry, err := p.engine.keyStore.GetKeyRegistryByProver(
candidate.Address,
)
if err != nil {
continue
}
if registry.IdentityKey == nil || registry.IdentityKey.KeyValue == nil {
continue
}
pub, err := crypto.UnmarshalEd448PublicKey(registry.IdentityKey.KeyValue)
if err != nil {
p.engine.logger.Warn("error unmarshaling identity key", zap.Error(err))
continue
}
peerID, err := peer.IDFromPublicKey(pub)
if err != nil {
p.engine.logger.Warn("error deriving peer id", zap.Error(err))
continue
}
head, err := p.engine.appTimeReel.GetHead()
if err != nil {
return errors.Wrap(err, "sync")
}
if latest.Header.FrameNumber < head.Header.FrameNumber {
latest = head
}
latest, err = p.syncWithPeer(latest, []byte(peerID))
if err != nil {
p.engine.logger.Debug("error syncing frame", zap.Error(err))
}
}
p.engine.logger.Info(
"returning leader frame",
zap.Uint64("frame_number", latest.Header.FrameNumber),
zap.Duration("frame_age", frametime.AppFrameSince(latest)),
)
return nil
}
func (p *AppSyncProvider) syncWithPeer(
latest *protobufs.AppShardFrame,
peerId []byte,
) (*protobufs.AppShardFrame, error) {
p.engine.logger.Info(
"polling peer for new frames",
zap.String("peer_id", peer.ID(peerId).String()),
zap.Uint64("current_frame", latest.Header.FrameNumber),
)
syncTimeout := p.engine.config.Engine.SyncTimeout
dialCtx, cancelDial := context.WithTimeout(p.engine.ctx, syncTimeout)
defer cancelDial()
cc, err := p.engine.pubsub.GetDirectChannel(dialCtx, peerId, "sync")
if err != nil {
p.engine.logger.Debug(
"could not establish direct channel",
zap.Error(err),
)
return latest, errors.Wrap(err, "sync")
}
defer func() {
if err := cc.Close(); err != nil {
p.engine.logger.Error("error while closing connection", zap.Error(err))
}
}()
client := protobufs.NewAppShardServiceClient(cc)
for {
getCtx, cancelGet := context.WithTimeout(p.engine.ctx, syncTimeout)
response, err := client.GetAppShardFrame(
getCtx,
&protobufs.GetAppShardFrameRequest{
Filter: p.engine.appAddress,
FrameNumber: latest.Header.FrameNumber + 1,
},
// The message size limits are swapped because the server is the one
// sending the data.
grpc.MaxCallRecvMsgSize(
p.engine.config.Engine.SyncMessageLimits.MaxSendMsgSize,
),
grpc.MaxCallSendMsgSize(
p.engine.config.Engine.SyncMessageLimits.MaxRecvMsgSize,
),
)
cancelGet()
if err != nil {
p.engine.logger.Debug(
"could not get frame",
zap.Error(err),
)
return latest, errors.Wrap(err, "sync")
}
if response == nil {
p.engine.logger.Debug("received no response from peer")
return latest, nil
}
if response.Frame == nil || response.Frame.Header == nil ||
response.Frame.Header.FrameNumber != latest.Header.FrameNumber+1 ||
response.Frame.Header.Timestamp < latest.Header.Timestamp {
p.engine.logger.Debug("received invalid response from peer")
return latest, nil
}
p.engine.logger.Info(
"received new leading frame",
zap.Uint64("frame_number", response.Frame.Header.FrameNumber),
zap.Duration("frame_age", frametime.AppFrameSince(response.Frame)),
)
if _, err := p.engine.frameProver.VerifyFrameHeader(
response.Frame.Header,
p.engine.blsConstructor,
); err != nil {
return latest, errors.Wrap(err, "sync")
}
err = p.engine.appTimeReel.Insert(p.engine.ctx, response.Frame)
if err != nil {
return latest, errors.Wrap(err, "sync")
}
latest = response.Frame
}
}
func (p *AppSyncProvider) downloadSnapshot(
dbPath string,
network uint8,
lookupKey []byte,
) error {
base := "https://frame-snapshots.quilibrium.com"
keyHex := fmt.Sprintf("%x", lookupKey)
manifestURL := fmt.Sprintf("%s/%d/%s/manifest", base, network, keyHex)
resp, err := http.Get(manifestURL)
if err != nil {
return errors.Wrap(err, "download snapshot")
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return errors.Wrap(
fmt.Errorf("manifest http status %d", resp.StatusCode),
"download snapshot",
)
}
type mfLine struct {
Name string
Hash string // lowercase hex
}
var lines []mfLine
sc := bufio.NewScanner(resp.Body)
sc.Buffer(make([]byte, 0, 64*1024), 10*1024*1024) // handle large manifests
for sc.Scan() {
raw := strings.TrimSpace(sc.Text())
if raw == "" || strings.HasPrefix(raw, "#") {
continue
}
fields := strings.Fields(raw)
if len(fields) != 2 {
return errors.Wrap(
fmt.Errorf("invalid manifest line: %q", raw),
"download snapshot",
)
}
name := fields[0]
hash := strings.ToLower(fields[1])
// quick sanity check hash looks hex
if _, err := hex.DecodeString(hash); err != nil || len(hash) != 64 {
return errors.Wrap(
fmt.Errorf("invalid sha256 hex in manifest for %s: %q", name, hash),
"download snapshot",
)
}
lines = append(lines, mfLine{Name: name, Hash: hash})
}
if err := sc.Err(); err != nil {
return errors.Wrap(err, "download snapshot")
}
if len(lines) == 0 {
return errors.Wrap(errors.New("manifest is empty"), "download snapshot")
}
snapDir := path.Join(dbPath, "snapshot")
// Start fresh
_ = os.RemoveAll(snapDir)
if err := os.MkdirAll(snapDir, 0o755); err != nil {
return errors.Wrap(err, "download snapshot")
}
// Download each file with retries + hash verification
for _, entry := range lines {
srcURL := fmt.Sprintf("%s/%d/%s/%s", base, network, keyHex, entry.Name)
dstPath := filepath.Join(snapDir, entry.Name)
// ensure parent dir exists (manifest may list nested files like CURRENT,
// MANIFEST-xxxx, OPTIONS, *.sst)
if err := os.MkdirAll(filepath.Dir(dstPath), 0o755); err != nil {
return errors.Wrap(
fmt.Errorf("mkdir for %s: %w", dstPath, err),
"download snapshot",
)
}
if err := downloadWithRetryAndHash(
srcURL,
dstPath,
entry.Hash,
5,
); err != nil {
return errors.Wrap(
fmt.Errorf("downloading %s failed: %w", entry.Name, err),
"download snapshot",
)
}
}
return nil
}
// downloadWithRetryAndHash fetches url, stores in dstPath, verifies
// sha256 == expectedHex, and retries up to retries times. Writes atomically via
// a temporary file.
func downloadWithRetryAndHash(
url, dstPath, expectedHex string,
retries int,
) error {
var lastErr error
for attempt := 1; attempt <= retries; attempt++ {
if err := func() error {
resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("http status %d", resp.StatusCode)
}
tmp, err := os.CreateTemp(filepath.Dir(dstPath), ".part-*")
if err != nil {
return err
}
defer func() {
tmp.Close()
_ = os.Remove(tmp.Name())
}()
h := sha256.New()
if _, err := io.Copy(io.MultiWriter(tmp, h), resp.Body); err != nil {
return err
}
sumHex := hex.EncodeToString(h.Sum(nil))
if !strings.EqualFold(sumHex, expectedHex) {
return fmt.Errorf(
"hash mismatch for %s: expected %s, got %s",
url,
expectedHex,
sumHex,
)
}
// fsync to be safe before rename
if err := tmp.Sync(); err != nil {
return err
}
// atomic replace
if err := os.Rename(tmp.Name(), dstPath); err != nil {
return err
}
return nil
}(); err != nil {
lastErr = err
// simple backoff: 200ms * attempt
time.Sleep(time.Duration(200*attempt) * time.Millisecond)
continue
}
return nil
}
return lastErr
}
func (p *AppSyncProvider) hyperSyncWithProver(
prover []byte,
shardKey tries.ShardKey,
) {
registry, err := p.engine.signerRegistry.GetKeyRegistryByProver(prover)
if err == nil && registry != nil && registry.IdentityKey != nil {
peerKey := registry.IdentityKey
pubKey, err := crypto.UnmarshalEd448PublicKey(peerKey.KeyValue)
if err == nil {
peerId, err := peer.IDFromPublicKey(pubKey)
if err == nil {
ch, err := p.engine.pubsub.GetDirectChannel(
p.engine.ctx,
[]byte(peerId),
"sync",
)
if err == nil {
defer ch.Close()
client := protobufs.NewHypergraphComparisonServiceClient(ch)
str, err := client.HyperStream(p.engine.ctx)
if err != nil {
p.engine.logger.Error("error from sync", zap.Error(err))
} else {
p.hyperSyncVertexAdds(str, shardKey)
p.hyperSyncVertexRemoves(str, shardKey)
p.hyperSyncHyperedgeAdds(str, shardKey)
p.hyperSyncHyperedgeRemoves(str, shardKey)
}
}
}
}
}
}
func (p *AppSyncProvider) hyperSyncVertexAdds(
str protobufs.HypergraphComparisonService_HyperStreamClient,
shardKey tries.ShardKey,
) {
err := p.engine.hypergraph.Sync(
str,
shardKey,
protobufs.HypergraphPhaseSet_HYPERGRAPH_PHASE_SET_VERTEX_ADDS,
)
if err != nil {
p.engine.logger.Error("error from sync", zap.Error(err))
}
str.CloseSend()
}
func (p *AppSyncProvider) hyperSyncVertexRemoves(
str protobufs.HypergraphComparisonService_HyperStreamClient,
shardKey tries.ShardKey,
) {
err := p.engine.hypergraph.Sync(
str,
shardKey,
protobufs.HypergraphPhaseSet_HYPERGRAPH_PHASE_SET_VERTEX_REMOVES,
)
if err != nil {
p.engine.logger.Error("error from sync", zap.Error(err))
}
str.CloseSend()
}
func (p *AppSyncProvider) hyperSyncHyperedgeAdds(
str protobufs.HypergraphComparisonService_HyperStreamClient,
shardKey tries.ShardKey,
) {
err := p.engine.hypergraph.Sync(
str,
shardKey,
protobufs.HypergraphPhaseSet_HYPERGRAPH_PHASE_SET_HYPEREDGE_ADDS,
)
if err != nil {
p.engine.logger.Error("error from sync", zap.Error(err))
}
str.CloseSend()
}
func (p *AppSyncProvider) hyperSyncHyperedgeRemoves(
str protobufs.HypergraphComparisonService_HyperStreamClient,
shardKey tries.ShardKey,
) {
err := p.engine.hypergraph.Sync(
str,
shardKey,
protobufs.HypergraphPhaseSet_HYPERGRAPH_PHASE_SET_HYPEREDGE_REMOVES,
)
if err != nil {
p.engine.logger.Error("error from sync", zap.Error(err))
}
str.CloseSend()
}