ceremonyclient/node/tests/hypergraph_race_test.go
Cassandra Heart 1b2660b7df
v2.1.0.20 (#516)
* .20 testing

* Read in the debug by env variable (#514)

* v2.1.0.19

* enhanced error logging, fix seniority marker join blocker, fix sync message size limit defaults

* resolve signature failure

* additional error logging for merge-related signatures

* fix: one-shot sync message size, app shard TC signature size, collector/hotstuff race condition, expired joins blocking new joins due to pruning disable

* remove compat with old 2.0.0 blossomsub

* fix: resolve abandoned prover joins

* reload prover registry

* fix stale worker proposal edge

* add full sanity check on join before submitting to identify bug

* resolve non-fallthrough condition that should be fallthrough

* fix: resolve rare SIGFPE, fix orphan expired joins blocking workers from reallocating

* add reconnect fallback if no peers are found with variable reconnect time (#511)

Co-authored-by: Tyler Sturos <55340199+tjsturos@users.noreply.github.com>

* update base peer count to 1 (#513)

* fix: expired prover join frames, starting port ranges, proposer getting stuck, and seniority on joins

* fix: panic on shutdown, libp2p discovery picking inaccessible peers, coverage event check not in shutdown logic, amend app shard worker behavior to mirror global for prover root reconciliation

* fix: shutdown scenario quirks, reload hanging

* fix: do not bailout early on shutdown of coverage check

* fix: force registry refresh on worker waiting for registration

* add more logging to wait for prover

* fix: worker manager refreshes the filter on allocation, snapshots blocking close on shutdown

* tweak: force shutdown after five seconds for app worker

* fix: don't loop when shutting down

* fix: slight reordering, also added named workers to trace hanging shutdowns

* use deterministic key for peer id of workers to stop flagging workers as sybil attacks

* fix: remove pubsub stop from app consensus engine as it shouldn't manage pubsub lifecycle, integrate shutdown context to PerformSync to prevent stuck syncs from halting respawn

* fix: blossomsub pubsub interface does not properly track subscription status

* fix: subscribe order to avoid nil panic

* switch from dnsaddr to dns4

* add missing quic-v1

* additional logging to isolate respawn quirks

* fix: dnsaddr -> dns4 for blossomsub

* allow debug env var to be read

---------

Co-authored-by: Cassandra Heart <cassandra@quilibrium.com>
Co-authored-by: Tyler Sturos <55340199+tjsturos@users.noreply.github.com>
Co-authored-by: Cassandra Heart <7929478+CassOnMars@users.noreply.github.com>

* fix newPebbleDB constructor config param (#517)

* fix: high CPU overhead in initial worker behaviors/ongoing sync

* faster docker builds with better caching

* qol: add extra data to node info, and query metrics from command line

* leave proposals for overcrowded shards

* hub-and-spoke global message broadcasts

* small tweaks to cli output for join frames

---------

Co-authored-by: winged-pegasus <55340199+winged-pegasus@users.noreply.github.com>
Co-authored-by: Tyler Sturos <55340199+tjsturos@users.noreply.github.com>
Co-authored-by: Black Swan <3999712+blacks1ne@users.noreply.github.com>
2026-03-04 01:37:04 -06:00

207 lines
5.7 KiB
Go

package tests
import (
"bytes"
"crypto/rand"
"math/big"
"runtime"
"sync"
"testing"
"github.com/stretchr/testify/mock"
"go.uber.org/zap"
"source.quilibrium.com/quilibrium/monorepo/config"
hg "source.quilibrium.com/quilibrium/monorepo/hypergraph"
"source.quilibrium.com/quilibrium/monorepo/node/store"
"source.quilibrium.com/quilibrium/monorepo/types/mocks"
"source.quilibrium.com/quilibrium/monorepo/types/tries"
)
type vertexSpec struct {
appAddr [32]byte
dataAddr [32]byte
commit []byte
size *big.Int
}
// TestConcurrentAddVertexAndCommitRace verifies that serializing
// AddVertex batches and Commit calls with a mutex (the commitBarrier
// pattern) prevents partial-state tree roots.
//
// The test applies the same serialization that GlobalConsensusEngine
// uses: a mutex held across the entire AddVertex loop and around each
// Commit call. With this barrier, Commit can never observe a partially
// modified tree.
func TestConcurrentAddVertexAndCommitRace(t *testing.T) {
const (
iterations = 100
verticesPerRun = 50
// Multiple concurrent commit goroutines to increase contention.
commitGoroutines = 4
)
// Ensure true parallelism.
prevProcs := runtime.GOMAXPROCS(runtime.NumCPU())
defer runtime.GOMAXPROCS(prevProcs)
logger, _ := zap.NewDevelopment()
for iter := 0; iter < iterations; iter++ {
prover := &mocks.MockInclusionProver{}
mockCommit := make([]byte, 74)
mockCommit[0] = 0x02
rand.Read(mockCommit[1:])
prover.On("CommitRaw", mock.Anything, mock.Anything).Return(mockCommit, nil)
enc := &mocks.MockVerifiableEncryptor{}
dbCfg := &config.DBConfig{InMemoryDONOTUSE: true, Path: ".configtest/store"}
s := store.NewPebbleDB(logger, &config.Config{DB: dbCfg}, 0)
hgStore := store.NewPebbleHypergraphStore(
dbCfg, s, logger, enc, prover,
)
hgcrdt := hg.NewHypergraph(
logger,
hgStore,
prover,
[]int{},
&Nopthenticator{},
200,
)
// All vertices share the same appAddress so they land in the same shard,
// maximizing tree contention.
appAddr := [32]byte{0x10}
// Commit baseline (frame 1) with no vertices — this is the
// "before" state that a commit goroutine may validly capture
// if it acquires the barrier before the AddVertex goroutine.
baselineCommits, err := hgcrdt.Commit(1)
if err != nil {
t.Fatalf("iter %d: baseline commit failed: %v", iter, err)
}
// Prepare vertices to add.
specs := make([]vertexSpec, verticesPerRun)
for i := 0; i < verticesPerRun; i++ {
dataAddr := [32]byte{byte(i + 1), byte(iter), byte(i >> 8)}
specs[i] = vertexSpec{
appAddr: appAddr,
dataAddr: dataAddr,
commit: mockCommit,
size: big.NewInt(55),
}
}
// commitBarrier mirrors the mutex in GlobalConsensusEngine that
// serializes materialize (AddVertex loop) with
// rebuildShardCommitments (Commit).
var commitBarrier sync.Mutex
var wg sync.WaitGroup
start := make(chan struct{})
type commitResult struct {
commits map[tries.ShardKey][][]byte
err error
}
results := make([]commitResult, commitGoroutines)
wg.Add(1 + commitGoroutines)
// Goroutine A: add vertices one at a time (like materialize does).
// Holds the barrier across the entire batch.
go func() {
defer wg.Done()
<-start
commitBarrier.Lock()
defer commitBarrier.Unlock()
for _, vs := range specs {
v := hg.NewVertex(vs.appAddr, vs.dataAddr, vs.commit, vs.size)
if addErr := hgcrdt.AddVertex(nil, v); addErr != nil {
t.Errorf("iter %d: AddVertex failed: %v", iter, addErr)
return
}
// Yield between additions — without the barrier this would
// allow Commit to interleave and capture partial state.
runtime.Gosched()
}
}()
// Goroutines B: commit the tree concurrently with vertex additions.
// Each acquires the barrier around Commit, so it waits for any
// in-progress AddVertex batch to finish.
for g := 0; g < commitGoroutines; g++ {
g := g
go func() {
defer wg.Done()
<-start
// Stagger start to hit different points in the AddVertex sequence.
for y := 0; y < g*3; y++ {
runtime.Gosched()
}
commitBarrier.Lock()
results[g].commits, results[g].err = hgcrdt.Commit(
uint64(iter*10+g+2),
)
commitBarrier.Unlock()
}()
}
close(start)
wg.Wait()
for g, r := range results {
if r.err != nil {
t.Fatalf("iter %d goroutine %d: commit failed: %v", iter, g, r.err)
}
}
// Final commit with ALL vertices present — the canonical state.
expectedCommits, err := hgcrdt.Commit(uint64(iter*10 + commitGoroutines + 2))
if err != nil {
t.Fatalf("iter %d: expected commit failed: %v", iter, err)
}
// With the commitBarrier, each concurrent commit must reflect a
// consistent state: either the baseline (0 vertices, committed
// before AddVertex batch) or the final state (all vertices,
// committed after). Any other result means Commit() interleaved
// with AddVertex calls and captured partial state.
for g, r := range results {
matchesBaseline := commitMapsEqual(r.commits, baselineCommits)
matchesFinal := commitMapsEqual(r.commits, expectedCommits)
if !matchesBaseline && !matchesFinal {
t.Fatalf(
"iter %d goroutine %d: commit captured partial state "+
"(tree root matches neither baseline nor final — "+
"divergence detected)",
iter, g,
)
}
}
}
}
// commitMapsEqual compares two commit maps for byte-level equality.
func commitMapsEqual(a, b map[tries.ShardKey][][]byte) bool {
if len(a) != len(b) {
return false
}
for k, aPhases := range a {
bPhases, ok := b[k]
if !ok {
return false
}
if len(aPhases) != len(bPhases) {
return false
}
for i := range aPhases {
if !bytes.Equal(aPhases[i], bPhases[i]) {
return false
}
}
}
return true
}