ceremonyclient/node/consensus/provers/proposer.go
Cassandra Heart 53f7c2b5c9
v2.1.0.2 (#442)
* v2.1.0.2

* restore tweaks to simlibp2p

* fix: nil ref on size calc

* fix: panic should induce shutdown from event_distributor

* fix: friendlier initialization that requires less manual kickstarting for test/devnets

* fix: fewer available shards than provers should choose shard length

* fix: update stored worker registry, improve logging for debug mode

* fix: shut the fuck up, peer log

* qol: log value should be snake cased

* fix:non-archive snap sync issues

* fix: separate X448/Decaf448 signed keys, add onion key to registry

* fix: overflow arithmetic on frame number comparison

* fix: worker registration should be idempotent if inputs are same, otherwise permit updated records

* fix: remove global prover state from size calculation

* fix: divide by zero case

* fix: eager prover

* fix: broadcast listener default

* qol: diagnostic data for peer authenticator

* fix: master/worker connectivity issue in sparse networks

tight coupling of peer and workers can sometimes interfere if mesh is sparse, so give workers a pseudoidentity but publish messages with the proper peer key

* fix: reorder steps of join creation

* fix: join verify frame source + ensure domain is properly padded (unnecessary but good for consistency)

* fix: add delegate to protobuf <-> reified join conversion

* fix: preempt prover from planning with no workers

* fix: use the unallocated workers to generate a proof

* qol: underflow causes join fail in first ten frames on test/devnets

* qol: small logging tweaks for easier log correlation in debug mode

* qol: use fisher-yates shuffle to ensure prover allocations are evenly distributed when scores are equal

* qol: separate decisional logic on post-enrollment confirmation into consensus engine, proposer, and worker manager where relevant, refactor out scoring

* reuse shard descriptors for both join planning and confirm/reject decisions

* fix: add missing interface method and amend test blossomsub to use new peer id basis

* fix: only check allocations if they exist

* fix: pomw mint proof data needs to be hierarchically under global intrinsic domain

* staging temporary state under diagnostics

* fix: first phase of distributed lock refactoring

* fix: compute intrinsic locking

* fix: hypergraph intrinsic locking

* fix: token intrinsic locking

* fix: update execution engines to support new locking model

* fix: adjust tests with new execution shape

* fix: weave in lock/unlock semantics to liveness provider

* fix lock fallthrough, add missing allocation update

* qol: additional logging for diagnostics, also testnet/devnet handling for confirmations

* fix: establish grace period on halt scenario to permit recovery

* fix: support test/devnet defaults for coverage scenarios

* fix: nil ref on consensus halts for non-archive nodes

* fix: remove unnecessary prefix from prover ref

* add test coverage for fork choice behaviors and replay – once passing, blocker (2) is resolved

* fix: no fork replay on repeat for non-archive nodes, snap now behaves correctly

* rollup of pre-liveness check lock interactions

* ahead of tests, get the protobuf/metrics-related changes out so teams can prepare

* add test coverage for distributed lock behaviors – once passing, blocker (3) is resolved

* fix: blocker (3)

* Dev docs improvements (#445)

* Make install deps script more robust

* Improve testing instructions

* Worker node should stop upon OS SIGINT/SIGTERM signal (#447)

* move pebble close to Stop()

* move deferred Stop() to Start()

* add core id to worker stop log message

* create done os signal channel and stop worker upon message to it

---------

Co-authored-by: Cassandra Heart <7929478+CassOnMars@users.noreply.github.com>

---------

Co-authored-by: Daz <daz_the_corgi@proton.me>
Co-authored-by: Black Swan <3999712+blacks1ne@users.noreply.github.com>
2025-10-23 01:03:06 -05:00

432 lines
10 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package provers
import (
"bytes"
"crypto/rand"
"encoding/hex"
"math/big"
"sort"
"sync"
"github.com/pkg/errors"
"github.com/shopspring/decimal"
"go.uber.org/zap"
"source.quilibrium.com/quilibrium/monorepo/node/consensus/reward"
"source.quilibrium.com/quilibrium/monorepo/types/store"
"source.quilibrium.com/quilibrium/monorepo/types/tries"
"source.quilibrium.com/quilibrium/monorepo/types/worker"
)
type Strategy int
const (
RewardGreedy Strategy = iota
DataGreedy
)
// WorldSizer provides the total world-state size (bytes).
type WorldSizer interface {
// GetSize returns the total world state size in bytes.
GetSize(key *tries.ShardKey, path []int) *big.Int
}
// ShardDescriptor describes a candidate shard allocation target.
type ShardDescriptor struct {
// Confirmation filter for the shard (routing key). Must be non-empty.
Filter []byte
// Size in bytes of this shards state (for reward proportionality).
Size uint64
// Ring attenuation factor (reward is divided by 2^Ring). Usually 0 unless
// you intentionally place on outer rings.
Ring uint8
// Logical shard-group participation count for sqrt divisor (>=1).
// If youre assigning a worker to exactly one shard, use 1.
Shards uint64
}
// Proposal is a plan to allocate a specific worker to a shard filter.
type Proposal struct {
WorkerId uint
Filter []byte
ExpectedReward *big.Int // in base units
WorldStateBytes uint64
ShardSizeBytes uint64
Ring uint8
ShardsDenominator uint64
}
// scored is an internal struct for ranking proposals
type scored struct {
idx int
score *big.Int
}
// Manager ranks shards and assigns free workers to the best ones.
type Manager struct {
logger *zap.Logger
world WorldSizer
store store.WorkerStore
workerMgr worker.WorkerManager
// Static issuance parameters for planning
Units uint64
Strategy Strategy
mu sync.Mutex
isPlanning bool
}
// NewManager wires up a planning manager
func NewManager(
logger *zap.Logger,
world WorldSizer,
ws store.WorkerStore,
wm worker.WorkerManager,
units uint64,
strategy Strategy,
) *Manager {
return &Manager{
logger: logger.Named("allocation_manager"),
world: world,
store: ws,
workerMgr: wm,
Units: units,
Strategy: strategy,
}
}
// PlanAndAllocate picks up to maxAllocations of the best shard filters and
// calls WorkerManager.AllocateWorker for each selected free worker.
// If maxAllocations == 0, it will use as many free workers as available.
func (m *Manager) PlanAndAllocate(
difficulty uint64,
shards []ShardDescriptor,
maxAllocations int,
) ([]Proposal, error) {
m.mu.Lock()
isPlanning := m.isPlanning
m.mu.Unlock()
if isPlanning {
m.logger.Debug("planning already in progress")
return []Proposal{}, nil
}
m.mu.Lock()
m.isPlanning = true
m.mu.Unlock()
defer func() {
m.mu.Lock()
m.isPlanning = false
m.mu.Unlock()
}()
if len(shards) == 0 {
m.logger.Debug("no shards to allocate")
return nil, nil
}
// Enumerate free workers (unallocated).
all, err := m.workerMgr.RangeWorkers()
if err != nil {
return nil, errors.Wrap(err, "plan and allocate")
}
free := make([]uint, 0, len(all))
for _, w := range all {
if !w.Allocated {
free = append(free, w.CoreId)
}
}
if len(free) == 0 {
m.logger.Debug("no workers free")
return nil, nil
}
worldBytes := m.world.GetSize(nil, nil)
if worldBytes.Cmp(big.NewInt(0)) == 0 {
return nil, errors.Wrap(
errors.New("world size is zero"),
"plan and allocate",
)
}
// Pre-compute basis (independent of shard specifics).
basis := reward.PomwBasis(difficulty, worldBytes.Uint64(), m.Units)
scores, err := m.scoreShards(shards, basis, worldBytes)
if err != nil {
return nil, errors.Wrap(err, "plan and allocate")
}
if len(scores) == 0 {
m.logger.Debug("no scores")
return nil, nil
}
// Sort by score desc, then lexicographically by filter to keep order
// stable/deterministic.
sort.Slice(scores, func(i, j int) bool {
cmp := scores[i].score.Cmp(scores[j].score)
if cmp != 0 {
return cmp > 0
}
fi := shards[scores[i].idx].Filter
fj := shards[scores[j].idx].Filter
return bytes.Compare(fi, fj) < 0
})
// For reward-greedy strategy, randomize within equal-score groups so equally
// good shards are chosen fairly instead of skewing toward lexicographically
// earlier filters.
if m.Strategy != DataGreedy && len(scores) > 1 {
for start := 0; start < len(scores); {
end := start + 1
// Find run [start,end) where scores are equal.
for end < len(scores) && scores[end].score.Cmp(scores[start].score) == 0 {
end++
}
// Shuffle the run with FisherYates:
if end-start > 1 {
for i := end - 1; i > start; i-- {
n := big.NewInt(int64(i - start + 1))
r, err := rand.Int(rand.Reader, n)
if err == nil {
j := start + int(r.Int64())
scores[i], scores[j] = scores[j], scores[i]
}
}
}
start = end
}
}
// Determine how many allocations well attempt.
limit := len(free)
if maxAllocations > 0 && maxAllocations < limit {
limit = maxAllocations
}
if limit > len(scores) {
limit = len(scores)
}
m.logger.Debug(
"deciding on scored proposals",
zap.Int("free_workers", len(free)),
zap.Int("max_allocations", maxAllocations),
zap.Int("scores", len(scores)),
zap.Int("limit", limit),
)
proposals := make([]Proposal, 0, limit)
// Assign top-k scored shards to the first k free workers.
for k := 0; k < limit; k++ {
sel := shards[scores[k].idx]
// Copy filter so we don't leak underlying slices.
filterCopy := make([]byte, len(sel.Filter))
copy(filterCopy, sel.Filter)
// Convert expected reward to *big.Int to match issuance style.
expBI := scores[k].score
proposals = append(proposals, Proposal{
WorkerId: free[k],
Filter: filterCopy,
ExpectedReward: expBI,
WorldStateBytes: worldBytes.Uint64(),
ShardSizeBytes: sel.Size,
Ring: sel.Ring,
ShardsDenominator: sel.Shards,
})
}
// Perform allocations
workerIds := []uint{}
filters := [][]byte{}
for _, p := range proposals {
workerIds = append(workerIds, p.WorkerId)
filters = append(filters, p.Filter)
}
m.logger.Debug("proposals collated", zap.Int("count", len(proposals)))
err = m.workerMgr.ProposeAllocations(workerIds, filters)
if err != nil {
m.logger.Warn("allocate worker failed",
zap.Error(err),
)
}
return proposals, errors.Wrap(err, "plan and allocate")
}
func (m *Manager) scoreShards(
shards []ShardDescriptor,
basis *big.Int,
worldBytes *big.Int,
) ([]scored, error) {
scores := make([]scored, 0, len(shards))
for i, s := range shards {
if len(s.Filter) == 0 || s.Size == 0 {
m.logger.Debug(
"filtering out empty shard",
zap.String("filter", hex.EncodeToString(s.Filter)),
zap.Uint64("size", s.Size),
)
continue
}
if s.Shards == 0 {
s.Shards = 1
}
var score *big.Int
switch m.Strategy {
case DataGreedy:
// Pure data coverage: larger shards first.
score = big.NewInt(int64(s.Size))
default:
// factor = (stateSize / worldBytes)
factor := decimal.NewFromUint64(s.Size)
factor = factor.Mul(decimal.NewFromBigInt(basis, 0))
factor = factor.Div(decimal.NewFromBigInt(worldBytes, 0))
// ring divisor = 2^Ring
divisor := int64(1)
for j := uint8(0); j < s.Ring+1; j++ {
divisor <<= 1
}
ringDiv := decimal.NewFromInt(divisor)
// shard factor = sqrt(Shards)
shardsSqrt, err := decimal.NewFromUint64(s.Shards).PowWithPrecision(
decimal.NewFromBigRat(big.NewRat(1, 2), 53),
53,
)
if err != nil {
return nil, errors.Wrap(err, "score shards")
}
if shardsSqrt.IsZero() {
return nil, errors.New("score shards")
}
m.logger.Debug(
"calculating score",
zap.Int("index", i),
zap.String("basis", basis.String()),
zap.String("size", big.NewInt(int64(s.Size)).String()),
zap.String("worldBytes", worldBytes.String()),
zap.String("factor", factor.String()),
zap.String("divisor", ringDiv.String()),
zap.String("shardsSqrt", shardsSqrt.String()),
)
factor = factor.Div(ringDiv)
factor = factor.Div(shardsSqrt)
score = factor.BigInt()
}
m.logger.Debug(
"adding score proposal",
zap.Int("index", i),
zap.String("score", score.String()),
)
scores = append(scores, scored{idx: i, score: score})
}
return scores, nil
}
// DecideJoins evaluates pending shard joins using the latest shard view. It
// uses the same scoring basis as initial planning. For each pending join:
// - If there exists a strictly better shard in the latest view, reject the
// existing one (this will result in a new join attempt).
// - Otherwise (tie or better), confirm the existing one.
func (m *Manager) DecideJoins(
difficulty uint64,
shards []ShardDescriptor,
pending [][]byte,
) error {
if len(pending) == 0 {
return nil
}
// If no shards remain, we should warn
if len(shards) == 0 {
m.logger.Warn("no shards available to decide")
return nil
}
worldBytes := m.world.GetSize(nil, nil)
basis := reward.PomwBasis(difficulty, worldBytes.Uint64(), m.Units)
scores, err := m.scoreShards(shards, basis, worldBytes)
if err != nil {
return errors.Wrap(err, "decide joins")
}
type srec struct {
desc ShardDescriptor
score *big.Int
}
byHex := make(map[string]srec, len(shards))
var bestScore *big.Int
for _, sc := range scores {
s := shards[sc.idx]
key := hex.EncodeToString(s.Filter)
byHex[key] = srec{desc: s, score: sc.score}
if bestScore == nil || sc.score.Cmp(bestScore) > 0 {
bestScore = sc.score
}
}
// If nothing valid, reject everything.
if bestScore == nil {
reject := make([][]byte, 0, len(pending))
for _, p := range pending {
if len(p) == 0 {
continue
}
pc := make([]byte, len(p))
copy(pc, p)
reject = append(reject, pc)
}
return m.workerMgr.DecideAllocations(reject, nil)
}
reject := make([][]byte, 0, len(pending))
confirm := make([][]byte, 0, len(pending))
for _, p := range pending {
if len(p) == 0 {
continue
}
key := hex.EncodeToString(p)
rec, ok := byHex[key]
if !ok {
// If a pending shard is missing, we should reject it. This could happen
// if shard-out produces a bunch of divisions.
pc := make([]byte, len(p))
copy(pc, p)
reject = append(reject, pc)
continue
}
// Reject only if there exists a strictly better score.
if rec.score.Cmp(bestScore) < 0 {
pc := make([]byte, len(p))
copy(pc, p)
reject = append(reject, pc)
} else {
// Otherwise confirm
pc := make([]byte, len(p))
copy(pc, p)
confirm = append(confirm, pc)
}
}
return m.workerMgr.DecideAllocations(reject, confirm)
}