ceremonyclient/node/main.go
Cassandra Heart 53f7c2b5c9
v2.1.0.2 (#442)
* v2.1.0.2

* restore tweaks to simlibp2p

* fix: nil ref on size calc

* fix: panic should induce shutdown from event_distributor

* fix: friendlier initialization that requires less manual kickstarting for test/devnets

* fix: fewer available shards than provers should choose shard length

* fix: update stored worker registry, improve logging for debug mode

* fix: shut the fuck up, peer log

* qol: log value should be snake cased

* fix:non-archive snap sync issues

* fix: separate X448/Decaf448 signed keys, add onion key to registry

* fix: overflow arithmetic on frame number comparison

* fix: worker registration should be idempotent if inputs are same, otherwise permit updated records

* fix: remove global prover state from size calculation

* fix: divide by zero case

* fix: eager prover

* fix: broadcast listener default

* qol: diagnostic data for peer authenticator

* fix: master/worker connectivity issue in sparse networks

tight coupling of peer and workers can sometimes interfere if mesh is sparse, so give workers a pseudoidentity but publish messages with the proper peer key

* fix: reorder steps of join creation

* fix: join verify frame source + ensure domain is properly padded (unnecessary but good for consistency)

* fix: add delegate to protobuf <-> reified join conversion

* fix: preempt prover from planning with no workers

* fix: use the unallocated workers to generate a proof

* qol: underflow causes join fail in first ten frames on test/devnets

* qol: small logging tweaks for easier log correlation in debug mode

* qol: use fisher-yates shuffle to ensure prover allocations are evenly distributed when scores are equal

* qol: separate decisional logic on post-enrollment confirmation into consensus engine, proposer, and worker manager where relevant, refactor out scoring

* reuse shard descriptors for both join planning and confirm/reject decisions

* fix: add missing interface method and amend test blossomsub to use new peer id basis

* fix: only check allocations if they exist

* fix: pomw mint proof data needs to be hierarchically under global intrinsic domain

* staging temporary state under diagnostics

* fix: first phase of distributed lock refactoring

* fix: compute intrinsic locking

* fix: hypergraph intrinsic locking

* fix: token intrinsic locking

* fix: update execution engines to support new locking model

* fix: adjust tests with new execution shape

* fix: weave in lock/unlock semantics to liveness provider

* fix lock fallthrough, add missing allocation update

* qol: additional logging for diagnostics, also testnet/devnet handling for confirmations

* fix: establish grace period on halt scenario to permit recovery

* fix: support test/devnet defaults for coverage scenarios

* fix: nil ref on consensus halts for non-archive nodes

* fix: remove unnecessary prefix from prover ref

* add test coverage for fork choice behaviors and replay – once passing, blocker (2) is resolved

* fix: no fork replay on repeat for non-archive nodes, snap now behaves correctly

* rollup of pre-liveness check lock interactions

* ahead of tests, get the protobuf/metrics-related changes out so teams can prepare

* add test coverage for distributed lock behaviors – once passing, blocker (3) is resolved

* fix: blocker (3)

* Dev docs improvements (#445)

* Make install deps script more robust

* Improve testing instructions

* Worker node should stop upon OS SIGINT/SIGTERM signal (#447)

* move pebble close to Stop()

* move deferred Stop() to Start()

* add core id to worker stop log message

* create done os signal channel and stop worker upon message to it

---------

Co-authored-by: Cassandra Heart <7929478+CassOnMars@users.noreply.github.com>

---------

Co-authored-by: Daz <daz_the_corgi@proton.me>
Co-authored-by: Black Swan <3999712+blacks1ne@users.noreply.github.com>
2025-10-23 01:03:06 -05:00

680 lines
16 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//go:build !js && !wasm
package main
import (
"bytes"
"context"
_ "embed"
"encoding/hex"
"flag"
"fmt"
"log"
"math/big"
"net/http"
npprof "net/http/pprof"
"os"
"os/signal"
"path/filepath"
"runtime"
rdebug "runtime/debug"
"runtime/pprof"
"strconv"
"strings"
"syscall"
"time"
"github.com/cloudflare/circl/sign/ed448"
"github.com/libp2p/go-libp2p/core/crypto"
"github.com/libp2p/go-libp2p/core/peer"
"github.com/multiformats/go-multiaddr"
mn "github.com/multiformats/go-multiaddr/net"
"github.com/pbnjay/memory"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.uber.org/zap"
"golang.org/x/crypto/sha3"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"source.quilibrium.com/quilibrium/monorepo/config"
"source.quilibrium.com/quilibrium/monorepo/node/app"
qgrpc "source.quilibrium.com/quilibrium/monorepo/node/internal/grpc"
"source.quilibrium.com/quilibrium/monorepo/node/rpc"
"source.quilibrium.com/quilibrium/monorepo/node/store"
"source.quilibrium.com/quilibrium/monorepo/protobufs"
qruntime "source.quilibrium.com/quilibrium/monorepo/utils/runtime"
)
var (
configDirectory = flag.String(
"config",
filepath.Join(".", ".config"),
"the configuration directory",
)
peerId = flag.Bool(
"peer-id",
false,
"print the peer id to stdout from the config and exit",
)
cpuprofile = flag.String(
"cpuprofile",
"",
"write cpu profile to file",
)
memprofile = flag.String(
"memprofile",
"",
"write memory profile after 20m to this file",
)
pprofServer = flag.String(
"pprof-server",
"",
"enable pprof server on specified address (e.g. localhost:6060)",
)
prometheusServer = flag.String(
"prometheus-server",
"",
"enable prometheus server on specified address (e.g. localhost:8080)",
)
nodeInfo = flag.Bool(
"node-info",
false,
"print node related information",
)
debug = flag.Bool(
"debug",
false,
"sets log output to debug (verbose)",
)
dhtOnly = flag.Bool(
"dht-only",
false,
"sets a node to run strictly as a dht bootstrap peer (not full node)",
)
network = flag.Uint(
"network",
0,
"sets the active network for the node (mainnet = 0, primary testnet = 1)",
)
signatureCheck = flag.Bool(
"signature-check",
signatureCheckDefault(),
"enables or disables signature validation (default true or value of QUILIBRIUM_SIGNATURE_CHECK env var)",
)
core = flag.Int(
"core",
0,
"specifies the core of the process (defaults to zero, the initial launcher)",
)
parentProcess = flag.Int(
"parent-process",
0,
"specifies the parent process pid for a data worker",
)
compactDB = flag.Bool(
"compact-db",
false,
"compacts the database and exits",
)
dbConsole = flag.Bool(
"db-console",
false,
"starts the db console mode (does not run nodes)",
)
// *char flags
blockchar = "█"
bver = "Bloom"
char *string = &blockchar
ver *string = &bver
)
func signatureCheckDefault() bool {
envVarValue, envVarExists := os.LookupEnv("QUILIBRIUM_SIGNATURE_CHECK")
if envVarExists {
def, err := strconv.ParseBool(envVarValue)
if err == nil {
return def
} else {
fmt.Println(
"Invalid environment variable QUILIBRIUM_SIGNATURE_CHECK, must be 'true' or 'false':",
envVarValue,
)
}
}
return true
}
// monitorParentProcess watches parent process and signals quit channel if
// parent dies
func monitorParentProcess(
parentProcessId int,
quitCh chan struct{},
logger *zap.Logger,
) {
for {
time.Sleep(1 * time.Second)
proc, err := os.FindProcess(parentProcessId)
if err != nil {
logger.Error("parent process not found, terminating")
close(quitCh)
return
}
// Windows returns an error if the process is dead, nobody else does
if runtime.GOOS != "windows" {
err := proc.Signal(syscall.Signal(0))
if err != nil {
logger.Error("parent process not found, terminating")
close(quitCh)
return
}
}
}
}
func main() {
config.Flags(&char, &ver)
flag.Parse()
nodeConfig, err := config.LoadConfig(*configDirectory, "", false)
if err != nil {
log.Fatal("failed to load config", err)
}
if *dbConsole {
db, err := app.NewDBConsole(nodeConfig)
if err != nil {
log.Fatal(err)
}
db.Run()
os.Exit(0)
}
logger, closer, err := nodeConfig.CreateLogger(uint(*core), *debug)
if err != nil {
log.Fatal("failed to create logger", err)
}
defer closer.Close()
if *signatureCheck {
if runtime.GOOS == "windows" {
logger.Info("Signature check not available for windows yet, skipping...")
} else {
ex, err := os.Executable()
if err != nil {
logger.Panic(
"Failed to get executable path",
zap.Error(err),
zap.String("executable", ex),
)
}
b, err := os.ReadFile(ex)
if err != nil {
logger.Panic(
"Error encountered during signature check are you running this "+
"from source? (use --signature-check=false)",
zap.Error(err),
)
}
checksum := sha3.Sum256(b)
digest, err := os.ReadFile(ex + ".dgst")
if err != nil {
logger.Fatal("digest file not found", zap.Error(err))
}
parts := strings.Split(string(digest), " ")
if len(parts) != 2 {
logger.Fatal("Invalid digest file format")
}
digestBytes, err := hex.DecodeString(parts[1][:64])
if err != nil {
logger.Fatal("invalid digest file format", zap.Error(err))
}
if !bytes.Equal(checksum[:], digestBytes) {
logger.Fatal("invalid digest for node")
}
count := 0
for i := 1; i <= len(config.Signatories); i++ {
signatureFile := fmt.Sprintf(ex+".dgst.sig.%d", i)
sig, err := os.ReadFile(signatureFile)
if err != nil {
continue
}
pubkey, _ := hex.DecodeString(config.Signatories[i-1])
if !ed448.Verify(pubkey, digest, sig, "") {
logger.Fatal(
"failed signature check for signatory",
zap.Int("signatory", i),
)
}
count++
}
if count < ((len(config.Signatories)-4)/2)+((len(config.Signatories)-4)%2) {
logger.Fatal("quorum on signatures not met")
}
logger.Info("signature check passed")
}
} else {
logger.Info("signature check disabled, skipping...")
}
if *core == 0 {
logger = logger.With(zap.String("process", "master"))
} else {
logger = logger.With(zap.String("process", fmt.Sprintf("worker %d", *core)))
}
if *memprofile != "" && *core == 0 {
go func() {
for {
time.Sleep(5 * time.Minute)
f, err := os.Create(*memprofile)
if err != nil {
logger.Fatal("failed to create memory profile file", zap.Error(err))
}
pprof.WriteHeapProfile(f)
f.Close()
}
}()
}
if *cpuprofile != "" && *core == 0 {
f, err := os.Create(*cpuprofile)
if err != nil {
logger.Fatal("failed to create cpu profile file", zap.Error(err))
}
defer f.Close()
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
}
if *pprofServer != "" && *core == 0 {
go func() {
mux := http.NewServeMux()
mux.HandleFunc("/debug/pprof/", npprof.Index)
mux.HandleFunc("/debug/pprof/cmdline", npprof.Cmdline)
mux.HandleFunc("/debug/pprof/profile", npprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", npprof.Symbol)
mux.HandleFunc("/debug/pprof/trace", npprof.Trace)
logger.Fatal(
"Failed to start pprof server",
zap.Error(http.ListenAndServe(*pprofServer, mux)),
)
}()
}
if *prometheusServer != "" && *core == 0 {
go func() {
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
logger.Fatal(
"Failed to start prometheus server",
zap.Error(http.ListenAndServe(*prometheusServer, mux)),
)
}()
}
if *peerId {
printPeerID(logger, nodeConfig.P2P)
return
}
if *nodeInfo {
config, err := config.LoadConfig(*configDirectory, "", false)
if err != nil {
logger.Fatal("failed to load config", zap.Error(err))
}
printNodeInfo(logger, config)
return
}
if *core == 0 {
config.PrintLogo(*char)
config.PrintVersion(uint8(*network), *char, *ver)
fmt.Println(" ")
}
if *compactDB {
db := store.NewPebbleDB(logger, nodeConfig.DB, uint(*core))
if err := db.CompactAll(); err != nil {
logger.Fatal("failed to compact database", zap.Error(err))
}
if err := db.Close(); err != nil {
logger.Fatal("failed to close database", zap.Error(err))
}
return
}
if *network != 0 {
if nodeConfig.P2P.BootstrapPeers[0] == config.BootstrapPeers[0] {
logger.Fatal(
"node has specified to run outside of mainnet but is still " +
"using default bootstrap list. this will fail. exiting.",
)
}
nodeConfig.P2P.Network = uint8(*network)
logger.Warn(
"node is operating outside of mainnet be sure you intended to do this.",
)
}
if *dhtOnly {
done := make(chan os.Signal, 1)
signal.Notify(done, syscall.SIGINT, syscall.SIGTERM)
dht, err := app.NewDHTNode(logger, nodeConfig, 0)
if err != nil {
logger.Error("failed to start dht node", zap.Error(err))
}
go func() {
dht.Start()
}()
<-done
dht.Stop()
return
}
if len(nodeConfig.Engine.DataWorkerP2PMultiaddrs) == 0 {
maxProcs, numCPU := runtime.GOMAXPROCS(0), runtime.NumCPU()
if maxProcs > numCPU && !nodeConfig.Engine.AllowExcessiveGOMAXPROCS {
logger.Fatal(
"GOMAXPROCS is set higher than the number of available cpus.",
)
}
nodeConfig.Engine.DataWorkerCount = qruntime.WorkerCount(
nodeConfig.Engine.DataWorkerCount, true, true,
)
}
if len(nodeConfig.Engine.DataWorkerP2PMultiaddrs) !=
len(nodeConfig.Engine.DataWorkerStreamMultiaddrs) {
logger.Fatal("mismatch of worker count for p2p and stream multiaddrs")
}
if *core != 0 {
rdebug.SetMemoryLimit(nodeConfig.Engine.DataWorkerMemoryLimit)
if *parentProcess == 0 &&
len(nodeConfig.Engine.DataWorkerP2PMultiaddrs) == 0 {
logger.Fatal("parent process pid not specified")
}
rpcMultiaddr := fmt.Sprintf(
nodeConfig.Engine.DataWorkerBaseListenMultiaddr,
int(nodeConfig.Engine.DataWorkerBaseStreamPort)+*core-1,
)
if len(nodeConfig.Engine.DataWorkerStreamMultiaddrs) != 0 {
rpcMultiaddr = nodeConfig.Engine.DataWorkerStreamMultiaddrs[*core-1]
}
dataWorkerNode, err := app.NewDataWorkerNode(
logger,
nodeConfig,
uint(*core),
rpcMultiaddr,
*parentProcess,
)
if err != nil {
logger.Panic("failed to create data worker node", zap.Error(err))
}
if *parentProcess != 0 {
go monitorParentProcess(
*parentProcess,
dataWorkerNode.GetQuitChannel(),
logger,
)
}
done := make(chan os.Signal, 1)
signal.Notify(done, syscall.SIGINT, syscall.SIGTERM)
quitCh := make(chan struct{})
go func() {
err = dataWorkerNode.Start(done, quitCh)
if err != nil {
logger.Panic("failed to start data worker node", zap.Error(err))
close(quitCh)
}
}()
diskFullCh := make(chan error, 1)
monitor := store.NewDiskMonitor(
uint(*core),
*nodeConfig.DB,
logger,
diskFullCh,
)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
monitor.Start(ctx)
loop:
for {
select {
case <-diskFullCh:
dataWorkerNode.Stop()
case <-quitCh:
dataWorkerNode.Stop()
break loop
}
}
return
} else {
totalMemory := int64(memory.TotalMemory())
dataWorkerReservedMemory := int64(0)
if len(nodeConfig.Engine.DataWorkerStreamMultiaddrs) == 0 {
dataWorkerReservedMemory =
nodeConfig.Engine.DataWorkerMemoryLimit * int64(
nodeConfig.Engine.DataWorkerCount,
)
}
switch availableOverhead := totalMemory - dataWorkerReservedMemory; {
case totalMemory < dataWorkerReservedMemory:
logger.Warn(
"the memory allocated to data workers exceeds the total system memory",
zap.Int64("total_memory", totalMemory),
zap.Int64("data_worker_reserved_memory", dataWorkerReservedMemory),
)
logger.Warn("you are at risk of running out of memory during runtime")
case availableOverhead < 2*1024*1024*1024:
logger.Warn(
"the memory available to the node, unallocated to "+
"the data workers, is less than 2gb",
zap.Int64("available_overhead", availableOverhead),
)
logger.Warn("you are at risk of running out of memory during runtime")
default:
if _, limit := os.LookupEnv("GOMEMLIMIT"); !limit {
rdebug.SetMemoryLimit(availableOverhead * 8 / 10)
}
if _, explicitGOGC := os.LookupEnv("GOGC"); !explicitGOGC {
rdebug.SetGCPercent(10)
}
}
}
logger.Info("starting node...")
done := make(chan os.Signal, 1)
signal.Notify(done, syscall.SIGINT, syscall.SIGTERM)
// Create MasterNode for core 0
masterNode, err := app.NewMasterNode(logger, nodeConfig, uint(*core))
if err != nil {
logger.Panic("failed to create master node", zap.Error(err))
}
// Start the master node
quitCh := make(chan struct{})
go func() {
if err := masterNode.Start(quitCh); err != nil {
logger.Error("master node start error", zap.Error(err))
close(quitCh)
}
}()
defer masterNode.Stop()
if nodeConfig.ListenGRPCMultiaddr != "" {
srv, err := rpc.NewRPCServer(
nodeConfig,
masterNode.GetLogger(),
masterNode.GetKeyManager(),
masterNode.GetPubSub(),
masterNode.GetPeerInfoProvider(),
masterNode.GetWorkerManager(),
masterNode.GetProverRegistry(),
masterNode.GetExecutionEngineManager(),
)
if err != nil {
logger.Panic("failed to new rpc server", zap.Error(err))
}
if err := srv.Start(); err != nil {
logger.Panic("failed to start rpc server", zap.Error(err))
}
defer srv.Stop()
}
diskFullCh := make(chan error, 1)
monitor := store.NewDiskMonitor(
uint(*core),
*nodeConfig.DB,
logger,
diskFullCh,
)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
monitor.Start(ctx)
select {
case <-done:
case <-diskFullCh:
case <-quitCh:
}
}
func getPeerID(logger *zap.Logger, p2pConfig *config.P2PConfig) peer.ID {
peerPrivKey, err := hex.DecodeString(p2pConfig.PeerPrivKey)
if err != nil {
logger.Panic("error to decode peer private key",
zap.Error(errors.Wrap(err, "error unmarshaling peerkey")))
}
privKey, err := crypto.UnmarshalEd448PrivateKey(peerPrivKey)
if err != nil {
logger.Panic("error to unmarshal ed448 private key",
zap.Error(errors.Wrap(err, "error unmarshaling peerkey")))
}
pub := privKey.GetPublic()
id, err := peer.IDFromPublicKey(pub)
if err != nil {
logger.Panic("error to get peer id", zap.Error(err))
}
return id
}
func printPeerID(logger *zap.Logger, p2pConfig *config.P2PConfig) {
id := getPeerID(logger, p2pConfig)
fmt.Println("Peer ID: " + id.String())
}
func printNodeInfo(logger *zap.Logger, cfg *config.Config) {
if cfg.ListenGRPCMultiaddr == "" {
logger.Fatal("gRPC Not Enabled, Please Configure")
}
printPeerID(logger, cfg.P2P)
conn, err := ConnectToNode(logger, cfg)
if err != nil {
logger.Fatal(
"could not connect to node. if it is still booting, please wait.",
zap.Error(err),
)
}
defer conn.Close()
client := protobufs.NewNodeServiceClient(conn)
nodeInfo, err := FetchNodeInfo(client)
if err != nil {
logger.Panic("failed to fetch node info", zap.Error(err))
}
fmt.Println("Version: " + config.FormatVersion(nodeInfo.Version))
fmt.Println("Seniority: " + new(big.Int).SetBytes(
nodeInfo.PeerSeniority,
).String())
fmt.Println("Active Workers:", nodeInfo.Workers)
}
var defaultGrpcAddress = "localhost:8337"
// Connect to the node via GRPC
func ConnectToNode(logger *zap.Logger, nodeConfig *config.Config) (*grpc.ClientConn, error) {
addr := defaultGrpcAddress
if nodeConfig.ListenGRPCMultiaddr != "" {
ma, err := multiaddr.NewMultiaddr(nodeConfig.ListenGRPCMultiaddr)
if err != nil {
logger.Panic("error parsing multiaddr", zap.Error(err))
}
_, addr, err = mn.DialArgs(ma)
if err != nil {
logger.Panic("error getting dial args", zap.Error(err))
}
}
return qgrpc.DialContext(
context.Background(),
addr,
grpc.WithTransportCredentials(
insecure.NewCredentials(),
),
grpc.WithDefaultCallOptions(
grpc.MaxCallSendMsgSize(600*1024*1024),
grpc.MaxCallRecvMsgSize(600*1024*1024),
),
)
}
type TokenBalance struct {
Owned *big.Int
UnconfirmedOwned *big.Int
}
func FetchNodeInfo(
client protobufs.NodeServiceClient,
) (*protobufs.NodeInfoResponse, error) {
info, err := client.GetNodeInfo(
context.Background(),
&protobufs.GetNodeInfoRequest{},
)
if err != nil {
return nil, errors.Wrap(err, "error getting node info")
}
return info, nil
}