ceremonyclient/node/execution/intrinsics/metrics.go
Cassandra Heart 53f7c2b5c9
v2.1.0.2 (#442)
* v2.1.0.2

* restore tweaks to simlibp2p

* fix: nil ref on size calc

* fix: panic should induce shutdown from event_distributor

* fix: friendlier initialization that requires less manual kickstarting for test/devnets

* fix: fewer available shards than provers should choose shard length

* fix: update stored worker registry, improve logging for debug mode

* fix: shut the fuck up, peer log

* qol: log value should be snake cased

* fix:non-archive snap sync issues

* fix: separate X448/Decaf448 signed keys, add onion key to registry

* fix: overflow arithmetic on frame number comparison

* fix: worker registration should be idempotent if inputs are same, otherwise permit updated records

* fix: remove global prover state from size calculation

* fix: divide by zero case

* fix: eager prover

* fix: broadcast listener default

* qol: diagnostic data for peer authenticator

* fix: master/worker connectivity issue in sparse networks

tight coupling of peer and workers can sometimes interfere if mesh is sparse, so give workers a pseudoidentity but publish messages with the proper peer key

* fix: reorder steps of join creation

* fix: join verify frame source + ensure domain is properly padded (unnecessary but good for consistency)

* fix: add delegate to protobuf <-> reified join conversion

* fix: preempt prover from planning with no workers

* fix: use the unallocated workers to generate a proof

* qol: underflow causes join fail in first ten frames on test/devnets

* qol: small logging tweaks for easier log correlation in debug mode

* qol: use fisher-yates shuffle to ensure prover allocations are evenly distributed when scores are equal

* qol: separate decisional logic on post-enrollment confirmation into consensus engine, proposer, and worker manager where relevant, refactor out scoring

* reuse shard descriptors for both join planning and confirm/reject decisions

* fix: add missing interface method and amend test blossomsub to use new peer id basis

* fix: only check allocations if they exist

* fix: pomw mint proof data needs to be hierarchically under global intrinsic domain

* staging temporary state under diagnostics

* fix: first phase of distributed lock refactoring

* fix: compute intrinsic locking

* fix: hypergraph intrinsic locking

* fix: token intrinsic locking

* fix: update execution engines to support new locking model

* fix: adjust tests with new execution shape

* fix: weave in lock/unlock semantics to liveness provider

* fix lock fallthrough, add missing allocation update

* qol: additional logging for diagnostics, also testnet/devnet handling for confirmations

* fix: establish grace period on halt scenario to permit recovery

* fix: support test/devnet defaults for coverage scenarios

* fix: nil ref on consensus halts for non-archive nodes

* fix: remove unnecessary prefix from prover ref

* add test coverage for fork choice behaviors and replay – once passing, blocker (2) is resolved

* fix: no fork replay on repeat for non-archive nodes, snap now behaves correctly

* rollup of pre-liveness check lock interactions

* ahead of tests, get the protobuf/metrics-related changes out so teams can prepare

* add test coverage for distributed lock behaviors – once passing, blocker (3) is resolved

* fix: blocker (3)

* Dev docs improvements (#445)

* Make install deps script more robust

* Improve testing instructions

* Worker node should stop upon OS SIGINT/SIGTERM signal (#447)

* move pebble close to Stop()

* move deferred Stop() to Start()

* add core id to worker stop log message

* create done os signal channel and stop worker upon message to it

---------

Co-authored-by: Cassandra Heart <7929478+CassOnMars@users.noreply.github.com>

---------

Co-authored-by: Daz <daz_the_corgi@proton.me>
Co-authored-by: Black Swan <3999712+blacks1ne@users.noreply.github.com>
2025-10-23 01:03:06 -05:00

229 lines
6.2 KiB
Go

package intrinsics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
const (
metricsNamespace = "quilibrium"
subsystem = "intrinsics"
)
var (
// Materialize operation metrics
MaterializeDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "materialize_duration_seconds",
Help: "Time taken to materialize intrinsic state",
Buckets: prometheus.DefBuckets,
},
[]string{"intrinsic_type"}, // compute, global, hypergraph, token
)
MaterializeTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "materialize_total",
Help: "Total number of materialize operations",
},
[]string{"intrinsic_type", "status"}, // status: success, error
)
// Validate operation metrics
ValidateDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "validate_duration_seconds",
Help: "Time taken to validate an intrinsic step",
Buckets: prometheus.DefBuckets,
},
[]string{"intrinsic_type"}, // intrinsic type only for overall timing
)
ValidateTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "validate_total",
Help: "Total number of successful validate operations",
},
[]string{"intrinsic_type", "operation"},
)
ValidateErrors = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "validate_errors_total",
Help: "Total number of failed validate operations",
},
[]string{"intrinsic_type", "operation"},
)
// Lock operation metrics
LockDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "lock_duration_seconds",
Help: "Time taken to lock an intrinsic step",
Buckets: prometheus.DefBuckets,
},
[]string{"intrinsic_type"}, // intrinsic type only for overall timing
)
LockTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "lock_total",
Help: "Total number of successful lock operations",
},
[]string{"intrinsic_type", "operation"},
)
LockErrors = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "lock_errors_total",
Help: "Total number of failed lock operations",
},
[]string{"intrinsic_type", "operation"},
)
UnlockDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "unlock_duration_seconds",
Help: "Time taken to unlock an intrinsic step",
Buckets: prometheus.DefBuckets,
},
[]string{"intrinsic_type"}, // intrinsic type only for overall timing
)
UnlockTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "unlock_total",
Help: "Total number of successful unlock operations",
},
[]string{"intrinsic_type", "operation"},
)
UnlockErrors = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "unlock_errors_total",
Help: "Total number of failed unlock operations",
},
[]string{"intrinsic_type", "operation"},
)
// InvokeStep operation metrics
InvokeStepDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "invoke_step_duration_seconds",
Help: "Time taken to execute an intrinsic step",
Buckets: prometheus.DefBuckets,
},
[]string{"intrinsic_type"}, // intrinsic type only for overall timing
)
InvokeStepTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "invoke_step_total",
Help: "Total number of successful invoke step operations",
},
[]string{"intrinsic_type", "operation"},
)
InvokeStepErrors = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "invoke_step_errors_total",
Help: "Total number of failed invoke step operations",
},
[]string{"intrinsic_type", "operation"},
)
// Commit operation metrics
CommitDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "commit_duration_seconds",
Help: "Time taken to commit intrinsic state",
Buckets: prometheus.DefBuckets,
},
[]string{"intrinsic_type"},
)
CommitTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "commit_total",
Help: "Total number of successful commit operations",
},
[]string{"intrinsic_type"},
)
CommitErrors = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "commit_errors_total",
Help: "Total number of failed commit operations",
},
[]string{"intrinsic_type"},
)
// Operation-specific duration metrics
OperationDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "operation_duration_seconds",
Help: "Time taken for specific operations within InvokeStep",
Buckets: prometheus.DefBuckets,
},
[]string{"intrinsic_type", "operation"},
)
// State size metrics
StateSize = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "state_size_bytes",
Help: "Current size of intrinsic state in bytes",
},
[]string{"intrinsic_type"},
)
// Operation-specific metrics
OperationCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: subsystem,
Name: "operation_count_total",
Help: "Total count of specific operations by type",
},
[]string{"intrinsic_type", "operation"},
)
)