From 72dfec697bf5beaf7cb1dce7705cf96b8aea0ca1 Mon Sep 17 00:00:00 2001 From: Cassandra Heart Date: Tue, 24 Feb 2026 06:20:44 -0600 Subject: [PATCH] additional logging to isolate respawn quirks --- node/consensus/app/app_consensus_engine.go | 4 ++++ node/datarpc/data_worker_ipc_server.go | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/node/consensus/app/app_consensus_engine.go b/node/consensus/app/app_consensus_engine.go index 2351d58..d2c2558 100644 --- a/node/consensus/app/app_consensus_engine.go +++ b/node/consensus/app/app_consensus_engine.go @@ -744,6 +744,7 @@ func NewAppConsensusEngine( // identify which worker(s) hang during shutdown. namedWorker := func(name string, fn func(lifecycle.SignalerContext, lifecycle.ReadyFunc)) lifecycle.ComponentWorker { return func(ctx lifecycle.SignalerContext, ready lifecycle.ReadyFunc) { + engine.logger.Debug("worker starting", zap.String("worker", name)) defer engine.logger.Debug("worker stopped", zap.String("worker", name)) fn(ctx, ready) } @@ -966,6 +967,7 @@ func NewAppConsensusEngine( } func (e *AppConsensusEngine) Stop(force bool) <-chan error { + e.logger.Info("app engine stopping", zap.Bool("force", force)) errChan := make(chan error, 1) // First, cancel context to signal all goroutines to stop @@ -2415,6 +2417,8 @@ func (e *AppConsensusEngine) startConsensus( e.timeoutAggregator.Start(ctx) <-lifecycle.AllReady(e.voteAggregator, e.timeoutAggregator) e.consensusParticipant.Start(ctx) + e.logger.Info("consensus started successfully", + zap.String("shard_address", e.appAddressHex)) return nil } diff --git a/node/datarpc/data_worker_ipc_server.go b/node/datarpc/data_worker_ipc_server.go index cdb77f9..e66326c 100644 --- a/node/datarpc/data_worker_ipc_server.go +++ b/node/datarpc/data_worker_ipc_server.go @@ -184,11 +184,13 @@ func (r *DataWorkerIPCServer) RespawnServer(filter []byte) error { // complete, but those handlers won't stop until the engine context is // cancelled. Reversing the order avoids a deadlock. if r.appConsensusEngine != nil { + r.logger.Info("respawning worker: stopping old engine") if r.cancel != nil { r.cancel() } <-r.appConsensusEngine.Stop(false) r.appConsensusEngine = nil + r.logger.Info("respawning worker: old engine stopped") } if r.server != nil { r.logger.Info("stopping server for respawn") @@ -285,16 +287,26 @@ func (r *DataWorkerIPCServer) RespawnServer(filter []byte) error { return errors.Wrap(err, "respawn server") } - r.ctx, r.cancel, _ = lifecycle.WithSignallerAndCancel(context.Background()) + var errCh <-chan error + r.ctx, r.cancel, errCh = lifecycle.WithSignallerAndCancel(context.Background()) // Capture engine and ctx in local variables to avoid race with subsequent RespawnServer calls engine := r.appConsensusEngine ctx := r.ctx + go func() { + if err, ok := <-errCh; ok && err != nil { + r.logger.Error("app engine fatal error during respawn", + zap.Error(err)) + } + }() + r.logger.Info("respawning worker: engine created, starting") go func() { if engine == nil { return } if err = engine.Start(ctx); err != nil { - r.logger.Error("error while running", zap.Error(err)) + r.logger.Error("respawning worker: engine start failed", zap.Error(err)) + } else { + r.logger.Info("respawning worker: engine started successfully") } }() }