additional logging to isolate respawn quirks

2026-02-25 12:27:24 +08:00 · 2026-02-24 06:20:44 -06:00 · 2026-02-24 06:20:44 -06:00 · 72dfec697b
commit 72dfec697b
parent 13a4fb24be
2 changed files with 18 additions and 2 deletions
--- a/node/consensus/app/app_consensus_engine.go
+++ b/node/consensus/app/app_consensus_engine.go
@ -744,6 +744,7 @@ func NewAppConsensusEngine(
 	// identify which worker(s) hang during shutdown.
 	namedWorker := func(name string, fn func(lifecycle.SignalerContext, lifecycle.ReadyFunc)) lifecycle.ComponentWorker {
 		return func(ctx lifecycle.SignalerContext, ready lifecycle.ReadyFunc) {
+			engine.logger.Debug("worker starting", zap.String("worker", name))
 			defer engine.logger.Debug("worker stopped", zap.String("worker", name))
 			fn(ctx, ready)
 		}
@ -966,6 +967,7 @@ func NewAppConsensusEngine(
 }

 func (e *AppConsensusEngine) Stop(force bool) <-chan error {
+	e.logger.Info("app engine stopping", zap.Bool("force", force))
 	errChan := make(chan error, 1)

 	// First, cancel context to signal all goroutines to stop
@ -2415,6 +2417,8 @@ func (e *AppConsensusEngine) startConsensus(
 	e.timeoutAggregator.Start(ctx)
 	<-lifecycle.AllReady(e.voteAggregator, e.timeoutAggregator)
 	e.consensusParticipant.Start(ctx)
+	e.logger.Info("consensus started successfully",
+		zap.String("shard_address", e.appAddressHex))
 	return nil
 }

--- a/node/datarpc/data_worker_ipc_server.go
+++ b/node/datarpc/data_worker_ipc_server.go
@ -184,11 +184,13 @@ func (r *DataWorkerIPCServer) RespawnServer(filter []byte) error {
 	// complete, but those handlers won't stop until the engine context is
 	// cancelled. Reversing the order avoids a deadlock.
 	if r.appConsensusEngine != nil {
+		r.logger.Info("respawning worker: stopping old engine")
 		if r.cancel != nil {
 			r.cancel()
 		}
 		<-r.appConsensusEngine.Stop(false)
 		r.appConsensusEngine = nil
+		r.logger.Info("respawning worker: old engine stopped")
 	}
 	if r.server != nil {
 		r.logger.Info("stopping server for respawn")
@ -285,16 +287,26 @@ func (r *DataWorkerIPCServer) RespawnServer(filter []byte) error {
 			return errors.Wrap(err, "respawn server")
 		}

-		r.ctx, r.cancel, _ = lifecycle.WithSignallerAndCancel(context.Background())
+		var errCh <-chan error
+		r.ctx, r.cancel, errCh = lifecycle.WithSignallerAndCancel(context.Background())
 		// Capture engine and ctx in local variables to avoid race with subsequent RespawnServer calls
 		engine := r.appConsensusEngine
 		ctx := r.ctx
+		go func() {
+			if err, ok := <-errCh; ok && err != nil {
+				r.logger.Error("app engine fatal error during respawn",
+					zap.Error(err))
+			}
+		}()
+		r.logger.Info("respawning worker: engine created, starting")
 		go func() {
 			if engine == nil {
 				return
 			}
 			if err = engine.Start(ctx); err != nil {
-				r.logger.Error("error while running", zap.Error(err))
+				r.logger.Error("respawning worker: engine start failed", zap.Error(err))
+			} else {
+				r.logger.Info("respawning worker: engine started successfully")
 			}
 		}()
 	}