additional logging to isolate respawn quirks

This commit is contained in:
Cassandra Heart 2026-02-24 06:20:44 -06:00
parent 13a4fb24be
commit 72dfec697b
No known key found for this signature in database
GPG Key ID: 371083BFA6C240AA
2 changed files with 18 additions and 2 deletions

View File

@ -744,6 +744,7 @@ func NewAppConsensusEngine(
// identify which worker(s) hang during shutdown.
namedWorker := func(name string, fn func(lifecycle.SignalerContext, lifecycle.ReadyFunc)) lifecycle.ComponentWorker {
return func(ctx lifecycle.SignalerContext, ready lifecycle.ReadyFunc) {
engine.logger.Debug("worker starting", zap.String("worker", name))
defer engine.logger.Debug("worker stopped", zap.String("worker", name))
fn(ctx, ready)
}
@ -966,6 +967,7 @@ func NewAppConsensusEngine(
}
func (e *AppConsensusEngine) Stop(force bool) <-chan error {
e.logger.Info("app engine stopping", zap.Bool("force", force))
errChan := make(chan error, 1)
// First, cancel context to signal all goroutines to stop
@ -2415,6 +2417,8 @@ func (e *AppConsensusEngine) startConsensus(
e.timeoutAggregator.Start(ctx)
<-lifecycle.AllReady(e.voteAggregator, e.timeoutAggregator)
e.consensusParticipant.Start(ctx)
e.logger.Info("consensus started successfully",
zap.String("shard_address", e.appAddressHex))
return nil
}

View File

@ -184,11 +184,13 @@ func (r *DataWorkerIPCServer) RespawnServer(filter []byte) error {
// complete, but those handlers won't stop until the engine context is
// cancelled. Reversing the order avoids a deadlock.
if r.appConsensusEngine != nil {
r.logger.Info("respawning worker: stopping old engine")
if r.cancel != nil {
r.cancel()
}
<-r.appConsensusEngine.Stop(false)
r.appConsensusEngine = nil
r.logger.Info("respawning worker: old engine stopped")
}
if r.server != nil {
r.logger.Info("stopping server for respawn")
@ -285,16 +287,26 @@ func (r *DataWorkerIPCServer) RespawnServer(filter []byte) error {
return errors.Wrap(err, "respawn server")
}
r.ctx, r.cancel, _ = lifecycle.WithSignallerAndCancel(context.Background())
var errCh <-chan error
r.ctx, r.cancel, errCh = lifecycle.WithSignallerAndCancel(context.Background())
// Capture engine and ctx in local variables to avoid race with subsequent RespawnServer calls
engine := r.appConsensusEngine
ctx := r.ctx
go func() {
if err, ok := <-errCh; ok && err != nil {
r.logger.Error("app engine fatal error during respawn",
zap.Error(err))
}
}()
r.logger.Info("respawning worker: engine created, starting")
go func() {
if engine == nil {
return
}
if err = engine.Start(ctx); err != nil {
r.logger.Error("error while running", zap.Error(err))
r.logger.Error("respawning worker: engine start failed", zap.Error(err))
} else {
r.logger.Info("respawning worker: engine started successfully")
}
}()
}