diff --git a/test/cli/autoconf/swarm_connect_test.go b/test/cli/autoconf/swarm_connect_test.go index 95c75d953..a9e58b7c9 100644 --- a/test/cli/autoconf/swarm_connect_test.go +++ b/test/cli/autoconf/swarm_connect_test.go @@ -2,7 +2,6 @@ package autoconf import ( "testing" - "time" "github.com/ipfs/kubo/test/cli/harness" "github.com/stretchr/testify/assert" @@ -21,10 +20,12 @@ func TestSwarmConnectWithAutoConf(t *testing.T) { t.Parallel() t.Run("AutoConf disabled - should work", func(t *testing.T) { + // Don't run subtests in parallel to avoid daemon startup conflicts testSwarmConnectWithAutoConfSetting(t, false, true) // expect success }) t.Run("AutoConf enabled - should work", func(t *testing.T) { + // Don't run subtests in parallel to avoid daemon startup conflicts testSwarmConnectWithAutoConfSetting(t, true, true) // expect success (fix the bug!) }) } @@ -44,17 +45,10 @@ func testSwarmConnectWithAutoConfSetting(t *testing.T, autoConfEnabled bool, exp "/dnsaddr/bootstrap.libp2p.io/p2p/QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb", }) - // CRITICAL: Start the daemon first - this is the key requirement - // The daemon must be running and working properly + // Start the daemon node.StartDaemon() defer node.StopDaemon() - // Give daemon time to start up completely - time.Sleep(3 * time.Second) - - // Verify daemon is responsive - result := node.RunIPFS("id") - require.Equal(t, 0, result.ExitCode(), "Daemon should be responsive before testing swarm connect") t.Logf("Daemon is running and responsive. AutoConf enabled: %v", autoConfEnabled) // Now test swarm connect to a bootstrap peer @@ -62,7 +56,7 @@ func testSwarmConnectWithAutoConfSetting(t *testing.T, autoConfEnabled bool, exp // 1. The daemon is running // 2. The CLI should connect to the daemon via API // 3. The daemon should handle the swarm connect request - result = node.RunIPFS("swarm", "connect", "/dnsaddr/bootstrap.libp2p.io") + result := node.RunIPFS("swarm", "connect", "/dnsaddr/bootstrap.libp2p.io") // swarm connect should work regardless of AutoConf setting assert.Equal(t, 0, result.ExitCode(), diff --git a/test/cli/harness/node.go b/test/cli/harness/node.go index 7d0a90866..fa36e7d6c 100644 --- a/test/cli/harness/node.go +++ b/test/cli/harness/node.go @@ -265,23 +265,74 @@ func (n *Node) StartDaemonWithReq(req RunRequest, authorization string) *Node { if alive { log.Panicf("node %d is already running", n.ID) } - newReq := req - newReq.Path = n.IPFSBin - newReq.Args = append([]string{"daemon"}, req.Args...) - newReq.RunFunc = (*exec.Cmd).Start - log.Debugf("starting node %d", n.ID) - res := n.Runner.MustRun(newReq) + // Start the daemon with a simple retry mechanism + // Sometimes when tests run in parallel, daemon startup can fail transiently + var daemonStarted bool + var lastErr error + for attempt := 0; attempt < 3; attempt++ { + if attempt > 0 { + time.Sleep(time.Second) // Brief pause before retry + log.Debugf("retrying daemon start for node %d (attempt %d/3)", n.ID, attempt+1) + } - n.Daemon = res + func() { + defer func() { + if r := recover(); r != nil { + lastErr = fmt.Errorf("panic during daemon start: %v", r) + log.Debugf("node %d daemon start attempt %d failed: %v", n.ID, attempt+1, r) + } + }() - // Register the daemon process for cleanup tracking - if res.Cmd != nil && res.Cmd.Process != nil { - globalProcessTracker.RegisterProcess(res.Cmd.Process) + newReq := req + newReq.Path = n.IPFSBin + newReq.Args = append([]string{"daemon"}, req.Args...) + newReq.RunFunc = (*exec.Cmd).Start + + log.Debugf("starting node %d", n.ID) + res := n.Runner.MustRun(newReq) + + n.Daemon = res + + // Register the daemon process for cleanup tracking + if res.Cmd != nil && res.Cmd.Process != nil { + globalProcessTracker.RegisterProcess(res.Cmd.Process) + } + + log.Debugf("node %d started, checking API", n.ID) + n.WaitOnAPI(authorization) + daemonStarted = true + }() + + if daemonStarted { + break + } + } + + if !daemonStarted { + if lastErr != nil { + log.Panicf("node %d failed to start daemon after 3 attempts: %v", n.ID, lastErr) + } else { + log.Panicf("node %d failed to start daemon after 3 attempts", n.ID) + } + } + + // Wait for daemon to be fully ready by checking it can respond to commands + // This is more reliable than just checking the API endpoint + maxRetries := 30 + for i := 0; i < maxRetries; i++ { + result := n.RunIPFS("id") + if result.ExitCode() == 0 { + log.Debugf("node %d daemon is fully responsive", n.ID) + break + } + if i == maxRetries-1 { + log.Panicf("node %d daemon not responsive after %d retries. stderr: %s", + n.ID, maxRetries, result.Stderr.String()) + } + time.Sleep(200 * time.Millisecond) } - log.Debugf("node %d started, checking API", n.ID) - n.WaitOnAPI(authorization) return n }