diff --git a/config/provide.go b/config/provide.go index 9408afb10..5f900aaa8 100644 --- a/config/provide.go +++ b/config/provide.go @@ -1,8 +1,11 @@ package config import ( + "fmt" "strings" "time" + + "github.com/libp2p/go-libp2p-kad-dht/amino" ) const ( @@ -101,3 +104,67 @@ func ParseProvideStrategy(s string) ProvideStrategy { } return strategy } + +// ValidateProvideConfig validates the Provide configuration according to DHT requirements. +func ValidateProvideConfig(cfg *Provide) error { + // Validate Provide.DHT.Interval + if !cfg.DHT.Interval.IsDefault() { + interval := cfg.DHT.Interval.WithDefault(DefaultProvideDHTInterval) + if interval > amino.DefaultProvideValidity { + return fmt.Errorf("Provide.DHT.Interval (%v) must be less than or equal to DHT provider record validity (%v)", interval, amino.DefaultProvideValidity) + } + if interval < 0 { + return fmt.Errorf("Provide.DHT.Interval must be non-negative, got %v", interval) + } + } + + // Validate MaxWorkers + if !cfg.DHT.MaxWorkers.IsDefault() { + maxWorkers := cfg.DHT.MaxWorkers.WithDefault(DefaultProvideDHTMaxWorkers) + if maxWorkers <= 0 { + return fmt.Errorf("Provide.DHT.MaxWorkers must be positive, got %d", maxWorkers) + } + } + + // Validate DedicatedPeriodicWorkers + if !cfg.DHT.DedicatedPeriodicWorkers.IsDefault() { + workers := cfg.DHT.DedicatedPeriodicWorkers.WithDefault(DefaultProvideDHTDedicatedPeriodicWorkers) + if workers < 0 { + return fmt.Errorf("Provide.DHT.DedicatedPeriodicWorkers must be non-negative, got %d", workers) + } + } + + // Validate DedicatedBurstWorkers + if !cfg.DHT.DedicatedBurstWorkers.IsDefault() { + workers := cfg.DHT.DedicatedBurstWorkers.WithDefault(DefaultProvideDHTDedicatedBurstWorkers) + if workers < 0 { + return fmt.Errorf("Provide.DHT.DedicatedBurstWorkers must be non-negative, got %d", workers) + } + } + + // Validate MaxProvideConnsPerWorker + if !cfg.DHT.MaxProvideConnsPerWorker.IsDefault() { + conns := cfg.DHT.MaxProvideConnsPerWorker.WithDefault(DefaultProvideDHTMaxProvideConnsPerWorker) + if conns <= 0 { + return fmt.Errorf("Provide.DHT.MaxProvideConnsPerWorker must be positive, got %d", conns) + } + } + + // Validate KeyStoreBatchSize + if !cfg.DHT.KeyStoreBatchSize.IsDefault() { + batchSize := cfg.DHT.KeyStoreBatchSize.WithDefault(DefaultProvideDHTKeyStoreBatchSize) + if batchSize <= 0 { + return fmt.Errorf("Provide.DHT.KeyStoreBatchSize must be positive, got %d", batchSize) + } + } + + // Validate OfflineDelay + if !cfg.DHT.OfflineDelay.IsDefault() { + delay := cfg.DHT.OfflineDelay.WithDefault(DefaultProvideDHTOfflineDelay) + if delay < 0 { + return fmt.Errorf("Provide.DHT.OfflineDelay must be non-negative, got %v", delay) + } + } + + return nil +} diff --git a/config/provide_test.go b/config/provide_test.go index 2ec1b85e1..213271eb0 100644 --- a/config/provide_test.go +++ b/config/provide_test.go @@ -1,6 +1,12 @@ package config -import "testing" +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) func TestParseProvideStrategy(t *testing.T) { tests := []struct { @@ -25,3 +31,77 @@ func TestParseProvideStrategy(t *testing.T) { } } } + +func TestValidateProvideConfig_Interval(t *testing.T) { + tests := []struct { + name string + interval time.Duration + wantErr bool + errMsg string + }{ + {"valid default (22h)", 22 * time.Hour, false, ""}, + {"valid max (48h)", 48 * time.Hour, false, ""}, + {"valid small (1h)", 1 * time.Hour, false, ""}, + {"valid zero (disabled)", 0, false, ""}, + {"invalid over limit (49h)", 49 * time.Hour, true, "must be less than or equal to DHT provider record validity"}, + {"invalid over limit (72h)", 72 * time.Hour, true, "must be less than or equal to DHT provider record validity"}, + {"invalid negative", -1 * time.Hour, true, "must be non-negative"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := &Provide{ + DHT: ProvideDHT{ + Interval: NewOptionalDuration(tt.interval), + }, + } + + err := ValidateProvideConfig(cfg) + + if tt.wantErr { + require.Error(t, err, "expected error for interval=%v", tt.interval) + if tt.errMsg != "" { + assert.Contains(t, err.Error(), tt.errMsg, "error message mismatch") + } + } else { + require.NoError(t, err, "unexpected error for interval=%v", tt.interval) + } + }) + } +} + +func TestValidateProvideConfig_MaxWorkers(t *testing.T) { + tests := []struct { + name string + maxWorkers int64 + wantErr bool + errMsg string + }{ + {"valid default", 16, false, ""}, + {"valid high", 100, false, ""}, + {"valid low", 1, false, ""}, + {"invalid zero", 0, true, "must be positive"}, + {"invalid negative", -1, true, "must be positive"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := &Provide{ + DHT: ProvideDHT{ + MaxWorkers: NewOptionalInteger(tt.maxWorkers), + }, + } + + err := ValidateProvideConfig(cfg) + + if tt.wantErr { + require.Error(t, err, "expected error for maxWorkers=%d", tt.maxWorkers) + if tt.errMsg != "" { + assert.Contains(t, err.Error(), tt.errMsg, "error message mismatch") + } + } else { + require.NoError(t, err, "unexpected error for maxWorkers=%d", tt.maxWorkers) + } + }) + } +} diff --git a/core/node/groups.go b/core/node/groups.go index 50c7955ba..9e6433a32 100644 --- a/core/node/groups.go +++ b/core/node/groups.go @@ -431,6 +431,11 @@ func IPFS(ctx context.Context, bcfg *BuildCfg) fx.Option { return fx.Error(err) } + // Validate Provide configuration + if err := config.ValidateProvideConfig(&cfg.Provide); err != nil { + return fx.Error(err) + } + // Auto-sharding settings shardingThresholdString := cfg.Import.UnixFSHAMTDirectorySizeThreshold.WithDefault(config.DefaultUnixFSHAMTDirectorySizeThreshold) shardSingThresholdInt, err := humanize.ParseBytes(shardingThresholdString) diff --git a/core/node/provider.go b/core/node/provider.go index 0658cfeee..b692fa8cd 100644 --- a/core/node/provider.go +++ b/core/node/provider.go @@ -208,20 +208,20 @@ func LegacyProviderOpt(reprovideInterval time.Duration, strategy string, acceler expectedProvideSpeed := reprovideInterval / probableBigBlockstore if avgProvideSpeed > expectedProvideSpeed { logger.Errorf(` -๐Ÿ””๐Ÿ””๐Ÿ”” YOU MAY BE FALLING BEHIND DHT REPROVIDES! ๐Ÿ””๐Ÿ””๐Ÿ”” +๐Ÿ””๐Ÿ””๐Ÿ”” Reprovide Operations Too Slow ๐Ÿ””๐Ÿ””๐Ÿ”” -โš ๏ธ Your system might be struggling to keep up with DHT reprovides! -This means your content could be partially or completely inaccessible on the network. -We observed that you recently provided %d keys at an average rate of %v per key. +Your node may be falling behind on DHT reprovides, which could affect content availability. -๐Ÿ•‘ An attempt to estimate your blockstore size timed out after 5 minutes, -implying your blockstore might be exceedingly large. Assuming a considerable -size of 10TiB, it would take %v to provide the complete set. +Observed: %d keys at %v per key +Estimated: Assuming 10TiB blockstore, would take %v to complete +โฐ Must finish within %v (Provide.DHT.Interval) -โฐ The total provide time needs to stay under your reprovide interval (%v) to prevent falling behind! +Solutions (try in order): +1. Enable Provide.DHT.SweepEnabled=true (recommended) +2. Increase Provide.DHT.MaxWorkers if needed +3. Enable Routing.AcceleratedDHTClient=true (last resort, resource intensive) -๐Ÿ’ก Consider enabling the Accelerated DHT to enhance your system performance. See: -https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient`, +Learn more: https://github.com/ipfs/kubo/blob/master/docs/config.md#provide`, keysProvided, avgProvideSpeed, avgProvideSpeed*probableBigBlockstore, reprovideInterval) return false } @@ -237,18 +237,20 @@ https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtcli if avgProvideSpeed > expectedProvideSpeed { logger.Errorf(` -๐Ÿ””๐Ÿ””๐Ÿ”” YOU ARE FALLING BEHIND DHT REPROVIDES! ๐Ÿ””๐Ÿ””๐Ÿ”” +๐Ÿ””๐Ÿ””๐Ÿ”” Reprovide Operations Too Slow ๐Ÿ””๐Ÿ””๐Ÿ”” -โš ๏ธ Your system is struggling to keep up with DHT reprovides! -This means your content could be partially or completely inaccessible on the network. -We observed that you recently provided %d keys at an average rate of %v per key. +Your node is falling behind on DHT reprovides, which will affect content availability. -๐Ÿ’พ Your total CID count is ~%d which would total at %v reprovide process. +Observed: %d keys at %v per key +Confirmed: ~%d total CIDs requiring %v to complete +โฐ Must finish within %v (Provide.DHT.Interval) -โฐ The total provide time needs to stay under your reprovide interval (%v) to prevent falling behind! +Solutions (try in order): +1. Enable Provide.DHT.SweepEnabled=true (recommended) +2. Increase Provide.DHT.MaxWorkers if needed +3. Enable Routing.AcceleratedDHTClient=true (last resort, resource intensive) -๐Ÿ’ก Consider enabling the Accelerated DHT to enhance your reprovide throughput. See: -https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient`, +Learn more: https://github.com/ipfs/kubo/blob/master/docs/config.md#provide`, keysProvided, avgProvideSpeed, count, avgProvideSpeed*time.Duration(count), reprovideInterval) } return false diff --git a/docs/changelogs/v0.38.md b/docs/changelogs/v0.38.md index 1fd774f04..045715e68 100644 --- a/docs/changelogs/v0.38.md +++ b/docs/changelogs/v0.38.md @@ -38,17 +38,25 @@ Read more about the new system below. #### ๐Ÿงน Experimental Sweeping DHT Provider -A new experimental DHT provider is available as an alternative to both the default provider and the resource-intensive [accelerated DHT client](https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient). Enable it via [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtssweepenabled). - -> [!NOTE] -> This feature is experimental and opt-in. In the future, it will become the default and replace the legacy system. Some commands like `ipfs stats provide` and `ipfs routing provide` are not yet available with sweep mode. Run `ipfs provide --help` for alternatives. +A new experimental DHT provider is available as an alternative to both the default provider and the resource-intensive [accelerated DHT client](https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient). Enable it via [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtsweepenabled). **How it works:** Instead of providing keys one-by-one, the sweep provider systematically explores DHT keyspace regions in batches. +> +> +> +> Reprovide Cycle Comparison +> +> +> The diagram shows how sweep mode avoids the hourly traffic spikes of Accelerated DHT while maintaining similar effectiveness. By grouping CIDs into keyspace regions and processing them in batches, sweep mode reduces memory overhead and creates predictable network patterns. + **Benefits for large-scale operations:** Handles hundreds of thousands of CIDs with reduced memory and network connections, spreads operations evenly to eliminate resource spikes, maintains state across restarts through persistent keystore, and provides better metrics visibility. **Monitoring and debugging:** Legacy mode (`SweepEnabled=false`) tracks `provider_reprovider_provide_count` and `provider_reprovider_reprovide_count`, while sweep mode (`SweepEnabled=true`) tracks `total_provide_count_total`. Enable debug logging with `GOLOG_LOG_LEVEL=error,provider=debug,dht/provider=debug` to see detailed logs from either system. +> [!NOTE] +> This feature is experimental and opt-in. In the future, it will become the default and replace the legacy system. Some commands like `ipfs stats provide` and `ipfs routing provide` are not yet available with sweep mode. Run `ipfs provide --help` for alternatives. + For configuration details, see [`Provide.DHT`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedht). For metrics documentation, see [Provide metrics](https://github.com/ipfs/kubo/blob/master/docs/metrics.md#provide). #### ๐Ÿ“Š Exposed DHT metrics diff --git a/docs/config.md b/docs/config.md index 9ef96514d..a69b045f7 100644 --- a/docs/config.md +++ b/docs/config.md @@ -131,7 +131,7 @@ config file at runtime. - [`Provide.DHT`](#providedht) - [`Provide.DHT.MaxWorkers`](#providedhtmaxworkers) - [`Provide.DHT.Interval`](#providedhtinterval) - - [`Provide.DHT.SweepEnabled`](#providedhtssweepenabled) + - [`Provide.DHT.SweepEnabled`](#providedhtsweepenabled) - [`Provide.DHT.DedicatedPeriodicWorkers`](#providedhtdedicatedperiodicworkers) - [`Provide.DHT.DedicatedBurstWorkers`](#providedhtdedicatedburstworkers) - [`Provide.DHT.MaxProvideConnsPerWorker`](#providedhtmaxprovideconnsperworker) @@ -2026,6 +2026,21 @@ by providing it a channel of all the keys it is expected to contain according to the [`Provide.Strategy`](#providestrategy). During this operation, all keys in the `Keystore` are purged, and only the given ones remain scheduled. +> +> +> +> Reprovide Cycle Comparison +> +> +> The diagram above visualizes the performance patterns: +> +> - **Legacy mode**: Individual (slow) provides per CID, can struggle with large datasets +> - **Sweep mode**: Even distribution matching the keyspace sweep described with low resource usage +> - **Accelerated DHT**: Hourly traffic spikes with high resource usage +> +> Sweep mode provides similar effectiveness to Accelerated DHT but with steady resource usage - better for machines with limited CPU, memory, or network bandwidth. + + > [!NOTE] > This feature is opt-in for now, but will become the default in a future release. > Eventually, this configuration flag will be removed once the feature is stable. @@ -2400,8 +2415,8 @@ When it is enabled: - The provider will now use a keyspace sweeping mode allowing to keep alive CID sets that are multiple orders of magnitude larger. - **Note:** For improved provide/reprovide operations specifically, consider using - [`Provide.DHT.SweepEnabled`](#providedhtssweepenabled) instead, which offers similar - benefits with lower resource consumption. + [`Provide.DHT.SweepEnabled`](#providedhtsweepenabled) instead, which offers similar + benefits without the hourly traffic spikes. - The standard Bucket-Routing-Table DHT will still run for the DHT server (if the DHT server is enabled). This means the classical routing table will still be used to answer other nodes.