docs(provide): validation and reprovide cycle visualization (#10977)

* docs: improve slow reprovide warning messages

simplify warning text and provide actionable solutions in order of preference

* feat(config): add validation for Provide.DHT settings

- validate interval doesn't exceed DHT record validity (48h)
- validate worker counts and other parameters are within valid ranges
- improve slow reprovide warning messages to reference config parameter
- add tests for all validation cases

* docs: add reprovide cycle visualization

shows traffic patterns of legacy vs sweep vs accelerated DHT
This commit is contained in:
Marcin Rataj 2025-09-19 18:47:30 +02:00 committed by GitHub
parent 9faefe316f
commit 07f017f01d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 203 additions and 26 deletions

View File

@ -1,8 +1,11 @@
package config
import (
"fmt"
"strings"
"time"
"github.com/libp2p/go-libp2p-kad-dht/amino"
)
const (
@ -101,3 +104,67 @@ func ParseProvideStrategy(s string) ProvideStrategy {
}
return strategy
}
// ValidateProvideConfig validates the Provide configuration according to DHT requirements.
func ValidateProvideConfig(cfg *Provide) error {
// Validate Provide.DHT.Interval
if !cfg.DHT.Interval.IsDefault() {
interval := cfg.DHT.Interval.WithDefault(DefaultProvideDHTInterval)
if interval > amino.DefaultProvideValidity {
return fmt.Errorf("Provide.DHT.Interval (%v) must be less than or equal to DHT provider record validity (%v)", interval, amino.DefaultProvideValidity)
}
if interval < 0 {
return fmt.Errorf("Provide.DHT.Interval must be non-negative, got %v", interval)
}
}
// Validate MaxWorkers
if !cfg.DHT.MaxWorkers.IsDefault() {
maxWorkers := cfg.DHT.MaxWorkers.WithDefault(DefaultProvideDHTMaxWorkers)
if maxWorkers <= 0 {
return fmt.Errorf("Provide.DHT.MaxWorkers must be positive, got %d", maxWorkers)
}
}
// Validate DedicatedPeriodicWorkers
if !cfg.DHT.DedicatedPeriodicWorkers.IsDefault() {
workers := cfg.DHT.DedicatedPeriodicWorkers.WithDefault(DefaultProvideDHTDedicatedPeriodicWorkers)
if workers < 0 {
return fmt.Errorf("Provide.DHT.DedicatedPeriodicWorkers must be non-negative, got %d", workers)
}
}
// Validate DedicatedBurstWorkers
if !cfg.DHT.DedicatedBurstWorkers.IsDefault() {
workers := cfg.DHT.DedicatedBurstWorkers.WithDefault(DefaultProvideDHTDedicatedBurstWorkers)
if workers < 0 {
return fmt.Errorf("Provide.DHT.DedicatedBurstWorkers must be non-negative, got %d", workers)
}
}
// Validate MaxProvideConnsPerWorker
if !cfg.DHT.MaxProvideConnsPerWorker.IsDefault() {
conns := cfg.DHT.MaxProvideConnsPerWorker.WithDefault(DefaultProvideDHTMaxProvideConnsPerWorker)
if conns <= 0 {
return fmt.Errorf("Provide.DHT.MaxProvideConnsPerWorker must be positive, got %d", conns)
}
}
// Validate KeyStoreBatchSize
if !cfg.DHT.KeyStoreBatchSize.IsDefault() {
batchSize := cfg.DHT.KeyStoreBatchSize.WithDefault(DefaultProvideDHTKeyStoreBatchSize)
if batchSize <= 0 {
return fmt.Errorf("Provide.DHT.KeyStoreBatchSize must be positive, got %d", batchSize)
}
}
// Validate OfflineDelay
if !cfg.DHT.OfflineDelay.IsDefault() {
delay := cfg.DHT.OfflineDelay.WithDefault(DefaultProvideDHTOfflineDelay)
if delay < 0 {
return fmt.Errorf("Provide.DHT.OfflineDelay must be non-negative, got %v", delay)
}
}
return nil
}

View File

@ -1,6 +1,12 @@
package config
import "testing"
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestParseProvideStrategy(t *testing.T) {
tests := []struct {
@ -25,3 +31,77 @@ func TestParseProvideStrategy(t *testing.T) {
}
}
}
func TestValidateProvideConfig_Interval(t *testing.T) {
tests := []struct {
name string
interval time.Duration
wantErr bool
errMsg string
}{
{"valid default (22h)", 22 * time.Hour, false, ""},
{"valid max (48h)", 48 * time.Hour, false, ""},
{"valid small (1h)", 1 * time.Hour, false, ""},
{"valid zero (disabled)", 0, false, ""},
{"invalid over limit (49h)", 49 * time.Hour, true, "must be less than or equal to DHT provider record validity"},
{"invalid over limit (72h)", 72 * time.Hour, true, "must be less than or equal to DHT provider record validity"},
{"invalid negative", -1 * time.Hour, true, "must be non-negative"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cfg := &Provide{
DHT: ProvideDHT{
Interval: NewOptionalDuration(tt.interval),
},
}
err := ValidateProvideConfig(cfg)
if tt.wantErr {
require.Error(t, err, "expected error for interval=%v", tt.interval)
if tt.errMsg != "" {
assert.Contains(t, err.Error(), tt.errMsg, "error message mismatch")
}
} else {
require.NoError(t, err, "unexpected error for interval=%v", tt.interval)
}
})
}
}
func TestValidateProvideConfig_MaxWorkers(t *testing.T) {
tests := []struct {
name string
maxWorkers int64
wantErr bool
errMsg string
}{
{"valid default", 16, false, ""},
{"valid high", 100, false, ""},
{"valid low", 1, false, ""},
{"invalid zero", 0, true, "must be positive"},
{"invalid negative", -1, true, "must be positive"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cfg := &Provide{
DHT: ProvideDHT{
MaxWorkers: NewOptionalInteger(tt.maxWorkers),
},
}
err := ValidateProvideConfig(cfg)
if tt.wantErr {
require.Error(t, err, "expected error for maxWorkers=%d", tt.maxWorkers)
if tt.errMsg != "" {
assert.Contains(t, err.Error(), tt.errMsg, "error message mismatch")
}
} else {
require.NoError(t, err, "unexpected error for maxWorkers=%d", tt.maxWorkers)
}
})
}
}

View File

@ -431,6 +431,11 @@ func IPFS(ctx context.Context, bcfg *BuildCfg) fx.Option {
return fx.Error(err)
}
// Validate Provide configuration
if err := config.ValidateProvideConfig(&cfg.Provide); err != nil {
return fx.Error(err)
}
// Auto-sharding settings
shardingThresholdString := cfg.Import.UnixFSHAMTDirectorySizeThreshold.WithDefault(config.DefaultUnixFSHAMTDirectorySizeThreshold)
shardSingThresholdInt, err := humanize.ParseBytes(shardingThresholdString)

View File

@ -208,20 +208,20 @@ func LegacyProviderOpt(reprovideInterval time.Duration, strategy string, acceler
expectedProvideSpeed := reprovideInterval / probableBigBlockstore
if avgProvideSpeed > expectedProvideSpeed {
logger.Errorf(`
🔔🔔🔔 YOU MAY BE FALLING BEHIND DHT REPROVIDES! 🔔🔔🔔
🔔🔔🔔 Reprovide Operations Too Slow 🔔🔔🔔
Your system might be struggling to keep up with DHT reprovides!
This means your content could be partially or completely inaccessible on the network.
We observed that you recently provided %d keys at an average rate of %v per key.
Your node may be falling behind on DHT reprovides, which could affect content availability.
🕑 An attempt to estimate your blockstore size timed out after 5 minutes,
implying your blockstore might be exceedingly large. Assuming a considerable
size of 10TiB, it would take %v to provide the complete set.
Observed: %d keys at %v per key
Estimated: Assuming 10TiB blockstore, would take %v to complete
Must finish within %v (Provide.DHT.Interval)
The total provide time needs to stay under your reprovide interval (%v) to prevent falling behind!
Solutions (try in order):
1. Enable Provide.DHT.SweepEnabled=true (recommended)
2. Increase Provide.DHT.MaxWorkers if needed
3. Enable Routing.AcceleratedDHTClient=true (last resort, resource intensive)
💡 Consider enabling the Accelerated DHT to enhance your system performance. See:
https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient`,
Learn more: https://github.com/ipfs/kubo/blob/master/docs/config.md#provide`,
keysProvided, avgProvideSpeed, avgProvideSpeed*probableBigBlockstore, reprovideInterval)
return false
}
@ -237,18 +237,20 @@ https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtcli
if avgProvideSpeed > expectedProvideSpeed {
logger.Errorf(`
🔔🔔🔔 YOU ARE FALLING BEHIND DHT REPROVIDES! 🔔🔔🔔
🔔🔔🔔 Reprovide Operations Too Slow 🔔🔔🔔
Your system is struggling to keep up with DHT reprovides!
This means your content could be partially or completely inaccessible on the network.
We observed that you recently provided %d keys at an average rate of %v per key.
Your node is falling behind on DHT reprovides, which will affect content availability.
💾 Your total CID count is ~%d which would total at %v reprovide process.
Observed: %d keys at %v per key
Confirmed: ~%d total CIDs requiring %v to complete
Must finish within %v (Provide.DHT.Interval)
The total provide time needs to stay under your reprovide interval (%v) to prevent falling behind!
Solutions (try in order):
1. Enable Provide.DHT.SweepEnabled=true (recommended)
2. Increase Provide.DHT.MaxWorkers if needed
3. Enable Routing.AcceleratedDHTClient=true (last resort, resource intensive)
💡 Consider enabling the Accelerated DHT to enhance your reprovide throughput. See:
https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient`,
Learn more: https://github.com/ipfs/kubo/blob/master/docs/config.md#provide`,
keysProvided, avgProvideSpeed, count, avgProvideSpeed*time.Duration(count), reprovideInterval)
}
return false

View File

@ -38,17 +38,25 @@ Read more about the new system below.
#### 🧹 Experimental Sweeping DHT Provider
A new experimental DHT provider is available as an alternative to both the default provider and the resource-intensive [accelerated DHT client](https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient). Enable it via [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtssweepenabled).
> [!NOTE]
> This feature is experimental and opt-in. In the future, it will become the default and replace the legacy system. Some commands like `ipfs stats provide` and `ipfs routing provide` are not yet available with sweep mode. Run `ipfs provide --help` for alternatives.
A new experimental DHT provider is available as an alternative to both the default provider and the resource-intensive [accelerated DHT client](https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient). Enable it via [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtsweepenabled).
**How it works:** Instead of providing keys one-by-one, the sweep provider systematically explores DHT keyspace regions in batches.
> <picture>
> <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/f6e06b08-7fee-490c-a681-1bf440e16e27">
> <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
> <img alt="Reprovide Cycle Comparison" src="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
> </picture>
>
> The diagram shows how sweep mode avoids the hourly traffic spikes of Accelerated DHT while maintaining similar effectiveness. By grouping CIDs into keyspace regions and processing them in batches, sweep mode reduces memory overhead and creates predictable network patterns.
**Benefits for large-scale operations:** Handles hundreds of thousands of CIDs with reduced memory and network connections, spreads operations evenly to eliminate resource spikes, maintains state across restarts through persistent keystore, and provides better metrics visibility.
**Monitoring and debugging:** Legacy mode (`SweepEnabled=false`) tracks `provider_reprovider_provide_count` and `provider_reprovider_reprovide_count`, while sweep mode (`SweepEnabled=true`) tracks `total_provide_count_total`. Enable debug logging with `GOLOG_LOG_LEVEL=error,provider=debug,dht/provider=debug` to see detailed logs from either system.
> [!NOTE]
> This feature is experimental and opt-in. In the future, it will become the default and replace the legacy system. Some commands like `ipfs stats provide` and `ipfs routing provide` are not yet available with sweep mode. Run `ipfs provide --help` for alternatives.
For configuration details, see [`Provide.DHT`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedht). For metrics documentation, see [Provide metrics](https://github.com/ipfs/kubo/blob/master/docs/metrics.md#provide).
#### 📊 Exposed DHT metrics

View File

@ -131,7 +131,7 @@ config file at runtime.
- [`Provide.DHT`](#providedht)
- [`Provide.DHT.MaxWorkers`](#providedhtmaxworkers)
- [`Provide.DHT.Interval`](#providedhtinterval)
- [`Provide.DHT.SweepEnabled`](#providedhtssweepenabled)
- [`Provide.DHT.SweepEnabled`](#providedhtsweepenabled)
- [`Provide.DHT.DedicatedPeriodicWorkers`](#providedhtdedicatedperiodicworkers)
- [`Provide.DHT.DedicatedBurstWorkers`](#providedhtdedicatedburstworkers)
- [`Provide.DHT.MaxProvideConnsPerWorker`](#providedhtmaxprovideconnsperworker)
@ -2026,6 +2026,21 @@ by providing it a channel of all the keys it is expected to contain according
to the [`Provide.Strategy`](#providestrategy). During this operation,
all keys in the `Keystore` are purged, and only the given ones remain scheduled.
> <picture>
> <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/f6e06b08-7fee-490c-a681-1bf440e16e27">
> <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
> <img alt="Reprovide Cycle Comparison" src="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
> </picture>
>
> The diagram above visualizes the performance patterns:
>
> - **Legacy mode**: Individual (slow) provides per CID, can struggle with large datasets
> - **Sweep mode**: Even distribution matching the keyspace sweep described with low resource usage
> - **Accelerated DHT**: Hourly traffic spikes with high resource usage
>
> Sweep mode provides similar effectiveness to Accelerated DHT but with steady resource usage - better for machines with limited CPU, memory, or network bandwidth.
> [!NOTE]
> This feature is opt-in for now, but will become the default in a future release.
> Eventually, this configuration flag will be removed once the feature is stable.
@ -2400,8 +2415,8 @@ When it is enabled:
- The provider will now use a keyspace sweeping mode allowing to keep alive
CID sets that are multiple orders of magnitude larger.
- **Note:** For improved provide/reprovide operations specifically, consider using
[`Provide.DHT.SweepEnabled`](#providedhtssweepenabled) instead, which offers similar
benefits with lower resource consumption.
[`Provide.DHT.SweepEnabled`](#providedhtsweepenabled) instead, which offers similar
benefits without the hourly traffic spikes.
- The standard Bucket-Routing-Table DHT will still run for the DHT server (if
the DHT server is enabled). This means the classical routing table will
still be used to answer other nodes.