fix(cli): deduplicate dag stat blocks by multihash

since Kubo v0.12.0, blocks are stored by multihash, so identical
data with different CIDs (e.g., CIDv0 vs CIDv1) is stored once.
dag stat now reflects actual storage by using multihash-based
deduplication instead of CID-based.

updated help text to clarify deduplication behavior and note that
dag export uses CID-based keying and may include duplicates.

added regression test for multihash deduplication.
This commit is contained in:
Marcin Rataj 2025-12-09 23:27:04 +01:00
parent d29c0b9c01
commit f0ea79b117
4 changed files with 40 additions and 7 deletions

View File

@ -377,7 +377,10 @@ var DagStatCmd = &cmds.Command{
'ipfs dag stat' fetches a DAG and returns various statistics about it.
Statistics include size and number of blocks.
Note: This command skips duplicate blocks in reporting both size and the number of blocks
Note: Duplicate blocks are identified by content hash (multihash) to reflect
actual disk usage. Identical data referenced via different CIDs is counted
once. 'dag export' uses CID-based keying and may include the same data
multiple times if referenced by different CIDs.
`,
},
Arguments: []cmds.Argument{

View File

@ -7,8 +7,9 @@ import (
mdag "github.com/ipfs/boxo/ipld/merkledag"
"github.com/ipfs/boxo/ipld/merkledag/traverse"
cid "github.com/ipfs/go-cid"
cmds "github.com/ipfs/go-ipfs-cmds"
mh "github.com/multiformats/go-multihash"
"github.com/ipfs/kubo/core/commands/cmdenv"
"github.com/ipfs/kubo/core/commands/cmdutils"
"github.com/ipfs/kubo/core/commands/e"
@ -26,7 +27,10 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment)
}
nodeGetter := mdag.NewSession(req.Context, api.Dag())
cidSet := cid.NewSet()
// Use multihash set for deduplication to reflect actual storage.
// Since Kubo v0.12.0, blocks are stored by multihash, so identical
// data with different CIDs (e.g., CIDv0 vs CIDv1) is stored once.
mhSet := mh.NewSet()
dagStatSummary := &DagStatSummary{DagStatsArray: []*DagStat{}}
for _, a := range req.Arguments {
p, err := cmdutils.PathOrCidPath(a)
@ -54,11 +58,11 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment)
currentNodeSize := uint64(len(current.Node.RawData()))
dagstats.Size += currentNodeSize
dagstats.NumBlocks++
if !cidSet.Has(current.Node.Cid()) {
// Visit returns true if this multihash was not seen before
if mhSet.Visit(current.Node.Cid().Hash()) {
dagStatSummary.incrementTotalSize(currentNodeSize)
}
dagStatSummary.incrementRedundantSize(currentNodeSize)
cidSet.Add(current.Node.Cid())
if progressive {
if err := res.Emit(dagStatSummary); err != nil {
return err
@ -74,7 +78,7 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment)
}
}
dagStatSummary.UniqueBlocks = cidSet.Len()
dagStatSummary.UniqueBlocks = mhSet.Len()
dagStatSummary.calculateSummary()
if err := res.Emit(dagStatSummary); err != nil {

View File

@ -11,7 +11,8 @@ This release was brought to you by the [Shipyard](https://ipshipyard.com/) team.
- [Overview](#overview)
- [🔦 Highlights](#-highlights)
- [Routing V1 HTTP API now exposed by default](#routing-v1-http-api-now-exposed-by-default)
- [Track total size when adding pins](#track-total-size-when-adding-pins]
- [Track total size when adding pins](#track-total-size-when-adding-pins)
- [Fixed `ipfs dag stat` block counting](#fixed-ipfs-dag-stat-block-counting)
- [📝 Changelog](#-changelog)
- [👨‍👩‍👧‍👦 Contributors](#-contributors)
@ -32,6 +33,10 @@ Example output:
Fetched/Processed 336 nodes (83 MB)
```
#### Fixed `ipfs dag stat` block counting
Since Kubo v0.12.0, blocks are stored by multihash, so the same data is stored only once regardless of which CID references it. The `dag stat` command now reflects actual storage by deduplicating blocks by content hash (e.g., data referenced via both CIDv0 and CIDv1 is counted once). See `ipfs dag stat --help` for more details.
### 📝 Changelog
### 👨‍👩‍👧‍👦 Contributors

View File

@ -104,6 +104,27 @@ func TestDag(t *testing.T) {
stat := node.RunIPFS("dag", "stat", "--progress=false", node1Cid, node2Cid)
assert.Equal(t, content, stat.Stdout.Bytes())
})
t.Run("dag stat deduplicates by multihash", func(t *testing.T) {
t.Parallel()
node := harness.NewT(t).NewNode().Init().StartDaemon()
// Add content and get CIDv0 with dag-pb (not raw leaves)
cidV0 := node.IPFSAddStr("hello world", "--cid-version=0", "--raw-leaves=false")
// Convert to CIDv1 (same multihash, different CID)
cidV1 := node.IPFS("cid", "format", "-v", "1", "-b", "base32", cidV0).Stdout.Trimmed()
// Run dag stat with both CIDs - should deduplicate by multihash
stat := node.RunIPFS("dag", "stat", "--progress=false", "--enc=json", cidV0, cidV1)
var data Data
err := json.Unmarshal(stat.Stdout.Bytes(), &data)
require.NoError(t, err)
// Same block referenced via CIDv0 and CIDv1 should be counted once
assert.Equal(t, 1, data.UniqueBlocks, "same data via different CIDs should be 1 unique block")
assert.Equal(t, 2.0, data.Ratio, "ratio should be 2.0 (2 refs to 1 block)")
})
}
func TestDagImportFastProvide(t *testing.T) {