mirror of
https://github.com/ipfs/kubo.git
synced 2026-02-21 10:27:46 +08:00
fix(cli): deduplicate dag stat blocks by multihash
since Kubo v0.12.0, blocks are stored by multihash, so identical data with different CIDs (e.g., CIDv0 vs CIDv1) is stored once. dag stat now reflects actual storage by using multihash-based deduplication instead of CID-based. updated help text to clarify deduplication behavior and note that dag export uses CID-based keying and may include duplicates. added regression test for multihash deduplication.
This commit is contained in:
parent
d29c0b9c01
commit
f0ea79b117
@ -377,7 +377,10 @@ var DagStatCmd = &cmds.Command{
|
||||
'ipfs dag stat' fetches a DAG and returns various statistics about it.
|
||||
Statistics include size and number of blocks.
|
||||
|
||||
Note: This command skips duplicate blocks in reporting both size and the number of blocks
|
||||
Note: Duplicate blocks are identified by content hash (multihash) to reflect
|
||||
actual disk usage. Identical data referenced via different CIDs is counted
|
||||
once. 'dag export' uses CID-based keying and may include the same data
|
||||
multiple times if referenced by different CIDs.
|
||||
`,
|
||||
},
|
||||
Arguments: []cmds.Argument{
|
||||
|
||||
@ -7,8 +7,9 @@ import (
|
||||
|
||||
mdag "github.com/ipfs/boxo/ipld/merkledag"
|
||||
"github.com/ipfs/boxo/ipld/merkledag/traverse"
|
||||
cid "github.com/ipfs/go-cid"
|
||||
cmds "github.com/ipfs/go-ipfs-cmds"
|
||||
mh "github.com/multiformats/go-multihash"
|
||||
|
||||
"github.com/ipfs/kubo/core/commands/cmdenv"
|
||||
"github.com/ipfs/kubo/core/commands/cmdutils"
|
||||
"github.com/ipfs/kubo/core/commands/e"
|
||||
@ -26,7 +27,10 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment)
|
||||
}
|
||||
nodeGetter := mdag.NewSession(req.Context, api.Dag())
|
||||
|
||||
cidSet := cid.NewSet()
|
||||
// Use multihash set for deduplication to reflect actual storage.
|
||||
// Since Kubo v0.12.0, blocks are stored by multihash, so identical
|
||||
// data with different CIDs (e.g., CIDv0 vs CIDv1) is stored once.
|
||||
mhSet := mh.NewSet()
|
||||
dagStatSummary := &DagStatSummary{DagStatsArray: []*DagStat{}}
|
||||
for _, a := range req.Arguments {
|
||||
p, err := cmdutils.PathOrCidPath(a)
|
||||
@ -54,11 +58,11 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment)
|
||||
currentNodeSize := uint64(len(current.Node.RawData()))
|
||||
dagstats.Size += currentNodeSize
|
||||
dagstats.NumBlocks++
|
||||
if !cidSet.Has(current.Node.Cid()) {
|
||||
// Visit returns true if this multihash was not seen before
|
||||
if mhSet.Visit(current.Node.Cid().Hash()) {
|
||||
dagStatSummary.incrementTotalSize(currentNodeSize)
|
||||
}
|
||||
dagStatSummary.incrementRedundantSize(currentNodeSize)
|
||||
cidSet.Add(current.Node.Cid())
|
||||
if progressive {
|
||||
if err := res.Emit(dagStatSummary); err != nil {
|
||||
return err
|
||||
@ -74,7 +78,7 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment)
|
||||
}
|
||||
}
|
||||
|
||||
dagStatSummary.UniqueBlocks = cidSet.Len()
|
||||
dagStatSummary.UniqueBlocks = mhSet.Len()
|
||||
dagStatSummary.calculateSummary()
|
||||
|
||||
if err := res.Emit(dagStatSummary); err != nil {
|
||||
|
||||
@ -11,7 +11,8 @@ This release was brought to you by the [Shipyard](https://ipshipyard.com/) team.
|
||||
- [Overview](#overview)
|
||||
- [🔦 Highlights](#-highlights)
|
||||
- [Routing V1 HTTP API now exposed by default](#routing-v1-http-api-now-exposed-by-default)
|
||||
- [Track total size when adding pins](#track-total-size-when-adding-pins]
|
||||
- [Track total size when adding pins](#track-total-size-when-adding-pins)
|
||||
- [Fixed `ipfs dag stat` block counting](#fixed-ipfs-dag-stat-block-counting)
|
||||
- [📝 Changelog](#-changelog)
|
||||
- [👨👩👧👦 Contributors](#-contributors)
|
||||
|
||||
@ -32,6 +33,10 @@ Example output:
|
||||
Fetched/Processed 336 nodes (83 MB)
|
||||
```
|
||||
|
||||
#### Fixed `ipfs dag stat` block counting
|
||||
|
||||
Since Kubo v0.12.0, blocks are stored by multihash, so the same data is stored only once regardless of which CID references it. The `dag stat` command now reflects actual storage by deduplicating blocks by content hash (e.g., data referenced via both CIDv0 and CIDv1 is counted once). See `ipfs dag stat --help` for more details.
|
||||
|
||||
### 📝 Changelog
|
||||
|
||||
### 👨👩👧👦 Contributors
|
||||
|
||||
@ -104,6 +104,27 @@ func TestDag(t *testing.T) {
|
||||
stat := node.RunIPFS("dag", "stat", "--progress=false", node1Cid, node2Cid)
|
||||
assert.Equal(t, content, stat.Stdout.Bytes())
|
||||
})
|
||||
|
||||
t.Run("dag stat deduplicates by multihash", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
node := harness.NewT(t).NewNode().Init().StartDaemon()
|
||||
|
||||
// Add content and get CIDv0 with dag-pb (not raw leaves)
|
||||
cidV0 := node.IPFSAddStr("hello world", "--cid-version=0", "--raw-leaves=false")
|
||||
|
||||
// Convert to CIDv1 (same multihash, different CID)
|
||||
cidV1 := node.IPFS("cid", "format", "-v", "1", "-b", "base32", cidV0).Stdout.Trimmed()
|
||||
|
||||
// Run dag stat with both CIDs - should deduplicate by multihash
|
||||
stat := node.RunIPFS("dag", "stat", "--progress=false", "--enc=json", cidV0, cidV1)
|
||||
var data Data
|
||||
err := json.Unmarshal(stat.Stdout.Bytes(), &data)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Same block referenced via CIDv0 and CIDv1 should be counted once
|
||||
assert.Equal(t, 1, data.UniqueBlocks, "same data via different CIDs should be 1 unique block")
|
||||
assert.Equal(t, 2.0, data.Ratio, "ratio should be 2.0 (2 refs to 1 block)")
|
||||
})
|
||||
}
|
||||
|
||||
func TestDagImportFastProvide(t *testing.T) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user