test(cli): improve HAMT threshold tests with exact +1 byte verification

- add UnixFSDataType() helper to directly check UnixFS type via protobuf
- refactor threshold tests to use exact +1 byte calculations instead of +1 file
- verify directory type directly (ft.TDirectory vs ft.THAMTShard) instead of
  inferring from link count
- clean up helper function signatures by removing unused cidLength parameter
This commit is contained in:
Marcin Rataj 2026-01-27 22:38:24 +01:00
parent 9b04a587f7
commit 120800fe8b
3 changed files with 105 additions and 67 deletions

View File

@ -8,6 +8,7 @@ import (
"testing"
"time"
ft "github.com/ipfs/boxo/ipld/unixfs"
"github.com/ipfs/kubo/config"
"github.com/ipfs/kubo/test/cli/harness"
"github.com/ipfs/kubo/test/cli/testutils"
@ -251,21 +252,20 @@ func TestAdd(t *testing.T) {
// Create directory exactly at the 256KiB threshold using links estimation.
// Links estimation: size = numFiles * (nameLen + cidLen)
// 4096 * (30 + 34) = 4096 * 64 = 262144 = threshold exactly
// With > comparison: stays as basic directory
// With >= comparison: converts to HAMT
// Threshold uses > not >=, so directory at exact threshold stays basic.
const numFiles, nameLen = 4096, 30
err = createDirectoryForHAMTLinksEstimation(randDir, cidV0Length, numFiles, nameLen, nameLen, seed)
err = createDirectoryForHAMTLinksEstimation(randDir, numFiles, nameLen, nameLen, seed)
require.NoError(t, err)
cidStr := node.IPFS("add", "-r", "-Q", randDir).Stdout.Trimmed()
// Should remain a basic directory (threshold uses > not >=)
root, err := node.InspectPBNode(cidStr)
assert.NoError(t, err)
require.Equal(t, numFiles, len(root.Links), "expected basic directory at exact threshold")
// Verify it's a basic directory by checking UnixFS type
fsType, err := node.UnixFSDataType(cidStr)
require.NoError(t, err)
require.Equal(t, ft.TDirectory, fsType, "expected basic directory (type=1) at exact threshold")
})
t.Run("over UnixFSHAMTDirectorySizeThreshold=256KiB (links estimation)", func(t *testing.T) {
t.Run("1 byte over UnixFSHAMTDirectorySizeThreshold=256KiB (links estimation)", func(t *testing.T) {
t.Parallel()
node := harness.NewT(t).NewNode().Init(profile)
node.StartDaemon()
@ -274,19 +274,19 @@ func TestAdd(t *testing.T) {
randDir, err := os.MkdirTemp(node.Dir, seed)
require.NoError(t, err)
// Create directory just over the 256KiB threshold using links estimation.
// Links estimation: size = numFiles * (nameLen + cidLen)
// 4097 * (30 + 34) = 4097 * 64 = 262208 > 262144, exceeds threshold
const numFiles, nameLen = 4097, 30
err = createDirectoryForHAMTLinksEstimation(randDir, cidV0Length, numFiles, nameLen, nameLen, seed)
// Create directory exactly 1 byte over the 256KiB threshold using links estimation.
// Links estimation: size = sum(nameLen + cidLen) for each file
// 4095 * (30 + 34) + 1 * (31 + 34) = 262080 + 65 = 262145 = threshold + 1
const numFiles, nameLen, lastNameLen = 4096, 30, 31
err = createDirectoryForHAMTLinksEstimation(randDir, numFiles, nameLen, lastNameLen, seed)
require.NoError(t, err)
cidStr := node.IPFS("add", "-r", "-Q", randDir).Stdout.Trimmed()
// Should be HAMT sharded (root links <= fanout of 256)
root, err := node.InspectPBNode(cidStr)
assert.NoError(t, err)
require.LessOrEqual(t, len(root.Links), 256, "expected HAMT directory when over threshold")
// Verify it's a HAMT directory by checking UnixFS type
fsType, err := node.UnixFSDataType(cidStr)
require.NoError(t, err)
require.Equal(t, ft.THAMTShard, fsType, "expected HAMT directory (type=5) when 1 byte over threshold")
})
})
@ -349,24 +349,23 @@ func TestAdd(t *testing.T) {
require.NoError(t, err)
// Create directory exactly at the 256KiB threshold using block estimation.
// Block estimation: size = baseOverhead + numFiles * LinkSerializedSize
// LinkSerializedSize(11, 36, 0) = 55 bytes per link
// 4766 * 55 + 14 = 262130 + 14 = 262144 = threshold exactly
// With > comparison: stays as basic directory
// With >= comparison: converts to HAMT
const numFiles, nameLen = 4766, 11
err = createDirectoryForHAMTBlockEstimation(randDir, cidV1Length, numFiles, nameLen, nameLen, seed)
// Block estimation: size = dataFieldSize + sum(LinkSerializedSize)
// - dataFieldSerializedSize() = 4 bytes (empty dir, no mode/mtime)
// - LinkSerializedSize(nameLen=11) = 55 bytes, LinkSerializedSize(nameLen=21) = 65 bytes
// Total: 4765 * 55 + 65 + 4 = 262144 = threshold exactly
const numFiles, nameLen, lastNameLen = 4766, 11, 21
err = createDirectoryForHAMTBlockEstimation(randDir, numFiles, nameLen, lastNameLen, seed)
require.NoError(t, err)
cidStr := node.IPFS("add", "-r", "-Q", randDir).Stdout.Trimmed()
// Should remain a basic directory (threshold uses > not >=)
root, err := node.InspectPBNode(cidStr)
assert.NoError(t, err)
require.Equal(t, numFiles, len(root.Links), "expected basic directory at exact threshold")
// Verify it's a basic directory by checking UnixFS type
fsType, err := node.UnixFSDataType(cidStr)
require.NoError(t, err)
require.Equal(t, ft.TDirectory, fsType, "expected basic directory (type=1) at exact threshold")
})
t.Run("over UnixFSHAMTDirectorySizeThreshold=256KiB (block estimation)", func(t *testing.T) {
t.Run("1 byte over UnixFSHAMTDirectorySizeThreshold=256KiB (block estimation)", func(t *testing.T) {
t.Parallel()
node := harness.NewT(t).NewNode().Init(profile)
node.StartDaemon()
@ -375,19 +374,18 @@ func TestAdd(t *testing.T) {
randDir, err := os.MkdirTemp(node.Dir, seed)
require.NoError(t, err)
// Create directory just over the 256KiB threshold using block estimation.
// Block estimation: size = baseOverhead + numFiles * LinkSerializedSize
// 4767 * 55 + 14 = 262185 + 14 = 262199 > 262144, exceeds threshold
const numFiles, nameLen = 4767, 11
err = createDirectoryForHAMTBlockEstimation(randDir, cidV1Length, numFiles, nameLen, nameLen, seed)
// Create directory exactly 1 byte over the 256KiB threshold.
// Same as above but lastNameLen=22 adds 1 byte: 4765 * 55 + 66 + 4 = 262145
const numFiles, nameLen, lastNameLen = 4766, 11, 22
err = createDirectoryForHAMTBlockEstimation(randDir, numFiles, nameLen, lastNameLen, seed)
require.NoError(t, err)
cidStr := node.IPFS("add", "-r", "-Q", randDir).Stdout.Trimmed()
// Should be HAMT sharded (root links <= fanout of 256)
root, err := node.InspectPBNode(cidStr)
assert.NoError(t, err)
require.LessOrEqual(t, len(root.Links), 256, "expected HAMT directory when over threshold")
// Verify it's a HAMT directory by checking UnixFS type
fsType, err := node.UnixFSDataType(cidStr)
require.NoError(t, err)
require.Equal(t, ft.THAMTShard, fsType, "expected HAMT directory (type=5) when 1 byte over threshold")
})
})
@ -398,8 +396,8 @@ func TestAdd(t *testing.T) {
setupTestDir := func(t *testing.T, node *harness.Node) string {
testDir, err := os.MkdirTemp(node.Dir, "hidden-test")
require.NoError(t, err)
require.NoError(t, os.WriteFile(filepath.Join(testDir, "visible.txt"), []byte("visible"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(testDir, ".hidden"), []byte("hidden"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(testDir, "visible.txt"), []byte("visible"), 0o644))
require.NoError(t, os.WriteFile(filepath.Join(testDir, ".hidden"), []byte("hidden"), 0o644))
return testDir
}
@ -447,8 +445,8 @@ func TestAdd(t *testing.T) {
setupTestDir := func(t *testing.T, node *harness.Node) string {
testDir, err := os.MkdirTemp(node.Dir, "empty-dirs-test")
require.NoError(t, err)
require.NoError(t, os.Mkdir(filepath.Join(testDir, "empty-subdir"), 0755))
require.NoError(t, os.WriteFile(filepath.Join(testDir, "file.txt"), []byte("content"), 0644))
require.NoError(t, os.Mkdir(filepath.Join(testDir, "empty-subdir"), 0o755))
require.NoError(t, os.WriteFile(filepath.Join(testDir, "file.txt"), []byte("content"), 0o644))
return testDir
}
@ -502,14 +500,14 @@ func TestAdd(t *testing.T) {
// Top-level file and symlink
targetFile := filepath.Join(testDir, "target.txt")
require.NoError(t, os.WriteFile(targetFile, []byte("target content"), 0644))
require.NoError(t, os.WriteFile(targetFile, []byte("target content"), 0o644))
require.NoError(t, os.Symlink("target.txt", filepath.Join(testDir, "link.txt")))
// Nested file and symlink in sub-sub directory
subsubdir := filepath.Join(testDir, "subdir", "subsubdir")
require.NoError(t, os.MkdirAll(subsubdir, 0755))
require.NoError(t, os.MkdirAll(subsubdir, 0o755))
nestedTarget := filepath.Join(subsubdir, "nested-target.txt")
require.NoError(t, os.WriteFile(nestedTarget, []byte("nested content"), 0644))
require.NoError(t, os.WriteFile(nestedTarget, []byte("nested content"), 0o644))
require.NoError(t, os.Symlink("nested-target.txt", filepath.Join(subsubdir, "nested-link.txt")))
return testDir
@ -806,32 +804,22 @@ func TestAddFastProvide(t *testing.T) {
}
// createDirectoryForHAMTLinksEstimation creates a directory with the specified number
// of files using the links-based size estimation formula (size = numFiles * (nameLen + cidLen)).
// of files for testing links-based size estimation (size = sum of nameLen + cidLen).
// Used by legacy profiles (unixfs-v0-2015).
//
// Threshold behavior: boxo uses > comparison, so directory at exact threshold stays basic.
// Use DirBasicFiles for basic directory test, DirHAMTFiles for HAMT directory test.
//
// The lastNameLen parameter allows the last file to have a different name length,
// enabling exact +1 byte threshold tests.
//
// See boxo/ipld/unixfs/io/directory.go sizeBelowThreshold() for the links estimation.
func createDirectoryForHAMTLinksEstimation(dirPath string, cidLength, numFiles, nameLen, lastNameLen int, seed string) error {
func createDirectoryForHAMTLinksEstimation(dirPath string, numFiles, nameLen, lastNameLen int, seed string) error {
return createDeterministicFiles(dirPath, numFiles, nameLen, lastNameLen, seed)
}
// createDirectoryForHAMTBlockEstimation creates a directory with the specified number
// of files using the block-based size estimation formula (LinkSerializedSize with protobuf overhead).
// of files for testing block-based size estimation (LinkSerializedSize with protobuf overhead).
// Used by modern profiles (unixfs-v1-2025).
//
// Threshold behavior: boxo uses > comparison, so directory at exact threshold stays basic.
// Use DirBasicFiles for basic directory test, DirHAMTFiles for HAMT directory test.
//
// The lastNameLen parameter allows the last file to have a different name length,
// enabling exact +1 byte threshold tests.
//
// See boxo/ipld/unixfs/io/directory.go estimatedBlockSize() for the block estimation.
func createDirectoryForHAMTBlockEstimation(dirPath string, cidLength, numFiles, nameLen, lastNameLen int, seed string) error {
func createDirectoryForHAMTBlockEstimation(dirPath string, numFiles, nameLen, lastNameLen int, seed string) error {
return createDeterministicFiles(dirPath, numFiles, nameLen, lastNameLen, seed)
}
@ -870,7 +858,7 @@ func createDeterministicFiles(dirPath string, numFiles, nameLen, lastNameLen int
filePath := filepath.Join(dirPath, filename)
// Create file with 1-byte content for non-zero tsize
if err := os.WriteFile(filePath, []byte("x"), 0644); err != nil {
if err := os.WriteFile(filePath, []byte("x"), 0o644); err != nil {
return err
}
}

View File

@ -7,6 +7,7 @@ import (
"strings"
"testing"
ft "github.com/ipfs/boxo/ipld/unixfs"
"github.com/ipfs/kubo/test/cli/harness"
"github.com/ipfs/kubo/test/cli/testutils"
"github.com/stretchr/testify/assert"
@ -133,7 +134,7 @@ func TestCIDProfiles(t *testing.T) {
carOutputDir := os.Getenv("CID_PROFILES_CAR_OUTPUT")
exportCARs := carOutputDir != ""
if exportCARs {
if err := os.MkdirAll(carOutputDir, 0755); err != nil {
if err := os.MkdirAll(carOutputDir, 0o755); err != nil {
t.Fatalf("failed to create CAR output directory: %v", err)
}
t.Logf("CAR export enabled, writing to: %s", carOutputDir)
@ -276,14 +277,19 @@ func runProfileTests(t *testing.T, exp cidProfileExpectations, carOutputDir stri
basicLastNameLen = exp.DirBasicNameLen
}
if exp.HAMTSizeEstimation == "block" {
err = createDirectoryForHAMTBlockEstimation(randDir, cidLen, exp.DirBasicFiles, exp.DirBasicNameLen, basicLastNameLen, seed)
err = createDirectoryForHAMTBlockEstimation(randDir, exp.DirBasicFiles, exp.DirBasicNameLen, basicLastNameLen, seed)
} else {
err = createDirectoryForHAMTLinksEstimation(randDir, cidLen, exp.DirBasicFiles, exp.DirBasicNameLen, basicLastNameLen, seed)
err = createDirectoryForHAMTLinksEstimation(randDir, exp.DirBasicFiles, exp.DirBasicNameLen, basicLastNameLen, seed)
}
require.NoError(t, err)
cidStr := node.IPFS("add", "-r", "-Q", randDir).Stdout.Trimmed()
// Verify it's a basic directory by checking UnixFS type
fsType, err := node.UnixFSDataType(cidStr)
require.NoError(t, err)
require.Equal(t, ft.TDirectory, fsType, "expected basic directory (type=1) at exact threshold")
root, err := node.InspectPBNode(cidStr)
assert.NoError(t, err)
require.Equal(t, exp.DirBasicFiles, len(root.Links),
@ -339,18 +345,24 @@ func runProfileTests(t *testing.T, exp cidProfileExpectations, carOutputDir stri
lastNameLen = exp.DirHAMTNameLen
}
if exp.HAMTSizeEstimation == "block" {
err = createDirectoryForHAMTBlockEstimation(randDir, cidLen, exp.DirHAMTFiles, exp.DirHAMTNameLen, lastNameLen, seed)
err = createDirectoryForHAMTBlockEstimation(randDir, exp.DirHAMTFiles, exp.DirHAMTNameLen, lastNameLen, seed)
} else {
err = createDirectoryForHAMTLinksEstimation(randDir, cidLen, exp.DirHAMTFiles, exp.DirHAMTNameLen, lastNameLen, seed)
err = createDirectoryForHAMTLinksEstimation(randDir, exp.DirHAMTFiles, exp.DirHAMTNameLen, lastNameLen, seed)
}
require.NoError(t, err)
cidStr := node.IPFS("add", "-r", "-Q", randDir).Stdout.Trimmed()
// Verify it's a HAMT directory by checking UnixFS type
fsType, err := node.UnixFSDataType(cidStr)
require.NoError(t, err)
require.Equal(t, ft.THAMTShard, fsType, "expected HAMT directory (type=5) when over threshold")
// HAMT root has at most fanout links (actual count depends on hash distribution)
root, err := node.InspectPBNode(cidStr)
assert.NoError(t, err)
require.LessOrEqual(t, len(root.Links), exp.HAMTFanout,
"expected HAMT directory with <= %d links", exp.HAMTFanout)
"expected HAMT directory root to have <= %d links", exp.HAMTFanout)
// Verify hash function
verifyHashFunction(t, node, cidStr, exp.HashFunc)

View File

@ -3,8 +3,47 @@ package harness
import (
"bytes"
"encoding/json"
mdag "github.com/ipfs/boxo/ipld/merkledag"
ft "github.com/ipfs/boxo/ipld/unixfs"
pb "github.com/ipfs/boxo/ipld/unixfs/pb"
)
// UnixFSDataType returns the UnixFS DataType for the given CID by fetching the
// raw block and parsing the protobuf. This directly checks the Type field in
// the UnixFS Data message (https://specs.ipfs.tech/unixfs/#data).
//
// Common types:
// - ft.TDirectory (1) = basic flat directory
// - ft.THAMTShard (5) = HAMT sharded directory
func (n *Node) UnixFSDataType(cid string) (pb.Data_DataType, error) {
log.Debugf("node %d block get %s", n.ID, cid)
var blockData bytes.Buffer
res := n.Runner.MustRun(RunRequest{
Path: n.IPFSBin,
Args: []string{"block", "get", cid},
CmdOpts: []CmdOpt{RunWithStdout(&blockData)},
})
if res.Err != nil {
return 0, res.Err
}
// Parse dag-pb block
protoNode, err := mdag.DecodeProtobuf(blockData.Bytes())
if err != nil {
return 0, err
}
// Parse UnixFS data
fsNode, err := ft.FSNodeFromBytes(protoNode.Data())
if err != nil {
return 0, err
}
return fsNode.Type(), nil
}
// InspectPBNode uses dag-json output of 'ipfs dag get' to inspect
// "Logical Format" of DAG-PB as defined in
// https://web.archive.org/web/20250403194752/https://ipld.io/specs/codecs/dag-pb/spec/#logical-format
@ -28,7 +67,6 @@ func (n *Node) InspectPBNode(cid string) (PBNode, error) {
return root, err
}
return root, nil
}
// Define structs to match the JSON for