kubo/test/cli/agent_version_unicode_test.go
Marcin Rataj f6a9b347cb
fix(cmds): cleanup unicode identify strings (#9465)
preserve private use characters as specified
in https://github.com/libp2p/specs/pull/491
enforce 128 rune limit on untrusted peer data
2025-09-19 04:46:38 +02:00

221 lines
7.1 KiB
Go
Raw Blame History

package cli
import (
"strings"
"testing"
"github.com/ipfs/kubo/core/commands/cmdutils"
"github.com/stretchr/testify/assert"
)
func TestCleanAndTrimUnicode(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "Basic ASCII",
input: "kubo/1.0.0",
expected: "kubo/1.0.0",
},
{
name: "Polish characters preserved",
input: "test-ąęćłńóśźż",
expected: "test-ąęćłńóśźż",
},
{
name: "Chinese characters preserved",
input: "版本-中文测试",
expected: "版本-中文测试",
},
{
name: "Arabic text preserved",
input: "اختبار-العربية",
expected: "اختبار-العربية",
},
{
name: "Emojis preserved",
input: "version-1.0-🚀-🎉",
expected: "version-1.0-🚀-🎉",
},
{
name: "Complex Unicode with combining marks preserved",
input: "h̸̢̢̢̢̢̢̢̢̢̢e̵̵̵̵̵̵̵̵̵̵l̷̷̷̷̷̷̷̷̷̷l̶̶̶̶̶̶̶̶̶̶o̴̴̴̴̴̴̴̴̴̴",
expected: "h̸̢̢̢̢̢̢̢̢̢̢e̵̵̵̵̵̵̵̵̵̵l̷̷̷̷̷̷̷̷̷̷l̶̶̶̶̶̶̶̶̶̶o̴̴̴̴̴̴̴̴̴̴", // Preserved as-is (only 50 runes)
},
{
name: "Long text with combining marks truncated at 128",
input: strings.Repeat("ẽ̸̢̛̖̬͈͉͖͇͈̭̥́̓̌̾͊̊̂̄̍̅̂͌́", 10), // Very long text (260 runes)
expected: "ẽ̸̢̛̖̬͈͉͖͇͈̭̥́̓̌̾͊̊̂̄̍̅̂͌́ẽ̸̢̛̖̬͈͉͖͇͈̭̥́̓̌̾͊̊̂̄̍̅̂͌́ẽ̸̢̛̖̬͈͉͖͇͈̭̥́̓̌̾͊̊̂̄̍̅̂͌́ẽ̸̢̛̖̬͈͉͖͇͈̭̥́̓̌̾͊̊̂̄̍̅̂͌́ẽ̸̢̛̖̬͈͉͖͇͈̭̥́̓̌̾͊̊̂̄̍̅̂", // Truncated at 128 runes
},
{
name: "Zero-width characters replaced with U+FFFD",
input: "test\u200Bzero\u200Cwidth\u200D\uFEFFchars",
expected: "test<73>zero<72>width<74><68>chars",
},
{
name: "RTL/LTR override replaced with U+FFFD",
input: "test\u202Drtl\u202Eltr\u202Aoverride",
expected: "test<73>rtl<74>ltr<74>override",
},
{
name: "Bidi isolates replaced with U+FFFD",
input: "test\u2066bidi\u2067isolate\u2068text\u2069end",
expected: "test<73>bidi<64>isolate<74>text<78>end",
},
{
name: "Control characters replaced with U+FFFD",
input: "test\x00null\x1Fescape\x7Fdelete",
expected: "test<73>null<6C>escape<70>delete",
},
{
name: "Combining marks preserved",
input: "e\u0301\u0302\u0303\u0304\u0305", // e with 5 combining marks
expected: "e\u0301\u0302\u0303\u0304\u0305", // All preserved
},
{
name: "No truncation at 70 characters",
input: "123456789012345678901234567890123456789012345678901234567890123456789",
expected: "123456789012345678901234567890123456789012345678901234567890123456789",
},
{
name: "No truncation with Unicode - 70 rockets preserved",
input: strings.Repeat("🚀", 70),
expected: strings.Repeat("🚀", 70),
},
{
name: "Empty string",
input: "",
expected: "",
},
{
name: "Only whitespace with control chars",
input: " \t\n ",
expected: "\uFFFD\uFFFD", // Tab and newline become U+FFFD, spaces trimmed
},
{
name: "Leading and trailing whitespace",
input: " test ",
expected: "test",
},
{
name: "Complex mix - invisible chars replaced with U+FFFD, Unicode preserved",
input: "kubo/1.0-🚀\u200B h̸̢̏̔ḛ̶̽̀s̵t\u202E-ąęł-中文",
expected: "kubo/1.0-🚀<> h̸̢̏̔ḛ̶̽̀s̵t<CCB5>-ąęł-中文",
},
{
name: "Emoji with skin tone preserved",
input: "👍🏽", // Thumbs up with skin tone modifier
expected: "👍🏽", // Preserved as-is
},
{
name: "Mixed scripts preserved",
input: "Hello-你好-مرحبا-Здравствуйте",
expected: "Hello-你好-مرحبا-Здравствуйте",
},
{
name: "Format characters replaced with U+FFFD",
input: "test\u00ADsoft\u2060word\u206Fnom\u200Ebreak",
expected: "test<73>soft<66>word<72>nom<6F>break", // Soft hyphen, word joiner, etc replaced
},
{
name: "Complex Unicode text with many combining marks (91 runes, no truncation)",
input: "ț̸̢͙̞̖̏̔ȩ̶̰͓̪͎̱̠̥̳͔̽̀̃̿̌̾̀͗̕̕͜s̵̢̛̖̬͈͉͖͇͈̭̥̃́̓̌̾͊̊̂̄̍̅̂͌́ͅţ̴̯̹̪͖͓̘̊́̑̄̋̈́͐̈́̔̇̄̂́̎̓͛͠ͅ test",
expected: "ț̸̢͙̞̖̏̔ȩ̶̰͓̪͎̱̠̥̳͔̽̀̃̿̌̾̀͗̕̕͜s̵̢̛̖̬͈͉͖͇͈̭̥̃́̓̌̾͊̊̂̄̍̅̂͌́ͅţ̴̯̹̪͖͓̘̊́̑̄̋̈́͐̈́̔̇̄̂́̎̓͛͠ͅ test", // Not truncated (91 < 128)
},
{
name: "Truncation at 128 characters",
input: strings.Repeat("a", 150),
expected: strings.Repeat("a", 128),
},
{
name: "Truncation with Unicode at 128",
input: strings.Repeat("🚀", 150),
expected: strings.Repeat("🚀", 128),
},
{
name: "Private use characters preserved (per spec)",
input: "test\uE000\uF8FF", // Private use area characters
expected: "test\uE000\uF8FF", // Should be preserved
},
{
name: "U+FFFD replacement for multiple categories",
input: "a\x00b\u200Cc\u202Ed", // control, format chars
expected: "a\uFFFDb\uFFFDc\uFFFDd", // All replaced with U+FFFD
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := cmdutils.CleanAndTrim(tt.input)
assert.Equal(t, tt.expected, result, "CleanAndTrim(%q) = %q, want %q", tt.input, result, tt.expected)
})
}
}
func TestCleanAndTrimIdempotent(t *testing.T) {
// Test that applying CleanAndTrim twice gives the same result
inputs := []string{
"test-ąęćłńóśźż",
"版本-中文测试",
"version-1.0-🚀-🎉",
"h̸e̵l̷l̶o̴ w̸o̵r̷l̶d̴",
"test\u200Bzero\u200Cwidth",
}
for _, input := range inputs {
once := cmdutils.CleanAndTrim(input)
twice := cmdutils.CleanAndTrim(once)
assert.Equal(t, once, twice, "CleanAndTrim should be idempotent for %q", input)
}
}
func TestCleanAndTrimSecurity(t *testing.T) {
// Test that all invisible/dangerous characters are removed
tests := []struct {
name string
input string
check func(string) bool
}{
{
name: "No zero-width spaces",
input: "test\u200B\u200C\u200Dtest",
check: func(s string) bool {
return !strings.Contains(s, "\u200B") && !strings.Contains(s, "\u200C") && !strings.Contains(s, "\u200D")
},
},
{
name: "No bidi overrides",
input: "test\u202A\u202B\u202C\u202D\u202Etest",
check: func(s string) bool {
for _, r := range []rune{0x202A, 0x202B, 0x202C, 0x202D, 0x202E} {
if strings.ContainsRune(s, r) {
return false
}
}
return true
},
},
{
name: "No control characters",
input: "test\x00\x01\x02\x1F\x7Ftest",
check: func(s string) bool {
for _, r := range s {
if r < 0x20 || r == 0x7F {
return false
}
}
return true
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := cmdutils.CleanAndTrim(tt.input)
assert.True(t, tt.check(result), "Security check failed for %q -> %q", tt.input, result)
})
}
}