From e7bc2fbad1d9d723ec6e2d459a37b46a6e059708 Mon Sep 17 00:00:00 2001 From: Robert Fratto Date: Mon, 29 Apr 2024 12:14:01 -0400 Subject: [PATCH] internal/units: create replacement for github.com/alecthomas/units package This commit creates a replacement for the github.com/alecthomas/units package. A replacement is desirable because the upstream package doesn't work very well with the Alloy syntax: * We use the UnmarshalText implementation of Base2Bytes, which treats metric (MB) and IEC (MiB) suffixes as the same. * Base2Bytes always reports values as IEC suffixes when marshalling them back, which can confuse users into thinking a unit conversion has occurred somewhere. I think it's potentially confusing to users that setting a limit of 4MB actually sets a limit of 4MiB, which is ~4% less than what the user intended. The new implementation supports parsing the same input as the old package, including complex byte sizes such as `4MiB3KiB`, though simplified byte sizes is preferred (`4099KiB`), and the simplified forms are returned when marshaling back into a string. --- internal/units/scanner.go | 37 +++ .../fuzz/Fuzz_Compare/20c62d19f67262c0 | 2 + .../fuzz/Fuzz_Compare/47e2335362c157c8 | 2 + .../fuzz/Fuzz_Compare/bb9c4a861dbff943 | 2 + .../fuzz/Fuzz_Compare/cb083f134466af85 | 2 + .../fuzz/Fuzz_Compare/fee633b7369dbe46 | 2 + .../fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c | 2 + internal/units/units.go | 214 ++++++++++++++++++ internal/units/units_test.go | 202 +++++++++++++++++ 9 files changed, 465 insertions(+) create mode 100644 internal/units/scanner.go create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/20c62d19f67262c0 create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/47e2335362c157c8 create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/bb9c4a861dbff943 create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/cb083f134466af85 create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/fee633b7369dbe46 create mode 100644 internal/units/testdata/fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c create mode 100644 internal/units/units.go create mode 100644 internal/units/units_test.go diff --git a/internal/units/scanner.go b/internal/units/scanner.go new file mode 100644 index 0000000000..5272b58a23 --- /dev/null +++ b/internal/units/scanner.go @@ -0,0 +1,37 @@ +package units + +type scanner struct { + text string + offset int +} + +func newScanner(in string) *scanner { + return &scanner{text: in, offset: 0} +} + +// Next returns true if there are more bytes to scan. It does not advance the scanner. +func (s *scanner) Next() bool { + return s.offset < len(s.text) +} + +// Scan returns the next byte and advances the scanner. +func (s *scanner) Scan() byte { + ch := s.text[s.offset] + s.offset++ + return ch +} + +// String returns the substring up to the current offset. +func (s *scanner) String() string { + return s.text[:s.offset] +} + +// Peek returns the byte at the current offset without advancing the scanner. +func (s *scanner) Peek() byte { + return s.text[s.offset] +} + +// Rem returns the number of bytes remaining in the scanner. +func (s *scanner) Rem() int { + return len(s.text) - s.offset +} diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/20c62d19f67262c0 b/internal/units/testdata/fuzz/Fuzz_Compare/20c62d19f67262c0 new file mode 100644 index 0000000000..a00bce773e --- /dev/null +++ b/internal/units/testdata/fuzz/Fuzz_Compare/20c62d19f67262c0 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("8EiB") diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/47e2335362c157c8 b/internal/units/testdata/fuzz/Fuzz_Compare/47e2335362c157c8 new file mode 100644 index 0000000000..e06121e2bf --- /dev/null +++ b/internal/units/testdata/fuzz/Fuzz_Compare/47e2335362c157c8 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("10.B") diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/bb9c4a861dbff943 b/internal/units/testdata/fuzz/Fuzz_Compare/bb9c4a861dbff943 new file mode 100644 index 0000000000..30d8dab100 --- /dev/null +++ b/internal/units/testdata/fuzz/Fuzz_Compare/bb9c4a861dbff943 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("0B00B") diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/cb083f134466af85 b/internal/units/testdata/fuzz/Fuzz_Compare/cb083f134466af85 new file mode 100644 index 0000000000..9091313396 --- /dev/null +++ b/internal/units/testdata/fuzz/Fuzz_Compare/cb083f134466af85 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("00B") diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/fee633b7369dbe46 b/internal/units/testdata/fuzz/Fuzz_Compare/fee633b7369dbe46 new file mode 100644 index 0000000000..dc08befa9a --- /dev/null +++ b/internal/units/testdata/fuzz/Fuzz_Compare/fee633b7369dbe46 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("00B0B") diff --git a/internal/units/testdata/fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c b/internal/units/testdata/fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c new file mode 100644 index 0000000000..9d84267315 --- /dev/null +++ b/internal/units/testdata/fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c @@ -0,0 +1,2 @@ +go test fuzz v1 +string("A") diff --git a/internal/units/units.go b/internal/units/units.go new file mode 100644 index 0000000000..5a41b1c427 --- /dev/null +++ b/internal/units/units.go @@ -0,0 +1,214 @@ +// Package units provides functionality for parsing and displaying multiples of +// bytes. +package units + +import ( + "encoding" + "errors" + "fmt" + "strconv" +) + +var ( + // ErrInvalidSyntax is returned when a byte string cannot be parsed. + ErrInvalidSyntax = errors.New("invalid syntax") + + // ErrOverflow is returned when a byte string is too large to be represented. + ErrOverflow = errors.New("byte size overflows int64") +) + +type Bytes int64 + +var ( + _ encoding.TextUnmarshaler = (*Bytes)(nil) + _ encoding.TextMarshaler = Bytes(0) +) + +const ( + Byte Bytes = 1 + + Kilobyte = 1000 * Byte + Megabyte = 1000 * Kilobyte + Gigabyte = 1000 * Megabyte + Terabyte = 1000 * Gigabyte + Petabyte = 1000 * Terabyte + Exabyte = 1000 * Petabyte + + Kibibyte = 1024 * Byte + Mebibyte = 1024 * Kibibyte + Gibibyte = 1024 * Mebibyte + Tebibyte = 1024 * Gibibyte + Pebibyte = 1024 * Tebibyte + Exbibyte = 1024 * Pebibyte +) + +var unitMap = map[string]Bytes{ + "": Byte, + "b": Byte, + "B": Byte, + "kB": Kilobyte, + "KB": Kilobyte, + "MB": Megabyte, + "GB": Gigabyte, + "TB": Terabyte, + "PB": Petabyte, + "EB": Exabyte, + + "KiB": Kibibyte, + "MiB": Mebibyte, + "GiB": Gibibyte, + "TiB": Tebibyte, + "PiB": Pebibyte, + "EiB": Exbibyte, +} + +// UnmarshalText parses a byte size from a string. Byte sizes are represented +// as sequences of number and unit pairs with no whitespace. Units are +// represented either as IEC units (KiB, MiB, GiB, etc) or metric units (kB or +// KB, MB, GB, etc). +// +// Multiple sequences of byte sizes can be provided in a single string, such as +// "4MB2KB". The sum of across all sequences is returned. +func (b *Bytes) UnmarshalText(text []byte) error { + if len(text) == 0 { + return ErrInvalidSyntax + } + + // Byte offset while scanning through input. + s := newScanner(string(text)) + + // Parse optional leading sign. + sign := 1 + switch s.Peek() { + case '-': + sign = -1 + s.Scan() // Advance scanner + case '+': + sign = 1 // This is redundant, but added for clarity. + s.Scan() // Advance scanner + } + + var sum Bytes + + for s.Next() { + // Find digit components. + numberText, err := scanNumberString(s) + if err != nil { + return err + } + number, err := strconv.ParseInt(numberText, 10, 64) + if err != nil { + return ErrInvalidSyntax + } + + unit, ok := unitMap[scanUnitString(s)] + if !ok { + return ErrInvalidSyntax + } + + newBytes := Bytes(number * int64(unit)) + if newBytes/unit != Bytes(number) { + return ErrOverflow + } else if sum+newBytes < sum { + return ErrOverflow + } + + sum += newBytes + } + + *b = Bytes(sign) * sum + return nil +} + +func scanNumberString(s *scanner) (string, error) { + var str string + + for s.Next() { + ch := s.Peek() + + if '0' <= ch && ch <= '9' { + str += string(ch) + _ = s.Scan() // Advance the scanner. + continue + } + + break + } + + if len(str) == 0 { + return "", ErrInvalidSyntax + } + return str, nil +} + +func scanUnitString(s *scanner) string { + var str string + + // Scan until a non-number character. + for s.Next() { + ch := s.Peek() + if ch < '0' || ch > '9' { + str += string(ch) + _ = s.Scan() // Advance the scanner. + continue + } + + break + } + + return str +} + +// MarshalText returns the string representation of b. See [Bytes.String] for +// more information. +func (b Bytes) MarshalText() ([]byte, error) { + return []byte(b.String()), nil +} + +// String returns a string representing the bytes in human-readable form. Bytes +// are returned in the highest possible unit that retains accuracy. If b is a +// multiple of 1024, the IEC binary prefixes are used (KiB, MiB, GiB, etc). +// Otherwise, the SI decimal prefixes are used (kB, MB, GB, etc). +// +// Byte sizes are always displayed as whole numbers, and are represented in the +// highest possible prefix that preserves precision. For example, 1024 bytes +// would be represented as "1KiB", while 1025 bytes would be represented as +// "1025". +func (b Bytes) String() string { + if b == 0 { + return "0" + } + + var metricSuffixes = []string{"", "kB", "MB", "GB", "TB", "PB", "EB"} + var iecSuffixes = []string{"", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB"} + + var suffixOffset int + + isMetric := b%1000 == 0 + + switch { + case isMetric: + for b%1000 == 0 && suffixOffset <= len(metricSuffixes)-1 { + // Divide by 1000, increase suffix offset. + b /= 1000 + suffixOffset++ + } + + if suffixOffset == 0 { + return fmt.Sprintf("%d", b) + } + return fmt.Sprintf("%d%s", b, metricSuffixes[suffixOffset]) + + default: + for b%1024 == 0 && suffixOffset < len(iecSuffixes)-1 { + // Divide by 1024, increase suffix offset. + b /= 1024 + suffixOffset++ + } + + if suffixOffset == 0 { + return fmt.Sprintf("%d", b) + } + return fmt.Sprintf("%d%s", b, iecSuffixes[suffixOffset]) + } +} diff --git a/internal/units/units_test.go b/internal/units/units_test.go new file mode 100644 index 0000000000..b41d6f85a3 --- /dev/null +++ b/internal/units/units_test.go @@ -0,0 +1,202 @@ +package units_test + +import ( + "errors" + "strings" + "testing" + + oldunits "github.com/alecthomas/units" + "github.com/grafana/alloy/internal/units" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var testCases = []struct { + text string + bytes units.Bytes +}{ + {"0", 0}, + {"512", 512}, + + {"1kB", 1 * units.Kilobyte}, + {"2MB", 2 * units.Megabyte}, + {"3GB", 3 * units.Gigabyte}, + {"4TB", 4 * units.Terabyte}, + {"5PB", 5 * units.Petabyte}, + {"6EB", 6 * units.Exabyte}, + + {"7KiB", 7 * units.Kibibyte}, + {"8MiB", 8 * units.Mebibyte}, + {"9GiB", 9 * units.Gibibyte}, + {"10TiB", 10 * units.Tebibyte}, + {"11PiB", 11 * units.Pebibyte}, + {"7EiB", 7 * units.Exbibyte}, // 7EiB is the highest we can go without overflowing int64 +} + +// Fuzz_UnmarshalText ensures that nothing panics when calling UnmarshalText. +func Fuzz_UnmarshalText(f *testing.F) { + f.Add("0") + f.Add("512") + f.Add("7KiB") + f.Add("8MiB") + f.Add("9GiB") + f.Add("10TiB") + f.Add("11PiB") + f.Add("7EiB") + f.Add("-3MiB") + f.Add("3MiB4KiB5B") + + f.Fuzz(func(t *testing.T, text string) { + var bytes units.Bytes + _ = bytes.UnmarshalText([]byte(text)) + }) +} + +// Fuzz_String ensures that nothing panics when calling String. +func Fuzz_String(f *testing.F) { + for _, tc := range testCases { + f.Add(int64(tc.bytes)) + } + + f.Fuzz(func(t *testing.T, bytes int64) { + _ = units.Bytes(bytes).String() + }) +} + +// Fuzz_Compare ensures that anything parseable by oldunits.UnmarshalText can be parsed by units.UnmarshalText. +func Fuzz_Compare(f *testing.F) { + for _, tc := range testCases { + f.Add(tc.text) + } + + // Ensure that spaces are not permitted in oldunits, since they're not + // permitted here either. + f.Add("4 MiB") + f.Add("4MiB 2KiB") + + f.Fuzz(func(t *testing.T, text string) { + if strings.Contains(text, ".") { + t.Skip() + } + + var oldType oldunits.Base2Bytes + if err := oldType.UnmarshalText([]byte(text)); err != nil { + t.Skip() + } + + var newType units.Bytes + err := newType.UnmarshalText([]byte(text)) + if errors.Is(err, units.ErrOverflow) { + t.Skip() + } + + require.NoError(t, err, "%q should not have failed to parse", text) + }) +} + +func TestBytes_UnmarshalText(t *testing.T) { + for _, tc := range testCases { + expect := tc.bytes + + var actual units.Bytes + if assert.NoError(t, actual.UnmarshalText([]byte(tc.text)), "Parsing %q should not have failed", tc.text) { + assert.Equal(t, tc.bytes, actual, "Parsing %q should have resulted in %d, but got %d", tc.text, expect, actual) + } + } + + t.Run("Multiple units", func(t *testing.T) { + in := "4MB3kB100B" + expect := (4 * units.Megabyte) + (3 * units.Kilobyte) + (100 * units.Byte) + + var actual units.Bytes + require.NoError(t, actual.UnmarshalText([]byte(in)), "%q should not have failed to parse", in) + require.Equal(t, expect, actual, "%q did not parse correctly", in) + }) + + t.Run("KB and kB are equivalent", func(t *testing.T) { + strings := []string{"15KB", "15kB"} + expect := 15 * units.Kilobyte + + for _, str := range strings { + var actual units.Bytes + if assert.NoError(t, actual.UnmarshalText([]byte(str)), "Parsing %q should not have failed", str) { + assert.Equal(t, expect, actual, "Parsing %q should have resulted in %d, but got %d", str, expect, actual) + } + } + }) + + t.Run("Leading positive sign", func(t *testing.T) { + for _, tc := range testCases { + text := "+" + tc.text + expect := tc.bytes + + var actual units.Bytes + if assert.NoError(t, actual.UnmarshalText([]byte(text)), "Parsing %q should not have failed", text) { + assert.Equal(t, tc.bytes, actual, "Parsing %q should have resulted in %d, but got %d", text, expect, actual) + } + } + }) + + t.Run("Leading negative sign", func(t *testing.T) { + for _, tc := range testCases { + text := "-" + tc.text + expect := tc.bytes + + var actual units.Bytes + if assert.NoError(t, actual.UnmarshalText([]byte(text)), "Parsing %q should not have failed", text) { + assert.Equal(t, -tc.bytes, actual, "Parsing %q should have resulted in %d, but got %d", text, -expect, actual) + } + } + }) + + t.Run("Detect overflows", func(t *testing.T) { + var actual units.Bytes + err := actual.UnmarshalText([]byte("10000EB")) + require.ErrorIs(t, err, units.ErrOverflow, "Parsing 10000EB should have resulted in an overflow error") + }) + + t.Run("Detect overflows of sequence", func(t *testing.T) { + var actual units.Bytes + err := actual.UnmarshalText([]byte("7EB5EB")) + require.ErrorIs(t, err, units.ErrOverflow, "Parsing 7EB5EB should have resulted in an overflow error") + }) +} + +func splitNumberSuffix(in string) (string, string) { + suffixOffSet := len(in) + +Loop: + for i := 0; i < len(in); i++ { + switch { + case i == 0 && (in[i] == '-' || in[i] == '+'): + suffixOffSet = i + 1 + case '0' <= in[i] && in[i] <= '9': + suffixOffSet = i + 1 + default: + break Loop + } + } + + return in[:suffixOffSet], in[suffixOffSet:] +} + +func TestBytes_String(t *testing.T) { + for _, tc := range testCases { + expect := tc.text + actual := tc.bytes.String() + assert.Equal(t, expect, actual, "String() should have returned %q, but got %q", expect, actual) + } + + t.Run("Negative numbers", func(t *testing.T) { + for _, tc := range testCases { + // Ignore 0, as -0 gets simplified to 0. + if tc.text == "0" { + continue + } + + expect := "-" + tc.text + actual := (-tc.bytes).String() + assert.Equal(t, expect, actual, "String() should have returned %q, but got %q", expect, actual) + } + }) +}