From e7bc2fbad1d9d723ec6e2d459a37b46a6e059708 Mon Sep 17 00:00:00 2001
From: Robert Fratto <robertfratto@gmail.com>
Date: Mon, 29 Apr 2024 12:14:01 -0400
Subject: [PATCH] internal/units: create replacement for
 github.com/alecthomas/units package

This commit creates a replacement for the github.com/alecthomas/units
package. A replacement is desirable because the upstream package doesn't
work very well with the Alloy syntax:

* We use the UnmarshalText implementation of Base2Bytes, which treats
  metric (MB) and IEC (MiB) suffixes as the same.
* Base2Bytes always reports values as IEC suffixes when marshalling them
  back, which can confuse users into thinking a unit conversion has
  occurred somewhere.

I think it's potentially confusing to users that setting a limit of 4MB
actually sets a limit of 4MiB, which is ~4% less than what the user
intended.

The new implementation supports parsing the same input as the old
package, including complex byte sizes such as `4MiB3KiB`, though
simplified byte sizes is preferred (`4099KiB`), and the simplified forms
are returned when marshaling back into a string.
---
 internal/units/scanner.go                     |  37 +++
 .../fuzz/Fuzz_Compare/20c62d19f67262c0        |   2 +
 .../fuzz/Fuzz_Compare/47e2335362c157c8        |   2 +
 .../fuzz/Fuzz_Compare/bb9c4a861dbff943        |   2 +
 .../fuzz/Fuzz_Compare/cb083f134466af85        |   2 +
 .../fuzz/Fuzz_Compare/fee633b7369dbe46        |   2 +
 .../fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c  |   2 +
 internal/units/units.go                       | 214 ++++++++++++++++++
 internal/units/units_test.go                  | 202 +++++++++++++++++
 9 files changed, 465 insertions(+)
 create mode 100644 internal/units/scanner.go
 create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/20c62d19f67262c0
 create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/47e2335362c157c8
 create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/bb9c4a861dbff943
 create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/cb083f134466af85
 create mode 100644 internal/units/testdata/fuzz/Fuzz_Compare/fee633b7369dbe46
 create mode 100644 internal/units/testdata/fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c
 create mode 100644 internal/units/units.go
 create mode 100644 internal/units/units_test.go

diff --git a/internal/units/scanner.go b/internal/units/scanner.go
new file mode 100644
index 0000000000..5272b58a23
--- /dev/null
+++ b/internal/units/scanner.go
@@ -0,0 +1,37 @@
+package units
+
+type scanner struct {
+	text   string
+	offset int
+}
+
+func newScanner(in string) *scanner {
+	return &scanner{text: in, offset: 0}
+}
+
+// Next returns true if there are more bytes to scan. It does not advance the scanner.
+func (s *scanner) Next() bool {
+	return s.offset < len(s.text)
+}
+
+// Scan returns the next byte and advances the scanner.
+func (s *scanner) Scan() byte {
+	ch := s.text[s.offset]
+	s.offset++
+	return ch
+}
+
+// String returns the substring up to the current offset.
+func (s *scanner) String() string {
+	return s.text[:s.offset]
+}
+
+// Peek returns the byte at the current offset without advancing the scanner.
+func (s *scanner) Peek() byte {
+	return s.text[s.offset]
+}
+
+// Rem returns the number of bytes remaining in the scanner.
+func (s *scanner) Rem() int {
+	return len(s.text) - s.offset
+}
diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/20c62d19f67262c0 b/internal/units/testdata/fuzz/Fuzz_Compare/20c62d19f67262c0
new file mode 100644
index 0000000000..a00bce773e
--- /dev/null
+++ b/internal/units/testdata/fuzz/Fuzz_Compare/20c62d19f67262c0
@@ -0,0 +1,2 @@
+go test fuzz v1
+string("8EiB")
diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/47e2335362c157c8 b/internal/units/testdata/fuzz/Fuzz_Compare/47e2335362c157c8
new file mode 100644
index 0000000000..e06121e2bf
--- /dev/null
+++ b/internal/units/testdata/fuzz/Fuzz_Compare/47e2335362c157c8
@@ -0,0 +1,2 @@
+go test fuzz v1
+string("10.B")
diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/bb9c4a861dbff943 b/internal/units/testdata/fuzz/Fuzz_Compare/bb9c4a861dbff943
new file mode 100644
index 0000000000..30d8dab100
--- /dev/null
+++ b/internal/units/testdata/fuzz/Fuzz_Compare/bb9c4a861dbff943
@@ -0,0 +1,2 @@
+go test fuzz v1
+string("0B00B")
diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/cb083f134466af85 b/internal/units/testdata/fuzz/Fuzz_Compare/cb083f134466af85
new file mode 100644
index 0000000000..9091313396
--- /dev/null
+++ b/internal/units/testdata/fuzz/Fuzz_Compare/cb083f134466af85
@@ -0,0 +1,2 @@
+go test fuzz v1
+string("00B")
diff --git a/internal/units/testdata/fuzz/Fuzz_Compare/fee633b7369dbe46 b/internal/units/testdata/fuzz/Fuzz_Compare/fee633b7369dbe46
new file mode 100644
index 0000000000..dc08befa9a
--- /dev/null
+++ b/internal/units/testdata/fuzz/Fuzz_Compare/fee633b7369dbe46
@@ -0,0 +1,2 @@
+go test fuzz v1
+string("00B0B")
diff --git a/internal/units/testdata/fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c b/internal/units/testdata/fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c
new file mode 100644
index 0000000000..9d84267315
--- /dev/null
+++ b/internal/units/testdata/fuzz/Fuzz_UnmarshalText/e9e3ffbe3b3a072c
@@ -0,0 +1,2 @@
+go test fuzz v1
+string("A")
diff --git a/internal/units/units.go b/internal/units/units.go
new file mode 100644
index 0000000000..5a41b1c427
--- /dev/null
+++ b/internal/units/units.go
@@ -0,0 +1,214 @@
+// Package units provides functionality for parsing and displaying multiples of
+// bytes.
+package units
+
+import (
+	"encoding"
+	"errors"
+	"fmt"
+	"strconv"
+)
+
+var (
+	// ErrInvalidSyntax is returned when a byte string cannot be parsed.
+	ErrInvalidSyntax = errors.New("invalid syntax")
+
+	// ErrOverflow is returned when a byte string is too large to be represented.
+	ErrOverflow = errors.New("byte size overflows int64")
+)
+
+type Bytes int64
+
+var (
+	_ encoding.TextUnmarshaler = (*Bytes)(nil)
+	_ encoding.TextMarshaler   = Bytes(0)
+)
+
+const (
+	Byte Bytes = 1
+
+	Kilobyte = 1000 * Byte
+	Megabyte = 1000 * Kilobyte
+	Gigabyte = 1000 * Megabyte
+	Terabyte = 1000 * Gigabyte
+	Petabyte = 1000 * Terabyte
+	Exabyte  = 1000 * Petabyte
+
+	Kibibyte = 1024 * Byte
+	Mebibyte = 1024 * Kibibyte
+	Gibibyte = 1024 * Mebibyte
+	Tebibyte = 1024 * Gibibyte
+	Pebibyte = 1024 * Tebibyte
+	Exbibyte = 1024 * Pebibyte
+)
+
+var unitMap = map[string]Bytes{
+	"":   Byte,
+	"b":  Byte,
+	"B":  Byte,
+	"kB": Kilobyte,
+	"KB": Kilobyte,
+	"MB": Megabyte,
+	"GB": Gigabyte,
+	"TB": Terabyte,
+	"PB": Petabyte,
+	"EB": Exabyte,
+
+	"KiB": Kibibyte,
+	"MiB": Mebibyte,
+	"GiB": Gibibyte,
+	"TiB": Tebibyte,
+	"PiB": Pebibyte,
+	"EiB": Exbibyte,
+}
+
+// UnmarshalText parses a byte size from a string. Byte sizes are represented
+// as sequences of number and unit pairs with no whitespace. Units are
+// represented either as IEC units (KiB, MiB, GiB, etc) or metric units (kB or
+// KB, MB, GB, etc).
+//
+// Multiple sequences of byte sizes can be provided in a single string, such as
+// "4MB2KB". The sum of across all sequences is returned.
+func (b *Bytes) UnmarshalText(text []byte) error {
+	if len(text) == 0 {
+		return ErrInvalidSyntax
+	}
+
+	// Byte offset while scanning through input.
+	s := newScanner(string(text))
+
+	// Parse optional leading sign.
+	sign := 1
+	switch s.Peek() {
+	case '-':
+		sign = -1
+		s.Scan() // Advance scanner
+	case '+':
+		sign = 1 // This is redundant, but added for clarity.
+		s.Scan() // Advance scanner
+	}
+
+	var sum Bytes
+
+	for s.Next() {
+		// Find digit components.
+		numberText, err := scanNumberString(s)
+		if err != nil {
+			return err
+		}
+		number, err := strconv.ParseInt(numberText, 10, 64)
+		if err != nil {
+			return ErrInvalidSyntax
+		}
+
+		unit, ok := unitMap[scanUnitString(s)]
+		if !ok {
+			return ErrInvalidSyntax
+		}
+
+		newBytes := Bytes(number * int64(unit))
+		if newBytes/unit != Bytes(number) {
+			return ErrOverflow
+		} else if sum+newBytes < sum {
+			return ErrOverflow
+		}
+
+		sum += newBytes
+	}
+
+	*b = Bytes(sign) * sum
+	return nil
+}
+
+func scanNumberString(s *scanner) (string, error) {
+	var str string
+
+	for s.Next() {
+		ch := s.Peek()
+
+		if '0' <= ch && ch <= '9' {
+			str += string(ch)
+			_ = s.Scan() // Advance the scanner.
+			continue
+		}
+
+		break
+	}
+
+	if len(str) == 0 {
+		return "", ErrInvalidSyntax
+	}
+	return str, nil
+}
+
+func scanUnitString(s *scanner) string {
+	var str string
+
+	// Scan until a non-number character.
+	for s.Next() {
+		ch := s.Peek()
+		if ch < '0' || ch > '9' {
+			str += string(ch)
+			_ = s.Scan() // Advance the scanner.
+			continue
+		}
+
+		break
+	}
+
+	return str
+}
+
+// MarshalText returns the string representation of b. See [Bytes.String] for
+// more information.
+func (b Bytes) MarshalText() ([]byte, error) {
+	return []byte(b.String()), nil
+}
+
+// String returns a string representing the bytes in human-readable form. Bytes
+// are returned in the highest possible unit that retains accuracy. If b is a
+// multiple of 1024, the IEC binary prefixes are used (KiB, MiB, GiB, etc).
+// Otherwise, the SI decimal prefixes are used (kB, MB, GB, etc).
+//
+// Byte sizes are always displayed as whole numbers, and are represented in the
+// highest possible prefix that preserves precision. For example, 1024 bytes
+// would be represented as "1KiB", while 1025 bytes would be represented as
+// "1025".
+func (b Bytes) String() string {
+	if b == 0 {
+		return "0"
+	}
+
+	var metricSuffixes = []string{"", "kB", "MB", "GB", "TB", "PB", "EB"}
+	var iecSuffixes = []string{"", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB"}
+
+	var suffixOffset int
+
+	isMetric := b%1000 == 0
+
+	switch {
+	case isMetric:
+		for b%1000 == 0 && suffixOffset <= len(metricSuffixes)-1 {
+			// Divide by 1000, increase suffix offset.
+			b /= 1000
+			suffixOffset++
+		}
+
+		if suffixOffset == 0 {
+			return fmt.Sprintf("%d", b)
+		}
+		return fmt.Sprintf("%d%s", b, metricSuffixes[suffixOffset])
+
+	default:
+		for b%1024 == 0 && suffixOffset < len(iecSuffixes)-1 {
+			// Divide by 1024, increase suffix offset.
+			b /= 1024
+			suffixOffset++
+		}
+
+		if suffixOffset == 0 {
+			return fmt.Sprintf("%d", b)
+		}
+		return fmt.Sprintf("%d%s", b, iecSuffixes[suffixOffset])
+	}
+}
diff --git a/internal/units/units_test.go b/internal/units/units_test.go
new file mode 100644
index 0000000000..b41d6f85a3
--- /dev/null
+++ b/internal/units/units_test.go
@@ -0,0 +1,202 @@
+package units_test
+
+import (
+	"errors"
+	"strings"
+	"testing"
+
+	oldunits "github.com/alecthomas/units"
+	"github.com/grafana/alloy/internal/units"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+var testCases = []struct {
+	text  string
+	bytes units.Bytes
+}{
+	{"0", 0},
+	{"512", 512},
+
+	{"1kB", 1 * units.Kilobyte},
+	{"2MB", 2 * units.Megabyte},
+	{"3GB", 3 * units.Gigabyte},
+	{"4TB", 4 * units.Terabyte},
+	{"5PB", 5 * units.Petabyte},
+	{"6EB", 6 * units.Exabyte},
+
+	{"7KiB", 7 * units.Kibibyte},
+	{"8MiB", 8 * units.Mebibyte},
+	{"9GiB", 9 * units.Gibibyte},
+	{"10TiB", 10 * units.Tebibyte},
+	{"11PiB", 11 * units.Pebibyte},
+	{"7EiB", 7 * units.Exbibyte}, // 7EiB is the highest we can go without overflowing int64
+}
+
+// Fuzz_UnmarshalText ensures that nothing panics when calling UnmarshalText.
+func Fuzz_UnmarshalText(f *testing.F) {
+	f.Add("0")
+	f.Add("512")
+	f.Add("7KiB")
+	f.Add("8MiB")
+	f.Add("9GiB")
+	f.Add("10TiB")
+	f.Add("11PiB")
+	f.Add("7EiB")
+	f.Add("-3MiB")
+	f.Add("3MiB4KiB5B")
+
+	f.Fuzz(func(t *testing.T, text string) {
+		var bytes units.Bytes
+		_ = bytes.UnmarshalText([]byte(text))
+	})
+}
+
+// Fuzz_String ensures that nothing panics when calling String.
+func Fuzz_String(f *testing.F) {
+	for _, tc := range testCases {
+		f.Add(int64(tc.bytes))
+	}
+
+	f.Fuzz(func(t *testing.T, bytes int64) {
+		_ = units.Bytes(bytes).String()
+	})
+}
+
+// Fuzz_Compare ensures that anything parseable by oldunits.UnmarshalText can be parsed by units.UnmarshalText.
+func Fuzz_Compare(f *testing.F) {
+	for _, tc := range testCases {
+		f.Add(tc.text)
+	}
+
+	// Ensure that spaces are not permitted in oldunits, since they're not
+	// permitted here either.
+	f.Add("4 MiB")
+	f.Add("4MiB 2KiB")
+
+	f.Fuzz(func(t *testing.T, text string) {
+		if strings.Contains(text, ".") {
+			t.Skip()
+		}
+
+		var oldType oldunits.Base2Bytes
+		if err := oldType.UnmarshalText([]byte(text)); err != nil {
+			t.Skip()
+		}
+
+		var newType units.Bytes
+		err := newType.UnmarshalText([]byte(text))
+		if errors.Is(err, units.ErrOverflow) {
+			t.Skip()
+		}
+
+		require.NoError(t, err, "%q should not have failed to parse", text)
+	})
+}
+
+func TestBytes_UnmarshalText(t *testing.T) {
+	for _, tc := range testCases {
+		expect := tc.bytes
+
+		var actual units.Bytes
+		if assert.NoError(t, actual.UnmarshalText([]byte(tc.text)), "Parsing %q should not have failed", tc.text) {
+			assert.Equal(t, tc.bytes, actual, "Parsing %q should have resulted in %d, but got %d", tc.text, expect, actual)
+		}
+	}
+
+	t.Run("Multiple units", func(t *testing.T) {
+		in := "4MB3kB100B"
+		expect := (4 * units.Megabyte) + (3 * units.Kilobyte) + (100 * units.Byte)
+
+		var actual units.Bytes
+		require.NoError(t, actual.UnmarshalText([]byte(in)), "%q should not have failed to parse", in)
+		require.Equal(t, expect, actual, "%q did not parse correctly", in)
+	})
+
+	t.Run("KB and kB are equivalent", func(t *testing.T) {
+		strings := []string{"15KB", "15kB"}
+		expect := 15 * units.Kilobyte
+
+		for _, str := range strings {
+			var actual units.Bytes
+			if assert.NoError(t, actual.UnmarshalText([]byte(str)), "Parsing %q should not have failed", str) {
+				assert.Equal(t, expect, actual, "Parsing %q should have resulted in %d, but got %d", str, expect, actual)
+			}
+		}
+	})
+
+	t.Run("Leading positive sign", func(t *testing.T) {
+		for _, tc := range testCases {
+			text := "+" + tc.text
+			expect := tc.bytes
+
+			var actual units.Bytes
+			if assert.NoError(t, actual.UnmarshalText([]byte(text)), "Parsing %q should not have failed", text) {
+				assert.Equal(t, tc.bytes, actual, "Parsing %q should have resulted in %d, but got %d", text, expect, actual)
+			}
+		}
+	})
+
+	t.Run("Leading negative sign", func(t *testing.T) {
+		for _, tc := range testCases {
+			text := "-" + tc.text
+			expect := tc.bytes
+
+			var actual units.Bytes
+			if assert.NoError(t, actual.UnmarshalText([]byte(text)), "Parsing %q should not have failed", text) {
+				assert.Equal(t, -tc.bytes, actual, "Parsing %q should have resulted in %d, but got %d", text, -expect, actual)
+			}
+		}
+	})
+
+	t.Run("Detect overflows", func(t *testing.T) {
+		var actual units.Bytes
+		err := actual.UnmarshalText([]byte("10000EB"))
+		require.ErrorIs(t, err, units.ErrOverflow, "Parsing 10000EB should have resulted in an overflow error")
+	})
+
+	t.Run("Detect overflows of sequence", func(t *testing.T) {
+		var actual units.Bytes
+		err := actual.UnmarshalText([]byte("7EB5EB"))
+		require.ErrorIs(t, err, units.ErrOverflow, "Parsing 7EB5EB should have resulted in an overflow error")
+	})
+}
+
+func splitNumberSuffix(in string) (string, string) {
+	suffixOffSet := len(in)
+
+Loop:
+	for i := 0; i < len(in); i++ {
+		switch {
+		case i == 0 && (in[i] == '-' || in[i] == '+'):
+			suffixOffSet = i + 1
+		case '0' <= in[i] && in[i] <= '9':
+			suffixOffSet = i + 1
+		default:
+			break Loop
+		}
+	}
+
+	return in[:suffixOffSet], in[suffixOffSet:]
+}
+
+func TestBytes_String(t *testing.T) {
+	for _, tc := range testCases {
+		expect := tc.text
+		actual := tc.bytes.String()
+		assert.Equal(t, expect, actual, "String() should have returned %q, but got %q", expect, actual)
+	}
+
+	t.Run("Negative numbers", func(t *testing.T) {
+		for _, tc := range testCases {
+			// Ignore 0, as -0 gets simplified to 0.
+			if tc.text == "0" {
+				continue
+			}
+
+			expect := "-" + tc.text
+			actual := (-tc.bytes).String()
+			assert.Equal(t, expect, actual, "String() should have returned %q, but got %q", expect, actual)
+		}
+	})
+}