Skip to content

Commit

Permalink
internal/units: create replacement for github.com/alecthomas/units pa…
Browse files Browse the repository at this point in the history
…ckage

This commit creates a replacement for the github.com/alecthomas/units
package. A replacement is desirable because the upstream package doesn't
work very well with the Alloy syntax:

* We use the UnmarshalText implementation of Base2Bytes, which treats
  metric (MB) and IEC (MiB) suffixes as the same.
* Base2Bytes always reports values as IEC suffixes when marshalling them
  back, which can confuse users into thinking a unit conversion has
  occurred somewhere.

I think it's potentially confusing to users that setting a limit of 4MB
actually sets a limit of 4MiB, which is ~4% less than what the user
intended.

The new implementation supports parsing the same input as the old
package, including complex byte sizes such as `4MiB3KiB`, though
simplified byte sizes is preferred (`4099KiB`), and the simplified forms
are returned when marshaling back into a string.
  • Loading branch information
rfratto committed Apr 29, 2024
1 parent fafbaa3 commit e7bc2fb
Show file tree
Hide file tree
Showing 9 changed files with 465 additions and 0 deletions.
37 changes: 37 additions & 0 deletions internal/units/scanner.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package units

type scanner struct {
text string
offset int
}

func newScanner(in string) *scanner {
return &scanner{text: in, offset: 0}
}

// Next returns true if there are more bytes to scan. It does not advance the scanner.
func (s *scanner) Next() bool {
return s.offset < len(s.text)
}

// Scan returns the next byte and advances the scanner.
func (s *scanner) Scan() byte {
ch := s.text[s.offset]
s.offset++
return ch
}

// String returns the substring up to the current offset.
func (s *scanner) String() string {
return s.text[:s.offset]
}

// Peek returns the byte at the current offset without advancing the scanner.
func (s *scanner) Peek() byte {
return s.text[s.offset]
}

// Rem returns the number of bytes remaining in the scanner.
func (s *scanner) Rem() int {
return len(s.text) - s.offset
}
2 changes: 2 additions & 0 deletions internal/units/testdata/fuzz/Fuzz_Compare/20c62d19f67262c0
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
go test fuzz v1
string("8EiB")
2 changes: 2 additions & 0 deletions internal/units/testdata/fuzz/Fuzz_Compare/47e2335362c157c8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
go test fuzz v1
string("10.B")
2 changes: 2 additions & 0 deletions internal/units/testdata/fuzz/Fuzz_Compare/bb9c4a861dbff943
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
go test fuzz v1
string("0B00B")
2 changes: 2 additions & 0 deletions internal/units/testdata/fuzz/Fuzz_Compare/cb083f134466af85
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
go test fuzz v1
string("00B")
2 changes: 2 additions & 0 deletions internal/units/testdata/fuzz/Fuzz_Compare/fee633b7369dbe46
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
go test fuzz v1
string("00B0B")
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
go test fuzz v1
string("A")
214 changes: 214 additions & 0 deletions internal/units/units.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
// Package units provides functionality for parsing and displaying multiples of
// bytes.
package units

import (
"encoding"
"errors"
"fmt"
"strconv"
)

var (
// ErrInvalidSyntax is returned when a byte string cannot be parsed.
ErrInvalidSyntax = errors.New("invalid syntax")

// ErrOverflow is returned when a byte string is too large to be represented.
ErrOverflow = errors.New("byte size overflows int64")
)

type Bytes int64

var (
_ encoding.TextUnmarshaler = (*Bytes)(nil)
_ encoding.TextMarshaler = Bytes(0)
)

const (
Byte Bytes = 1

Kilobyte = 1000 * Byte
Megabyte = 1000 * Kilobyte
Gigabyte = 1000 * Megabyte
Terabyte = 1000 * Gigabyte
Petabyte = 1000 * Terabyte
Exabyte = 1000 * Petabyte

Kibibyte = 1024 * Byte
Mebibyte = 1024 * Kibibyte
Gibibyte = 1024 * Mebibyte
Tebibyte = 1024 * Gibibyte
Pebibyte = 1024 * Tebibyte
Exbibyte = 1024 * Pebibyte
)

var unitMap = map[string]Bytes{
"": Byte,
"b": Byte,
"B": Byte,
"kB": Kilobyte,
"KB": Kilobyte,
"MB": Megabyte,
"GB": Gigabyte,
"TB": Terabyte,
"PB": Petabyte,
"EB": Exabyte,

"KiB": Kibibyte,
"MiB": Mebibyte,
"GiB": Gibibyte,
"TiB": Tebibyte,
"PiB": Pebibyte,
"EiB": Exbibyte,
}

// UnmarshalText parses a byte size from a string. Byte sizes are represented
// as sequences of number and unit pairs with no whitespace. Units are
// represented either as IEC units (KiB, MiB, GiB, etc) or metric units (kB or
// KB, MB, GB, etc).
//
// Multiple sequences of byte sizes can be provided in a single string, such as
// "4MB2KB". The sum of across all sequences is returned.
func (b *Bytes) UnmarshalText(text []byte) error {
if len(text) == 0 {
return ErrInvalidSyntax
}

// Byte offset while scanning through input.
s := newScanner(string(text))

// Parse optional leading sign.
sign := 1
switch s.Peek() {
case '-':
sign = -1
s.Scan() // Advance scanner
case '+':
sign = 1 // This is redundant, but added for clarity.
s.Scan() // Advance scanner
}

var sum Bytes

for s.Next() {
// Find digit components.
numberText, err := scanNumberString(s)
if err != nil {
return err
}
number, err := strconv.ParseInt(numberText, 10, 64)
if err != nil {
return ErrInvalidSyntax
}

unit, ok := unitMap[scanUnitString(s)]
if !ok {
return ErrInvalidSyntax
}

newBytes := Bytes(number * int64(unit))
if newBytes/unit != Bytes(number) {
return ErrOverflow
} else if sum+newBytes < sum {
return ErrOverflow
}

sum += newBytes
}

*b = Bytes(sign) * sum
return nil
}

func scanNumberString(s *scanner) (string, error) {
var str string

for s.Next() {
ch := s.Peek()

if '0' <= ch && ch <= '9' {
str += string(ch)
_ = s.Scan() // Advance the scanner.
continue
}

break
}

if len(str) == 0 {
return "", ErrInvalidSyntax
}
return str, nil
}

func scanUnitString(s *scanner) string {
var str string

// Scan until a non-number character.
for s.Next() {
ch := s.Peek()
if ch < '0' || ch > '9' {
str += string(ch)
_ = s.Scan() // Advance the scanner.
continue
}

break
}

return str
}

// MarshalText returns the string representation of b. See [Bytes.String] for
// more information.
func (b Bytes) MarshalText() ([]byte, error) {
return []byte(b.String()), nil
}

// String returns a string representing the bytes in human-readable form. Bytes
// are returned in the highest possible unit that retains accuracy. If b is a
// multiple of 1024, the IEC binary prefixes are used (KiB, MiB, GiB, etc).
// Otherwise, the SI decimal prefixes are used (kB, MB, GB, etc).
//
// Byte sizes are always displayed as whole numbers, and are represented in the
// highest possible prefix that preserves precision. For example, 1024 bytes
// would be represented as "1KiB", while 1025 bytes would be represented as
// "1025".
func (b Bytes) String() string {
if b == 0 {
return "0"
}

var metricSuffixes = []string{"", "kB", "MB", "GB", "TB", "PB", "EB"}
var iecSuffixes = []string{"", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB"}

var suffixOffset int

isMetric := b%1000 == 0

switch {
case isMetric:
for b%1000 == 0 && suffixOffset <= len(metricSuffixes)-1 {
// Divide by 1000, increase suffix offset.
b /= 1000
suffixOffset++
}

if suffixOffset == 0 {
return fmt.Sprintf("%d", b)
}
return fmt.Sprintf("%d%s", b, metricSuffixes[suffixOffset])

default:
for b%1024 == 0 && suffixOffset < len(iecSuffixes)-1 {
// Divide by 1024, increase suffix offset.
b /= 1024
suffixOffset++
}

if suffixOffset == 0 {
return fmt.Sprintf("%d", b)
}
return fmt.Sprintf("%d%s", b, iecSuffixes[suffixOffset])
}
}
Loading

0 comments on commit e7bc2fb

Please sign in to comment.