Skip to content

Commit

Permalink
tweak name and docs
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-zh committed Jan 9, 2025
1 parent 426a9a2 commit 9f0abfa
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 89 deletions.
134 changes: 64 additions & 70 deletions builtin/charsview.mbt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2024 International Digital Economy Academy
// Copyright 2025 International Digital Economy Academy
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -13,48 +13,41 @@
// limitations under the License.

///|
/// A `CharsView` represents a slice of a String, storing UTF-16 character indices.
///
/// # Fields
///
/// - `str`: The source String being viewed
/// - `start`: Starting UTF-16 code unit index into the string
/// - `end`: Ending UTF-16 code unit index into the string (not included)
/// - `len`: Number of Unicode codepoints (characters) in this view
///
/// Note that `start` and `end` are UTF-16 code unit indices, while `len`
/// counts actual Unicode codepoints which may span multiple UTF-16 code units.
struct CharsView {
/// A `StringView` represents a view of a String that maintains proper Unicode
/// character boundaries. It allows safe access to a substring while handling
/// multi-byte characters correctly.
struct StringView {
// # Fields
//
// - `str`: The source String being viewed
// - `start`: Starting UTF-16 code unit index into the string
// - `end`: Ending UTF-16 code unit index into the string (not included)
// - `len`: Number of Unicode codepoints (characters) in this view
//
// Note that `start` and `end` are UTF-16 code unit indices, while `len`
// counts actual Unicode codepoints which may span multiple UTF-16 code units.
str : String
start : Int
end : Int
len : Int
}

///|
/// A `CharOffset` represents an offset into a string based on UTF-16 code units.
/// A `StringIndex` represents an unicode-aware position within a string.
///
/// This is an opaque type that represents positions within a string based on
/// raw UTF-16 code unit offsets. This ensures that string slicing operations
/// work with proper character boundaries and do not split surrogate pairs.
///
/// The offset counts the number of UTF-16 code units from the start of the
/// string. This makes it safe to use for string indexing operations since it
/// will always align with character boundaries.
///
/// This type is intentionally opaque - it can only be constructed through the
/// `CharOffset::offset_by(str: String, char_offset: Int) -> CharOffset` method
/// and cannot be manipulated directly as an integer.
type CharOffset Int derive(Show, Eq)
/// This type is intentionally opaque - it is recommended to use the
/// `StringIndex::index_at(str: String, char_index: Int) -> StringIndex` method
/// to construct a `StringIndex`.
type StringIndex Int derive(Show, Eq)

///|
/// Returns the UTF-16 code unit index of the character at the given offset.
/// Returns `None` if the offset is beyond the end of the string.
fn offset_by(str : String, char_offset : Int, start~ : Int = 0) -> Int? {
///|
/// Returns the UTF-16 code unit index after skipping `char_index` Unicode
/// characters, starting from the given `start` position.
fn index_at(str : String, char_index : Int, start~ : Int = 0) -> Int? {
let str_len = str.length()
let mut utf16_offset = start
let mut char_count = 0
while utf16_offset < str_len && char_count < char_offset {
while utf16_offset < str_len && char_count < char_index {
let c1 = str[utf16_offset]
// check if this is a surrogate pair
if is_leading_surrogate(c1) && utf16_offset + 1 < str_len {
Expand All @@ -72,82 +65,79 @@ fn offset_by(str : String, char_offset : Int, start~ : Int = 0) -> Int? {
// Return None if:
// 1. We couldn't reach the requested character offset
// 2. The resulting offset is beyond the end of the string
if char_count < char_offset || utf16_offset >= str_len {
if char_count < char_index || utf16_offset >= str_len {
None
} else {
Some(utf16_offset)
}
}

///|
/// Returns a `CharOffset` representing the position after skipping `char_offset` characters.
///
/// This method counts Unicode characters (codepoints) from the start of the string and returns
/// a `CharOffset` representing that position. Returns `None` if the offset would go beyond the
/// end of the string.
/// Returns a `StringIndex` representing the position after skipping `char_index`
/// Unicode characters.
///
/// # Examples
///
/// ```
/// let str = "🤣🤣🤣"
/// let offset = CharOffset::offset_by(str, 1) // Skip 1 character
/// inspect!(offset, content="Some(CharOffset(2))") // Points to second emoji
/// let offset = StringIndex::index_at(str, 1) // Skip 1 character
/// inspect!(offset, content="Some(StringIndex(2))") // Points to second emoji
///
/// let offset = CharOffset::offset_by(str, 2) // Skip 2 characters
/// inspect!(offset, content="Some(CharOffset(4))") // Points to third emoji
/// let offset = StringIndex::index_at(str, 2) // Skip 2 characters
/// inspect!(offset, content="Some(StringIndex(4))") // Points to third emoji
///
/// let offset = CharOffset::offset_by(str, 3) // Skip 3 characters
/// let offset = StringIndex::index_at(str, 3) // Skip 3 characters
/// inspect!(offset, content="None") // Beyond end of string
/// ```
///
pub fn CharOffset::offset_by(str : String, char_offset : Int) -> CharOffset? {
match offset_by(str, char_offset, start=0) {
pub fn StringIndex::index_at(str : String, char_index : Int) -> StringIndex? {
match index_at(str, char_index, start=0) {
Some(utf16_offset) => Some(utf16_offset)
None => None
}
}

///|
/// Creates a `CharOffset` from an integer.
/// Creates a `StringIndex` from an integer.
///
/// This is an unsafe operation that assumes the integer is a valid UTF-16
/// code unit index. It does not perform any validation or bounds checking.
pub fn CharOffset::unsafe_from_int(i : Int) -> CharOffset {
pub fn StringIndex::unsafe_from_int(i : Int) -> StringIndex {
i
}

///|
pub fn length(self : CharsView) -> Int {
///|
/// Returns the number of Unicode characters in this view.
pub fn length(self : StringView) -> Int {
self.len
}

///|
/// Creates a `CharsView` into a `String`.
/// Creates a `StringView` into a `String`.
///
/// # Example
///
/// ```
/// let str = "Hello🤣🤣🤣"
/// guard let Some(start) = CharOffset::offset_by(str, 1)
/// guard let Some(end) = CharOffset::offset_by(str, 6)
/// guard let Some(start) = StringIndex::index_at(str, 1)
/// guard let Some(end) = StringIndex::index_at(str, 6)
/// let view = str[start:end]
/// inspect!(view, content=
/// #|"ello🤣"
///)
/// ```
pub fn op_as_view(
self : String,
start~ : CharOffset = 0,
end? : CharOffset
) -> CharsView {
start~ : StringIndex = 0,
end? : StringIndex
) -> StringView {
let str_len = self.length()
let start = start._
let end = match end {
Some(e) => e._
None => str_len
}
guard start >= 0 && start <= end && end <= str_len else {
abort("Invalid index for CharsView")
abort("Invalid index for StringView")
}
let mut len = 0
for index = start; index < end; index = index + 1 {
Expand All @@ -167,37 +157,41 @@ pub fn op_as_view(
}

///|
/// Creates a new `CharsView` from an existing `CharsView`.
/// Creates a new `StringView` from an existing `StringView`.
///
/// # Example
///
/// ```
/// let str = "Hello🤣🤣🤣"
/// guard let Some(start) = CharOffset::offset_by(str, 1)
/// guard let Some(end) = CharOffset::offset_by(str, 7)
/// guard let Some(start) = StringIndex::index_at(str, 1)
/// guard let Some(end) = StringIndex::index_at(str, 7)
/// let view = str[start:end]
/// let view2 = view[1:5]
/// inspect!(view2, content=
/// #|"llo🤣"
/// )
/// ```
pub fn op_as_view(self : CharsView, start~ : Int = 0, end? : Int) -> CharsView {
pub fn op_as_view(
self : StringView,
start~ : Int = 0,
end? : Int
) -> StringView {
let cv_len = self.len
match end {
Some(end) => {
guard start >= 0 && start <= end && end <= cv_len else {
abort("Invalid index for CharsView")
abort("Invalid index for StringView")
}
guard let Some(start) = offset_by(self.str, start, start=self.start)
// TODO: provide offset_by_rev or offset_by2 to avoid repeatedly iterate the string
guard let Some(end) = offset_by(self.str, end, start=self.start)
guard let Some(start) = index_at(self.str, start, start=self.start)
// TODO: provide index_at_rev or index_at2 to avoid repeatedly iterate the string
guard let Some(end) = index_at(self.str, end, start=self.start)
{ str: self.str, start, end, len: end - start }
}
None => {
guard start >= 0 && start <= cv_len else {
abort("Invalid index for CharsView")
abort("Invalid index for StringView")
}
guard let Some(start) = offset_by(self.str, start, start=self.start)
guard let Some(start) = index_at(self.str, start, start=self.start)
{ str: self.str, start, end: self.end, len: cv_len - start }
}
}
Expand All @@ -212,13 +206,13 @@ pub fn op_as_view(self : CharsView, start~ : Int = 0, end? : Int) -> CharsView {
///
/// ```
/// let str = "Hello🤣🤣🤣"
/// guard let Some(start) = CharOffset::offset_by(str, 1)
/// guard let Some(end) = CharOffset::offset_by(str, 6)
/// guard let Some(start) = StringIndex::index_at(str, 1)
/// guard let Some(end) = StringIndex::index_at(str, 6)
/// let view = str[start:end]
/// inspect!(view[0], content="'e'")
/// inspect!(view[4], content="'🤣'")
/// ```
pub fn op_get(self : CharsView, index : Int) -> Char {
pub fn op_get(self : StringView, index : Int) -> Char {
guard index >= 0 && index < self.len else {
abort(
"index out of bounds: the len is from 0 to \{self.len} but the index is \{index}",
Expand Down Expand Up @@ -253,7 +247,7 @@ pub fn op_get(self : CharsView, index : Int) -> Char {
}

///|
pub impl Show for CharsView with output(self, logger) {
pub impl Show for StringView with output(self, logger) {
let substr = self.str.substring(start=self.start, end=self.end)
String::output(substr, logger)
}
38 changes: 19 additions & 19 deletions builtin/charsview_test.mbt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2024 International Digital Economy Academy
// Copyright 2025 International Digital Economy Academy
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -12,28 +12,28 @@
// See the License for the specific language governing permissions and
// limitations under the License.

test "offset_by" {
test "index_at" {
let str = "Hello"
let offset = CharOffset::offset_by(str, 3)
inspect!(offset, content="Some(CharOffset(3))")
let offset = CharOffset::offset_by(str, 5)
let offset = StringIndex::index_at(str, 3)
inspect!(offset, content="Some(StringIndex(3))")
let offset = StringIndex::index_at(str, 5)
inspect!(offset, content="None")
}

test "offset_by with surrogate pairs" {
test "index_at with surrogate pairs" {
let str = "🤣🤣🤣"
let offset = CharOffset::offset_by(str, 1)
inspect!(offset, content="Some(CharOffset(2))")
let offset = CharOffset::offset_by(str, 2)
inspect!(offset, content="Some(CharOffset(4))")
let offset = CharOffset::offset_by(str, 3)
let offset = StringIndex::index_at(str, 1)
inspect!(offset, content="Some(StringIndex(2))")
let offset = StringIndex::index_at(str, 2)
inspect!(offset, content="Some(StringIndex(4))")
let offset = StringIndex::index_at(str, 3)
inspect!(offset, content="None")
}

test "charsview basic" {
let str = "Hello🤣🤣🤣"
guard let Some(start) = CharOffset::offset_by(str, 1)
guard let Some(end) = CharOffset::offset_by(str, 6)
guard let Some(start) = StringIndex::index_at(str, 1)
guard let Some(end) = StringIndex::index_at(str, 6)
inspect!(
str[start:],
content=
Expand Down Expand Up @@ -62,8 +62,8 @@ test "charsview basic" {

test "charsview basic2" {
let str = "He🤣🤣🤣llo"
guard let Some(start) = CharOffset::offset_by(str, 1)
guard let Some(end) = CharOffset::offset_by(str, 7)
guard let Some(start) = StringIndex::index_at(str, 1)
guard let Some(end) = StringIndex::index_at(str, 7)
inspect!(
str[start:],
content=
Expand Down Expand Up @@ -92,8 +92,8 @@ test "charsview basic2" {

test "charsview subview" {
let str = "Hello🤣🤣🤣"
guard let Some(start) = CharOffset::offset_by(str, 1)
guard let Some(end) = CharOffset::offset_by(str, 6)
guard let Some(start) = StringIndex::index_at(str, 1)
guard let Some(end) = StringIndex::index_at(str, 6)
let view = str[start:end]
inspect!(
view[1:],
Expand Down Expand Up @@ -123,8 +123,8 @@ test "charsview subview" {

test "charsview op_get" {
let str = "Hello🤣🤣🤣"
guard let Some(start) = CharOffset::offset_by(str, 1)
guard let Some(end) = CharOffset::offset_by(str, 6)
guard let Some(start) = StringIndex::index_at(str, 1)
guard let Some(end) = StringIndex::index_at(str, 6)
let view = str[start:end]
inspect!(view[0], content="'e'")
inspect!(view[4], content="'🤣'")
Expand Down

0 comments on commit 9f0abfa

Please sign in to comment.