Version 11 improving results

AuracleTech · Feb 20, 2024 · 98bced2 · 98bced2
1 parent 2c947cc
commit 98bced2
Show file tree

Hide file tree

Showing 15 changed files with 329 additions and 608 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,16 +2,26 @@
 
 ## [9.0.1] - 2024-02-07
 
-### Update
+- updated dependencies to latest versions
+- attempt to improve performance by turning the `Tokenizer` into an iterator, performance worsened, so I reverted to the current implementation
 
-- Updated dependencies to latest versions.
+## [9.0.0] - 2023-08-16
 
-### Performance
+- users can now configure whitespace characters, replacing the previous hardcoded approach
 
-- Attempt to improve performance by turning the Tokenizer into an iterator, performance worsened, so I reverted to the current implementation.
+## [10.0.0] - 2024-02-14
 
-## [9.0.0] - 2023-08-16
+- `Duo` struct replacing `duos` macro
+- `next` method for `Tokenizer` is now `seek`
+- `SeekResult` enum for `Tokenizer::seek`
 
-### Changed
+## [11.0.0] - 2024-02-18
 
-- Users can now configure whitespace characters, replacing the previous hardcoded approach.
+- `SeekResult` nuked
+- `Tokenizer::seek` renamed to `Tokenizer::consume`
+- `Tokenizer::consume` now returns `Result<Option<Token>>`
+- prevent compiler from optimizing away benchmarks properly
+- tests are now in separate files
+- futurproofing tests by expect matching the whole `Token` struct
+- `serialization` feature nuked
+- fixed `Tokenizer::tokenize_all` not returning errors properly
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "jayce"
-version = "10.0.1"
+version = "11.0.0"
 edition = "2021"
 description = "jayce is a tokenizer 🌌"
 repository = "https://github.com/AuracleTech/jayce"
@@ -10,11 +10,9 @@ license = "MIT"
 bytecount = "0.6.7"
 lazy_static = "1.4.0"
 regex = "1.10.3"
-serde = { version = "1.0.196", features = ["derive"], optional = true }
 
 [dev-dependencies]
 criterion = { version = "0.5.1", features = ["html_reports"] }
-serde_json = "1.0.113"
 
 [[bench]]
 name = "initialization"
@@ -26,5 +24,4 @@ harness = false
 
 [features]
 default = ["bytecount/runtime-dispatch-simd"]
-serialization = ["serde"]
 generic-simd = ["bytecount/generic-simd"]
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ jayce is a tokenizer 🌌
 ##### Example
 
 ```rust
-use jayce::{Duo, SeekResult, Tokenizer};
+use jayce::{Duo, Tokenizer};
 
 const SOURCE: &str = "Excalibur = 5000$; // Your own language!";
 
@@ -26,12 +26,8 @@ lazy_static::lazy_static! {
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     let mut tokenizer = Tokenizer::new(SOURCE, &DUOS);
 
-    while let Ok(tokenize_result) = tokenizer.seek() {
-        match tokenize_result {
-            SeekResult::Match(token) => println!("{:?}", token),
-            SeekResult::Skipped => continue,
-            SeekResult::End => break,
-        }
+    while let Some(token) = tokenizer.consume()? {
+        println!("{:?}", token);
     }
 
     Ok(())
@@ -49,34 +45,26 @@ Token { kind: "semicolon", value: ";", pos: (1, 18) }
 
 ##### Info
 
-`tokenizer.seek()` returns `Result`
+`Tokenizer::consume` returns `Result Option Token`
 
-1. `Ok(SeekResult)` seeking next token is successful
-2. `Err(error)` an error occurs
+1. `Ok Some` match found
+2. `Ok None` end of source
+3. `Err` an error occurs
 
-`SeekResult`
+`Tokenizer::tokenize_all` returns `Result Vec Token`
 
-3. `Match(Token<T>)` match found
-4. `Skipped` match found but token is not preserved
-5. `End` end of source
-
-`tokenizer.tokenize_all()` returns `Result`
-
-1. `Ok(Vec<Tokens>)` tokens are found
-2. `Err(error)` an error occurs
+1. `Ok Vec Token` tokens matched
+2. `Err` an error occurs
 
 ##### Performances
 
-tokenization of [Yuumi](https://github.com/AuracleTech/yuumi) project's language tokens
-
-- `3.8 milliseconds` with referenced tokens and serialization disabled
-- `5.0 milliseconds` with owned tokens and serialization available
+initialization in ~`3 nanoseconds`
+tokenization of [Yuumi](https://github.com/AuracleTech/yuumi) in ~`4 milliseconds`
 
 ##### Features
 
-- `serialization`
 - `generic-simd`
-- `runtime-dispatch-simd` enabled by default, to disable modify `Cargo.toml` as follows
+- `runtime-dispatch-simd` default enabled, to disable modify `Cargo.toml` as follows
 
 ```toml
 jayce = { version = "X.X.X", default-features = false }

diff --git a/TODOS.md b/TODOS.md
@@ -1,3 +1,11 @@
 # TODO
 
 - [ ] Replace `lazy_static` by `LazyLock` when available in stable std release
+
+- [ ] Multi-threading support
+- [ ] Improve performance and precision by removing `^` and parsing the whole file at once
+- [ ] Brainstorm `Duo` and duos structure
+- [ ] Improve `Duo::new` by `unwrap` on regex result
+- [ ]
+
+- [ ] VERIFY `README`
diff --git a/examples/example.rs b/examples/example.rs
@@ -1,4 +1,4 @@
-use jayce::{Duo, SeekResult, Tokenizer};
+use jayce::{Duo, Tokenizer};
 
 const SOURCE: &str = "Excalibur = 5000$; // Your own language!";
 
@@ -19,12 +19,8 @@ lazy_static::lazy_static! {
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     let mut tokenizer = Tokenizer::new(SOURCE, &DUOS);
 
-    while let Ok(tokenize_result) = tokenizer.seek() {
-        match tokenize_result {
-            SeekResult::Match(token) => println!("{:?}", token),
-            SeekResult::Skipped => continue,
-            SeekResult::End => break,
-        }
+    while let Some(token) = tokenizer.consume()? {
+        println!("{:?}", token);
     }
 
     Ok(())

diff --git a/src/internal.rs b/src/internal.rs
@@ -1,9 +1,6 @@
 use crate::Duo;
-#[cfg(feature = "serialization")]
-use serde::{Deserialize, Serialize};
 
 #[derive(Debug, PartialEq, Clone, Copy)]
-#[cfg_attr(feature = "serialization", derive(Serialize, Deserialize))]
 pub enum KindsRust {
     Whitespace,
     CommentLine,

diff --git a/src/lib.rs b/src/lib.rs
@@ -2,17 +2,16 @@ pub mod internal;
 use regex::Regex;
 
 pub struct Duo<T> {
-    kind: T,
-    regex: Regex,
-    preserve: bool,
+    pub kind: T,
+    pub regex: Regex,
+    pub preserve: bool,
 }
 
 impl<T> Duo<T> {
     pub fn new(kind: T, regex: &str, preserve: bool) -> Self {
-        let regex = Regex::new(regex).unwrap();
         Self {
             kind,
-            regex,
+            regex: Regex::new(regex).unwrap(),
             preserve,
         }
     }
@@ -26,16 +25,72 @@ pub struct Tokenizer<'a, T> {
     column: usize,
 }
 
-#[cfg(feature = "serialization")]
-mod tokenizer_owned;
-#[cfg(feature = "serialization")]
-pub use tokenizer_owned::SeekResult;
-#[cfg(feature = "serialization")]
-pub use tokenizer_owned::Token;
-
-#[cfg(not(feature = "serialization"))]
-mod tokenizer_ref;
-#[cfg(not(feature = "serialization"))]
-pub use tokenizer_ref::SeekResult;
-#[cfg(not(feature = "serialization"))]
-pub use tokenizer_ref::Token;
+#[derive(Debug, Clone, PartialEq)]
+pub struct Token<'a, T> {
+    pub kind: &'a T,
+    pub value: &'a str,
+    pub pos: (usize, usize),
+}
+
+impl<'a, T> Tokenizer<'a, T> {
+    #[inline]
+    pub fn new(source: &'a str, duos: &'a [Duo<T>]) -> Self {
+        Self {
+            source,
+            duos,
+            cursor: 0,
+            line: 1,
+            column: 1,
+        }
+    }
+
+    pub fn consume(&mut self) -> Result<Option<Token<'a, T>>, Box<dyn std::error::Error>> {
+        while self.cursor < self.source.len() {
+            let mut matched = false;
+
+            for duo in self.duos.iter() {
+                if let Some(result) = duo.regex.find(&self.source[self.cursor..]) {
+                    let value: &str = result.as_str();
+                    let token_pos = (self.line, self.column);
+                    let len = result.len();
+                    self.cursor += len;
+                    let newlines_count = bytecount::count(value.as_bytes(), b'\n');
+                    if newlines_count > 0 {
+                        self.line += newlines_count;
+                        self.column = len - value.rfind('\n').unwrap_or(1);
+                    } else {
+                        self.column += len;
+                    }
+
+                    if duo.preserve {
+                        return Ok(Some(Token {
+                            kind: &duo.kind,
+                            value,
+                            pos: token_pos,
+                        }));
+                    } else {
+                        matched = true;
+                        break;
+                    }
+                }
+            }
+
+            if !matched {
+                return Err(format!(
+                    "Failed to match at line {}, column {}.",
+                    self.line, self.column
+                ))?;
+            }
+        }
+
+        Ok(None)
+    }
+
+    pub fn tokenize_all(&mut self) -> Result<Vec<Token<'a, T>>, Box<dyn std::error::Error>> {
+        let mut tokens = Vec::new();
+        while let Some(token) = self.consume()? {
+            tokens.push(token);
+        }
+        Ok(tokens)
+    }
+}
diff --git a/src/tokenizer_owned.rs b/src/tokenizer_owned.rs