diff --git a/Cargo.lock b/Cargo.lock index 893b110..a2a2c43 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -241,7 +241,7 @@ checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jayce" -version = "9.1.2" +version = "10.0.0" dependencies = [ "bytecount", "criterion", diff --git a/Cargo.toml b/Cargo.toml index cde729a..53a48b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "jayce" -version = "9.1.2" +version = "10.0.0" edition = "2021" description = "jayce is a tokenizer 🌌" repository = "https://github.com/AuracleTech/jayce" diff --git a/README.md b/README.md index ffc373b..71c6227 100644 --- a/README.md +++ b/README.md @@ -5,30 +5,34 @@ jayce is a tokenizer 🌌 ##### Example ```rust -use jayce::{duos, Tokenizer}; -use regex::Regex; +use jayce::{Duo, SeekResult, Tokenizer}; const SOURCE: &str = "Excalibur = 5000$; // Your own language!"; -lazy_static::lazy_static! ( - static ref DUOS: Vec<(&'static str, Regex)> = duos![ - "whitespace", r"^[^\S\n]+", - "comment_line", r"^//(.*)", - "comment_block", r"^/\*(.|\n)*?\*/", - "newline", r"^\n", - - "price", r"^[0-9]+\$", - "semicolon", r"^;", - "operator", r"^=", - "name", r"^[a-zA-Z_]+" +lazy_static::lazy_static! { + static ref DUOS: Vec> = vec![ + // Token name, regular expression, and if we preserve the token + Duo::new("whitespace", r"^[^\S\n]+", false), + Duo::new("commentLine", r"^//(.*)", false), + Duo::new("commentBlock", r"^/\*(.|\n)*?\*/", false), + Duo::new("newline", r"^\n", false), + + Duo::new("price", r"^[0-9]+\$", true), + Duo::new("semicolon", r"^;", true), + Duo::new("operator", r"^=", true), + Duo::new("name", r"^[a-zA-Z_]+", true) ]; -); +} fn main() -> Result<(), Box> { let mut tokenizer = Tokenizer::new(SOURCE, &DUOS); - while let Some(token) = tokenizer.next()? { - println!("{:?}", token); + while let Ok(tokenize_result) = tokenizer.seek() { + match tokenize_result { + SeekResult::Match(token) => println!("{:?}", token), + SeekResult::Skipped => continue, + SeekResult::End => break, + } } Ok(()) @@ -39,31 +43,32 @@ fn main() -> Result<(), Box> { ```rust,ignore Token { kind: "name", value: "Excalibur", pos: (1, 1) } -Token { kind: "whitespace", value: " ", pos: (1, 10) } Token { kind: "operator", value: "=", pos: (1, 11) } -Token { kind: "whitespace", value: " ", pos: (1, 12) } Token { kind: "price", value: "5000$", pos: (1, 13) } Token { kind: "semicolon", value: ";", pos: (1, 18) } -Token { kind: "whitespace", value: " ", pos: (1, 19) } -Token { kind: "comment_line", value: "// Your own language!", pos: (1, 20) } ``` ##### Info -`next` possible `Result` +`tokenizer.seek()` returns `Result` + +1. `Ok(SeekResult)` Seeking next token is successful +2. `Err(error)` An error occurs + +`SeekResult` -1. `Ok(Some(token))` Match is found -2. `Ok(None)` End of source -3. `Err(error)` An error occurs +3. `Match(Token)` Match found +4. `Skipped` Match found but token is not preserved +5. `End` End of source -`tokenize_all` possible `Result` +`tokenizer.tokenize_all()` returns `Result` 1. `Ok(Vec)` Tokens are found 2. `Err(error)` An error occurs ##### Performances -tokenization of [29 639](https://github.com/AuracleTech/yuumi) rust language tokens +tokenization of [Yuumi](https://github.com/AuracleTech/yuumi) project's language tokens - `3.8 milliseconds` with referenced tokens and serialization disabled - `5.0 milliseconds` with owned tokens and serialization available diff --git a/TODOS.md b/TODOS.md index 7df9b8b..bae0340 100644 --- a/TODOS.md +++ b/TODOS.md @@ -1,7 +1,3 @@ # TODO - [ ] Replace `lazy_static` by `LazyLock` when available in stable std release -- [ ] Clippy linting -- [ ] README review -- [x] Preserve optional duos -- [ ] Implement `Serialize` and `Deserialize` for `Token` without duplication diff --git a/examples/example.rs b/examples/example.rs index d6d90a1..46fcd78 100644 --- a/examples/example.rs +++ b/examples/example.rs @@ -1,4 +1,4 @@ -use jayce::{Duo, SeekResult}; +use jayce::{Duo, SeekResult, Tokenizer}; const SOURCE: &str = "Excalibur = 5000$; // Your own language!"; @@ -17,11 +17,11 @@ lazy_static::lazy_static! { } fn main() -> Result<(), Box> { - let mut tokenizer = jayce::Tokenizer::new(SOURCE, &DUOS); + let mut tokenizer = Tokenizer::new(SOURCE, &DUOS); while let Ok(tokenize_result) = tokenizer.seek() { match tokenize_result { - SeekResult::Token(token) => println!("{:?}", token), + SeekResult::Match(token) => println!("{:?}", token), SeekResult::Skipped => continue, SeekResult::End => break, } diff --git a/src/lib.rs b/src/lib.rs index 49d7d48..4305025 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,12 +8,12 @@ pub struct Duo { } impl Duo { - pub fn new(kind: T, regex: &str, _preserve: bool) -> Self { + pub fn new(kind: T, regex: &str, preserve: bool) -> Self { let regex = Regex::new(regex).unwrap(); Self { kind, regex, - preserve: _preserve, + preserve, } } } @@ -29,10 +29,13 @@ pub struct Tokenizer<'a, T> { #[cfg(feature = "serialization")] mod tokenizer_owned; #[cfg(feature = "serialization")] +pub use tokenizer_owned::SeekResult; +#[cfg(feature = "serialization")] pub use tokenizer_owned::Token; #[cfg(not(feature = "serialization"))] mod tokenizer_ref; +#[cfg(not(feature = "serialization"))] pub use tokenizer_ref::SeekResult; #[cfg(not(feature = "serialization"))] pub use tokenizer_ref::Token; diff --git a/src/tokenizer_owned.rs b/src/tokenizer_owned.rs index 625f844..f21fff5 100644 --- a/src/tokenizer_owned.rs +++ b/src/tokenizer_owned.rs @@ -1,7 +1,13 @@ -use crate::Tokenizer; -use regex::Regex; +use crate::{Duo, Tokenizer}; use serde::{Deserialize, Serialize}; +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum SeekResult { + Match(Token), + Skipped, + End, +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Token { pub kind: T, @@ -14,7 +20,7 @@ where T: Clone, { #[inline] - pub fn new(source: &'a str, duos: &'static [(T, Regex)]) -> Self { + pub fn new(source: &'a str, duos: &'a [Duo]) -> Self { Self { source, duos, @@ -24,19 +30,23 @@ where } } - pub fn next(&mut self) -> Result>, Box> { + pub fn seek(&mut self) -> Result, Box> { if self.cursor >= self.source.len() { - return Ok(None); + return Ok(SeekResult::End); } - for (kind, regex) in self.duos.iter() { - if let Some(result) = regex.find(&self.source[self.cursor..]) { + for duo in self.duos.iter() { + if let Some(result) = duo.regex.find(&self.source[self.cursor..]) { let value: &str = result.as_str(); - let token = Token { - kind: kind.clone(), - value: value.to_string(), - pos: (self.line, self.column), + let token = if duo.preserve { + SeekResult::Match(Token { + kind: duo.kind.clone(), + value: value.to_string(), + pos: (self.line, self.column), + }) + } else { + SeekResult::Skipped }; let len = result.len(); @@ -49,7 +59,7 @@ where self.column += len; } - return Ok(Some(token)); + return Ok(token); } } @@ -61,8 +71,12 @@ where pub fn tokenize_all(&mut self) -> Result>, Box> { let mut tokens = Vec::new(); - while let Some(token) = self.next()? { - tokens.push(token); + while let Ok(tokenize_result) = self.seek() { + match tokenize_result { + SeekResult::Match(token) => tokens.push(token), + SeekResult::Skipped => continue, + SeekResult::End => break, + } } Ok(tokens) diff --git a/src/tokenizer_ref.rs b/src/tokenizer_ref.rs index a0ec351..ad2bd2c 100644 --- a/src/tokenizer_ref.rs +++ b/src/tokenizer_ref.rs @@ -1,7 +1,7 @@ use crate::{Duo, Tokenizer}; pub enum SeekResult<'a, T> { - Token(Token<'a, T>), + Match(Token<'a, T>), Skipped, End, } @@ -35,7 +35,7 @@ impl<'a, T> Tokenizer<'a, T> { let value: &str = result.as_str(); let token = if duo.preserve { - SeekResult::Token(Token { + SeekResult::Match(Token { kind: &duo.kind, value, pos: (self.line, self.column), @@ -68,7 +68,7 @@ impl<'a, T> Tokenizer<'a, T> { let mut tokens = Vec::new(); while let Ok(tokenize_result) = self.seek() { match tokenize_result { - SeekResult::Token(token) => tokens.push(token), + SeekResult::Match(token) => tokens.push(token), SeekResult::Skipped => continue, SeekResult::End => break, } diff --git a/tests/experiment.rs b/tests/experiment.rs index 260c886..54612e6 100644 --- a/tests/experiment.rs +++ b/tests/experiment.rs @@ -1,6 +1,6 @@ use jayce::{ internal::{KindsRust, DUOS_RUST}, - Duo, Tokenizer, + Duo, SeekResult, Tokenizer, }; use lazy_static::lazy_static; @@ -247,14 +247,12 @@ fn verify(source: &str, duos: &Vec>, expected: &[(T, &str, (usize, usi where T: PartialEq + std::fmt::Debug, { - use jayce::SeekResult; - let mut tokenizer = Tokenizer::new(source, duos); for (kind, value, (line, column)) in expected { let result = tokenizer.seek().unwrap(); let token = match result { - SeekResult::Token(token) => token, + SeekResult::Match(token) => token, SeekResult::Skipped => continue, SeekResult::End => panic!("No token found when expected"), }; @@ -274,24 +272,25 @@ where let result = tokenizer.seek().unwrap(); match result { - SeekResult::Token(token) => panic!("Unexpected token: {:?}", token), + SeekResult::Match(token) => panic!("Unexpected token: {:?}", token), SeekResult::Skipped => panic!("Unexpected skipped token"), SeekResult::End => {} }; } #[cfg(feature = "serialization")] -fn verify(source: &str, duos: &'static [(T, Regex)], expected: &[(T, &str, (usize, usize))]) +fn verify(source: &str, duos: &Vec>, expected: &[(T, &str, (usize, usize))]) where T: PartialEq + std::fmt::Debug + Clone, { let mut tokenizer = Tokenizer::new(source, duos); for (kind, value, (line, column)) in expected { - let token = match tokenizer.next() { - Ok(Some(token)) => token, - Ok(None) => panic!("No token found when expected"), - Err(err) => panic!("Error while tokenizing: {}", err), + let result = tokenizer.seek().unwrap(); + let token = match result { + SeekResult::Match(token) => token, + SeekResult::Skipped => continue, + SeekResult::End => panic!("No token found when expected"), }; println!( @@ -306,10 +305,12 @@ where assert_eq!(column, &token.pos.1); } - match tokenizer.next() { - Ok(Some(token)) => panic!("Unexpected token: {:?}", token), - Ok(None) => {} - Err(err) => panic!("Error while tokenizing: {}", err), + let result = tokenizer.seek().unwrap(); + + match result { + SeekResult::Match(token) => panic!("Unexpected token: {:?}", token), + SeekResult::Skipped => panic!("Unexpected skipped token"), + SeekResult::End => {} }; } @@ -344,15 +345,20 @@ use vulkanalia::prelude::v1_0::*;"; #[test] fn serialization_collection() { let mut tokenizer = Tokenizer::new(SOURCE_SERIALIZATION, &DUOS_RUST); - let tokens = tokenizer.tokenize_all().unwrap(); let serialized = serde_json::to_string(&tokens).unwrap(); let mut deserialized: Vec> = serde_json::from_str(&serialized).unwrap(); - while let Some(token) = tokenizer.next().unwrap() { + while let Ok(result) = tokenizer.seek() { + let token = match result { + SeekResult::Match(token) => token, + SeekResult::Skipped => continue, + SeekResult::End => break, + }; + let deser_token = deserialized.pop().unwrap(); - let token_kind = match deser_token.kind { + let deser_token_kind = match deser_token.kind { "CommentLine" => KindsRust::CommentLine, "Newline" => KindsRust::Newline, "Keyword" => KindsRust::Keyword, @@ -367,6 +373,6 @@ fn serialization_collection() { assert_eq!(token.value, deser_token.value); assert_eq!(token.pos.0, deser_token.pos.0); assert_eq!(token.pos.1, deser_token.pos.1); - assert_eq!(token.kind, token_kind); + assert_eq!(token.kind, deser_token_kind); } }