Skip to content

Commit

Permalink
Implement Duo everywhere, preserve bool and SeekResult
Browse files Browse the repository at this point in the history
  • Loading branch information
AuracleTech committed Feb 14, 2024
1 parent a644841 commit 1299afc
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 72 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "jayce"
version = "9.1.2"
version = "10.0.0"
edition = "2021"
description = "jayce is a tokenizer 🌌"
repository = "https://github.com/AuracleTech/jayce"
Expand Down
57 changes: 31 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,34 @@ jayce is a tokenizer 🌌
##### Example

```rust
use jayce::{duos, Tokenizer};
use regex::Regex;
use jayce::{Duo, SeekResult, Tokenizer};

const SOURCE: &str = "Excalibur = 5000$; // Your own language!";

lazy_static::lazy_static! (
static ref DUOS: Vec<(&'static str, Regex)> = duos![
"whitespace", r"^[^\S\n]+",
"comment_line", r"^//(.*)",
"comment_block", r"^/\*(.|\n)*?\*/",
"newline", r"^\n",

"price", r"^[0-9]+\$",
"semicolon", r"^;",
"operator", r"^=",
"name", r"^[a-zA-Z_]+"
lazy_static::lazy_static! {
static ref DUOS: Vec<Duo<&'static str>> = vec![
// Token name, regular expression, and if we preserve the token
Duo::new("whitespace", r"^[^\S\n]+", false),
Duo::new("commentLine", r"^//(.*)", false),
Duo::new("commentBlock", r"^/\*(.|\n)*?\*/", false),
Duo::new("newline", r"^\n", false),

Duo::new("price", r"^[0-9]+\$", true),
Duo::new("semicolon", r"^;", true),
Duo::new("operator", r"^=", true),
Duo::new("name", r"^[a-zA-Z_]+", true)
];
);
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut tokenizer = Tokenizer::new(SOURCE, &DUOS);

while let Some(token) = tokenizer.next()? {
println!("{:?}", token);
while let Ok(tokenize_result) = tokenizer.seek() {
match tokenize_result {
SeekResult::Match(token) => println!("{:?}", token),
SeekResult::Skipped => continue,
SeekResult::End => break,
}
}

Ok(())
Expand All @@ -39,31 +43,32 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {

```rust,ignore
Token { kind: "name", value: "Excalibur", pos: (1, 1) }
Token { kind: "whitespace", value: " ", pos: (1, 10) }
Token { kind: "operator", value: "=", pos: (1, 11) }
Token { kind: "whitespace", value: " ", pos: (1, 12) }
Token { kind: "price", value: "5000$", pos: (1, 13) }
Token { kind: "semicolon", value: ";", pos: (1, 18) }
Token { kind: "whitespace", value: " ", pos: (1, 19) }
Token { kind: "comment_line", value: "// Your own language!", pos: (1, 20) }
```

##### Info

`next` possible `Result`
`tokenizer.seek()` returns `Result`

1. `Ok(SeekResult)` Seeking next token is successful
2. `Err(error)` An error occurs

`SeekResult`

1. `Ok(Some(token))` Match is found
2. `Ok(None)` End of source
3. `Err(error)` An error occurs
3. `Match(Token<T>)` Match found
4. `Skipped` Match found but token is not preserved
5. `End` End of source

`tokenize_all` possible `Result`
`tokenizer.tokenize_all()` returns `Result`

1. `Ok(Vec<Tokens>)` Tokens are found
2. `Err(error)` An error occurs

##### Performances

tokenization of [29 639](https://github.com/AuracleTech/yuumi) rust language tokens
tokenization of [Yuumi](https://github.com/AuracleTech/yuumi) project's language tokens

- `3.8 milliseconds` with referenced tokens and serialization disabled
- `5.0 milliseconds` with owned tokens and serialization available
Expand Down
4 changes: 0 additions & 4 deletions TODOS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
# TODO

- [ ] Replace `lazy_static` by `LazyLock` when available in stable std release
- [ ] Clippy linting
- [ ] README review
- [x] Preserve optional duos
- [ ] Implement `Serialize` and `Deserialize` for `Token` without duplication
6 changes: 3 additions & 3 deletions examples/example.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use jayce::{Duo, SeekResult};
use jayce::{Duo, SeekResult, Tokenizer};

const SOURCE: &str = "Excalibur = 5000$; // Your own language!";

Expand All @@ -17,11 +17,11 @@ lazy_static::lazy_static! {
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut tokenizer = jayce::Tokenizer::new(SOURCE, &DUOS);
let mut tokenizer = Tokenizer::new(SOURCE, &DUOS);

while let Ok(tokenize_result) = tokenizer.seek() {
match tokenize_result {
SeekResult::Token(token) => println!("{:?}", token),
SeekResult::Match(token) => println!("{:?}", token),
SeekResult::Skipped => continue,
SeekResult::End => break,
}
Expand Down
7 changes: 5 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ pub struct Duo<T> {
}

impl<T> Duo<T> {
pub fn new(kind: T, regex: &str, _preserve: bool) -> Self {
pub fn new(kind: T, regex: &str, preserve: bool) -> Self {
let regex = Regex::new(regex).unwrap();
Self {
kind,
regex,
preserve: _preserve,
preserve,
}
}
}
Expand All @@ -29,10 +29,13 @@ pub struct Tokenizer<'a, T> {
#[cfg(feature = "serialization")]
mod tokenizer_owned;
#[cfg(feature = "serialization")]
pub use tokenizer_owned::SeekResult;
#[cfg(feature = "serialization")]
pub use tokenizer_owned::Token;

#[cfg(not(feature = "serialization"))]
mod tokenizer_ref;
#[cfg(not(feature = "serialization"))]
pub use tokenizer_ref::SeekResult;
#[cfg(not(feature = "serialization"))]
pub use tokenizer_ref::Token;
42 changes: 28 additions & 14 deletions src/tokenizer_owned.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
use crate::Tokenizer;
use regex::Regex;
use crate::{Duo, Tokenizer};
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub enum SeekResult<T> {
Match(Token<T>),
Skipped,
End,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Token<T> {
pub kind: T,
Expand All @@ -14,7 +20,7 @@ where
T: Clone,
{
#[inline]
pub fn new(source: &'a str, duos: &'static [(T, Regex)]) -> Self {
pub fn new(source: &'a str, duos: &'a [Duo<T>]) -> Self {
Self {
source,
duos,
Expand All @@ -24,19 +30,23 @@ where
}
}

pub fn next(&mut self) -> Result<Option<Token<T>>, Box<dyn std::error::Error>> {
pub fn seek(&mut self) -> Result<SeekResult<T>, Box<dyn std::error::Error>> {
if self.cursor >= self.source.len() {
return Ok(None);
return Ok(SeekResult::End);
}

for (kind, regex) in self.duos.iter() {
if let Some(result) = regex.find(&self.source[self.cursor..]) {
for duo in self.duos.iter() {
if let Some(result) = duo.regex.find(&self.source[self.cursor..]) {
let value: &str = result.as_str();

let token = Token {
kind: kind.clone(),
value: value.to_string(),
pos: (self.line, self.column),
let token = if duo.preserve {
SeekResult::Match(Token {
kind: duo.kind.clone(),
value: value.to_string(),
pos: (self.line, self.column),
})
} else {
SeekResult::Skipped
};

let len = result.len();
Expand All @@ -49,7 +59,7 @@ where
self.column += len;
}

return Ok(Some(token));
return Ok(token);
}
}

Expand All @@ -61,8 +71,12 @@ where

pub fn tokenize_all(&mut self) -> Result<Vec<Token<T>>, Box<dyn std::error::Error>> {
let mut tokens = Vec::new();
while let Some(token) = self.next()? {
tokens.push(token);
while let Ok(tokenize_result) = self.seek() {
match tokenize_result {
SeekResult::Match(token) => tokens.push(token),
SeekResult::Skipped => continue,
SeekResult::End => break,
}
}

Ok(tokens)
Expand Down
6 changes: 3 additions & 3 deletions src/tokenizer_ref.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::{Duo, Tokenizer};

pub enum SeekResult<'a, T> {
Token(Token<'a, T>),
Match(Token<'a, T>),
Skipped,
End,
}
Expand Down Expand Up @@ -35,7 +35,7 @@ impl<'a, T> Tokenizer<'a, T> {
let value: &str = result.as_str();

let token = if duo.preserve {
SeekResult::Token(Token {
SeekResult::Match(Token {
kind: &duo.kind,
value,
pos: (self.line, self.column),
Expand Down Expand Up @@ -68,7 +68,7 @@ impl<'a, T> Tokenizer<'a, T> {
let mut tokens = Vec::new();
while let Ok(tokenize_result) = self.seek() {
match tokenize_result {
SeekResult::Token(token) => tokens.push(token),
SeekResult::Match(token) => tokens.push(token),
SeekResult::Skipped => continue,
SeekResult::End => break,
}
Expand Down
42 changes: 24 additions & 18 deletions tests/experiment.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use jayce::{
internal::{KindsRust, DUOS_RUST},
Duo, Tokenizer,
Duo, SeekResult, Tokenizer,
};
use lazy_static::lazy_static;

Expand Down Expand Up @@ -247,14 +247,12 @@ fn verify<T>(source: &str, duos: &Vec<Duo<T>>, expected: &[(T, &str, (usize, usi
where
T: PartialEq + std::fmt::Debug,
{
use jayce::SeekResult;

let mut tokenizer = Tokenizer::new(source, duos);

for (kind, value, (line, column)) in expected {
let result = tokenizer.seek().unwrap();
let token = match result {
SeekResult::Token(token) => token,
SeekResult::Match(token) => token,
SeekResult::Skipped => continue,
SeekResult::End => panic!("No token found when expected"),
};
Expand All @@ -274,24 +272,25 @@ where
let result = tokenizer.seek().unwrap();

match result {
SeekResult::Token(token) => panic!("Unexpected token: {:?}", token),
SeekResult::Match(token) => panic!("Unexpected token: {:?}", token),
SeekResult::Skipped => panic!("Unexpected skipped token"),
SeekResult::End => {}
};
}

#[cfg(feature = "serialization")]
fn verify<T>(source: &str, duos: &'static [(T, Regex)], expected: &[(T, &str, (usize, usize))])
fn verify<T>(source: &str, duos: &Vec<Duo<T>>, expected: &[(T, &str, (usize, usize))])
where
T: PartialEq + std::fmt::Debug + Clone,
{
let mut tokenizer = Tokenizer::new(source, duos);

for (kind, value, (line, column)) in expected {
let token = match tokenizer.next() {
Ok(Some(token)) => token,
Ok(None) => panic!("No token found when expected"),
Err(err) => panic!("Error while tokenizing: {}", err),
let result = tokenizer.seek().unwrap();
let token = match result {
SeekResult::Match(token) => token,
SeekResult::Skipped => continue,
SeekResult::End => panic!("No token found when expected"),
};

println!(
Expand All @@ -306,10 +305,12 @@ where
assert_eq!(column, &token.pos.1);
}

match tokenizer.next() {
Ok(Some(token)) => panic!("Unexpected token: {:?}", token),
Ok(None) => {}
Err(err) => panic!("Error while tokenizing: {}", err),
let result = tokenizer.seek().unwrap();

match result {
SeekResult::Match(token) => panic!("Unexpected token: {:?}", token),
SeekResult::Skipped => panic!("Unexpected skipped token"),
SeekResult::End => {}
};
}

Expand Down Expand Up @@ -344,15 +345,20 @@ use vulkanalia::prelude::v1_0::*;";
#[test]
fn serialization_collection() {
let mut tokenizer = Tokenizer::new(SOURCE_SERIALIZATION, &DUOS_RUST);

let tokens = tokenizer.tokenize_all().unwrap();

let serialized = serde_json::to_string(&tokens).unwrap();
let mut deserialized: Vec<jayce::Token<&str>> = serde_json::from_str(&serialized).unwrap();

while let Some(token) = tokenizer.next().unwrap() {
while let Ok(result) = tokenizer.seek() {
let token = match result {
SeekResult::Match(token) => token,
SeekResult::Skipped => continue,
SeekResult::End => break,
};

let deser_token = deserialized.pop().unwrap();
let token_kind = match deser_token.kind {
let deser_token_kind = match deser_token.kind {
"CommentLine" => KindsRust::CommentLine,
"Newline" => KindsRust::Newline,
"Keyword" => KindsRust::Keyword,
Expand All @@ -367,6 +373,6 @@ fn serialization_collection() {
assert_eq!(token.value, deser_token.value);
assert_eq!(token.pos.0, deser_token.pos.0);
assert_eq!(token.pos.1, deser_token.pos.1);
assert_eq!(token.kind, token_kind);
assert_eq!(token.kind, deser_token_kind);
}
}

0 comments on commit 1299afc

Please sign in to comment.