Skip to content

Commit

Permalink
New serialization feature and update dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
AuracleTech committed Feb 8, 2024
1 parent 6f17204 commit d9b9e13
Show file tree
Hide file tree
Showing 6 changed files with 267 additions and 91 deletions.
4 changes: 3 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
[package]
name = "jayce"
version = "9.0.2"
version = "9.1.0"
edition = "2021"
description = "jayce is a tokenizer 🌌"
repository = "https://github.com/AuracleTech/jayce"
license = "MIT"

[dependencies]
bytecount = { version = "0.6.3", features = ["runtime-dispatch-simd"] }
bytecount = "0.6.7"
lazy_static = "1.4.0"
regex = "1.8.4"
regex = "1.10.3"
serde = { version = "1.0.196", optional = true }

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
serde_json = "1.0.113"

[[bench]]
name = "initialization"
Expand All @@ -23,6 +25,6 @@ name = "tokenize"
harness = false

[features]
default = ["runtime-dispatch-simd"]
runtime-dispatch-simd = ["bytecount/runtime-dispatch-simd"]
default = ["bytecount/runtime-dispatch-simd"]
serialization = ["serde"]
generic-simd = ["bytecount/generic-simd"]
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,18 @@ Token { kind: "comment_line", value: "// Your own language!", pos: (1, 20) }

##### Performances

initialization in `1.83 nanoseconds` and tokenization of [29 639](https://github.com/AuracleTech/yuumi) tokens in `3.85 milliseconds`
tokenization of [29 639](https://github.com/AuracleTech/yuumi) rust language tokens

SIMD acceleration enabled by default, modify `Cargo.toml` as follows to disable
- `3.8 milliseconds` with referenced tokens and serialization disabled

```toml
jayce = { version = "X.X.X", default-features = false }
```

##### Changelog
- `5.0 milliseconds` with owned tokens and serialization available

> `7.0.2` is `442%` faster than version `4.0.1` from making everything precompiled
##### Features

> `9.0.0` is `30%` slower than version `8.1.0` to support custom whitespaces & comments
`serialization`
`generic-simd`
`runtime-dispatch-simd` enabled by default, to disable modify `Cargo.toml` as follows

> `9.0.2` is `5%` faster than version `9.0.1` by enabling SIMD acceleration by default
```toml
jayce = { version = "X.X.X", default-features = false }
```
151 changes: 77 additions & 74 deletions src/internal.rs
Original file line number Diff line number Diff line change
@@ -1,74 +1,77 @@
use regex::Regex;

#[derive(Debug, Clone, PartialEq)]
pub enum Duos {
Whitespace,
CommentLine,
CommentBlock,
Newline,

Keyword,
String,
Char,
Lifetime,
Operator,
Identifier,
Integer,
Float,
DoubleColon,
Semicolon,
OpenBrace,
CloseBrace,
OpenParen,
CloseParen,
OpenBracket,
CloseBracket,
Comma,
Hash,
Dot,
Colon,
Pipe,
OpenAngle,
CloseAngle,
Caret,
TempBorrow,
Question,
MacroExclamation,
}

lazy_static::lazy_static! {
pub static ref DUOS_RUST: Vec<(Duos, Regex)> = crate::duos!(
Duos::Whitespace, r"^[^\S\n]+",
Duos::CommentLine, r"^//(.*)",
Duos::CommentBlock, r"^/\*(.|\n)*?\*/",
Duos::Newline, r"^\n",

Duos::Keyword, r"^(mut|let|if|else|fn|struct|enum|match|use|mod|pub|crate|impl|trait|for|while|loop|break|continue|return|as|const|static|type|where|unsafe|extern|ref|self|super|in|move|dyn|abstract|async|await|become|box|do|final|macro|override|priv|typeof|unsized|virtual|yield)\b",
Duos::String, r#"^"[^"]*""#,
Duos::Char, r"^'(.|\\n)'",
Duos::Lifetime, r"^'(?:[a-z_][a-z0-9_]*|static)\b",
Duos::Operator, r"^(=|\+|-|\*|/|%)",
Duos::Identifier, r"^[a-zA-Z_][a-zA-Z0-9_]*",
Duos::Integer, r"^\d+",
Duos::Float, r"^\d+\.\d+",
Duos::DoubleColon, r"^::",
Duos::Semicolon, r"^;",
Duos::OpenBrace, r"^\{",
Duos::CloseBrace, r"^\}",
Duos::OpenParen, r"^\(",
Duos::CloseParen, r"^\)",
Duos::OpenBracket, r"^\[",
Duos::CloseBracket, r"^\]",
Duos::Comma, r"^,",
Duos::Hash, r"^#",
Duos::Dot, r"^\.",
Duos::Colon, r"^:",
Duos::Pipe, r"^\|",
Duos::OpenAngle, r"^<",
Duos::CloseAngle, r"^>",
Duos::Caret, r"^\^",
Duos::TempBorrow, r"^&",
Duos::Question, r"^\?",
Duos::MacroExclamation, r"^!"
);
}
use regex::Regex;
#[cfg(feature = "serialization")]
use serde::{Deserialize, Serialize};

#[derive(Debug, PartialEq, Clone, Copy)]
#[cfg_attr(feature = "serialization", derive(Serialize, Deserialize))]
pub enum Duos {
Whitespace,
CommentLine,
CommentBlock,
Newline,

Keyword,
String,
Char,
Lifetime,
Operator,
Identifier,
Integer,
Float,
DoubleColon,
Semicolon,
OpenBrace,
CloseBrace,
OpenParen,
CloseParen,
OpenBracket,
CloseBracket,
Comma,
Hash,
Dot,
Colon,
Pipe,
OpenAngle,
CloseAngle,
Caret,
TempBorrow,
Question,
MacroExclamation,
}

lazy_static::lazy_static! {
pub static ref DUOS_RUST: Vec<(Duos, Regex)> = crate::duos!(
Duos::Whitespace, r"^[^\S\n]+",
Duos::CommentLine, r"^//(.*)",
Duos::CommentBlock, r"^/\*(.|\n)*?\*/",
Duos::Newline, r"^\n",

Duos::Keyword, r"^(mut|let|if|else|fn|struct|enum|match|use|mod|pub|crate|impl|trait|for|while|loop|break|continue|return|as|const|static|type|where|unsafe|extern|ref|self|super|in|move|dyn|abstract|async|await|become|box|do|final|macro|override|priv|typeof|unsized|virtual|yield)\b",
Duos::String, r#"^"[^"]*""#,
Duos::Char, r"^'(.|\\n)'",
Duos::Lifetime, r"^'(?:[a-z_][a-z0-9_]*|static)\b",
Duos::Operator, r"^(=|\+|-|\*|/|%)",
Duos::Identifier, r"^[a-zA-Z_][a-zA-Z0-9_]*",
Duos::Integer, r"^\d+",
Duos::Float, r"^\d+\.\d+",
Duos::DoubleColon, r"^::",
Duos::Semicolon, r"^;",
Duos::OpenBrace, r"^\{",
Duos::CloseBrace, r"^\}",
Duos::OpenParen, r"^\(",
Duos::CloseParen, r"^\)",
Duos::OpenBracket, r"^\[",
Duos::CloseBracket, r"^\]",
Duos::Comma, r"^,",
Duos::Hash, r"^#",
Duos::Dot, r"^\.",
Duos::Colon, r"^:",
Duos::Pipe, r"^\|",
Duos::OpenAngle, r"^<",
Duos::CloseAngle, r"^>",
Duos::Caret, r"^\^",
Duos::TempBorrow, r"^&",
Duos::Question, r"^\?",
Duos::MacroExclamation, r"^!"
);
}
74 changes: 74 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
pub mod internal;
use regex::Regex;

#[cfg(feature = "serialization")]
use serde::{Deserialize, Serialize};

#[macro_export]
macro_rules! duos(($($kind:expr, $pattern:expr),*) => { vec![ $( ($kind, Regex::new($pattern).unwrap()) ),* ] };);

Expand All @@ -12,13 +15,15 @@ pub struct Tokenizer<'a, T> {
column: usize,
}

#[cfg(not(feature = "serialization"))]
#[derive(Debug, Clone, PartialEq)]
pub struct Token<'a, T> {
pub kind: &'a T,
pub value: &'a str,
pub pos: (usize, usize),
}

#[cfg(not(feature = "serialization"))]
impl<'a, T> Tokenizer<'a, T> {
#[inline]
pub fn new(source: &'a str, duos: &'static [(T, Regex)]) -> Self {
Expand Down Expand Up @@ -75,3 +80,72 @@ impl<'a, T> Tokenizer<'a, T> {
Ok(tokens)
}
}

#[cfg(feature = "serialization")]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Token<T> {
pub kind: T,
pub value: String,
pub pos: (usize, usize),
}

#[cfg(feature = "serialization")]
impl<'a, T> Tokenizer<'a, T>
where
T: Clone,
{
#[inline]
pub fn new(source: &'a str, duos: &'static [(T, Regex)]) -> Self {
Self {
source,
duos,
cursor: 0,
line: 1,
column: 1,
}
}

pub fn next(&mut self) -> Result<Option<Token<T>>, Box<dyn std::error::Error>> {
if self.cursor >= self.source.len() {
return Ok(None);
}

for (kind, regex) in self.duos.iter() {
if let Some(result) = regex.find(&self.source[self.cursor..]) {
let value: &str = result.as_str();

let token = Token {
kind: kind.clone(),
value: value.to_string(),
pos: (self.line, self.column),
};

let len = result.len();
self.cursor += len;
let newlines_count = bytecount::count(value.as_bytes(), b'\n');
if newlines_count > 0 {
self.line += newlines_count;
self.column = len - value.rfind('\n').unwrap_or(1);
} else {
self.column += len;
}

return Ok(Some(token));
}
}

Err(format!(
"Failed to match at line {}, column {}.",
self.line, self.column
))?
}

pub fn tokenize_all(&mut self) -> Result<Vec<Token<T>>, Box<dyn std::error::Error>> {
let mut tokens = Vec::new();
while let Some(token) = self.next()? {
tokens.push(token);
}

Ok(tokens)
}
}
Loading

0 comments on commit d9b9e13

Please sign in to comment.