From d9b9e13e4a14c607e6e2adf074156f8409e37a18 Mon Sep 17 00:00:00 2001 From: AuracleTech Date: Wed, 7 Feb 2024 21:49:48 -0500 Subject: [PATCH] New serialization feature and update dependencies --- Cargo.lock | 4 +- Cargo.toml | 12 ++-- README.md | 20 +++--- src/internal.rs | 151 ++++++++++++++++++++++---------------------- src/lib.rs | 74 ++++++++++++++++++++++ tests/experiment.rs | 97 +++++++++++++++++++++++++++- 6 files changed, 267 insertions(+), 91 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b6ab659..b5895ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -241,12 +241,14 @@ checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jayce" -version = "9.0.2" +version = "9.1.0" dependencies = [ "bytecount", "criterion", "lazy_static", "regex", + "serde", + "serde_json", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 513b5e2..fd3f531 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,18 +1,20 @@ [package] name = "jayce" -version = "9.0.2" +version = "9.1.0" edition = "2021" description = "jayce is a tokenizer 🌌" repository = "https://github.com/AuracleTech/jayce" license = "MIT" [dependencies] -bytecount = { version = "0.6.3", features = ["runtime-dispatch-simd"] } +bytecount = "0.6.7" lazy_static = "1.4.0" -regex = "1.8.4" +regex = "1.10.3" +serde = { version = "1.0.196", optional = true } [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } +serde_json = "1.0.113" [[bench]] name = "initialization" @@ -23,6 +25,6 @@ name = "tokenize" harness = false [features] -default = ["runtime-dispatch-simd"] -runtime-dispatch-simd = ["bytecount/runtime-dispatch-simd"] +default = ["bytecount/runtime-dispatch-simd"] +serialization = ["serde"] generic-simd = ["bytecount/generic-simd"] diff --git a/README.md b/README.md index aded0c0..47631b9 100644 --- a/README.md +++ b/README.md @@ -63,18 +63,18 @@ Token { kind: "comment_line", value: "// Your own language!", pos: (1, 20) } ##### Performances -initialization in `1.83 nanoseconds` and tokenization of [29 639](https://github.com/AuracleTech/yuumi) tokens in `3.85 milliseconds` +tokenization of [29 639](https://github.com/AuracleTech/yuumi) rust language tokens -SIMD acceleration enabled by default, modify `Cargo.toml` as follows to disable +- `3.8 milliseconds` with referenced tokens and serialization disabled -```toml -jayce = { version = "X.X.X", default-features = false } -``` - -##### Changelog +- `5.0 milliseconds` with owned tokens and serialization available -> `7.0.2` is `442%` faster than version `4.0.1` from making everything precompiled +##### Features -> `9.0.0` is `30%` slower than version `8.1.0` to support custom whitespaces & comments +`serialization` +`generic-simd` +`runtime-dispatch-simd` enabled by default, to disable modify `Cargo.toml` as follows -> `9.0.2` is `5%` faster than version `9.0.1` by enabling SIMD acceleration by default +```toml +jayce = { version = "X.X.X", default-features = false } +``` diff --git a/src/internal.rs b/src/internal.rs index 8fc4ecf..0d2f0e9 100644 --- a/src/internal.rs +++ b/src/internal.rs @@ -1,74 +1,77 @@ -use regex::Regex; - -#[derive(Debug, Clone, PartialEq)] -pub enum Duos { - Whitespace, - CommentLine, - CommentBlock, - Newline, - - Keyword, - String, - Char, - Lifetime, - Operator, - Identifier, - Integer, - Float, - DoubleColon, - Semicolon, - OpenBrace, - CloseBrace, - OpenParen, - CloseParen, - OpenBracket, - CloseBracket, - Comma, - Hash, - Dot, - Colon, - Pipe, - OpenAngle, - CloseAngle, - Caret, - TempBorrow, - Question, - MacroExclamation, -} - -lazy_static::lazy_static! { -pub static ref DUOS_RUST: Vec<(Duos, Regex)> = crate::duos!( -Duos::Whitespace, r"^[^\S\n]+", -Duos::CommentLine, r"^//(.*)", -Duos::CommentBlock, r"^/\*(.|\n)*?\*/", -Duos::Newline, r"^\n", - -Duos::Keyword, r"^(mut|let|if|else|fn|struct|enum|match|use|mod|pub|crate|impl|trait|for|while|loop|break|continue|return|as|const|static|type|where|unsafe|extern|ref|self|super|in|move|dyn|abstract|async|await|become|box|do|final|macro|override|priv|typeof|unsized|virtual|yield)\b", -Duos::String, r#"^"[^"]*""#, -Duos::Char, r"^'(.|\\n)'", -Duos::Lifetime, r"^'(?:[a-z_][a-z0-9_]*|static)\b", -Duos::Operator, r"^(=|\+|-|\*|/|%)", -Duos::Identifier, r"^[a-zA-Z_][a-zA-Z0-9_]*", -Duos::Integer, r"^\d+", -Duos::Float, r"^\d+\.\d+", -Duos::DoubleColon, r"^::", -Duos::Semicolon, r"^;", -Duos::OpenBrace, r"^\{", -Duos::CloseBrace, r"^\}", -Duos::OpenParen, r"^\(", -Duos::CloseParen, r"^\)", -Duos::OpenBracket, r"^\[", -Duos::CloseBracket, r"^\]", -Duos::Comma, r"^,", -Duos::Hash, r"^#", -Duos::Dot, r"^\.", -Duos::Colon, r"^:", -Duos::Pipe, r"^\|", -Duos::OpenAngle, r"^<", -Duos::CloseAngle, r"^>", -Duos::Caret, r"^\^", -Duos::TempBorrow, r"^&", -Duos::Question, r"^\?", -Duos::MacroExclamation, r"^!" -); -} +use regex::Regex; +#[cfg(feature = "serialization")] +use serde::{Deserialize, Serialize}; + +#[derive(Debug, PartialEq, Clone, Copy)] +#[cfg_attr(feature = "serialization", derive(Serialize, Deserialize))] +pub enum Duos { + Whitespace, + CommentLine, + CommentBlock, + Newline, + + Keyword, + String, + Char, + Lifetime, + Operator, + Identifier, + Integer, + Float, + DoubleColon, + Semicolon, + OpenBrace, + CloseBrace, + OpenParen, + CloseParen, + OpenBracket, + CloseBracket, + Comma, + Hash, + Dot, + Colon, + Pipe, + OpenAngle, + CloseAngle, + Caret, + TempBorrow, + Question, + MacroExclamation, +} + +lazy_static::lazy_static! { +pub static ref DUOS_RUST: Vec<(Duos, Regex)> = crate::duos!( +Duos::Whitespace, r"^[^\S\n]+", +Duos::CommentLine, r"^//(.*)", +Duos::CommentBlock, r"^/\*(.|\n)*?\*/", +Duos::Newline, r"^\n", + +Duos::Keyword, r"^(mut|let|if|else|fn|struct|enum|match|use|mod|pub|crate|impl|trait|for|while|loop|break|continue|return|as|const|static|type|where|unsafe|extern|ref|self|super|in|move|dyn|abstract|async|await|become|box|do|final|macro|override|priv|typeof|unsized|virtual|yield)\b", +Duos::String, r#"^"[^"]*""#, +Duos::Char, r"^'(.|\\n)'", +Duos::Lifetime, r"^'(?:[a-z_][a-z0-9_]*|static)\b", +Duos::Operator, r"^(=|\+|-|\*|/|%)", +Duos::Identifier, r"^[a-zA-Z_][a-zA-Z0-9_]*", +Duos::Integer, r"^\d+", +Duos::Float, r"^\d+\.\d+", +Duos::DoubleColon, r"^::", +Duos::Semicolon, r"^;", +Duos::OpenBrace, r"^\{", +Duos::CloseBrace, r"^\}", +Duos::OpenParen, r"^\(", +Duos::CloseParen, r"^\)", +Duos::OpenBracket, r"^\[", +Duos::CloseBracket, r"^\]", +Duos::Comma, r"^,", +Duos::Hash, r"^#", +Duos::Dot, r"^\.", +Duos::Colon, r"^:", +Duos::Pipe, r"^\|", +Duos::OpenAngle, r"^<", +Duos::CloseAngle, r"^>", +Duos::Caret, r"^\^", +Duos::TempBorrow, r"^&", +Duos::Question, r"^\?", +Duos::MacroExclamation, r"^!" +); +} diff --git a/src/lib.rs b/src/lib.rs index b7dda8b..e92872f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,9 @@ pub mod internal; use regex::Regex; +#[cfg(feature = "serialization")] +use serde::{Deserialize, Serialize}; + #[macro_export] macro_rules! duos(($($kind:expr, $pattern:expr),*) => { vec![ $( ($kind, Regex::new($pattern).unwrap()) ),* ] };); @@ -12,6 +15,7 @@ pub struct Tokenizer<'a, T> { column: usize, } +#[cfg(not(feature = "serialization"))] #[derive(Debug, Clone, PartialEq)] pub struct Token<'a, T> { pub kind: &'a T, @@ -19,6 +23,7 @@ pub struct Token<'a, T> { pub pos: (usize, usize), } +#[cfg(not(feature = "serialization"))] impl<'a, T> Tokenizer<'a, T> { #[inline] pub fn new(source: &'a str, duos: &'static [(T, Regex)]) -> Self { @@ -75,3 +80,72 @@ impl<'a, T> Tokenizer<'a, T> { Ok(tokens) } } + +#[cfg(feature = "serialization")] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Token { + pub kind: T, + pub value: String, + pub pos: (usize, usize), +} + +#[cfg(feature = "serialization")] +impl<'a, T> Tokenizer<'a, T> +where + T: Clone, +{ + #[inline] + pub fn new(source: &'a str, duos: &'static [(T, Regex)]) -> Self { + Self { + source, + duos, + cursor: 0, + line: 1, + column: 1, + } + } + + pub fn next(&mut self) -> Result>, Box> { + if self.cursor >= self.source.len() { + return Ok(None); + } + + for (kind, regex) in self.duos.iter() { + if let Some(result) = regex.find(&self.source[self.cursor..]) { + let value: &str = result.as_str(); + + let token = Token { + kind: kind.clone(), + value: value.to_string(), + pos: (self.line, self.column), + }; + + let len = result.len(); + self.cursor += len; + let newlines_count = bytecount::count(value.as_bytes(), b'\n'); + if newlines_count > 0 { + self.line += newlines_count; + self.column = len - value.rfind('\n').unwrap_or(1); + } else { + self.column += len; + } + + return Ok(Some(token)); + } + } + + Err(format!( + "Failed to match at line {}, column {}.", + self.line, self.column + ))? + } + + pub fn tokenize_all(&mut self) -> Result>, Box> { + let mut tokens = Vec::new(); + while let Some(token) = self.next()? { + tokens.push(token); + } + + Ok(tokens) + } +} diff --git a/tests/experiment.rs b/tests/experiment.rs index a1987b7..91c1b50 100644 --- a/tests/experiment.rs +++ b/tests/experiment.rs @@ -8,7 +8,7 @@ use regex::Regex; // Custom duos -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] enum CustomDuos { Whitespace, CommentLine, @@ -241,6 +241,7 @@ fn unexpected() { // Verify function +#[cfg(not(feature = "serialization"))] fn verify(source: &str, duos: &'static [(T, Regex)], expected: &[(T, &str, (usize, usize))]) where T: PartialEq + std::fmt::Debug, @@ -272,3 +273,97 @@ where Err(err) => panic!("Error while tokenizing: {}", err), }; } + +#[cfg(feature = "serialization")] +fn verify(source: &str, duos: &'static [(T, Regex)], expected: &[(T, &str, (usize, usize))]) +where + T: PartialEq + std::fmt::Debug + Clone, +{ + let mut tokenizer = Tokenizer::new(source, duos); + + for (kind, value, (line, column)) in expected { + let token = match tokenizer.next() { + Ok(Some(token)) => token, + Ok(None) => panic!("No token found when expected"), + Err(err) => panic!("Error while tokenizing: {}", err), + }; + + println!( + "Expected {:?} got {:?}", + (kind, value, (line, column)), + token, + ); + + assert_eq!(kind, &token.kind); + assert_eq!(value, &token.value); + assert_eq!(line, &token.pos.0); + assert_eq!(column, &token.pos.1); + } + + match tokenizer.next() { + Ok(Some(token)) => panic!("Unexpected token: {:?}", token), + Ok(None) => {} + Err(err) => panic!("Error while tokenizing: {}", err), + }; +} + +#[cfg(feature = "serialization")] +#[test] +fn serialization_one() { + use jayce::Token; + let token = Token { + kind: "example_kind", + value: "example_value".to_string(), + pos: (1, 2), + }; + + let serialized = serde_json::to_string(&token).unwrap(); + let deserialized: Token<&str> = serde_json::from_str(&serialized).unwrap(); + + assert_eq!(token, deserialized); +} + +#[cfg(feature = "serialization")] +const SOURCE_SERIALIZATION: &str = r"use crate::{ + app::AppData, + camera::Camera, + model::{self, Model}, + texture::{self, Texture}, +}; +use anyhow::{anyhow, Result}; +use std::collections::HashMap; +use vulkanalia::prelude::v1_0::*;"; + +#[cfg(feature = "serialization")] +#[test] +fn serialization_collection() { + let mut tokenizer = Tokenizer::new(SOURCE_SERIALIZATION, &DUOS_RUST); + + let mut tokens = Vec::new(); + while let Some(token) = tokenizer.next().unwrap() { + tokens.push(token); + } + + let serialized = serde_json::to_string(&tokens).unwrap(); + let mut deserialized: Vec> = serde_json::from_str(&serialized).unwrap(); + + while let Some(token) = tokenizer.next().unwrap() { + let deser_token = deserialized.pop().unwrap(); + let token_kind = match deser_token.kind { + "CommentLine" => Duos::CommentLine, + "Newline" => Duos::Newline, + "Keyword" => Duos::Keyword, + "Whitespace" => Duos::Whitespace, + "Operator" => Duos::Operator, + "Identifier" => Duos::Identifier, + "Integer" => Duos::Integer, + "Semicolon" => Duos::Semicolon, + "CommentBlock" => Duos::CommentBlock, + _ => panic!("Unexpected token kind"), + }; + assert_eq!(token.value, deser_token.value); + assert_eq!(token.pos.0, deser_token.pos.0); + assert_eq!(token.pos.1, deser_token.pos.1); + assert_eq!(token.kind, token_kind); + } +}