-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
FEAT: add parsing (partial) implementation and CI workflow for regex …
…engine
- Loading branch information
Showing
9 changed files
with
404 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
name: RegEx Engine CI | ||
|
||
on: | ||
push: | ||
branches: [ "main" ] | ||
pull_request: | ||
branches: [ "main" ] | ||
|
||
jobs: | ||
test: | ||
name: Test | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Set up Go | ||
uses: actions/setup-go@v4 | ||
with: | ||
go-version: '1.20' | ||
|
||
- name: Run tests in 'regex_engine' | ||
run: | | ||
cd regex_engine | ||
go test ./... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
.DEFAULT_GOAL := vet | ||
|
||
fmt: | ||
go fmt ./... | ||
.PHONY: fmt | ||
|
||
lint: fmt | ||
golint ./... | ||
.PHONY: lint | ||
|
||
vet: fmt | ||
go vet ./... | ||
.PHONY: vet | ||
|
||
shadow: vet | ||
shadow ./... | ||
.PHONY: shadow | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Go Concepts | ||
Repo for basic tutorial-based Golang study | ||
|
||
--- | ||
|
||
### todo | ||
- [ ] running and testing pipeline | ||
- [ ] parsing | ||
- [ ] or | ||
- [ ] repeat | ||
- [ ] repeatspecified | ||
- [ ] compiling | ||
- [ ] matching engine | ||
|
||
# regex engine | ||
* based on [this blogpost](https://rhaeguard.github.io/posts/regex/) | ||
|
||
## run | ||
* `go test ./...` | ||
|
||
## notes | ||
* 3 stages: | ||
* `parse` - create tokens from string | ||
* `build state machine(compile)` - create state machine from tokens | ||
* `match` - match string with state machine | ||
* `NFA` - non-deterministic finite automaton | ||
* `DFA` - deterministic finite automaton | ||
|
||
## references | ||
* [wikipedia on NFA](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
package evaluator | ||
|
||
// func TestNfa(t *testing.T) { | ||
|
||
// var data = []struct { | ||
// email string | ||
// validity bool | ||
// }{ | ||
// {email: "valid_email@example.com", validity: true}, | ||
// {email: "john.doe@email.com", validity: true}, | ||
// {email: "user_name@email.org", validity: true}, | ||
// {email: "support@email.io", validity: true}, | ||
// {email: "contact@123.com", validity: true}, | ||
// {email: "sales@email.biz", validity: true}, | ||
// {email: "test_email@email.test", validity: true}, | ||
// {email: "random.email@email.xyz", validity: true}, | ||
// {email: "user@domain12345.com", validity: true}, | ||
// {email: "user@12345domain.com", validity: true}, | ||
// // invalid when compared against our regex | ||
// {email: "alice.smith123@email.co.uk", validity: false}, | ||
// {email: "invalid.email@", validity: false}, | ||
// {email: ".invalid@email.com", validity: false}, | ||
// {email: "email@invalid..com", validity: false}, | ||
// {email: "user@-invalid.com", validity: false}, | ||
// {email: "user@invalid-.com", validity: false}, | ||
// {email: "user@in valid.com", validity: false}, | ||
// {email: "user@.com", validity: false}, | ||
// {email: "user@.co", validity: false}, | ||
// {email: "user@domain.c", validity: false}, | ||
// {email: "user@domain.1a", validity: false}, | ||
// {email: "user@domain.c0m", validity: false}, | ||
// {email: "user@domain..com", validity: false}, | ||
// {email: "user@.email.com", validity: false}, | ||
// {email: "user@emai.l.com", validity: false}, | ||
// {email: "user@e_mail.com", validity: false}, | ||
// {email: "user@e+mail.com", validity: false}, | ||
// {email: "user@e^mail.com", validity: false}, | ||
// {email: "user@e*mail.com", validity: false}, | ||
// {email: "user@e.mail.com", validity: false}, | ||
// {email: "user@e_mail.net", validity: false}, | ||
// {email: "user@sub.domain.com", validity: false}, | ||
// {email: "user@sub-domain.com", validity: false}, | ||
// {email: "user@sub.domain12345.com", validity: false}, | ||
// {email: "user@sub.domain-12345.com", validity: false}, | ||
// {email: "user@-sub.domain.com", validity: false}, | ||
// {email: "user@sub-.domain.com", validity: false}, | ||
// {email: "user@domain-.com", validity: false}, | ||
// {email: "user@sub.domain.c0m", validity: false}, | ||
// {email: "user@sub.domain.c", validity: false}, | ||
// {email: "user@sub.domain.1a", validity: false}, | ||
// {email: "user@sub.domain.c0m", validity: false}, | ||
// {email: "user@sub.domain..com", validity: false}, | ||
// {email: "user@sub.domain.c0m", validity: false}, | ||
// {email: "user@sub.domain..com", validity: false}, | ||
// {email: "user@sub.domain.c0m", validity: false}, | ||
// } | ||
|
||
// ctx := parse(`[a-zA-Z][a-zA-Z0-9_.]+@[a-zA-Z0-9]+.[a-zA-Z]{2,}`) | ||
// nfa := toNfa(ctx) | ||
|
||
// for _, instance := range data { | ||
// t.Run(fmt.Sprintf("Test: '%s'", instance.email), func(t *testing.T) { | ||
// result := nfa.check(instance.email, -1) | ||
// if result != instance.validity { | ||
// t.Logf("Expected: %t, got: %t\n", instance.validity, result) | ||
// t.Fail() | ||
// } | ||
// }) | ||
// } | ||
// } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
module regex_engine | ||
|
||
go 1.18 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
package main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
package parser | ||
|
||
import "fmt" | ||
|
||
type tokenType uint8 | ||
|
||
const ( | ||
group tokenType = iota | ||
bracket tokenType = iota | ||
or tokenType = iota | ||
repeat tokenType = iota | ||
literal tokenType = iota | ||
groupUncaptured tokenType = iota | ||
) | ||
|
||
type token struct { | ||
tokenType tokenType | ||
value interface{} | ||
} | ||
|
||
type parseContext struct { | ||
pos int | ||
tokens []token | ||
} | ||
|
||
func parse(regex string) *parseContext { | ||
ctx := &parseContext{pos: 0, tokens: []token{}} | ||
for ctx.pos < len(regex) { | ||
process(regex, ctx) | ||
ctx.pos++ | ||
} | ||
|
||
return ctx | ||
} | ||
|
||
func process(regex string, ctx *parseContext) { | ||
ch := regex[ctx.pos] | ||
switch ch { | ||
case '(': | ||
// it's a group | ||
groupCtx := &parseContext{ | ||
pos: ctx.pos, | ||
tokens: []token{}, | ||
} | ||
parseGroup(regex, groupCtx) | ||
ctx.tokens = append(ctx.tokens, token{ | ||
tokenType: group, | ||
value: groupCtx.tokens, | ||
}) | ||
case '[': | ||
// it's a bracket expression | ||
parseBracket(regex, ctx) | ||
case '|': | ||
// it's an OR operator | ||
parseOr(regex, ctx) | ||
case '*': | ||
// it's a repeat operator | ||
// remaining repeat operators are + and ?, but they can be specified using brackets | ||
// a* == a{0,} | ||
// a+ == a{1,} | ||
// a? == a{0,1} | ||
parseRepeat(regex, ctx) | ||
case '{': | ||
parseRepeatSpecified(regex, ctx) | ||
default: | ||
// it's a literal (nothing matched) | ||
t := token{tokenType: literal, value: ch} | ||
ctx.tokens = append(ctx.tokens, t) | ||
} | ||
|
||
} | ||
|
||
func parseGroup(regex string, ctx *parseContext) { | ||
ctx.pos++ | ||
for regex[ctx.pos] != ')' { | ||
process(regex, ctx) | ||
ctx.pos++ | ||
} | ||
} | ||
|
||
func parseBracket(regex string, ctx *parseContext) { | ||
ctx.pos++ | ||
var literals []string | ||
for regex[ctx.pos] != ']' { | ||
ch := regex[ctx.pos] | ||
|
||
if ch == '-' { | ||
// range indicator | ||
next := regex[ctx.pos+1] | ||
prev := literals[len(literals)-1][0] | ||
literals[len(literals)-1] = fmt.Sprintf("%c%c", prev, next) // <3-2> | ||
ctx.pos++ | ||
} else { | ||
literals = append(literals, fmt.Sprintf("%c", ch)) | ||
} | ||
|
||
ctx.pos++ | ||
} | ||
|
||
literalsSet := map[uint8]bool{} // because literals ranges can overlap | ||
|
||
for _, l := range literals { | ||
for i := l[0]; i <= l[1]; i++ { | ||
literalsSet[i] = true | ||
} | ||
} | ||
|
||
ctx.tokens = append(ctx.tokens, token{ | ||
tokenType: bracket, | ||
value: literalsSet, | ||
}) | ||
} | ||
|
||
func parseOr(regex string, ctx *parseContext) { | ||
// TODO: implement | ||
} | ||
|
||
func parseRepeat(regex string, ctx *parseContext) { | ||
// TODO: implement | ||
} | ||
|
||
func parseRepeatSpecified(regex string, ctx *parseContext) { | ||
// TODO: implement | ||
} |
Oops, something went wrong.