FEAT: add parsing (partial) implementation and CI workflow for regex …

…engine
MKaczkow · Dec 8, 2024 · fa1e7ce · fa1e7ce
1 parent 39a8791
commit fa1e7ce
Show file tree

Hide file tree

Showing 9 changed files with 404 additions and 0 deletions.
diff --git a/.github/workflows/regex-engine-ci.yml b/.github/workflows/regex-engine-ci.yml
@@ -0,0 +1,25 @@
+name: RegEx Engine CI
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  test:
+    name: Test
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: '1.20'
+
+    - name: Run tests in 'regex_engine'
+      run: |
+        cd regex_engine
+        go test ./...
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ Repo for basic tutorial-based Golang study
 	- [ ] compiler
 	- [ ] macros
 	- [x] interpreter
+- [ ] regex engine
 - [ ] web crawler with `colly`
 - [ ] do `gilded rose` kata
 - [x] smth with `Hanoi tower`

diff --git a/regex_engine/Makefile b/regex_engine/Makefile
@@ -0,0 +1,18 @@
+.DEFAULT_GOAL := vet
+
+fmt:
+	go fmt ./...
+.PHONY: fmt
+
+lint: fmt
+	golint ./...
+.PHONY: lint
+
+vet: fmt
+	go vet ./...
+.PHONY: vet
+
+shadow: vet
+	shadow ./...
+.PHONY: shadow
+
diff --git a/regex_engine/README.md b/regex_engine/README.md
@@ -0,0 +1,30 @@
+# Go Concepts
+Repo for basic tutorial-based Golang study  
+
+---
+
+### todo
+- [ ] running and testing pipeline
+- [ ] parsing 
+  - [ ] or
+  - [ ] repeat 
+  - [ ] repeatspecified
+- [ ] compiling
+- [ ] matching engine
+
+# regex engine
+* based on [this blogpost](https://rhaeguard.github.io/posts/regex/)
+
+## run
+* `go test ./...`
+
+## notes
+* 3 stages:
+  * `parse` - create tokens from string
+  * `build state machine(compile)` - create state machine from tokens
+  * `match` - match string with state machine
+* `NFA` - non-deterministic finite automaton
+* `DFA` - deterministic finite automaton
+
+## references
+* [wikipedia on NFA](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton)
diff --git a/regex_engine/evaluator/match_test.go b/regex_engine/evaluator/match_test.go
@@ -0,0 +1,70 @@
+package evaluator
+
+// func TestNfa(t *testing.T) {
+
+// 	var data = []struct {
+// 		email    string
+// 		validity bool
+// 	}{
+// 		{email: "valid_email@example.com", validity: true},
+// 		{email: "john.doe@email.com", validity: true},
+// 		{email: "user_name@email.org", validity: true},
+// 		{email: "support@email.io", validity: true},
+// 		{email: "contact@123.com", validity: true},
+// 		{email: "sales@email.biz", validity: true},
+// 		{email: "test_email@email.test", validity: true},
+// 		{email: "random.email@email.xyz", validity: true},
+// 		{email: "user@domain12345.com", validity: true},
+// 		{email: "user@12345domain.com", validity: true},
+// 		// invalid when compared against our regex
+// 		{email: "alice.smith123@email.co.uk", validity: false},
+// 		{email: "invalid.email@", validity: false},
+// 		{email: ".invalid@email.com", validity: false},
+// 		{email: "email@invalid..com", validity: false},
+// 		{email: "user@-invalid.com", validity: false},
+// 		{email: "user@invalid-.com", validity: false},
+// 		{email: "user@in valid.com", validity: false},
+// 		{email: "user@.com", validity: false},
+// 		{email: "user@.co", validity: false},
+// 		{email: "user@domain.c", validity: false},
+// 		{email: "user@domain.1a", validity: false},
+// 		{email: "user@domain.c0m", validity: false},
+// 		{email: "user@domain..com", validity: false},
+// 		{email: "user@.email.com", validity: false},
+// 		{email: "user@emai.l.com", validity: false},
+// 		{email: "user@e_mail.com", validity: false},
+// 		{email: "user@e+mail.com", validity: false},
+// 		{email: "user@e^mail.com", validity: false},
+// 		{email: "user@e*mail.com", validity: false},
+// 		{email: "user@e.mail.com", validity: false},
+// 		{email: "user@e_mail.net", validity: false},
+// 		{email: "user@sub.domain.com", validity: false},
+// 		{email: "user@sub-domain.com", validity: false},
+// 		{email: "user@sub.domain12345.com", validity: false},
+// 		{email: "user@sub.domain-12345.com", validity: false},
+// 		{email: "user@-sub.domain.com", validity: false},
+// 		{email: "user@sub-.domain.com", validity: false},
+// 		{email: "user@domain-.com", validity: false},
+// 		{email: "user@sub.domain.c0m", validity: false},
+// 		{email: "user@sub.domain.c", validity: false},
+// 		{email: "user@sub.domain.1a", validity: false},
+// 		{email: "user@sub.domain.c0m", validity: false},
+// 		{email: "user@sub.domain..com", validity: false},
+// 		{email: "user@sub.domain.c0m", validity: false},
+// 		{email: "user@sub.domain..com", validity: false},
+// 		{email: "user@sub.domain.c0m", validity: false},
+// 	}
+
+// 	ctx := parse(`[a-zA-Z][a-zA-Z0-9_.]+@[a-zA-Z0-9]+.[a-zA-Z]{2,}`)
+// 	nfa := toNfa(ctx)
+
+// 	for _, instance := range data {
+// 		t.Run(fmt.Sprintf("Test: '%s'", instance.email), func(t *testing.T) {
+// 			result := nfa.check(instance.email, -1)
+// 			if result != instance.validity {
+// 				t.Logf("Expected: %t, got: %t\n", instance.validity, result)
+// 				t.Fail()
+// 			}
+// 		})
+// 	}
+// }
diff --git a/regex_engine/go.mod b/regex_engine/go.mod
@@ -0,0 +1,3 @@
+module regex_engine
+
+go 1.18
diff --git a/regex_engine/main.go b/regex_engine/main.go
@@ -0,0 +1 @@
+package main
diff --git a/regex_engine/parser/parser.go b/regex_engine/parser/parser.go
@@ -0,0 +1,124 @@
+package parser
+
+import "fmt"
+
+type tokenType uint8
+
+const (
+	group           tokenType = iota
+	bracket         tokenType = iota
+	or              tokenType = iota
+	repeat          tokenType = iota
+	literal         tokenType = iota
+	groupUncaptured tokenType = iota
+)
+
+type token struct {
+	tokenType tokenType
+	value     interface{}
+}
+
+type parseContext struct {
+	pos    int
+	tokens []token
+}
+
+func parse(regex string) *parseContext {
+	ctx := &parseContext{pos: 0, tokens: []token{}}
+	for ctx.pos < len(regex) {
+		process(regex, ctx)
+		ctx.pos++
+	}
+
+	return ctx
+}
+
+func process(regex string, ctx *parseContext) {
+	ch := regex[ctx.pos]
+	switch ch {
+	case '(':
+		// it's a group
+		groupCtx := &parseContext{
+			pos:    ctx.pos,
+			tokens: []token{},
+		}
+		parseGroup(regex, groupCtx)
+		ctx.tokens = append(ctx.tokens, token{
+			tokenType: group,
+			value:     groupCtx.tokens,
+		})
+	case '[':
+		// it's a bracket expression
+		parseBracket(regex, ctx)
+	case '|':
+		// it's an OR operator
+		parseOr(regex, ctx)
+	case '*':
+		// it's a repeat operator
+		// remaining repeat operators are + and ?, but they can be specified using brackets
+		// a* == a{0,}
+		// a+ == a{1,}
+		// a? == a{0,1}
+		parseRepeat(regex, ctx)
+	case '{':
+		parseRepeatSpecified(regex, ctx)
+	default:
+		// it's a literal (nothing matched)
+		t := token{tokenType: literal, value: ch}
+		ctx.tokens = append(ctx.tokens, t)
+	}
+
+}
+
+func parseGroup(regex string, ctx *parseContext) {
+	ctx.pos++
+	for regex[ctx.pos] != ')' {
+		process(regex, ctx)
+		ctx.pos++
+	}
+}
+
+func parseBracket(regex string, ctx *parseContext) {
+	ctx.pos++
+	var literals []string
+	for regex[ctx.pos] != ']' {
+		ch := regex[ctx.pos]
+
+		if ch == '-' {
+			// range indicator
+			next := regex[ctx.pos+1]
+			prev := literals[len(literals)-1][0]
+			literals[len(literals)-1] = fmt.Sprintf("%c%c", prev, next) // <3-2>
+			ctx.pos++
+		} else {
+			literals = append(literals, fmt.Sprintf("%c", ch))
+		}
+
+		ctx.pos++
+	}
+
+	literalsSet := map[uint8]bool{} // because literals ranges can overlap
+
+	for _, l := range literals {
+		for i := l[0]; i <= l[1]; i++ {
+			literalsSet[i] = true
+		}
+	}
+
+	ctx.tokens = append(ctx.tokens, token{
+		tokenType: bracket,
+		value:     literalsSet,
+	})
+}
+
+func parseOr(regex string, ctx *parseContext) {
+	// TODO: implement
+}
+
+func parseRepeat(regex string, ctx *parseContext) {
+	// TODO: implement
+}
+
+func parseRepeatSpecified(regex string, ctx *parseContext) {
+	// TODO: implement
+}