Skip to content

Commit

Permalink
FEAT: add parsing (partial) implementation and CI workflow for regex …
Browse files Browse the repository at this point in the history
…engine
  • Loading branch information
MKaczkow committed Dec 8, 2024
1 parent 39a8791 commit fa1e7ce
Show file tree
Hide file tree
Showing 9 changed files with 404 additions and 0 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/regex-engine-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: RegEx Engine CI

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

jobs:
test:
name: Test
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.20'

- name: Run tests in 'regex_engine'
run: |
cd regex_engine
go test ./...
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Repo for basic tutorial-based Golang study
- [ ] compiler
- [ ] macros
- [x] interpreter
- [ ] regex engine
- [ ] web crawler with `colly`
- [ ] do `gilded rose` kata
- [x] smth with `Hanoi tower`
Expand Down
18 changes: 18 additions & 0 deletions regex_engine/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.DEFAULT_GOAL := vet

fmt:
go fmt ./...
.PHONY: fmt

lint: fmt
golint ./...
.PHONY: lint

vet: fmt
go vet ./...
.PHONY: vet

shadow: vet
shadow ./...
.PHONY: shadow

30 changes: 30 additions & 0 deletions regex_engine/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Go Concepts
Repo for basic tutorial-based Golang study

---

### todo
- [ ] running and testing pipeline
- [ ] parsing
- [ ] or
- [ ] repeat
- [ ] repeatspecified
- [ ] compiling
- [ ] matching engine

# regex engine
* based on [this blogpost](https://rhaeguard.github.io/posts/regex/)

## run
* `go test ./...`

## notes
* 3 stages:
* `parse` - create tokens from string
* `build state machine(compile)` - create state machine from tokens
* `match` - match string with state machine
* `NFA` - non-deterministic finite automaton
* `DFA` - deterministic finite automaton

## references
* [wikipedia on NFA](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton)
70 changes: 70 additions & 0 deletions regex_engine/evaluator/match_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package evaluator

// func TestNfa(t *testing.T) {

// var data = []struct {
// email string
// validity bool
// }{
// {email: "valid_email@example.com", validity: true},
// {email: "john.doe@email.com", validity: true},
// {email: "user_name@email.org", validity: true},
// {email: "support@email.io", validity: true},
// {email: "contact@123.com", validity: true},
// {email: "sales@email.biz", validity: true},
// {email: "test_email@email.test", validity: true},
// {email: "random.email@email.xyz", validity: true},
// {email: "user@domain12345.com", validity: true},
// {email: "user@12345domain.com", validity: true},
// // invalid when compared against our regex
// {email: "alice.smith123@email.co.uk", validity: false},
// {email: "invalid.email@", validity: false},
// {email: ".invalid@email.com", validity: false},
// {email: "email@invalid..com", validity: false},
// {email: "user@-invalid.com", validity: false},
// {email: "user@invalid-.com", validity: false},
// {email: "user@in valid.com", validity: false},
// {email: "user@.com", validity: false},
// {email: "user@.co", validity: false},
// {email: "user@domain.c", validity: false},
// {email: "user@domain.1a", validity: false},
// {email: "user@domain.c0m", validity: false},
// {email: "user@domain..com", validity: false},
// {email: "user@.email.com", validity: false},
// {email: "user@emai.l.com", validity: false},
// {email: "user@e_mail.com", validity: false},
// {email: "user@e+mail.com", validity: false},
// {email: "user@e^mail.com", validity: false},
// {email: "user@e*mail.com", validity: false},
// {email: "user@e.mail.com", validity: false},
// {email: "user@e_mail.net", validity: false},
// {email: "user@sub.domain.com", validity: false},
// {email: "user@sub-domain.com", validity: false},
// {email: "user@sub.domain12345.com", validity: false},
// {email: "user@sub.domain-12345.com", validity: false},
// {email: "user@-sub.domain.com", validity: false},
// {email: "user@sub-.domain.com", validity: false},
// {email: "user@domain-.com", validity: false},
// {email: "user@sub.domain.c0m", validity: false},
// {email: "user@sub.domain.c", validity: false},
// {email: "user@sub.domain.1a", validity: false},
// {email: "user@sub.domain.c0m", validity: false},
// {email: "user@sub.domain..com", validity: false},
// {email: "user@sub.domain.c0m", validity: false},
// {email: "user@sub.domain..com", validity: false},
// {email: "user@sub.domain.c0m", validity: false},
// }

// ctx := parse(`[a-zA-Z][a-zA-Z0-9_.]+@[a-zA-Z0-9]+.[a-zA-Z]{2,}`)
// nfa := toNfa(ctx)

// for _, instance := range data {
// t.Run(fmt.Sprintf("Test: '%s'", instance.email), func(t *testing.T) {
// result := nfa.check(instance.email, -1)
// if result != instance.validity {
// t.Logf("Expected: %t, got: %t\n", instance.validity, result)
// t.Fail()
// }
// })
// }
// }
3 changes: 3 additions & 0 deletions regex_engine/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module regex_engine

go 1.18
1 change: 1 addition & 0 deletions regex_engine/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package main
124 changes: 124 additions & 0 deletions regex_engine/parser/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package parser

import "fmt"

type tokenType uint8

const (
group tokenType = iota
bracket tokenType = iota
or tokenType = iota
repeat tokenType = iota
literal tokenType = iota
groupUncaptured tokenType = iota
)

type token struct {
tokenType tokenType
value interface{}
}

type parseContext struct {
pos int
tokens []token
}

func parse(regex string) *parseContext {
ctx := &parseContext{pos: 0, tokens: []token{}}
for ctx.pos < len(regex) {
process(regex, ctx)
ctx.pos++
}

return ctx
}

func process(regex string, ctx *parseContext) {
ch := regex[ctx.pos]
switch ch {
case '(':
// it's a group
groupCtx := &parseContext{
pos: ctx.pos,
tokens: []token{},
}
parseGroup(regex, groupCtx)
ctx.tokens = append(ctx.tokens, token{
tokenType: group,
value: groupCtx.tokens,
})
case '[':
// it's a bracket expression
parseBracket(regex, ctx)
case '|':
// it's an OR operator
parseOr(regex, ctx)
case '*':
// it's a repeat operator
// remaining repeat operators are + and ?, but they can be specified using brackets
// a* == a{0,}
// a+ == a{1,}
// a? == a{0,1}
parseRepeat(regex, ctx)
case '{':
parseRepeatSpecified(regex, ctx)
default:
// it's a literal (nothing matched)
t := token{tokenType: literal, value: ch}
ctx.tokens = append(ctx.tokens, t)
}

}

func parseGroup(regex string, ctx *parseContext) {
ctx.pos++
for regex[ctx.pos] != ')' {
process(regex, ctx)
ctx.pos++
}
}

func parseBracket(regex string, ctx *parseContext) {
ctx.pos++
var literals []string
for regex[ctx.pos] != ']' {
ch := regex[ctx.pos]

if ch == '-' {
// range indicator
next := regex[ctx.pos+1]
prev := literals[len(literals)-1][0]
literals[len(literals)-1] = fmt.Sprintf("%c%c", prev, next) // <3-2>
ctx.pos++
} else {
literals = append(literals, fmt.Sprintf("%c", ch))
}

ctx.pos++
}

literalsSet := map[uint8]bool{} // because literals ranges can overlap

for _, l := range literals {
for i := l[0]; i <= l[1]; i++ {
literalsSet[i] = true
}
}

ctx.tokens = append(ctx.tokens, token{
tokenType: bracket,
value: literalsSet,
})
}

func parseOr(regex string, ctx *parseContext) {
// TODO: implement
}

func parseRepeat(regex string, ctx *parseContext) {
// TODO: implement
}

func parseRepeatSpecified(regex string, ctx *parseContext) {
// TODO: implement
}
Loading

0 comments on commit fa1e7ce

Please sign in to comment.