Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Show off DFA's strength #5

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright 2015-2016 yatt.top
Copyright 2015-2018 yatt.top

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
11 changes: 2 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,5 @@ Regex | Input | Time Cost (Java Native) | Time Cost (DFA Regex)
- brackets

### Todo List:
- capture groups `(<name>)`
- some zero width tokens `\b`
- look forward / look back `(?=)` `(?<)`
- anchor points `^` `$`
- ranged set `[0-9]`



*Be free to file issues or promote unsupported features that you want to see to make this project better.*
- [POSIX-Extended Regex](http://www.boost.org/doc/libs/1_44_0/libs/regex/doc/html/boost_regex/syntax/basic_extended.html) syntax support
- Liner time searching
148 changes: 148 additions & 0 deletions checkstyle.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
<?xml version="1.0"?>
<!DOCTYPE module PUBLIC
"-//Puppy Crawl//DTD Check Configuration 1.3//EN"
"http://checkstyle.sourceforge.net/dtds/configuration_1_3.dtd">

<!--
Checkstyle configuration that checks the Google coding conventions from Google Java Style
that can be found at https://google.github.io/styleguide/javaguide.html.
Checkstyle is very configurable. Be sure to read the documentation at
http://checkstyle.sf.net (or in your downloaded distribution).
To completely disable a check, just comment it out or delete it from the file.
Authors: Max Vetrenko, Ruslan Diachenko, Roman Ivanov.
-->

<module name = "Checker">
<property name="charset" value="UTF-8"/>

<property name="fileExtensions" value="java, properties, xml"/>
<!-- Checks for whitespace -->
<!-- See http://checkstyle.sf.net/config_whitespace.html -->
<module name="FileTabCharacter">
<property name="eachLine" value="true"/>
</module>

<module name="TreeWalker">
<module name="OuterTypeFilename"/>
<module name="IllegalTokenText">
<property name="tokens" value="STRING_LITERAL, CHAR_LITERAL"/>
<property name="format" value="\\u00(09|0(a|A)|0(c|C)|0(d|D)|22|27|5(C|c))|\\(0(10|11|12|14|15|42|47)|134)"/>
<property name="message" value="Consider using special escape sequence instead of octal value or Unicode escaped value."/>
</module>
<module name="AvoidEscapedUnicodeCharacters">
<property name="allowEscapesForControlCharacters" value="true"/>
<property name="allowByTailComment" value="true"/>
<property name="allowNonPrintableEscapes" value="true"/>
</module>
<module name="OneTopLevelClass"/>
<module name="NoLineWrap"/>
<module name="EmptyBlock">
<property name="option" value="TEXT"/>
<property name="tokens" value="LITERAL_TRY, LITERAL_FINALLY, LITERAL_IF, LITERAL_ELSE, LITERAL_SWITCH"/>
</module>
<module name="NeedBraces"/>
<module name="LeftCurly"/>
<module name="RightCurly">
<property name="id" value="RightCurlySame"/>
<property name="tokens" value="LITERAL_TRY, LITERAL_CATCH, LITERAL_FINALLY, LITERAL_IF, LITERAL_ELSE, LITERAL_DO"/>
</module>
<module name="RightCurly">
<property name="id" value="RightCurlyAlone"/>
<property name="option" value="alone"/>
<property name="tokens" value="CLASS_DEF, METHOD_DEF, CTOR_DEF, LITERAL_FOR, LITERAL_WHILE, STATIC_INIT, INSTANCE_INIT"/>
</module>
<module name="WhitespaceAround">
<property name="allowEmptyConstructors" value="true"/>
<property name="allowEmptyMethods" value="true"/>
<property name="allowEmptyTypes" value="true"/>
<property name="allowEmptyLoops" value="true"/>
<message key="ws.notFollowed"
value="WhitespaceAround: ''{0}'' is not followed by whitespace. Empty blocks may only be represented as '{}' when not part of a multi-block statement (4.1.3)"/>
<message key="ws.notPreceded"
value="WhitespaceAround: ''{0}'' is not preceded with whitespace."/>
</module>
<module name="OneStatementPerLine"/>
<module name="MultipleVariableDeclarations"/>
<module name="ArrayTypeStyle"/>
<module name="MissingSwitchDefault"/>
<module name="FallThrough"/>
<module name="UpperEll"/>
<module name="ModifierOrder"/>
<module name="EmptyLineSeparator">
<property name="allowNoEmptyLineBetweenFields" value="true"/>
</module>
<module name="SeparatorWrap">
<property name="id" value="SeparatorWrapDot"/>
<property name="tokens" value="DOT"/>
<property name="option" value="nl"/>
</module>
<module name="SeparatorWrap">
<property name="id" value="SeparatorWrapComma"/>
<property name="tokens" value="COMMA"/>
<property name="option" value="EOL"/>
</module>
<module name="SeparatorWrap">
<!-- ELLIPSIS is EOL until https://github.com/google/styleguide/issues/258 -->
<property name="id" value="SeparatorWrapEllipsis"/>
<property name="tokens" value="ELLIPSIS"/>
<property name="option" value="EOL"/>
</module>
<module name="SeparatorWrap">
<!-- ARRAY_DECLARATOR is EOL until https://github.com/google/styleguide/issues/259 -->
<property name="id" value="SeparatorWrapArrayDeclarator"/>
<property name="tokens" value="ARRAY_DECLARATOR"/>
<property name="option" value="EOL"/>
</module>
<module name="SeparatorWrap">
<property name="id" value="SeparatorWrapMethodRef"/>
<property name="tokens" value="METHOD_REF"/>
<property name="option" value="nl"/>
</module>
<module name="NoFinalizer"/>
<module name="GenericWhitespace">
<message key="ws.followed"
value="GenericWhitespace ''{0}'' is followed by whitespace."/>
<message key="ws.preceded"
value="GenericWhitespace ''{0}'' is preceded with whitespace."/>
<message key="ws.illegalFollow"
value="GenericWhitespace ''{0}'' should followed by whitespace."/>
<message key="ws.notPreceded"
value="GenericWhitespace ''{0}'' is not preceded with whitespace."/>
</module>
<!--<module name="OverloadMethodsDeclarationOrder"/>-->
<module name="VariableDeclarationUsageDistance"/>
<module name="MethodParamPad"/>
<module name="ParenPad"/>
<!--<module name="OperatorWrap">-->
<!--<property name="option" value="NL"/>-->
<!--<property name="tokens" value="BAND, BOR, BSR, BXOR, DIV, EQUAL, GE, GT, LAND, LE, LITERAL_INSTANCEOF, LOR, LT, MINUS, MOD, NOT_EQUAL, PLUS, QUESTION, SL, SR, STAR, METHOD_REF "/>-->
<!--</module>-->
<module name="AnnotationLocation">
<property name="id" value="AnnotationLocationMostCases"/>
<property name="tokens" value="CLASS_DEF, INTERFACE_DEF, ENUM_DEF, METHOD_DEF, CTOR_DEF"/>
</module>
<module name="AnnotationLocation">
<property name="id" value="AnnotationLocationVariables"/>
<property name="tokens" value="VARIABLE_DEF"/>
<property name="allowSamelineMultipleAnnotations" value="true"/>
</module>
<module name="EmptyCatchBlock">
<property name="exceptionVariableName" value="expected"/>
</module>
<!--<module name="AvoidNestedBlocks"/>-->
<module name="EmptyCatchBlock" />
<module name="EmptyStatement" />
<module name="MissingOverride"/>
<module name="MultipleVariableDeclarations"/>
<module name="ParameterAssignment"/>
<module name="StringLiteralEquality"/>
<module name="RedundantImport"/>
<module name="UnusedImports"/>
<module name="WhitespaceAfter"/>
<module name="NeedBraces" />
<module name="UnnecessaryParentheses" />
<module name="LeftCurly"/>
<module name="RightCurly"/>
<module name="SuppressWarningsHolder" />
</module>
</module>
50 changes: 40 additions & 10 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@

<groupId>top.yatt.dfargx</groupId>
<artifactId>dfargx</artifactId>
<version>0.2.1</version>
<version>0.2.2-SNAPSHOT</version>
<packaging>jar</packaging>

<name>DFA-Regex</name>
<url>https://github.com/zbdzzg/DFA-Regex</url>
<description>A pretty fast regex engine built in java using pure DFA.</description>
<url>https://github.com/zhztheplayer/DFA-Regex</url>
<description>A Java DFA regex engine implementation.</description>
<inceptionYear>2015</inceptionYear>

<distributionManagement>
Expand All @@ -36,23 +36,23 @@
</licenses>

<scm>
<connection>scm:git:git@github.com:zbdzzg/DFA-Regex.git</connection>
<developerConnection>scm:git:git@github.com:zbdzzg/DFA-Regex.git</developerConnection>
<url>https://github.com/zbdzzg/DFA-Regex</url>
<connection>scm:git:git@github.com:zhztheplayer/DFA-Regex.git</connection>
<developerConnection>scm:git:git@github.com:zhztheplayer/DFA-Regex.git</developerConnection>
<url>https://github.com/zhztheplayer/DFA-Regex</url>
<tag>HEAD</tag>
</scm>

<developers>
<developer>
<id>zbdzzg</id>
<id>zhz</id>
<name>Zhang Hongze</name>
<email>zbdzzg@sina.cn</email>
<email>talktozhz@126.com</email>
</developer>
</developers>

<issueManagement>
<system>GitHub Issues</system>
<url>https://github.com/zbdzzg/DFA-Regex/issues</url>
<url>https://github.com/zhztheplayer/DFA-Regex/issues</url>
</issueManagement>
<profiles>
<profile>
Expand Down Expand Up @@ -120,7 +120,37 @@
</execution>
</executions>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
<executions>
<execution>
<id>verify-style</id>
<phase>process-classes</phase>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
<configuration>
<configLocation>checkstyle.xml</configLocation>
<logViolationsToConsole>true</logViolationsToConsole>
<includeTestSourceDirectory>true</includeTestSourceDirectory>
<sourceDirectories>
<sourceDirectory>
${project.build.sourceDirectory}
</sourceDirectory>
</sourceDirectories>
</configuration>
<dependencies>
<!-- https://mvnrepository.com/artifact/com.puppycrawl.tools/checkstyle -->
<dependency>
<groupId>com.puppycrawl.tools</groupId>
<artifactId>checkstyle</artifactId>
<version>8.1</version>
</dependency>
</dependencies>
</plugin>
</plugins>
</build>

Expand Down
26 changes: 26 additions & 0 deletions src/main/java/top/yatt/dfargx/RegexComparator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package top.yatt.dfargx;

import top.yatt.dfargx.automata.DFA;
import top.yatt.dfargx.automata.NFA;
import top.yatt.dfargx.tree.SyntaxTree;

public class RegexComparator {

/**
* Checks if the matching space of <code>regexp1</code> contains the matching space of <code>regexp1</code>.
*/
public static final boolean contains(String regexp1, String regexp2) {
SyntaxTree syntaxTree = new SyntaxTree(regexp1);
NFA nfa = new NFA(syntaxTree.getRoot());
DFA dfa = new DFA(nfa.getStateList());

// by definition if the matching space of A is equal than the matching space of A|B then B is contained in A
// similarly, if DFA(A) is identical to DFA(A|B) then B is contained in A.
SyntaxTree syntaxTree2 = new SyntaxTree(regexp1 + "|" + regexp2);
NFA nfa2 = new NFA(syntaxTree2.getRoot());
DFA dfa2 = new DFA(nfa2.getStateList());

return dfa.equals(dfa2);

}
}
72 changes: 63 additions & 9 deletions src/main/java/top/yatt/dfargx/automata/DFA.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@
public class DFA {

private int[][] transitionTable;
private int is; // init state
private int rs; // rejected state
private boolean[] fs; // final states
// init state
private int is;
// rejected state
private int rs;
// final states
private boolean[] fs;

public DFA(List<NFAState> nfaStateList) {
transitionTable = null;
Expand All @@ -37,6 +40,52 @@ public boolean[] getFinalStates() {
return fs;
}

@Override
public boolean equals(Object o) {
if (!(o instanceof DFA)) {
return false;
}
DFA other = (DFA) o;

int [][] checked = new int[transitionTable.length][other.transitionTable.length];
return dfaEquivalenceCheck(other, is, other.is, checked);
}

private boolean dfaEquivalenceCheck(DFA other, int initialState, int otherInitialState, int [][] checked) {
// transitions for this DFA for state initialState
int [] initialTransitions = transitionTable[initialState];
// transitions for the other DFA for state otherInitialState
int [] otherInitialTransitions = other.transitionTable[otherInitialState];

// For every possible transition from initialState (and otherInitialState)
for (int i = 0; i < initialTransitions.length; i++) {

// if the target state is already computed in previous iterations, skip
if (checked[initialTransitions[i]][otherInitialTransitions[i]] == 1) {
continue;
}
// mark the transition as computed
checked[initialTransitions[i]][otherInitialTransitions[i]] = 1;

if (fs[initialTransitions[i]] != other.fs[otherInitialTransitions[i]]) {
// one transition goes to a final state and the other does not, this DFA is not equivalent
return false;
} else if ((initialTransitions[i] == rs && otherInitialTransitions[i] != other.rs)
|| (initialTransitions[i] != rs && otherInitialTransitions[i] == other.rs)) {
// one transition goes to rejected state and the other does not, this DFA is not equivalent
return false;
} else if (fs[initialTransitions[i]] == false && other.fs[otherInitialTransitions[i]] == false) {
// both transitions go to intermediate states, needs further computing using current states as initial
if (!dfaEquivalenceCheck(other, initialTransitions[i], otherInitialTransitions[i], checked)) {
// the transition is not equivalent further down, this DFA is not equivalent
return false;
}
}
}
// All transitions check were equivalent, this DFA is equivalent
return true;
}

private void convert(List<NFAState> nfaStateList) {
NFAState initState = nfaStateList.get(0);
NFAState finalState = nfaStateList.get(1);
Expand Down Expand Up @@ -143,7 +192,7 @@ private void minimize(Map<Set<NFAState>, Map<Character, Set<NFAState>>> oriDFATr
// rename all states
for (Set<NFAState> nfaState : oriDFATransitionMap.keySet()) {
if (initStateAfterRenaming == -1 && nfaState.equals(initClosure)) {
initStateAfterRenaming = renamingStateID; // record init state id
initStateAfterRenaming = renamingStateID; // preserve init state id
}
stateRenamingMap.put(nfaState, renamingStateID++);
}
Expand All @@ -161,15 +210,20 @@ private void minimize(Map<Set<NFAState>, Map<Character, Set<NFAState>>> oriDFATr
renamedDFATransitionTable.put(renamingStateID, state);
if (entry.getKey().contains(finalNFAState)) {
finalFlags.put(renamingStateID, true);
} else finalFlags.put(renamingStateID, false);
} else {
finalFlags.put(renamingStateID, false);
}
}

// split states to final states and non-final states
// group states to final states and non-final states
Map<Integer, Integer> groupFlags = new HashMap<>();
for (int i = 0; i < finalFlags.size(); i++) {
boolean b = finalFlags.get(i);
if (b) groupFlags.put(i, 0);
else groupFlags.put(i, 1);
if (b) {
groupFlags.put(i, 0);
} else {
groupFlags.put(i, 1);
}
}

int groupTotal = 2;
Expand Down Expand Up @@ -230,7 +284,7 @@ private void minimize(Map<Set<NFAState>, Map<Character, Set<NFAState>>> oriDFATr
fs[i] = finalGroupFlags.contains(i);
}

// construct the final transition table
// construct the output transition table
transitionTable = new int[groupTotal][];

for (int groupID = 0; groupID < groupTotal; groupID++) {
Expand Down
Loading