From 2f3899eb9b7a6e77bbe8b0325e6297f0d6125df0 Mon Sep 17 00:00:00 2001
From: Sven Hertling <svenhertling@t-online.de>
Date: Tue, 12 Nov 2019 13:27:51 +0100
Subject: [PATCH] added hearst pattern extraction updated template class
 extraction updated abstract extraction

---
 core/pom.xml                                  |  13 +
 .../extraction/hearst/CustomPattern.java      | 111 +++
 .../hearst/ExtractHearstPatterns.java         | 648 ++++++++++++++++++
 .../dbpedia/extraction/hearst/IsaPattern.java |  71 ++
 .../dbpedia/extraction/hearst/NounPhrase.java | 139 ++++
 .../main/resources/datasetdefinitions.json    |  14 +-
 .../config/provenance/DBpediaDatasets.scala   |   2 +
 .../ArticleTemplatesClassExtractor.scala      |  12 +-
 .../extraction/mappings/NifNewExtractor.scala |  13 +-
 .../extraction/nif/GeneralNifExtractor.scala  |  50 +-
 .../nif/NifExtractionAstVisitor.scala         |  32 +-
 .../org/dbpedia/extraction/nif/NifLink.scala  |   3 +-
 .../relation/RelationExtractionJava.java      |   2 -
 13 files changed, 1079 insertions(+), 31 deletions(-)
 create mode 100644 core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java
 create mode 100644 core/src/main/java/org/dbpedia/extraction/hearst/ExtractHearstPatterns.java
 create mode 100644 core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java
 create mode 100644 core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java
diff --git a/core/pom.xml b/core/pom.xml
index d13af7b6db..e4924d94f0 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -142,6 +142,19 @@
             <version>2.1.0</version>
             <type>pom</type>
         </dependency-->
+        
+        <!--extracting hearst patterns -->
+        <dependency>
+            <groupId>edu.stanford.nlp</groupId>
+            <artifactId>stanford-corenlp</artifactId>
+            <version>3.9.2</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.stanford.nlp</groupId>
+            <artifactId>stanford-corenlp</artifactId>
+            <version>3.9.2</version>
+            <classifier>models-english</classifier>
+        </dependency>
 
         <dependency>
             <groupId>org.wikidata.wdtk</groupId>
diff --git a/core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java b/core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java
new file mode 100644
index 0000000000..df9d9addd3
--- /dev/null
+++ b/core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java
@@ -0,0 +1,111 @@
+package org.dbpedia.extraction.hearst;
+
+import java.util.regex.Pattern;
+
+public class CustomPattern {
+
+    private String pid;
+    private String regex;
+    private String type;
+    private Pattern pattern;
+    private String preCondition;
+    private Boolean excludePronouns;
+    private String firstKeyWord;
+    private String secondKeyWord;
+    private Boolean instanceFirst;
+
+    private String surrounderSymbols = "[\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u0022]?";
+    private String endSymbols = "[\"\\u0026\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u00A9\\u00AE]?";	//includes surrounderSymbols as well!
+    private String prefix = "(\\p{L}|\\d)" + endSymbols;
+    private String suffix = surrounderSymbols + "(\\p{L}|\\d)";
+
+    public CustomPattern(String pid, String regex, String type, Boolean instanceFirst) {
+        this.pid = pid;
+        this.regex = regex;
+        this.type = type;
+        this.instanceFirst = instanceFirst;
+
+        //Configure the Prefix and suffix of the regex
+        if (type.equals("compact") || type.equals("split")) {
+            this.pattern = Pattern.compile(prefix + regex + suffix);
+        }
+
+        if (type.equals("split_noPrefix")) {
+            this.pattern = Pattern.compile("(?>" + regex + suffix + ")");
+        }
+
+        if (type.equals("split_noSuffix")) {
+            this.pattern = Pattern.compile("(?>" + prefix + regex + ")");
+        }
+    }
+
+    public CustomPattern(String pid, String regex, String type, String preCond, Boolean instanceFirst) {
+        this.pid = pid;
+        this.regex = regex;
+        this.type = type;
+        this.instanceFirst = instanceFirst;
+
+        //Configure the Prefix and suffix of the regex
+        if (type.equals("compact") || type.equals("split")) {
+            this.pattern = Pattern.compile(prefix + regex + suffix);
+        }
+
+        if (type.equals("split_noPrefix")) {
+            this.pattern = Pattern.compile("(?>" + regex + suffix + ")");
+        }
+
+        if (type.equals("split_noSuffix")) {
+            this.pattern = Pattern.compile("(?>" + prefix + regex + ")");
+        }
+
+        this.preCondition = preCond;
+    }
+
+    public CustomPattern(String pid, String regex, String type, String preCond, String fkw, String skw, Boolean instanceFirst) {
+        this.pid = pid;
+        this.regex = regex;
+        this.type = type;
+        this.firstKeyWord = fkw;
+        this.secondKeyWord = skw;
+        this.instanceFirst = instanceFirst;
+
+        //Configure the Prefix and suffix of the regex
+        if (type.equals("compact") || type.equals("split")) {
+            this.pattern = Pattern.compile(prefix + regex + suffix);
+        }
+
+        if (type.equals("split_noPrefix")) {
+            this.pattern = Pattern.compile("(?>" + regex + suffix + ")");
+        }
+
+        if (type.equals("split_noSuffix")) {
+            this.pattern = Pattern.compile("(?>" + prefix + regex + ")");
+        }
+
+        this.preCondition = preCond;
+    }
+
+    public Pattern getPattern() {
+        return pattern;
+    }
+
+    public String getType() {
+        return type;
+    }
+
+    public String getFirstKeyWord() {
+        return firstKeyWord;
+    }
+
+    public String getSecondKeyWord() {
+        return secondKeyWord;
+    }
+
+    public Boolean getInstanceFirst() {
+        return instanceFirst;
+    }
+    
+    
+    
+    
+}
diff --git a/core/src/main/java/org/dbpedia/extraction/hearst/ExtractHearstPatterns.java b/core/src/main/java/org/dbpedia/extraction/hearst/ExtractHearstPatterns.java
new file mode 100644
index 0000000000..ca9684e7a0
--- /dev/null
+++ b/core/src/main/java/org/dbpedia/extraction/hearst/ExtractHearstPatterns.java
@@ -0,0 +1,648 @@
+package org.dbpedia.extraction.hearst;
+
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.ling.SentenceUtils;
+import edu.stanford.nlp.ling.TaggedWord;
+import edu.stanford.nlp.pipeline.CoreNLPProtos.Sentence;
+
+import edu.stanford.nlp.tagger.maxent.MaxentTagger;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ *
+ * @author shertlin
+ */
+public class ExtractHearstPatterns {
+    
+    private static int maxNpSize =  4;
+
+    private static MaxentTagger tagger = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
+
+    private static String separatorSymbols  = 	"[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\u2043]?"; 
+    private static String surrounderSymbols = "[\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u0022]?";
+    private static String endSymbols	    =	"[\"\\u0026\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u00A9\\u00AE]?";    
+    
+    private static String npPlaceholder = "("+surrounderSymbols+""				//Quotation mark could be in front
+			+ "(\\p{L}++|\\d++\\p{L}++)"						//Word can start with letters or digits but must contain letters
+			+ "("+separatorSymbols+"(\\p{L}++|\\d++))?"				//Can be separated by a hyphen
+			+ endSymbols+"\\s)"							//Can be followed by quotation mark
+			+ "{1,4}";								//NP can consist of up to 4 words
+	
+    private static String npPlaceholderAdjMost = "("+surrounderSymbols+""		//Quotation mark could be in front
+                    + "(\\p{L}++|\\d++\\p{L}++)"					//Word can start with letters or digits but must contain letters
+                    + "("+separatorSymbols+"(\\p{L}++|\\d++))?"				//Can be separated by a hyphen
+                    + endSymbols+"\\s)"							//Can be followed by quotation mark
+                    + "{2,5}";
+    
+    private static List<CustomPattern> allPatterns = generatePatterns();
+    private static List<CustomPattern> generatePatterns() {
+        List<CustomPattern> allPatterns = new ArrayList<CustomPattern>();
+        allPatterns.add(new CustomPattern("p8a", "\\,?\\sis\\san?\\s", "compact", "is\\sa", true));
+        allPatterns.add(new CustomPattern("p8b", "\\,?\\swas\\san?\\s", "compact", "was\\sa", true));
+        /*
+        allPatterns.add(new CustomPattern("p3a", "\\,?\\sincluding\\s", "compact", "including", false));
+        allPatterns.add(new CustomPattern("p5", "\\,?\\ssuch\\sas\\s", "compact", "such\\sas", false));
+        allPatterns.add(new CustomPattern("p1", "\\,?\\sand\\sother\\s", "compact", "and\\sother", true));
+        allPatterns.add(new CustomPattern("p4", "\\,?\\sor\\sother\\s", "compact", "or\\sother", true));
+        allPatterns.add(new CustomPattern("p2", "\\,?\\sespecially\\s", "compact", "especially", false));
+        allPatterns.add(new CustomPattern("p8c", "\\,?\\sare\\san?\\s", "compact", "are\\sa", true));		
+        allPatterns.add(new CustomPattern("p34", "\\stypes\\s", "compact", "types", false));
+        allPatterns.add(new CustomPattern("p25", "\\,?\\sexcept\\s", "compact", "except", false));
+        allPatterns.add(new CustomPattern("p23d", "\\,?\\sparticularly\\s", "compact", "particularly", false));
+        allPatterns.add(new CustomPattern("p20a", "\\sis\\sthe\\s\\w+est\\s", "compact", "is\\sthe", true));
+        allPatterns.add(new CustomPattern("p43", "\\,?\\ssort\\sof\\s", "compact", "sort\\sof", true));
+        allPatterns.add(new CustomPattern("p26", "\\,?\\sother\\sthan\\s", "compact", "other\\sthan", false));
+
+        allPatterns.add(new CustomPattern("p21a", "\\p{L}+est\\s"+npPlaceholder+"is\\s", "split_noPrefix", "est\\s", "est", "is", false));
+        allPatterns.add(new CustomPattern("p21b", "\\p{L}+est\\s"+npPlaceholder+"are\\s", "split_noPrefix", "est\\s", "est", "are", false));
+        allPatterns.add(new CustomPattern("p21c", "\\s(M|m)ost\\s"+npPlaceholderAdjMost+"is\\s", "split_noPrefix", "most\\s", "most", "is", false));
+        allPatterns.add(new CustomPattern("p21d", "\\s(M|m)ost\\s"+npPlaceholderAdjMost+"are\\s", "split_noPrefix", null, "most", "are", false));
+
+        allPatterns.add(new CustomPattern("p23b", "\\,?\\smostly\\s", "compact", "mostly", false));
+        allPatterns.add(new CustomPattern("p23a", "\\,?\\smainly\\s", "compact", "mainly", false));
+        allPatterns.add(new CustomPattern("p12a", "\\,\\sone\\sof\\sthe\\s", "compact", "one\\sof\\sthe", true));
+        allPatterns.add(new CustomPattern("p20c", "\\sis\\sthe\\smost\\s\\w+\\s", "compact", true));
+        allPatterns.add(new CustomPattern("p8d", "\\,?\\swere\\san?\\s", "compact", "were\\sa", true));
+        allPatterns.add(new CustomPattern("p6", "\\,?\\sand\\sany\\sother\\s", "compact", "and\\sany\\sother", true));
+        allPatterns.add(new CustomPattern("p15a", "\\sexamples\\sof\\s", "compact", "examples\\sof", true));
+        allPatterns.add(new CustomPattern("p27a", "\\,?\\se\\.g\\.\\s", "compact", "e\\.g\\.", false));
+        allPatterns.add(new CustomPattern("p27b", "\\,?\\si\\.e\\.\\s", "compact", "i\\.e\\.", false));
+        allPatterns.add(new CustomPattern("p16", "\\,?\\sfor\\sexample\\s", "compact", "for\\sexample", false));
+        allPatterns.add(new CustomPattern("p24", "\\,?\\sin\\sparticular\\s", "compact", "in\\sparticular", false));
+        allPatterns.add(new CustomPattern("p20b", "\\sare\\sthe\\s\\w+est\\s", "compact", "are\\sthe", true));
+        allPatterns.add(new CustomPattern("p20d", "\\sare\\sthe\\smost\\s\\w+\\s", "compact", true));
+        allPatterns.add(new CustomPattern("p23c", "\\,?\\snotably\\s", "compact", "notably", false));
+        allPatterns.add(new CustomPattern("p39", "\\,?\\samong\\sthem\\s", "compact", "\\samong\\sthem", false));
+        allPatterns.add(new CustomPattern("p38", "\\scompared\\sto\\sother\\s", "compact", "compared\\sto", true));
+        allPatterns.add(new CustomPattern("p11", "\\,?\\slike\\sother\\s", "compact", "like\\sother", true));
+        allPatterns.add(new CustomPattern("p7", "\\,?\\sand\\ssome\\sother\\s", "compact", "and\\some\\sother", true));
+        allPatterns.add(new CustomPattern("p23e", "\\,?\\sprincipally\\s", "compact", "principally", false));		
+        allPatterns.add(new CustomPattern("p15b", "\\sis\\san\\sexample\\sof\\s", "compact", "is\\san\\sexample\\sof", true));
+        allPatterns.add(new CustomPattern("p22a", "\\,?\\swhich\\sis\\scalled\\s", "compact", "which\\sis\\scalled", false));		
+        allPatterns.add(new CustomPattern("p28a", "\\,?\\sa\\skind\\sof\\s", "compact", "a\\skind\\sof", true));
+        allPatterns.add(new CustomPattern("p12c", "\\,\\sone\\sof\\sthose\\s", "compact", "one\\sof\\sthose", true));
+        allPatterns.add(new CustomPattern("p29a", "\\,?\\swhich\\slooks?\\slike\\s", "compact", "which\\slooks?\\slike", false));
+        allPatterns.add(new CustomPattern("p28c", "\\,?\\sa\\sform\\sof\\s", "compact", "a\\sform\\sof", true));
+        allPatterns.add(new CustomPattern("p30b", "\\,?\\swhich\\sis\\ssimilar\\sto\\s", "compact", "which\\sis\\ssimilar\\sto", false));
+        allPatterns.add(new CustomPattern("p12b", "\\,\\sone\\sof\\sthese\\s", "compact", "one\\sof\\sthese", true));
+        allPatterns.add(new CustomPattern("p29c", "\\,?\\swhich\\ssounds?\\slike\\s", "compact", "which\\ssounds?\\slike", false));
+        allPatterns.add(new CustomPattern("p28d", "\\,?\\sforms\\sof\\s", "compact", "forms\\sof", true));
+        allPatterns.add(new CustomPattern("p30a", "\\,?\\swhich\\sare\\ssimilar\\sto\\s", "compact", "which\\sare\\ssimilar\\sto", false));
+        allPatterns.add(new CustomPattern("p22b", "\\,?\\swhich\\sis\\snamed\\s", "compact", "which\\sis\\snamed", false));
+        allPatterns.add(new CustomPattern("p42", "\\,?\\sor\\sthe\\smany\\s", "compact", "or\\sthe\\smany", true));
+        allPatterns.add(new CustomPattern("p31a", "\\,?\\sexample\\sof\\sthis\\sis\\s", "compact", "example\\sof\\sthis\\sis", false));
+        allPatterns.add(new CustomPattern("p28b", "\\,?\\skinds\\sof\\s", "compact", "kinds\\sof", true));	
+        allPatterns.add(new CustomPattern("p31b", "\\,?\\sexamples\\sof\\sthis\\sare\\s", "compact", "examples\\sof\\sthis\\sare", false));
+
+        allPatterns.add(new CustomPattern("p10", "(S|s)uch\\s"+npPlaceholder+"as\\s", "split_noPrefix", "(S|s)uch\\s", "such", "as", false));
+        allPatterns.add(new CustomPattern("p13", "(E|e)xample\\sof\\s"+npPlaceholder+"is\\s", "split_noPrefix", "example\\sof", "example of", "is", false));
+        allPatterns.add(new CustomPattern("p14", "(E|e)xamples\\sof\\s"+npPlaceholder+"are\\s", "split_noPrefix", null, "examples of", "are", false));
+        allPatterns.add(new CustomPattern("p36", "\\swhether\\s"+npPlaceholder+"or\\s", "split", " whether", "whether", "or", false));
+        allPatterns.add(new CustomPattern("p37", "(C|c)ompare\\s"+npPlaceholder+"with\\s", "split_noPrefix", "compare\\s", "compare", "with", true));
+         */
+        return allPatterns;
+    }
+
+    
+    public static NounPhrase extract(String text, String gold_instance_label) {
+        Set<String> gold_instance_tokens = new HashSet(Arrays.asList(gold_instance_label.toLowerCase().split(" ")));
+        List<IsaPattern> patterns = extract(text);
+        for(IsaPattern p : patterns){
+            for(NounPhrase i : p.getInstance()){
+                Set<String> instanceTokens = new HashSet(Arrays.asList(i.toString().toLowerCase().split(" ")));
+                instanceTokens.retainAll(gold_instance_tokens);
+                if(instanceTokens.size() > 0){
+                    return p.getClazz().get(0);
+                }
+            }
+        }
+        return null;        
+    }
+    
+    public static List<IsaPattern> extract(String s) {
+        List<IsaPattern> extractedPatterns = new ArrayList<IsaPattern>();
+        
+        List<String> sentencesClean = splitSentences(s);
+        for (String sentence : sentencesClean) {
+            sentence = preprocessSentence(sentence);
+            
+            for (CustomPattern customPattern : allPatterns) {
+                Matcher patternMatcher = customPattern.getPattern().matcher(sentence);
+                while (patternMatcher.find()) {
+                    
+                    String extractedPattern = patternMatcher.group();
+                    int onset = patternMatcher.start();
+                    int offset = patternMatcher.end();
+                    
+                    // Compact Patterns still contain an Indicator for the leading and the following Nounphrase
+                    // This indicator has to be removed
+                    if (customPattern.getType().equals("compact")){
+                        onset++;
+                        offset--;
+                    }
+                    
+                    // Check if a leading pronoun can be excluded
+                    String pronounFront = sentence.substring(0, onset);
+                    int lastWhitespace = pronounFront.lastIndexOf(" ");
+                    if (lastWhitespace != -1){
+                        pronounFront = pronounFront.substring(lastWhitespace+1).toLowerCase();
+                    }
+                    
+                    String pronounBack = sentence.substring(offset);
+                    int firstWhitespace = pronounBack.indexOf(" ");
+                    if (firstWhitespace != -1){
+                        pronounBack = pronounBack.substring(0, firstWhitespace).toLowerCase();
+                    }
+                    
+                    if(allExclusions.contains(pronounFront.toLowerCase()) || allExclusions.contains(pronounBack.toLowerCase())){
+                        continue;
+                    }
+                    
+                    ArrayList<NounPhrase> currentNPsBeforePattern = new ArrayList<>();
+                    ArrayList<NounPhrase> currentNPsAfterPattern = new ArrayList<>();
+                    try{
+                        List<TaggedWord> fullTaggedList = tagger.tagSentence(SentenceUtils.toWordList(sentence.split(" ")));
+
+                        for (TaggedWord tw : fullTaggedList) {
+                            if (tw.word().length() < tw.word().replaceAll("(?<=s)[\\u201A\\u201C\\u201D\\u201E\\u201F\\u0022]", "").length()
+                                    || tw.word().length() < tw.word().replaceAll("[\\u201A\\u201C\\u201D\\u201E\\u201F\\u0022](?=s)", "").length()) {
+                                tw.setTag("JJ");
+                            }
+                        }
+                        
+                        List<TaggedWord> taggedWordsBeforePattern;
+                        List<TaggedWord> taggedWordsAfterPattern;
+                        if (customPattern.getType().equals("compact")) {
+                            taggedWordsBeforePattern = getWordListSubset(0, onset + 1, fullTaggedList);
+                            taggedWordsAfterPattern = getWordListSubset(offset, sentence.length(), fullTaggedList);
+                        } else {
+                            taggedWordsBeforePattern = getWordlistBeforeSplittedPattern(customPattern, sentence, onset, fullTaggedList);
+                            taggedWordsAfterPattern = getWordlistAfterSplittedPattern(customPattern, sentence, onset, offset, fullTaggedList);
+                        }
+
+                        Collections.reverse(taggedWordsBeforePattern);
+                        findNextNounPhraseReverse(0, taggedWordsBeforePattern, currentNPsBeforePattern);
+                        findNextNounPhrase(0, taggedWordsAfterPattern, currentNPsAfterPattern);
+
+                        if (currentNPsAfterPattern.isEmpty() || currentNPsBeforePattern.isEmpty()){
+                            continue;
+                        }
+
+                    }catch (StringIndexOutOfBoundsException e){ }
+
+                    
+                    if (customPattern.getInstanceFirst()){
+                        extractedPatterns.add(new IsaPattern(currentNPsBeforePattern, currentNPsAfterPattern));
+                    }
+                    else {
+                        extractedPatterns.add(new IsaPattern(currentNPsAfterPattern, currentNPsBeforePattern));
+                    }
+                }
+            }
+        }
+        return extractedPatterns;
+    }
+
+    
+    private static String[] abbreviations = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
+        "Adj", "Adm", "Adv", "Asst", "Bart", "Bldg", "Brig", "Bros", "Capt", "Cmdr", "Col", "Comdr", "Con", "Corp", "Cpl", "DR", "Dr", "Drs", "Ens", "Fig", "FIG", "fig", "Gen", "Gov", "Hon", "Hr", "Hosp", "Insp", "Lt", "MM",
+        "MR", "MRS", "MS", "Maj", "Messrs", "Mlle", "Mme", "Mr", "Mrs", "Ms", "Msgr", "Op", "Ord", "Pat", "Pfc", "Ph", "Prof", "Pvt", "Rep", "Reps", "Res", "Rev", "Rt", "Sen", "Sens", "Sfc", "Sgt", "Sr", "St",
+        "Supt", "Surg", "v", "vs", "U.S", "u.s", "U.K", "u.k", "i.e", "rev", "e.g", "No", "Nos", "Art", "Nr", "pp"};
+    private static Pattern splitRegex = Pattern.compile("(?<=[\\!\\.\\?]" + surrounderSymbols + ")\\s(?=" + surrounderSymbols + "\\p{Lu})");
+
+    private static List<String> splitSentences(String text) {
+        String[] sentencesRaw = splitRegex.split(text);
+
+        Boolean connectWithLatterPart = false;
+        List<String> sentencesClean = new ArrayList<String>();
+        sentencesClean.add(sentencesRaw[0]);
+        for (int z = 1; z < sentencesRaw.length; z++) {
+            for (String abb : abbreviations) {
+                if (sentencesClean.get(sentencesClean.size() - 1).endsWith(abb + ".")) {
+                    connectWithLatterPart = true;
+                    break;
+                }
+            }
+            if (connectWithLatterPart) {
+                sentencesClean.set(sentencesClean.size() - 1, sentencesClean.get(sentencesClean.size() - 1) + " " + sentencesRaw[z]);
+            } else {
+                sentencesClean.add(sentencesRaw[z]);
+            }
+            connectWithLatterPart = false;
+        }
+
+        // If no valid sentence is found, the line itself is analyzed
+        if (sentencesClean.size() == 0) {
+            sentencesClean.add(text);
+        }
+
+        return sentencesClean;
+    }
+
+    private static Pattern multipleWhitespacePatternTwo = Pattern.compile("[Â \\t\\p{Zs}\\n\\cK\\f\\r\\x85\\x{2028}\\x{2029}]+");
+    private static Pattern multipleWhitespacePattern = Pattern.compile("\\s+");
+    private static Pattern quotationMarkPattern = Pattern.compile("(?<!s)[\\u201A\\u201C\\u201D\\u201E\\u201F\\u0022](?!s)");
+    private static Pattern parenthesisPattern = Pattern.compile("\\(.*?\\)|\\[.*?\\]");
+
+    private static String preprocessSentence(String sentence) {
+        // Help the Pos-Tagger with Apostrophies and QuotationMarks; Replace as many as possible, without losing the meaning;
+        
+        sentence = multipleWhitespacePatternTwo.matcher(sentence).replaceAll(" ");
+        sentence = parenthesisPattern.matcher(sentence).replaceAll(""); // replace all text between parenthesis
+        sentence = multipleWhitespacePattern.matcher(sentence).replaceAll(" ");
+        sentence = replaceVerbApostrophies(sentence);
+        sentence = quotationMarkPattern.matcher(sentence).replaceAll(" ");
+        sentence = sentence.trim();
+        return sentence;
+    }
+
+    /**
+     * Transforms the most common usages of auxiliary-verb apostrophies of the
+     * sentence, into the regular form, with the apostrophy replaced; 4 Types of
+     * apostrophies are considered
+     *
+     * @param sentence
+     * @return sentence
+     */
+    public static String replaceVerbApostrophies(String sentence) {
+        if (sentence.contains("'") | sentence.contains("’") | sentence.contains("‘") | sentence.contains("‛")) {
+            //Auxiliary Verb abbreviations
+            sentence = sentence.replaceAll("(?i)[\\u0027\\u2018\\u2019\\u201B]d\\s", " would ");
+            sentence = sentence.replaceAll("(?i)[\\u0027\\u2018\\u2019\\u201B]re\\s", " are ");
+            sentence = sentence.replaceAll("(?i)[\\u0027\\u2018\\u2019\\u201B]ve\\s", " have ");
+            sentence = sentence.replaceAll("(?i)[\\u0027\\u2018\\u2019\\u201B]ll\\s", " will ");
+            sentence = sentence.replaceAll("(?i)i[\\u0027\\u2018\\u2019\\u201B]m\\s", "I am ");
+
+            //Auxiliary Verb 's
+            sentence = sentence.replaceAll("(?i)he[\\u0027\\u2018\\u2019\\u201B]s\\s", "he is ");
+            sentence = sentence.replaceAll("(?i)she[\\u0027\\u2018\\u2019\\u201B]s\\s", "she is ");
+            sentence = sentence.replaceAll("(?i)it[\\u0027\\u2018\\u2019\\u201B]s\\s", "it is ");
+            sentence = sentence.replaceAll("(?i)that[\\u0027\\u2018\\u2019\\u201B]s\\s", "he is ");
+            sentence = sentence.replaceAll("(?i)where[\\u0027\\u2018\\u2019\\u201B]s\\s", "she is ");
+            sentence = sentence.replaceAll("(?i)who[\\u0027\\u2018\\u2019\\u201B]s\\s", "it is ");
+            sentence = sentence.replaceAll("(?i)what[\\u0027\\u2018\\u2019\\u201B]s\\s", "what is ");
+            sentence = sentence.replaceAll("(?i)when[\\u0027\\u2018\\u2019\\u201B]s\\s", "when is ");
+            sentence = sentence.replaceAll("(?i)why[\\u0027\\u2018\\u2019\\u201B]s\\s", "why is ");
+            sentence = sentence.replaceAll("(?i)how[\\u0027\\u2018\\u2019\\u201B]s\\s", "how is ");
+            sentence = sentence.replaceAll("(?i)here[\\u0027\\u2018\\u2019\\u201B]s\\s", "here is ");
+            sentence = sentence.replaceAll("(?i)there[\\u0027\\u2018\\u2019\\u201B]s\\s", "there is ");
+
+            //Negations
+            sentence = sentence.replaceAll("(?i)isn[\\u0027\\u2018\\u2019\\u201B]t\\s", "is not ");
+            sentence = sentence.replaceAll("(?i)aren[\\u0027\\u2018\\u2019\\u201B]t\\s", "are not ");
+            sentence = sentence.replaceAll("(?i)don[\\u0027\\u2018\\u2019\\u201B]t\\s", "do not ");
+            sentence = sentence.replaceAll("(?i)doesn[\\u0027\\u2018\\u2019\\u201B]t\\s", "does not ");
+            sentence = sentence.replaceAll("(?i)can[\\u0027\\u2018\\u2019\\u201B]t\\s", "can not ");
+            sentence = sentence.replaceAll("(?i)couldn[\\u0027\\u2018\\u2019\\u201B]t\\s", "could not ");
+            sentence = sentence.replaceAll("(?i)shouldn[\\u0027\\u2018\\u2019\\u201B]t\\s", "should not ");
+            sentence = sentence.replaceAll("(?i)won[\\u0027\\u2018\\u2019\\u201B]t\\s", " will not ");
+            sentence = sentence.replaceAll("(?i)wouldn[\\u0027\\u2018\\u2019\\u201B]t\\s", "would not ");
+            sentence = sentence.replaceAll("(?i)haven[\\u0027\\u2018\\u2019\\u201B]t\\s", "have not ");
+        }
+        return sentence;
+    }
+    
+    
+    /**
+    * Depending on the type of custPat and the Length of its two parts, the onset and offset of the first NounPhrase-Search-Area
+    * @param custPat
+    * @param sentence
+    * @param onset (Of the pattern inside the sentence)
+    * @param offset (Of the pattern inside the sentence)
+    * @param tw
+    * @return
+    */
+    public static List<TaggedWord> getWordlistBeforeSplittedPattern(CustomPattern custPat, String sentence, int onset, List<TaggedWord> tw) {
+        //Beginnt nach First KeyWord und endet vor second KeyWord; 
+        if (custPat.getType().equals("split_noPrefix")) {
+            return getWordListSubset(onset + custPat.getFirstKeyWord().length() + 1, sentence.toLowerCase().indexOf(custPat.getSecondKeyWord(), onset), tw);
+        }
+        //Beginnt vor erstem Keyword
+        if (custPat.getType().equals("split")) {
+            return getWordListSubset(0, onset, tw);
+        }
+        //Onset wird "normal gesplitted in diesem Fall" (wie Compact Patterns)
+        if (custPat.getType().equals("split_noSuffix")) {
+            return getWordListSubset(0, onset, tw);
+        }
+        return new ArrayList<TaggedWord>();
+    }
+    
+    /**
+    * Depending on the type of custPat and the Length of its two parts, the onset and offset of the second NounPhrase-Search-Area
+    * @param custPat
+    * @param sentence
+    * @param onset (Of the pattern inside the sentence)
+    * @param offset (Of the pattern inside the sentence)
+    * @param tw
+    * @return
+    */
+    public static List<TaggedWord> getWordlistAfterSplittedPattern(CustomPattern custPat, String sentence, int onset, int offset, List<TaggedWord> tw) {
+        //Beginnt nach dem zweiten Keyword
+        if (custPat.getType().equals("split_noPrefix")) {
+            return getWordListSubset(offset, sentence.length(), tw);
+        }
+        // Beginnt beim Onset und endet mit secondKeyword
+        if (custPat.getType().equals("split_noSuffix")) {
+            return getWordListSubset(onset, sentence.indexOf(custPat.getSecondKeyWord(), onset), tw);
+
+        }
+        //Beginnt nach dem ersten Keyword
+        if (custPat.getType().equals("split")) {
+            return getWordListSubset(onset + custPat.getFirstKeyWord().length(), sentence.length(), tw);
+        }
+        return new ArrayList<TaggedWord>();
+    }
+    
+    /**
+     *
+     * @param wordOffset This word offset points to a word in the sentence list
+     * and describes the starting point for the NounPhrase search
+     * @param sentence
+     * @param resultNPs
+     * @return
+     */
+    public static ArrayList<NounPhrase> findNextNounPhrase(int wordOffset, List<TaggedWord> sentence, ArrayList<NounPhrase> resultNPs) {
+        NounPhrase currentNP = new NounPhrase(maxNpSize);
+        int postOffset = 0;
+        for (int i = wordOffset; i < sentence.size(); i++) {
+            if (!sentence.get(i).tag().equals("DT") && !sentence.get(i).tag().startsWith("NN") && !sentence.get(i).tag().startsWith("JJ") && !sentence.get(i).tag().equals("VBN") && !sentence.get(i).word().toLowerCase().equals("and") && !sentence.get(i).word().toLowerCase().equals("or") && !sentence.get(i).word().toLowerCase().equals("&") && resultNPs.size() > 0) {
+                return resultNPs;
+            }
+
+            if (sentence.get(i).tag().startsWith("NN")) {
+                currentNP.setNPCore(sentence.get(i));
+                findPreMod(i, sentence, currentNP);
+                if (sentence.get(i).word().endsWith(",")) {
+                    resultNPs.add(cleanNounPhrase(currentNP, false));
+                    findNextNounPhrase(i + 1, sentence, resultNPs);
+                    return resultNPs;
+                }
+                postOffset = findPostMod(i, sentence, currentNP);
+                if (postOffset != -1) {
+                    resultNPs.add(cleanNounPhrase(currentNP, false));
+                    findNextNounPhrase(postOffset + 1, sentence, resultNPs);
+                    return resultNPs;
+                } else {
+                    resultNPs.add(cleanNounPhrase(currentNP, false));
+                    return resultNPs;
+                }
+            }
+        }
+        return resultNPs;
+    }
+    
+    /**
+     * Searches for potential pre-modifying words of the NounPhrase;
+     *
+     * @param nnOffset Is a pointer on the last word analyzed (which has to be a
+     * valid Core-NN)
+     * @param sentence
+     * @param currentNP
+     */
+    public static void findPreMod(int nnOffset, List<TaggedWord> sentence, NounPhrase currentNP) {
+        for (int i = nnOffset - 1; i > nnOffset - currentNP.getMaxNPLength() && i >= 0; i--) {
+            if ((sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN")) && !sentence.get(i).word().endsWith(",")) {
+                currentNP.addPreModifier(sentence.get(i));
+            } else {
+                return;
+            }
+        }
+    }
+
+    /**
+     * Searches and stores potential post-modifying words of the NounPhrase, as
+     * well as looks for potential coordinations
+     *
+     * @param nnOffset Is a pointer on the last word analyzed (which has to be a
+     * valid Core-NN)
+     * @param sentence
+     * @param currentNP
+     * @return -1, if no potential coordination was found; word-offset of the
+     * last analyzed word (will be used to search for next NounPhrase)
+     */
+    public static int findPostMod(int nnOffset, List<TaggedWord> sentence, NounPhrase currentNP) {
+        for (int i = nnOffset + 1; i < nnOffset + currentNP.getMaxNPLength() && i < sentence.size(); i++) {
+            if (sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN") | sentence.get(i).tag().equals("VBG") | sentence.get(i).tag().startsWith("NN") | sentence.get(i).tag().equals("IN") | sentence.get(i).tag().equals("CD") | sentence.get(i).tag().equals("DT")) {
+                currentNP.addPostModifier(sentence.get(i));
+            }
+            if (sentence.get(i).word().toLowerCase().equals("and") | sentence.get(i).word().toLowerCase().equals("or") | sentence.get(i).word().toLowerCase().equals("&")) {
+                return i;
+            }
+            if (!(sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN") | sentence.get(i).tag().equals("VBG") | sentence.get(i).tag().startsWith("NN") | sentence.get(i).tag().equals("IN") | sentence.get(i).tag().equals("CD") | sentence.get(i).tag().equals("DT") | sentence.get(i).word().toLowerCase().equals("and") | sentence.get(i).word().toLowerCase().equals("or") | sentence.get(i).word().toLowerCase().equals("&"))) {
+                return -1;
+            }
+            if (sentence.get(i).word().endsWith(",")) {
+                return i;
+            }
+        }
+        if (sentence.size() > nnOffset + currentNP.getMaxNPLength()) {
+            if (sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("and") | sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("or") | sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("&")) {
+                return nnOffset + currentNP.getMaxNPLength();
+            }
+        }
+        return -1;
+    }
+    
+    public static ArrayList<NounPhrase> findNextNounPhraseReverse(int wordOffset, List<TaggedWord> sentence, ArrayList<NounPhrase> resultNPs) {
+        int status = 0;
+        NounPhrase currentNP = new NounPhrase(maxNpSize);
+        for (int i = wordOffset; i < sentence.size(); i++) {
+            if (!sentence.get(i).tag().startsWith("NN") && !sentence.get(i).tag().equals("VBG") && !sentence.get(i).tag().equals("IN") && !sentence.get(i).tag().equals("CD") && !sentence.get(i).tag().equals("DT") && resultNPs.size() > 0) {
+                return resultNPs;
+            }
+
+            if (sentence.get(i).tag().startsWith("NN")) {
+                currentNP.setNPCore(sentence.get(i));
+                if (sentence.get(i).word().endsWith(",")) {
+                    status = findPreModReverse(i, sentence, currentNP);
+                    if (status == -2) {
+                        findNextNounPhraseReverse(i + 1, sentence, resultNPs);
+                        return resultNPs;
+                    } else if (status > 0) {
+                        resultNPs.add(cleanNounPhrase(currentNP, true));
+                        findNextNounPhraseReverse(status + 1, sentence, resultNPs);
+                        return resultNPs;
+                    } else {
+                        resultNPs.add(cleanNounPhrase(currentNP, true));
+                        return resultNPs;
+                    }
+                } else {
+                    status = findPreModReverse(i, sentence, currentNP);
+                    if (status == -2) {
+                        findNextNounPhraseReverse(i + 1, sentence, resultNPs);
+                        return resultNPs;
+                    }
+                    //Neue NounPhrase Entdeckt y=>ist neues Offset
+                    if (status > 0) {
+                        findPostModReverse(i, sentence, currentNP);
+                        resultNPs.add(cleanNounPhrase(currentNP, true));
+                        findNextNounPhraseReverse(status, sentence, resultNPs);
+                        return resultNPs;
+                    } else {
+                        findPostModReverse(i, sentence, currentNP);
+                        resultNPs.add(cleanNounPhrase(currentNP, true));
+                        return resultNPs;
+                    }
+                }
+            }
+        }
+        return resultNPs;
+    }
+    
+    /**
+     * In this stage the post-modifier of NounPHrase is reduced until the Last
+     * NN is found; The cleaning of NounPhrases with regard to Symbols can be
+     * done in the Aggregation step; At this stage the risk is too high, that
+     * valuable information gets lost / Algorithms can't be too precise at this
+     * stage
+     *
+     * @param np
+     * @param isReverse
+     * @return
+     */
+    public static NounPhrase cleanNounPhrase(NounPhrase np, Boolean isReverse) {
+        //If dot is inside a NN, the NN is split: Depending if entity was extracted inFront of or afterPattern
+        /*
+		String[] npPartHolder = np.getNPCore().word().split("(?<=(\\p{L}|\\d))\\s?[\\.\\?\\!\\(\\)\\{\\}\\[\\]]+\\s?(?=\\p{L})");
+		
+		if (npPartHolder.length>=2)
+		{
+			if (isReverse)
+			{
+				np.NPCore.setWord(npPartHolder[npPartHolder.length-1]);
+				np.clearPreMod();
+			}
+			else
+			{
+				np.NPCore.setWord(npPartHolder[0]);
+				np.clearPostMod();
+			}
+		}*/
+
+        //Reduce the Post-Modifier or PreModifier until the last NN is found
+        try {
+            if (np.getPostModifier() != null && !np.getPostModifier().isEmpty()) {
+                np.getPostModifier().get(np.getPostModifier().size() - 1).setWord(np.getPostModifier().get(np.getPostModifier().size() - 1).word().trim().replaceAll("(\\.|\\,|\\;|\\:|\\?|\\!)$", ""));
+                while (!np.getPostModifier().get(np.getPostModifier().size() - 1).tag().startsWith("NN") && !np.getPostModifier().get(np.getPostModifier().size() - 1).tag().equals("CD")) {
+                    np.getPostModifier().remove(np.getPostModifier().size() - 1);
+                    if (np.getPostModifier().isEmpty()) {
+                        break;
+                    }
+                }
+            }
+
+            if (np.getPreModifier() != null && !np.getPreModifier().isEmpty()) {
+                np.getPreModifier().get(np.getPreModifier().size() - 1).setWord(np.getPreModifier().get(np.getPreModifier().size() - 1).word().trim().replaceAll("(\\.|\\,|\\;|\\:|\\?|\\!)$", ""));
+                while (!np.getPreModifier().get(0).tag().startsWith("JJ") && !np.getPreModifier().get(0).tag().equals("VBN")) {
+                    np.getPreModifier().remove(0);
+                    if (np.getPreModifier().isEmpty()) {
+                        break;
+                    }
+                }
+            }
+        } catch (ArrayIndexOutOfBoundsException e) { }
+
+        //Clean the single word of NounPhrases if there are still symbols in front or behind it; (in case of multiple symbols)
+        //Problem with abbreviations or websites => Can be filtered in next steps
+        // KOmma removed, because it intereferes with Coordinations; (Not in Aggregation step ;-) )
+        //KOmma added - Sven
+        np.NPCore.setWord(np.NPCore.word().trim().replaceAll("[\\.\\,\\;\\:\\?\\!\\(\\)\\[\\]\\{\\}]+$", "")); 
+        np.NPCore.setWord(np.NPCore.word().trim().replaceAll("^[\\.\\,\\;\\:\\?\\!\\(\\)\\[\\]\\{\\}]+", ""));
+        
+        return np;
+    }
+    
+    public static int findPreModReverse(int nnOffset, List<TaggedWord> sentence, NounPhrase currentNP) {
+        boolean premodFinished = false;
+        for (int i = nnOffset + 1; i < nnOffset + currentNP.getMaxNPLength() && i < sentence.size(); i++) {
+            if ((sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN")) && !sentence.get(i).word().endsWith(",") && !premodFinished) {
+                currentNP.addPreModifier(sentence.get(i));
+            } else if ((sentence.get(i).tag().equals("VBG") | sentence.get(i).tag().equals("IN") | sentence.get(i).tag().equals("CD") | sentence.get(i).tag().equals("DT"))) {
+                //ignore: It might be that this is pat of a postModifier: if another NN is found;
+                premodFinished = true;
+            } else if (sentence.get(i).tag().startsWith("NN") && !sentence.get(i).word().endsWith(",")) {
+                return -2;
+            } else if (sentence.get(i).word().endsWith(",")) {
+                return i;
+            } else if (sentence.get(i).word().toLowerCase().equals("and") | sentence.get(i).word().toLowerCase().equals("or") | sentence.get(i).word().toLowerCase().equals("&")) {
+                return i + 1;
+            } else {
+                return -1;
+            }
+        }
+
+        if (sentence.size() > nnOffset + currentNP.getMaxNPLength()) {
+            if (sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("and") | sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("or") | sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("&")) {
+                return nnOffset + currentNP.getMaxNPLength() + 1;
+            }
+        }
+        return -1;
+    }
+    
+    
+    public static void findPostModReverse(int nnOffset, List<TaggedWord> sentence, NounPhrase currentNP) {
+        for (int i = nnOffset - 1; i > nnOffset - currentNP.getMaxNPLength() && i >= 0; i--) {
+            if (sentence.get(i).word().toLowerCase().equals("and") | sentence.get(i).word().toLowerCase().equals("or") | sentence.get(i).word().toLowerCase().equals("&")) {
+                return;
+            }
+
+            if (sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN") | sentence.get(i).tag().equals("VBG") | sentence.get(i).tag().startsWith("NN") | sentence.get(i).tag().equals("IN") | sentence.get(i).tag().equals("CD") | sentence.get(i).tag().equals("DT")) {
+                currentNP.addPostModifier(sentence.get(i));
+            }
+
+            if (sentence.get(i).word().endsWith(",")) {
+                return;
+            }
+        }
+    }
+
+    
+    /**
+     * The method returns a subList of a tagged-Word List (the entire sentence).
+     * The Onset and Offset are pointers in the sentence String, which restrict
+     * the retunred List of words;
+     *
+     * @param onset
+     * @param offset
+     * @param taggedWords
+     * @return
+     */
+    public static List<TaggedWord> getWordListSubset(int onset, int offset, List<TaggedWord> taggedWords) {
+        List<TaggedWord> result = new ArrayList<TaggedWord>();
+        int charCounter = 0;
+        for (TaggedWord tw : taggedWords) {
+            charCounter += tw.word().length();
+            if (charCounter >= onset && charCounter <= offset) {
+                result.add(tw);
+            }
+            charCounter++;
+        }
+        return result;
+    }
+
+    
+    //Variables and Constants for false-positive detection
+    private static Set<String> allExclusions = createAllExclusions();
+    private static Set<String> createAllExclusions(){
+        Set<String> exclusions = new HashSet<>();
+        exclusions.addAll(Arrays.asList("that","this","these","those"));//demonstratives
+        exclusions.addAll(Arrays.asList("mine","yours","his","hers","its","ours","theirs"));//possessives
+        exclusions.addAll(Arrays.asList("i","you","he","she","it","we","they"));//personals
+        exclusions.addAll(Arrays.asList("where", "who", "when", "what", "why", "whose", "which", "how"));//questions
+        exclusions.addAll(Arrays.asList("there"));
+        return exclusions;
+    }
+
+    
+}
diff --git a/core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java b/core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java
new file mode 100644
index 0000000000..d5b1212660
--- /dev/null
+++ b/core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java
@@ -0,0 +1,71 @@
+package org.dbpedia.extraction.hearst;
+
+import edu.stanford.nlp.ling.TaggedWord;
+import java.util.ArrayList;
+import java.util.List;
+
+
+public class IsaPattern {
+    private ArrayList<NounPhrase> instance;
+    private ArrayList<NounPhrase> clazz;
+
+    public IsaPattern(ArrayList<NounPhrase> instance, ArrayList<NounPhrase> clazz) {
+        this.instance = (ArrayList<NounPhrase>)instance.clone();
+        this.clazz = (ArrayList<NounPhrase>)clazz.clone();
+    }
+
+    public ArrayList<NounPhrase> getInstance() {
+        return instance;
+    }
+
+    public ArrayList<NounPhrase> getClazz() {
+        return clazz;
+    }
+
+    @Override
+    public String toString() {
+        return nounPhraseListUnderscoreToString(instance) + " --isa--> " + nounPhraseListUnderscoreToString(clazz);
+    }
+    
+    
+    private static String nounPhraseListToString(ArrayList<NounPhrase> nps) {
+        if (nps.size() == 0) {
+            return "{}";
+        }
+        StringBuilder result = new StringBuilder();
+        result.append("{");
+        for (NounPhrase np : nps) {
+            result.append(np.toString()).append("|");
+        }
+        result.setLength(result.length() - 1);
+        result.append("}");
+        return result.toString();
+    }
+    
+    private static String nounPhraseListUnderscoreToString(ArrayList<NounPhrase> nps) {
+        if (nps.size() == 0) {
+            return "{}";
+        }
+        StringBuilder result = new StringBuilder();
+        result.append("{");
+        for (NounPhrase np : nps) {
+            for (TaggedWord tw : np.getPreModifier()) {
+                result.append(tw.word()).append(" ");
+            }
+            result.append("_");
+            result.append(np.getNPCore().word());
+            result.append("_");
+            for (TaggedWord tw : np.getPostModifier()) {
+                result.append(tw.word()).append(" ");
+            }
+            //if (result.length() > 0) {
+            //    result.setLength(result.length() - 1);
+            //}
+        
+            result.append("|");
+        }
+        result.setLength(result.length() - 1);
+        result.append("}");
+        return result.toString();
+    }
+}
diff --git a/core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java b/core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java
new file mode 100644
index 0000000000..99f0224890
--- /dev/null
+++ b/core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java
@@ -0,0 +1,139 @@
+package org.dbpedia.extraction.hearst;
+
+import java.util.ArrayList;
+
+import edu.stanford.nlp.ling.TaggedWord;
+import java.util.StringJoiner;
+
+public class NounPhrase {
+
+    public TaggedWord NPCore;
+    private ArrayList<TaggedWord> preModifier, postModifier;
+    private boolean isComplete;
+    private boolean coreFound;
+    private int maxNPLength;
+
+    public NounPhrase(int maxNPLength) {
+        this.maxNPLength = maxNPLength;
+
+        isComplete = false;
+        preModifier = new ArrayList<TaggedWord>();
+        postModifier = new ArrayList<TaggedWord>();
+    }
+
+    public void addPreModifier(TaggedWord tw) {
+        preModifier.add(0, tw);
+        if (preModifier.size() == maxNPLength) {
+            preModifier.remove(preModifier.size() - 1);
+        }
+    }
+
+    public void addPostModifier(TaggedWord tw) {
+        postModifier.add(tw);
+        if (postModifier.size() + 1 + preModifier.size() > maxNPLength) {
+            if (preModifier.size() > 0) {
+                preModifier.remove(0);
+            } else {
+                isComplete = true;
+            }
+        }
+    }
+
+    public void NPCoreToPost(TaggedWord tw) {
+        postModifier.add(NPCore);
+        NPCore = tw;
+        if (postModifier.size() + 1 == maxNPLength) {
+            isComplete = true;
+        }
+    }
+
+    public void clearPreMod() {
+        preModifier.clear();
+    }
+
+    public void clearPostMod() {
+        postModifier.clear();
+    }
+
+    public void setNPCore(TaggedWord tw) {
+        NPCore = tw;
+        if (postModifier.size() + 1 == maxNPLength) {
+            isComplete = true;
+        }
+    }
+
+    public TaggedWord getNPCore() {
+        return NPCore;
+    }
+
+    public ArrayList<TaggedWord> getPreModifier() {
+        return preModifier;
+    }
+
+    public ArrayList<TaggedWord> getPostModifier() {
+        return postModifier;
+    }
+    
+    public String getPreModifierText() {
+        StringJoiner joiner = new StringJoiner(" ");
+        for (TaggedWord tw : preModifier) {
+            joiner.add(tw.word());
+        }
+        return joiner.toString();
+    }
+    
+    public String getPostModifierText() {
+        StringJoiner joiner = new StringJoiner(" ");
+        for (TaggedWord tw : postModifier) {
+            joiner.add(tw.word());
+        }
+        return joiner.toString();
+    }
+    
+    public String getNPCoreText() {
+        return NPCore.word();
+    }
+
+    public int getMaxNPLength() {
+        return maxNPLength;
+    }
+    
+    public boolean isCoreFound() {
+        return coreFound;
+    }
+
+    public boolean isComplete() {
+        return isComplete;
+    }
+    
+
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        for (TaggedWord tw : preModifier) {
+            sb.append(tw.word()).append(" ");
+        }
+        sb.append(NPCore.word()).append(" ");
+        for (TaggedWord tw : postModifier) {
+            sb.append(tw.word()).append(" ");
+        }
+        if (sb.length() > 0) {
+            sb.setLength(sb.length() - 1);
+        }
+        return sb.toString();
+    }
+
+    public String tagsToString() {
+        StringBuilder sb = new StringBuilder();
+        for (TaggedWord tw : preModifier) {
+            sb.append(tw.tag()).append(" ");
+        }
+        sb.append(NPCore.tag()).append(" ");
+        for (TaggedWord tw : postModifier) {
+            sb.append(tw.tag()).append(" ");
+        }
+        if (sb.length() > 0) {
+            sb.setLength(sb.length() - 1);
+        }
+        return sb.toString();
+    }
+}
diff --git a/core/src/main/resources/datasetdefinitions.json b/core/src/main/resources/datasetdefinitions.json
index 946296ce04..c9ff844289 100644
--- a/core/src/main/resources/datasetdefinitions.json
+++ b/core/src/main/resources/datasetdefinitions.json
@@ -148,7 +148,13 @@
       "traits":"LinkedData, Published",
       "desc": "Dataset linking a DBpedia resource to the same resource in other wikis.",
       "defaultgraph": "dataset"
-    },    
+    },
+    "interwiki_links_link_section": {
+      "name": "InterWiki Links in Link Section",
+      "traits":"LinkedData, Published",
+      "desc": "All links which appears in a link section and linking to another wiki.",
+      "defaultgraph": "dataset"
+    },
     "interlanguage_links_chapters": {
       "name": "Interlanguage Links between DBpedia Chapters",
       "traits":"LinkedData, Published",
@@ -471,6 +477,12 @@
       "traits":"LinkedData",
       "desc": "This are all equations collected during the NIF extraction, transformed into MathML XML syntax.",
       "defaultgraph": "dataset"
+    },
+    "hearst_patterns": {
+      "name": "Hearst Patterns in the short abstract",
+      "traits":"LinkedData",
+      "desc": "Hearst Patterns extracted from the short abstract.",
+      "defaultgraph": "dataset"
     }
   },
   "links":{
diff --git a/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala b/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala
index 4e6c4265a0..a2e6e4e427 100644
--- a/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala
@@ -156,6 +156,7 @@ object DBpediaDatasets extends java.io.Serializable
     val PageIds: Dataset = datasets("page_ids")
     val InterLanguageLinks: Dataset = datasets("interlanguage_links") // Since the inter-language links were moved from Wikipedia to Wikidata, we now extract these links from the Wikidata dump, not from Wikipedia pages.")
     val InterWikiLinks: Dataset = datasets("interwiki_links")
+    val InterWikiLinksLinkSection: Dataset = datasets("interwiki_links_link_section")
     val InterLanguageLinksChapter: Dataset = datasets("interlanguage_links_chapters")
     val Genders: Dataset = datasets("genders")
     val TopicalConcepts: Dataset = datasets("topical_concepts")
@@ -229,6 +230,7 @@ object DBpediaDatasets extends java.io.Serializable
     val NifTextLinks: Dataset = datasets("nif_text_links")
     val RawTables: Dataset = datasets("raw_tables")
     val Equations: Dataset = datasets("equations")
+    val HearstPatterns: Dataset = datasets("hearst_patterns")
 
     /**
      * Links
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/ArticleTemplatesClassExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/ArticleTemplatesClassExtractor.scala
index 2e4fb5ed0b..1f21c4e0f5 100644
--- a/core/src/main/scala/org/dbpedia/extraction/mappings/ArticleTemplatesClassExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/mappings/ArticleTemplatesClassExtractor.scala
@@ -58,7 +58,7 @@ class ArticleTemplatesClassExtractor(
         //  title = title.substring(indexInfoBox + 7, title.length)
         //}
         titleLower = titleLower.replaceAll("infobox", "")
-        titleLower = titleLower.replace("/", "")
+        titleLower = titleLower.replace("/", "_")
         titleLower = stripAll(titleLower, " _-")
         if(titleLower.nonEmpty) {
           var classUri = context.language.dbpediaUri + "/class/" + titleLower
@@ -68,7 +68,7 @@ class ArticleTemplatesClassExtractor(
 
           infoboxSeenClasses.synchronized {
             if (!infoboxSeenClasses.contains(classUri)) {
-              var classLabel = template.title.decoded.replaceAll("(?i)infobox", "")
+              var classLabel = template.title.decoded.replaceAll("(?i)infobox", "").replace("/", " ")
               classLabel = stripAll(classLabel, " _-")
               infoboxSeenClasses += classUri
               quads += new Quad(context.language, DBpediaDatasets.InfoboxTemplateTypeDefinitions, classUri, typeProperty, owlClass, node.sourceIri)
@@ -87,13 +87,17 @@ class ArticleTemplatesClassExtractor(
       }else{
         var selectedTemplateNode = templateNodes.sortBy(_.children.length).last
 
-        var templateClassUri = context.language.dbpediaUri + "/class/" + selectedTemplateNode.title.encoded.toLowerCase
+        var titleLower = selectedTemplateNode.title.encoded.toLowerCase
+        titleLower = titleLower.replace("/", "_")
+        titleLower = stripAll(titleLower, " _-")
+
+        var templateClassUri = context.language.dbpediaUri + "/class/" + titleLower
 
         quads += new Quad(context.language, DBpediaDatasets.TemplateType, subjectUri, typeProperty, templateClassUri, node.sourceIri)
 
         seenClasses.synchronized {
           if (!seenClasses.contains(templateClassUri)) {
-            var classLabel = selectedTemplateNode.title.decoded
+            var classLabel = selectedTemplateNode.title.decoded.replace("/", " ")
             classLabel = stripAll(classLabel, " _-")
             seenClasses += templateClassUri
             quads += new Quad(context.language, DBpediaDatasets.TemplateTypeDefinitions, templateClassUri, typeProperty, owlClass, node.sourceIri)
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/NifNewExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/NifNewExtractor.scala
index 8204a32a7f..d3f06f1d48 100644
--- a/core/src/main/scala/org/dbpedia/extraction/mappings/NifNewExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/mappings/NifNewExtractor.scala
@@ -49,7 +49,8 @@ class NifNewExtractor(
     DBpediaDatasets.NifContext,DBpediaDatasets.NifPageStructure,DBpediaDatasets.NifTextLinks,
     DBpediaDatasets.RawTables, DBpediaDatasets.Equations,
     DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts,
-    DBpediaDatasets.InterWikiLinks, DBpediaDatasets.ExternalLinks, DBpediaDatasets.PageLinks, DBpediaDatasets.InterLanguageLinks
+    DBpediaDatasets.InterWikiLinks, DBpediaDatasets.ExternalLinks, DBpediaDatasets.PageLinks, DBpediaDatasets.InterLanguageLinks, DBpediaDatasets.InterWikiLinksLinkSection,
+    DBpediaDatasets.HearstPatterns
   )
 
   var config: WikiConfig = getSwebleConfig()
@@ -98,15 +99,17 @@ class NifNewExtractor(
       //new PrintWriter(URLEncoder.encode(pageNode.title.encoded + "_ast_expansion", StandardCharsets.UTF_8.toString)) { write(AstPrinter.print[WtNode](page)); close }
       var astVisitor = new NifExtractionAstVisitor(context.language)
       astVisitor.go(page)
-      if(astVisitor.getFullText().trim.length == 0){
+      if(astVisitor.getFullText().trim.length <= 5){
         //don't expand templates
         page = engine.postprocess(pageId, source,null).getPage
         astVisitor = new NifExtractionAstVisitor(context.language)
         astVisitor.go(page)
       }
-      //extractionInfoPrinter(astVisitor, URLEncoder.encode(pageNode.title.encoded + "_nif_extractor", StandardCharsets.UTF_8.toString))
-      quads ++= new GeneralNifExtractor(context, pageNode).extractNif(astVisitor.getTocMap(), astVisitor.getFullText())
-
+      if(astVisitor.getFullText().trim.length >= 5) {
+        //extractionInfoPrinter(astVisitor, URLEncoder.encode(pageNode.title.encoded + "_nif_extractor", StandardCharsets.UTF_8.toString))
+        //new PrintWriter(URLEncoder.encode(pageNode.title.encoded + "_ast_expansion", StandardCharsets.UTF_8.toString)) { write(AstPrinter.print[WtNode](page)); close }
+        quads ++= new GeneralNifExtractor(context, pageNode).extractNif(astVisitor.getTocMap(), astVisitor.getFullText())
+      }
     //}
     quads
   }
diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala
index 260fdccce1..4ced408819 100644
--- a/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala
@@ -3,12 +3,14 @@ package org.dbpedia.extraction.nif
 import util.control.Breaks._
 import org.dbpedia.extraction.config.Config
 import org.dbpedia.extraction.config.provenance.DBpediaDatasets
-import org.dbpedia.extraction.ontology.{Ontology, OntologyProperty, RdfNamespace}
+import org.dbpedia.extraction.hearst.ExtractHearstPatterns
+import org.dbpedia.extraction.ontology.{DBpediaNamespace, Ontology, OntologyProperty, RdfNamespace}
 import org.dbpedia.extraction.transform.{Quad, QuadBuilder}
 import org.dbpedia.extraction.util.Language
 import org.dbpedia.extraction.wikiparser.WikiPage
 import org.dbpedia.iri.UriUtils
 
+import scala.collection.mutable
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 import scala.util.{Failure, Success}
 import scala.language.reflectiveCalls
@@ -29,8 +31,6 @@ class GeneralNifExtractor (
 
   private val nifContextUri = wikiPage.uri + "?dbpv=" + context.configFile.dbPediaVersion + "&nif=context"
   private val sourceUrl = wikiPage.sourceIri
-  private val nifcontext = "http://dbkwik.webdatacommons.org/ontology/nifcontext"
-  private val ontologyLink = "http://dbkwik.webdatacommons.org/ontology/"
 
   val wikiPageWikiLinkProperty = context.ontology.properties("wikiPageWikiLink")
   val wikiPageExternalLinkProperty = context.ontology.properties("wikiPageExternalLink")
@@ -38,7 +38,10 @@ class GeneralNifExtractor (
   val wikiPageInterLanguageLinkProperty = context.ontology.properties("wikiPageInterLanguageLink")
   val labelProperty = context.ontology.properties("rdfs:label")
 
-
+  val hasTypePreModifier = DBpediaNamespace.ONTOLOGY.append("hasTypePreModifier")
+  val hasTypeHead = DBpediaNamespace.ONTOLOGY.append("hasTypeHead")
+  val hasTypePostModifier = DBpediaNamespace.ONTOLOGY.append("hasTypePostModifier")
+  val hasType = DBpediaNamespace.ONTOLOGY.append("hasType")
 
 
   protected lazy val nifContext: (String, String, String, String, String) => Quad = QuadBuilder.dynamicPredicate(context.language.isoCode,DBpediaDatasets.NifContext.encoded) _
@@ -73,12 +76,29 @@ class GeneralNifExtractor (
       val describingParagraphs = section.paragraphs//getParagraphsDescribingConcept(section, text)
       if(describingParagraphs.size > 0){
         quads += longQuad(wikiPage.uri, text.substring(describingParagraphs.head.begin.getOrElse(0), describingParagraphs.last.end.getOrElse(0)), sourceUrl) //text.substring(section.begin.getOrElse(0), section.end.getOrElse(0)), sourceUrl)
-        quads += shortQuad(wikiPage.uri, getShortAbstract(describingParagraphs, text), sourceUrl) // getShortAbstract(section.paragraphs, text), sourceUrl)
+        val shortAbstract = getShortAbstract(describingParagraphs, text)
+        quads += shortQuad(wikiPage.uri, shortAbstract, sourceUrl) // getShortAbstract(section.paragraphs, text), sourceUrl)
+        quads ++= extractHearstPattern(wikiPage.uri, shortAbstract, wikiPage.title.decoded)
       }
     }
     quads
   }
 
+  private def extractHearstPattern(subject:String, text:String, title:String): ArrayBuffer[Quad] = {
+    var quads = ArrayBuffer[Quad]()
+    val np = ExtractHearstPatterns.extract(text, title)
+    if (np != null) {
+     if(!np.getPreModifierText.isEmpty)
+        quads += new Quad(context.language, DBpediaDatasets.HearstPatterns, subject, hasTypePreModifier, np.getPreModifierText, wikiPage.sourceIri, context.ontology.datatypes("rdf:langString"))
+      quads += new Quad(context.language, DBpediaDatasets.HearstPatterns, subject, hasTypeHead, np.getNPCoreText(), wikiPage.sourceIri, context.ontology.datatypes("rdf:langString"))
+      if(!np.getPostModifierText.isEmpty)
+        quads += new Quad(context.language, DBpediaDatasets.HearstPatterns, subject, hasTypePostModifier, np.getPostModifierText, wikiPage.sourceIri, context.ontology.datatypes("rdf:langString"))
+
+      quads += new Quad(context.language, DBpediaDatasets.HearstPatterns, subject, hasType, np.toString, wikiPage.sourceIri, context.ontology.datatypes("rdf:langString"))
+    }
+    quads
+  }
+
   private def getParagraphsDescribingConcept(section: NifSection, text:String):Seq[NifParagraph] = {
     val wikiTitleWords = Set() ++ wikiPage.title.decoded.toLowerCase.split("[\\s\\u202F\\u00A0]").map(s=>s.trim)
     //try to find wiki title somewhere in the beginning of a paragraph
@@ -148,7 +168,7 @@ class GeneralNifExtractor (
     quads += nifContext(nifContextUri, RdfNamespace.NIF.append("sourceUrl"), sourceUrl, sourceUrl, null)
     quads += nifContext(nifContextUri, RdfNamespace.NIF.append("isString"), text.toString(), sourceUrl, RdfNamespace.XSD.append("string"))
     quads += nifContext(nifContextUri, RdfNamespace.NIF.append("predLang"), "http://lexvo.org/id/iso639-3/" + context.language.iso639_3, sourceUrl, null)
-    quads += nifContext(wikiPage.uri, nifcontext, nifContextUri, sourceUrl, null) //link between resource and nif context
+    quads += nifContext(wikiPage.uri, DBpediaNamespace.ONTOLOGY.append("nifcontext"), nifContextUri, sourceUrl, null) //link between resource and nif context
     quads
   }
 
@@ -232,6 +252,22 @@ class GeneralNifExtractor (
       quads ++= writeLinks(paragraphObject, paragraph, text)
       lastParagraph = Some(paragraph)
     }
+    //write links only occuring in links section:
+    if(section.id.contains("link")){ //english: link, german link, spanish: enlace, french: lien
+      val links = mutable.Map[String, mutable.HashSet[NifLink]]()
+      for (paragraphObject <- section.paragraphs) {
+        for (link <- paragraphObject.links) {
+          if(link.begin.nonEmpty && link.end.nonEmpty && link.linkType == NifLinkType.InterWiki && !link.uri.contains("#")) {
+            links.getOrElseUpdate(link.wikiTarget, new mutable.HashSet[NifLink]) += link
+          }
+        }
+      }
+      for ((wiki, linkSet) <- links) {
+        if(linkSet.size == 1){ //check if we have only one link to another wiki
+          quads += new Quad(context.language, DBpediaDatasets.InterWikiLinksLinkSection, wikiPage.uri, wikiPageInterWikiLinkProperty, linkSet.head.uri, sourceUrl, null)
+        }
+      }
+    }
     quads
   }
 
@@ -245,7 +281,7 @@ class GeneralNifExtractor (
         val typ = if (linkText.split(" ").length > 1) "Phrase" else "Word"
         val word = getNifIri(typ.toString.toLowerCase, link.begin.getOrElse(0), link.end.getOrElse(0))
         quads += nifLinks(word, RdfNamespace.RDF.append("type"), RdfNamespace.NIF.append(typ), sourceUrl, null)
-        quads += nifLinks(word, RdfNamespace.RDF.append("type"), ontologyLink + link.linkType.toString + "Link", sourceUrl, null)
+        quads += nifLinks(word, RdfNamespace.RDF.append("type"), DBpediaNamespace.ONTOLOGY.append(link.linkType.toString + "Link"), sourceUrl, null)
         quads += nifLinks(word, RdfNamespace.NIF.append("referenceContext"), nifContextUri, sourceUrl, null)
         quads += nifLinks(word, RdfNamespace.NIF.append("beginIndex"), link.begin.getOrElse(0).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger"))
         quads += nifLinks(word, RdfNamespace.NIF.append("endIndex"), link.end.getOrElse(0).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger"))
diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala
index 7274de55a2..f5bf6b5446 100644
--- a/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala
@@ -136,7 +136,8 @@ class NifExtractionAstVisitor(language : Language)
       begin = None,
       end = None,
       uri = link,
-      linkType = NifLinkType.External
+      linkType = NifLinkType.External,
+      wikiTarget = ""
     )
     nifLink.begin = Some(context.length)
     write(link)
@@ -159,7 +160,8 @@ class NifExtractionAstVisitor(language : Language)
       begin = None,
       end = None,
       uri = strLink,
-      linkType = NifLinkType.External
+      linkType = NifLinkType.External,
+      wikiTarget = ""
     )
     nifLink.begin = Some(context.length)
     if(link.hasTitle)
@@ -178,6 +180,7 @@ class NifExtractionAstVisitor(language : Language)
   def visit(link: WtInternalLink): Unit = {
     var linkTarget = link.getTarget.getAsString
     var linkType = NifLinkType.Internal
+    var wikiTarget = ""
     try
     {
       val destinationTitle = WikiTitle.parse(link.getTarget.getAsString, language)
@@ -203,6 +206,7 @@ class NifExtractionAstVisitor(language : Language)
           linkType = NifLinkType.Internal
         }
       }
+      wikiTarget = destinationTitle.language.dbpediaDomain
     }
     catch { case _: Throwable  =>  {
       write(link.getPrefix)
@@ -216,7 +220,8 @@ class NifExtractionAstVisitor(language : Language)
       begin = None,
       end = None,
       uri = linkTarget,
-      linkType = linkType
+      linkType = linkType,
+      wikiTarget = wikiTarget
     )
     nifLink.begin = Some(context.length)
     write(link.getPrefix)
@@ -366,6 +371,7 @@ class NifExtractionAstVisitor(language : Language)
   def visit(n: WtPageSwitch): Unit = {}
   def visit(hr: WtHorizontalRule): Unit = {}
   def visit(n: WtTable): Unit = {}//no table
+  def visit(e: WtXmlEmptyTag): Unit = {}
   def visit(e: WtXmlEndTag): Unit = {}
   def visit(e: WtXmlStartTag): Unit = {}
   def visit(n: WtXmlComment): Unit = {}
@@ -382,18 +388,22 @@ class NifExtractionAstVisitor(language : Language)
 
 
   private def write(s: String): Unit = {
-    if (s.isEmpty) return
-    if(context.isEmpty){
-      context ++= s.replace("\n", "").replaceAll(" +", " ").replaceAll("^\\s+", "")
+    val processed = preprocessString(s)
+    if (processed.isEmpty) return
+    if(context.isEmpty || context.last == ' '){
+      context ++= processed.replaceAll("^[\\s| ]+", "")
     }else{
-      if(context.last == ' '){
-        context ++= s.replace("\n", "").replaceAll(" +", " ").replaceAll("^\\s+", "")
-      }else{
-        context ++= s.replace("\n", "").replaceAll(" +", " ")
-      }
+        context ++= processed
     }
   }
 
+  private def preprocessString(s:String): String = {
+    s.replace("\n", "").replace("\r", "").replace("\t", "")
+      .replaceAll("[\\s| ]+", " ") //different whitespace character
+  }
+
+
+
   private def write(cs: Array[Char]): Unit = {
     write(String.valueOf(cs))
   }
diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/NifLink.scala b/core/src/main/scala/org/dbpedia/extraction/nif/NifLink.scala
index e03eddbaee..958e0a3b50 100644
--- a/core/src/main/scala/org/dbpedia/extraction/nif/NifLink.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/nif/NifLink.scala
@@ -4,7 +4,8 @@ class NifLink(
                var begin: Option[Int],
                var end: Option[Int],
                var uri: String,
-               var linkType: NifLinkType.Value
+               var linkType: NifLinkType.Value,
+               var wikiTarget : String
              ) {
 }
 
diff --git a/scripts/src/main/java/org/dbpedia/extraction/relation/RelationExtractionJava.java b/scripts/src/main/java/org/dbpedia/extraction/relation/RelationExtractionJava.java
index adde0e924b..687c8db18e 100644
--- a/scripts/src/main/java/org/dbpedia/extraction/relation/RelationExtractionJava.java
+++ b/scripts/src/main/java/org/dbpedia/extraction/relation/RelationExtractionJava.java
@@ -40,8 +40,6 @@ public static List<Quad> run(Language language) throws Exception {
 
         Map<String, Set<String>> predictionCandidates = predictionCandidates(models, relations);
 
-        Set<String> s = predictionCandidates.get("http://dbkwik.webdatacommons.org/harrypotter/resource/Daphne_Maldon");
-
         Map<String, RelationPage> predictionSet = getPredictionSet(abstractEnds, predictionCandidates);
         updateType(predictionSet);
         computeSentenceFeatures(predictionSet, abstractEnds);