From 2f3899eb9b7a6e77bbe8b0325e6297f0d6125df0 Mon Sep 17 00:00:00 2001 From: Sven Hertling Date: Tue, 12 Nov 2019 13:27:51 +0100 Subject: [PATCH] added hearst pattern extraction updated template class extraction updated abstract extraction --- core/pom.xml | 13 + .../extraction/hearst/CustomPattern.java | 111 +++ .../hearst/ExtractHearstPatterns.java | 648 ++++++++++++++++++ .../dbpedia/extraction/hearst/IsaPattern.java | 71 ++ .../dbpedia/extraction/hearst/NounPhrase.java | 139 ++++ .../main/resources/datasetdefinitions.json | 14 +- .../config/provenance/DBpediaDatasets.scala | 2 + .../ArticleTemplatesClassExtractor.scala | 12 +- .../extraction/mappings/NifNewExtractor.scala | 13 +- .../extraction/nif/GeneralNifExtractor.scala | 50 +- .../nif/NifExtractionAstVisitor.scala | 32 +- .../org/dbpedia/extraction/nif/NifLink.scala | 3 +- .../relation/RelationExtractionJava.java | 2 - 13 files changed, 1079 insertions(+), 31 deletions(-) create mode 100644 core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java create mode 100644 core/src/main/java/org/dbpedia/extraction/hearst/ExtractHearstPatterns.java create mode 100644 core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java create mode 100644 core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java diff --git a/core/pom.xml b/core/pom.xml index d13af7b6db..e4924d94f0 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -142,6 +142,19 @@ 2.1.0 pom + + + + edu.stanford.nlp + stanford-corenlp + 3.9.2 + + + edu.stanford.nlp + stanford-corenlp + 3.9.2 + models-english + org.wikidata.wdtk diff --git a/core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java b/core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java new file mode 100644 index 0000000000..df9d9addd3 --- /dev/null +++ b/core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java @@ -0,0 +1,111 @@ +package org.dbpedia.extraction.hearst; + +import java.util.regex.Pattern; + +public class CustomPattern { + + private String pid; + private String regex; + private String type; + private Pattern pattern; + private String preCondition; + private Boolean excludePronouns; + private String firstKeyWord; + private String secondKeyWord; + private Boolean instanceFirst; + + private String surrounderSymbols = "[\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u0022]?"; + private String endSymbols = "[\"\\u0026\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u00A9\\u00AE]?"; //includes surrounderSymbols as well! + private String prefix = "(\\p{L}|\\d)" + endSymbols; + private String suffix = surrounderSymbols + "(\\p{L}|\\d)"; + + public CustomPattern(String pid, String regex, String type, Boolean instanceFirst) { + this.pid = pid; + this.regex = regex; + this.type = type; + this.instanceFirst = instanceFirst; + + //Configure the Prefix and suffix of the regex + if (type.equals("compact") || type.equals("split")) { + this.pattern = Pattern.compile(prefix + regex + suffix); + } + + if (type.equals("split_noPrefix")) { + this.pattern = Pattern.compile("(?>" + regex + suffix + ")"); + } + + if (type.equals("split_noSuffix")) { + this.pattern = Pattern.compile("(?>" + prefix + regex + ")"); + } + } + + public CustomPattern(String pid, String regex, String type, String preCond, Boolean instanceFirst) { + this.pid = pid; + this.regex = regex; + this.type = type; + this.instanceFirst = instanceFirst; + + //Configure the Prefix and suffix of the regex + if (type.equals("compact") || type.equals("split")) { + this.pattern = Pattern.compile(prefix + regex + suffix); + } + + if (type.equals("split_noPrefix")) { + this.pattern = Pattern.compile("(?>" + regex + suffix + ")"); + } + + if (type.equals("split_noSuffix")) { + this.pattern = Pattern.compile("(?>" + prefix + regex + ")"); + } + + this.preCondition = preCond; + } + + public CustomPattern(String pid, String regex, String type, String preCond, String fkw, String skw, Boolean instanceFirst) { + this.pid = pid; + this.regex = regex; + this.type = type; + this.firstKeyWord = fkw; + this.secondKeyWord = skw; + this.instanceFirst = instanceFirst; + + //Configure the Prefix and suffix of the regex + if (type.equals("compact") || type.equals("split")) { + this.pattern = Pattern.compile(prefix + regex + suffix); + } + + if (type.equals("split_noPrefix")) { + this.pattern = Pattern.compile("(?>" + regex + suffix + ")"); + } + + if (type.equals("split_noSuffix")) { + this.pattern = Pattern.compile("(?>" + prefix + regex + ")"); + } + + this.preCondition = preCond; + } + + public Pattern getPattern() { + return pattern; + } + + public String getType() { + return type; + } + + public String getFirstKeyWord() { + return firstKeyWord; + } + + public String getSecondKeyWord() { + return secondKeyWord; + } + + public Boolean getInstanceFirst() { + return instanceFirst; + } + + + + +} diff --git a/core/src/main/java/org/dbpedia/extraction/hearst/ExtractHearstPatterns.java b/core/src/main/java/org/dbpedia/extraction/hearst/ExtractHearstPatterns.java new file mode 100644 index 0000000000..ca9684e7a0 --- /dev/null +++ b/core/src/main/java/org/dbpedia/extraction/hearst/ExtractHearstPatterns.java @@ -0,0 +1,648 @@ +package org.dbpedia.extraction.hearst; + +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.ling.SentenceUtils; +import edu.stanford.nlp.ling.TaggedWord; +import edu.stanford.nlp.pipeline.CoreNLPProtos.Sentence; + +import edu.stanford.nlp.tagger.maxent.MaxentTagger; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * + * @author shertlin + */ +public class ExtractHearstPatterns { + + private static int maxNpSize = 4; + + private static MaxentTagger tagger = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"); + + private static String separatorSymbols = "[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\u2043]?"; + private static String surrounderSymbols = "[\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u0022]?"; + private static String endSymbols = "[\"\\u0026\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u00A9\\u00AE]?"; + + private static String npPlaceholder = "("+surrounderSymbols+"" //Quotation mark could be in front + + "(\\p{L}++|\\d++\\p{L}++)" //Word can start with letters or digits but must contain letters + + "("+separatorSymbols+"(\\p{L}++|\\d++))?" //Can be separated by a hyphen + + endSymbols+"\\s)" //Can be followed by quotation mark + + "{1,4}"; //NP can consist of up to 4 words + + private static String npPlaceholderAdjMost = "("+surrounderSymbols+"" //Quotation mark could be in front + + "(\\p{L}++|\\d++\\p{L}++)" //Word can start with letters or digits but must contain letters + + "("+separatorSymbols+"(\\p{L}++|\\d++))?" //Can be separated by a hyphen + + endSymbols+"\\s)" //Can be followed by quotation mark + + "{2,5}"; + + private static List allPatterns = generatePatterns(); + private static List generatePatterns() { + List allPatterns = new ArrayList(); + allPatterns.add(new CustomPattern("p8a", "\\,?\\sis\\san?\\s", "compact", "is\\sa", true)); + allPatterns.add(new CustomPattern("p8b", "\\,?\\swas\\san?\\s", "compact", "was\\sa", true)); + /* + allPatterns.add(new CustomPattern("p3a", "\\,?\\sincluding\\s", "compact", "including", false)); + allPatterns.add(new CustomPattern("p5", "\\,?\\ssuch\\sas\\s", "compact", "such\\sas", false)); + allPatterns.add(new CustomPattern("p1", "\\,?\\sand\\sother\\s", "compact", "and\\sother", true)); + allPatterns.add(new CustomPattern("p4", "\\,?\\sor\\sother\\s", "compact", "or\\sother", true)); + allPatterns.add(new CustomPattern("p2", "\\,?\\sespecially\\s", "compact", "especially", false)); + allPatterns.add(new CustomPattern("p8c", "\\,?\\sare\\san?\\s", "compact", "are\\sa", true)); + allPatterns.add(new CustomPattern("p34", "\\stypes\\s", "compact", "types", false)); + allPatterns.add(new CustomPattern("p25", "\\,?\\sexcept\\s", "compact", "except", false)); + allPatterns.add(new CustomPattern("p23d", "\\,?\\sparticularly\\s", "compact", "particularly", false)); + allPatterns.add(new CustomPattern("p20a", "\\sis\\sthe\\s\\w+est\\s", "compact", "is\\sthe", true)); + allPatterns.add(new CustomPattern("p43", "\\,?\\ssort\\sof\\s", "compact", "sort\\sof", true)); + allPatterns.add(new CustomPattern("p26", "\\,?\\sother\\sthan\\s", "compact", "other\\sthan", false)); + + allPatterns.add(new CustomPattern("p21a", "\\p{L}+est\\s"+npPlaceholder+"is\\s", "split_noPrefix", "est\\s", "est", "is", false)); + allPatterns.add(new CustomPattern("p21b", "\\p{L}+est\\s"+npPlaceholder+"are\\s", "split_noPrefix", "est\\s", "est", "are", false)); + allPatterns.add(new CustomPattern("p21c", "\\s(M|m)ost\\s"+npPlaceholderAdjMost+"is\\s", "split_noPrefix", "most\\s", "most", "is", false)); + allPatterns.add(new CustomPattern("p21d", "\\s(M|m)ost\\s"+npPlaceholderAdjMost+"are\\s", "split_noPrefix", null, "most", "are", false)); + + allPatterns.add(new CustomPattern("p23b", "\\,?\\smostly\\s", "compact", "mostly", false)); + allPatterns.add(new CustomPattern("p23a", "\\,?\\smainly\\s", "compact", "mainly", false)); + allPatterns.add(new CustomPattern("p12a", "\\,\\sone\\sof\\sthe\\s", "compact", "one\\sof\\sthe", true)); + allPatterns.add(new CustomPattern("p20c", "\\sis\\sthe\\smost\\s\\w+\\s", "compact", true)); + allPatterns.add(new CustomPattern("p8d", "\\,?\\swere\\san?\\s", "compact", "were\\sa", true)); + allPatterns.add(new CustomPattern("p6", "\\,?\\sand\\sany\\sother\\s", "compact", "and\\sany\\sother", true)); + allPatterns.add(new CustomPattern("p15a", "\\sexamples\\sof\\s", "compact", "examples\\sof", true)); + allPatterns.add(new CustomPattern("p27a", "\\,?\\se\\.g\\.\\s", "compact", "e\\.g\\.", false)); + allPatterns.add(new CustomPattern("p27b", "\\,?\\si\\.e\\.\\s", "compact", "i\\.e\\.", false)); + allPatterns.add(new CustomPattern("p16", "\\,?\\sfor\\sexample\\s", "compact", "for\\sexample", false)); + allPatterns.add(new CustomPattern("p24", "\\,?\\sin\\sparticular\\s", "compact", "in\\sparticular", false)); + allPatterns.add(new CustomPattern("p20b", "\\sare\\sthe\\s\\w+est\\s", "compact", "are\\sthe", true)); + allPatterns.add(new CustomPattern("p20d", "\\sare\\sthe\\smost\\s\\w+\\s", "compact", true)); + allPatterns.add(new CustomPattern("p23c", "\\,?\\snotably\\s", "compact", "notably", false)); + allPatterns.add(new CustomPattern("p39", "\\,?\\samong\\sthem\\s", "compact", "\\samong\\sthem", false)); + allPatterns.add(new CustomPattern("p38", "\\scompared\\sto\\sother\\s", "compact", "compared\\sto", true)); + allPatterns.add(new CustomPattern("p11", "\\,?\\slike\\sother\\s", "compact", "like\\sother", true)); + allPatterns.add(new CustomPattern("p7", "\\,?\\sand\\ssome\\sother\\s", "compact", "and\\some\\sother", true)); + allPatterns.add(new CustomPattern("p23e", "\\,?\\sprincipally\\s", "compact", "principally", false)); + allPatterns.add(new CustomPattern("p15b", "\\sis\\san\\sexample\\sof\\s", "compact", "is\\san\\sexample\\sof", true)); + allPatterns.add(new CustomPattern("p22a", "\\,?\\swhich\\sis\\scalled\\s", "compact", "which\\sis\\scalled", false)); + allPatterns.add(new CustomPattern("p28a", "\\,?\\sa\\skind\\sof\\s", "compact", "a\\skind\\sof", true)); + allPatterns.add(new CustomPattern("p12c", "\\,\\sone\\sof\\sthose\\s", "compact", "one\\sof\\sthose", true)); + allPatterns.add(new CustomPattern("p29a", "\\,?\\swhich\\slooks?\\slike\\s", "compact", "which\\slooks?\\slike", false)); + allPatterns.add(new CustomPattern("p28c", "\\,?\\sa\\sform\\sof\\s", "compact", "a\\sform\\sof", true)); + allPatterns.add(new CustomPattern("p30b", "\\,?\\swhich\\sis\\ssimilar\\sto\\s", "compact", "which\\sis\\ssimilar\\sto", false)); + allPatterns.add(new CustomPattern("p12b", "\\,\\sone\\sof\\sthese\\s", "compact", "one\\sof\\sthese", true)); + allPatterns.add(new CustomPattern("p29c", "\\,?\\swhich\\ssounds?\\slike\\s", "compact", "which\\ssounds?\\slike", false)); + allPatterns.add(new CustomPattern("p28d", "\\,?\\sforms\\sof\\s", "compact", "forms\\sof", true)); + allPatterns.add(new CustomPattern("p30a", "\\,?\\swhich\\sare\\ssimilar\\sto\\s", "compact", "which\\sare\\ssimilar\\sto", false)); + allPatterns.add(new CustomPattern("p22b", "\\,?\\swhich\\sis\\snamed\\s", "compact", "which\\sis\\snamed", false)); + allPatterns.add(new CustomPattern("p42", "\\,?\\sor\\sthe\\smany\\s", "compact", "or\\sthe\\smany", true)); + allPatterns.add(new CustomPattern("p31a", "\\,?\\sexample\\sof\\sthis\\sis\\s", "compact", "example\\sof\\sthis\\sis", false)); + allPatterns.add(new CustomPattern("p28b", "\\,?\\skinds\\sof\\s", "compact", "kinds\\sof", true)); + allPatterns.add(new CustomPattern("p31b", "\\,?\\sexamples\\sof\\sthis\\sare\\s", "compact", "examples\\sof\\sthis\\sare", false)); + + allPatterns.add(new CustomPattern("p10", "(S|s)uch\\s"+npPlaceholder+"as\\s", "split_noPrefix", "(S|s)uch\\s", "such", "as", false)); + allPatterns.add(new CustomPattern("p13", "(E|e)xample\\sof\\s"+npPlaceholder+"is\\s", "split_noPrefix", "example\\sof", "example of", "is", false)); + allPatterns.add(new CustomPattern("p14", "(E|e)xamples\\sof\\s"+npPlaceholder+"are\\s", "split_noPrefix", null, "examples of", "are", false)); + allPatterns.add(new CustomPattern("p36", "\\swhether\\s"+npPlaceholder+"or\\s", "split", " whether", "whether", "or", false)); + allPatterns.add(new CustomPattern("p37", "(C|c)ompare\\s"+npPlaceholder+"with\\s", "split_noPrefix", "compare\\s", "compare", "with", true)); + */ + return allPatterns; + } + + + public static NounPhrase extract(String text, String gold_instance_label) { + Set gold_instance_tokens = new HashSet(Arrays.asList(gold_instance_label.toLowerCase().split(" "))); + List patterns = extract(text); + for(IsaPattern p : patterns){ + for(NounPhrase i : p.getInstance()){ + Set instanceTokens = new HashSet(Arrays.asList(i.toString().toLowerCase().split(" "))); + instanceTokens.retainAll(gold_instance_tokens); + if(instanceTokens.size() > 0){ + return p.getClazz().get(0); + } + } + } + return null; + } + + public static List extract(String s) { + List extractedPatterns = new ArrayList(); + + List sentencesClean = splitSentences(s); + for (String sentence : sentencesClean) { + sentence = preprocessSentence(sentence); + + for (CustomPattern customPattern : allPatterns) { + Matcher patternMatcher = customPattern.getPattern().matcher(sentence); + while (patternMatcher.find()) { + + String extractedPattern = patternMatcher.group(); + int onset = patternMatcher.start(); + int offset = patternMatcher.end(); + + // Compact Patterns still contain an Indicator for the leading and the following Nounphrase + // This indicator has to be removed + if (customPattern.getType().equals("compact")){ + onset++; + offset--; + } + + // Check if a leading pronoun can be excluded + String pronounFront = sentence.substring(0, onset); + int lastWhitespace = pronounFront.lastIndexOf(" "); + if (lastWhitespace != -1){ + pronounFront = pronounFront.substring(lastWhitespace+1).toLowerCase(); + } + + String pronounBack = sentence.substring(offset); + int firstWhitespace = pronounBack.indexOf(" "); + if (firstWhitespace != -1){ + pronounBack = pronounBack.substring(0, firstWhitespace).toLowerCase(); + } + + if(allExclusions.contains(pronounFront.toLowerCase()) || allExclusions.contains(pronounBack.toLowerCase())){ + continue; + } + + ArrayList currentNPsBeforePattern = new ArrayList<>(); + ArrayList currentNPsAfterPattern = new ArrayList<>(); + try{ + List fullTaggedList = tagger.tagSentence(SentenceUtils.toWordList(sentence.split(" "))); + + for (TaggedWord tw : fullTaggedList) { + if (tw.word().length() < tw.word().replaceAll("(?<=s)[\\u201A\\u201C\\u201D\\u201E\\u201F\\u0022]", "").length() + || tw.word().length() < tw.word().replaceAll("[\\u201A\\u201C\\u201D\\u201E\\u201F\\u0022](?=s)", "").length()) { + tw.setTag("JJ"); + } + } + + List taggedWordsBeforePattern; + List taggedWordsAfterPattern; + if (customPattern.getType().equals("compact")) { + taggedWordsBeforePattern = getWordListSubset(0, onset + 1, fullTaggedList); + taggedWordsAfterPattern = getWordListSubset(offset, sentence.length(), fullTaggedList); + } else { + taggedWordsBeforePattern = getWordlistBeforeSplittedPattern(customPattern, sentence, onset, fullTaggedList); + taggedWordsAfterPattern = getWordlistAfterSplittedPattern(customPattern, sentence, onset, offset, fullTaggedList); + } + + Collections.reverse(taggedWordsBeforePattern); + findNextNounPhraseReverse(0, taggedWordsBeforePattern, currentNPsBeforePattern); + findNextNounPhrase(0, taggedWordsAfterPattern, currentNPsAfterPattern); + + if (currentNPsAfterPattern.isEmpty() || currentNPsBeforePattern.isEmpty()){ + continue; + } + + }catch (StringIndexOutOfBoundsException e){ } + + + if (customPattern.getInstanceFirst()){ + extractedPatterns.add(new IsaPattern(currentNPsBeforePattern, currentNPsAfterPattern)); + } + else { + extractedPatterns.add(new IsaPattern(currentNPsAfterPattern, currentNPsBeforePattern)); + } + } + } + } + return extractedPatterns; + } + + + private static String[] abbreviations = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", + "Adj", "Adm", "Adv", "Asst", "Bart", "Bldg", "Brig", "Bros", "Capt", "Cmdr", "Col", "Comdr", "Con", "Corp", "Cpl", "DR", "Dr", "Drs", "Ens", "Fig", "FIG", "fig", "Gen", "Gov", "Hon", "Hr", "Hosp", "Insp", "Lt", "MM", + "MR", "MRS", "MS", "Maj", "Messrs", "Mlle", "Mme", "Mr", "Mrs", "Ms", "Msgr", "Op", "Ord", "Pat", "Pfc", "Ph", "Prof", "Pvt", "Rep", "Reps", "Res", "Rev", "Rt", "Sen", "Sens", "Sfc", "Sgt", "Sr", "St", + "Supt", "Surg", "v", "vs", "U.S", "u.s", "U.K", "u.k", "i.e", "rev", "e.g", "No", "Nos", "Art", "Nr", "pp"}; + private static Pattern splitRegex = Pattern.compile("(?<=[\\!\\.\\?]" + surrounderSymbols + ")\\s(?=" + surrounderSymbols + "\\p{Lu})"); + + private static List splitSentences(String text) { + String[] sentencesRaw = splitRegex.split(text); + + Boolean connectWithLatterPart = false; + List sentencesClean = new ArrayList(); + sentencesClean.add(sentencesRaw[0]); + for (int z = 1; z < sentencesRaw.length; z++) { + for (String abb : abbreviations) { + if (sentencesClean.get(sentencesClean.size() - 1).endsWith(abb + ".")) { + connectWithLatterPart = true; + break; + } + } + if (connectWithLatterPart) { + sentencesClean.set(sentencesClean.size() - 1, sentencesClean.get(sentencesClean.size() - 1) + " " + sentencesRaw[z]); + } else { + sentencesClean.add(sentencesRaw[z]); + } + connectWithLatterPart = false; + } + + // If no valid sentence is found, the line itself is analyzed + if (sentencesClean.size() == 0) { + sentencesClean.add(text); + } + + return sentencesClean; + } + + private static Pattern multipleWhitespacePatternTwo = Pattern.compile("[Â \\t\\p{Zs}\\n\\cK\\f\\r\\x85\\x{2028}\\x{2029}]+"); + private static Pattern multipleWhitespacePattern = Pattern.compile("\\s+"); + private static Pattern quotationMarkPattern = Pattern.compile("(? getWordlistBeforeSplittedPattern(CustomPattern custPat, String sentence, int onset, List tw) { + //Beginnt nach First KeyWord und endet vor second KeyWord; + if (custPat.getType().equals("split_noPrefix")) { + return getWordListSubset(onset + custPat.getFirstKeyWord().length() + 1, sentence.toLowerCase().indexOf(custPat.getSecondKeyWord(), onset), tw); + } + //Beginnt vor erstem Keyword + if (custPat.getType().equals("split")) { + return getWordListSubset(0, onset, tw); + } + //Onset wird "normal gesplitted in diesem Fall" (wie Compact Patterns) + if (custPat.getType().equals("split_noSuffix")) { + return getWordListSubset(0, onset, tw); + } + return new ArrayList(); + } + + /** + * Depending on the type of custPat and the Length of its two parts, the onset and offset of the second NounPhrase-Search-Area + * @param custPat + * @param sentence + * @param onset (Of the pattern inside the sentence) + * @param offset (Of the pattern inside the sentence) + * @param tw + * @return + */ + public static List getWordlistAfterSplittedPattern(CustomPattern custPat, String sentence, int onset, int offset, List tw) { + //Beginnt nach dem zweiten Keyword + if (custPat.getType().equals("split_noPrefix")) { + return getWordListSubset(offset, sentence.length(), tw); + } + // Beginnt beim Onset und endet mit secondKeyword + if (custPat.getType().equals("split_noSuffix")) { + return getWordListSubset(onset, sentence.indexOf(custPat.getSecondKeyWord(), onset), tw); + + } + //Beginnt nach dem ersten Keyword + if (custPat.getType().equals("split")) { + return getWordListSubset(onset + custPat.getFirstKeyWord().length(), sentence.length(), tw); + } + return new ArrayList(); + } + + /** + * + * @param wordOffset This word offset points to a word in the sentence list + * and describes the starting point for the NounPhrase search + * @param sentence + * @param resultNPs + * @return + */ + public static ArrayList findNextNounPhrase(int wordOffset, List sentence, ArrayList resultNPs) { + NounPhrase currentNP = new NounPhrase(maxNpSize); + int postOffset = 0; + for (int i = wordOffset; i < sentence.size(); i++) { + if (!sentence.get(i).tag().equals("DT") && !sentence.get(i).tag().startsWith("NN") && !sentence.get(i).tag().startsWith("JJ") && !sentence.get(i).tag().equals("VBN") && !sentence.get(i).word().toLowerCase().equals("and") && !sentence.get(i).word().toLowerCase().equals("or") && !sentence.get(i).word().toLowerCase().equals("&") && resultNPs.size() > 0) { + return resultNPs; + } + + if (sentence.get(i).tag().startsWith("NN")) { + currentNP.setNPCore(sentence.get(i)); + findPreMod(i, sentence, currentNP); + if (sentence.get(i).word().endsWith(",")) { + resultNPs.add(cleanNounPhrase(currentNP, false)); + findNextNounPhrase(i + 1, sentence, resultNPs); + return resultNPs; + } + postOffset = findPostMod(i, sentence, currentNP); + if (postOffset != -1) { + resultNPs.add(cleanNounPhrase(currentNP, false)); + findNextNounPhrase(postOffset + 1, sentence, resultNPs); + return resultNPs; + } else { + resultNPs.add(cleanNounPhrase(currentNP, false)); + return resultNPs; + } + } + } + return resultNPs; + } + + /** + * Searches for potential pre-modifying words of the NounPhrase; + * + * @param nnOffset Is a pointer on the last word analyzed (which has to be a + * valid Core-NN) + * @param sentence + * @param currentNP + */ + public static void findPreMod(int nnOffset, List sentence, NounPhrase currentNP) { + for (int i = nnOffset - 1; i > nnOffset - currentNP.getMaxNPLength() && i >= 0; i--) { + if ((sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN")) && !sentence.get(i).word().endsWith(",")) { + currentNP.addPreModifier(sentence.get(i)); + } else { + return; + } + } + } + + /** + * Searches and stores potential post-modifying words of the NounPhrase, as + * well as looks for potential coordinations + * + * @param nnOffset Is a pointer on the last word analyzed (which has to be a + * valid Core-NN) + * @param sentence + * @param currentNP + * @return -1, if no potential coordination was found; word-offset of the + * last analyzed word (will be used to search for next NounPhrase) + */ + public static int findPostMod(int nnOffset, List sentence, NounPhrase currentNP) { + for (int i = nnOffset + 1; i < nnOffset + currentNP.getMaxNPLength() && i < sentence.size(); i++) { + if (sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN") | sentence.get(i).tag().equals("VBG") | sentence.get(i).tag().startsWith("NN") | sentence.get(i).tag().equals("IN") | sentence.get(i).tag().equals("CD") | sentence.get(i).tag().equals("DT")) { + currentNP.addPostModifier(sentence.get(i)); + } + if (sentence.get(i).word().toLowerCase().equals("and") | sentence.get(i).word().toLowerCase().equals("or") | sentence.get(i).word().toLowerCase().equals("&")) { + return i; + } + if (!(sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN") | sentence.get(i).tag().equals("VBG") | sentence.get(i).tag().startsWith("NN") | sentence.get(i).tag().equals("IN") | sentence.get(i).tag().equals("CD") | sentence.get(i).tag().equals("DT") | sentence.get(i).word().toLowerCase().equals("and") | sentence.get(i).word().toLowerCase().equals("or") | sentence.get(i).word().toLowerCase().equals("&"))) { + return -1; + } + if (sentence.get(i).word().endsWith(",")) { + return i; + } + } + if (sentence.size() > nnOffset + currentNP.getMaxNPLength()) { + if (sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("and") | sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("or") | sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("&")) { + return nnOffset + currentNP.getMaxNPLength(); + } + } + return -1; + } + + public static ArrayList findNextNounPhraseReverse(int wordOffset, List sentence, ArrayList resultNPs) { + int status = 0; + NounPhrase currentNP = new NounPhrase(maxNpSize); + for (int i = wordOffset; i < sentence.size(); i++) { + if (!sentence.get(i).tag().startsWith("NN") && !sentence.get(i).tag().equals("VBG") && !sentence.get(i).tag().equals("IN") && !sentence.get(i).tag().equals("CD") && !sentence.get(i).tag().equals("DT") && resultNPs.size() > 0) { + return resultNPs; + } + + if (sentence.get(i).tag().startsWith("NN")) { + currentNP.setNPCore(sentence.get(i)); + if (sentence.get(i).word().endsWith(",")) { + status = findPreModReverse(i, sentence, currentNP); + if (status == -2) { + findNextNounPhraseReverse(i + 1, sentence, resultNPs); + return resultNPs; + } else if (status > 0) { + resultNPs.add(cleanNounPhrase(currentNP, true)); + findNextNounPhraseReverse(status + 1, sentence, resultNPs); + return resultNPs; + } else { + resultNPs.add(cleanNounPhrase(currentNP, true)); + return resultNPs; + } + } else { + status = findPreModReverse(i, sentence, currentNP); + if (status == -2) { + findNextNounPhraseReverse(i + 1, sentence, resultNPs); + return resultNPs; + } + //Neue NounPhrase Entdeckt y=>ist neues Offset + if (status > 0) { + findPostModReverse(i, sentence, currentNP); + resultNPs.add(cleanNounPhrase(currentNP, true)); + findNextNounPhraseReverse(status, sentence, resultNPs); + return resultNPs; + } else { + findPostModReverse(i, sentence, currentNP); + resultNPs.add(cleanNounPhrase(currentNP, true)); + return resultNPs; + } + } + } + } + return resultNPs; + } + + /** + * In this stage the post-modifier of NounPHrase is reduced until the Last + * NN is found; The cleaning of NounPhrases with regard to Symbols can be + * done in the Aggregation step; At this stage the risk is too high, that + * valuable information gets lost / Algorithms can't be too precise at this + * stage + * + * @param np + * @param isReverse + * @return + */ + public static NounPhrase cleanNounPhrase(NounPhrase np, Boolean isReverse) { + //If dot is inside a NN, the NN is split: Depending if entity was extracted inFront of or afterPattern + /* + String[] npPartHolder = np.getNPCore().word().split("(?<=(\\p{L}|\\d))\\s?[\\.\\?\\!\\(\\)\\{\\}\\[\\]]+\\s?(?=\\p{L})"); + + if (npPartHolder.length>=2) + { + if (isReverse) + { + np.NPCore.setWord(npPartHolder[npPartHolder.length-1]); + np.clearPreMod(); + } + else + { + np.NPCore.setWord(npPartHolder[0]); + np.clearPostMod(); + } + }*/ + + //Reduce the Post-Modifier or PreModifier until the last NN is found + try { + if (np.getPostModifier() != null && !np.getPostModifier().isEmpty()) { + np.getPostModifier().get(np.getPostModifier().size() - 1).setWord(np.getPostModifier().get(np.getPostModifier().size() - 1).word().trim().replaceAll("(\\.|\\,|\\;|\\:|\\?|\\!)$", "")); + while (!np.getPostModifier().get(np.getPostModifier().size() - 1).tag().startsWith("NN") && !np.getPostModifier().get(np.getPostModifier().size() - 1).tag().equals("CD")) { + np.getPostModifier().remove(np.getPostModifier().size() - 1); + if (np.getPostModifier().isEmpty()) { + break; + } + } + } + + if (np.getPreModifier() != null && !np.getPreModifier().isEmpty()) { + np.getPreModifier().get(np.getPreModifier().size() - 1).setWord(np.getPreModifier().get(np.getPreModifier().size() - 1).word().trim().replaceAll("(\\.|\\,|\\;|\\:|\\?|\\!)$", "")); + while (!np.getPreModifier().get(0).tag().startsWith("JJ") && !np.getPreModifier().get(0).tag().equals("VBN")) { + np.getPreModifier().remove(0); + if (np.getPreModifier().isEmpty()) { + break; + } + } + } + } catch (ArrayIndexOutOfBoundsException e) { } + + //Clean the single word of NounPhrases if there are still symbols in front or behind it; (in case of multiple symbols) + //Problem with abbreviations or websites => Can be filtered in next steps + // KOmma removed, because it intereferes with Coordinations; (Not in Aggregation step ;-) ) + //KOmma added - Sven + np.NPCore.setWord(np.NPCore.word().trim().replaceAll("[\\.\\,\\;\\:\\?\\!\\(\\)\\[\\]\\{\\}]+$", "")); + np.NPCore.setWord(np.NPCore.word().trim().replaceAll("^[\\.\\,\\;\\:\\?\\!\\(\\)\\[\\]\\{\\}]+", "")); + + return np; + } + + public static int findPreModReverse(int nnOffset, List sentence, NounPhrase currentNP) { + boolean premodFinished = false; + for (int i = nnOffset + 1; i < nnOffset + currentNP.getMaxNPLength() && i < sentence.size(); i++) { + if ((sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN")) && !sentence.get(i).word().endsWith(",") && !premodFinished) { + currentNP.addPreModifier(sentence.get(i)); + } else if ((sentence.get(i).tag().equals("VBG") | sentence.get(i).tag().equals("IN") | sentence.get(i).tag().equals("CD") | sentence.get(i).tag().equals("DT"))) { + //ignore: It might be that this is pat of a postModifier: if another NN is found; + premodFinished = true; + } else if (sentence.get(i).tag().startsWith("NN") && !sentence.get(i).word().endsWith(",")) { + return -2; + } else if (sentence.get(i).word().endsWith(",")) { + return i; + } else if (sentence.get(i).word().toLowerCase().equals("and") | sentence.get(i).word().toLowerCase().equals("or") | sentence.get(i).word().toLowerCase().equals("&")) { + return i + 1; + } else { + return -1; + } + } + + if (sentence.size() > nnOffset + currentNP.getMaxNPLength()) { + if (sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("and") | sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("or") | sentence.get(nnOffset + currentNP.getMaxNPLength()).word().toLowerCase().equals("&")) { + return nnOffset + currentNP.getMaxNPLength() + 1; + } + } + return -1; + } + + + public static void findPostModReverse(int nnOffset, List sentence, NounPhrase currentNP) { + for (int i = nnOffset - 1; i > nnOffset - currentNP.getMaxNPLength() && i >= 0; i--) { + if (sentence.get(i).word().toLowerCase().equals("and") | sentence.get(i).word().toLowerCase().equals("or") | sentence.get(i).word().toLowerCase().equals("&")) { + return; + } + + if (sentence.get(i).tag().startsWith("JJ") | sentence.get(i).tag().equals("VBN") | sentence.get(i).tag().equals("VBG") | sentence.get(i).tag().startsWith("NN") | sentence.get(i).tag().equals("IN") | sentence.get(i).tag().equals("CD") | sentence.get(i).tag().equals("DT")) { + currentNP.addPostModifier(sentence.get(i)); + } + + if (sentence.get(i).word().endsWith(",")) { + return; + } + } + } + + + /** + * The method returns a subList of a tagged-Word List (the entire sentence). + * The Onset and Offset are pointers in the sentence String, which restrict + * the retunred List of words; + * + * @param onset + * @param offset + * @param taggedWords + * @return + */ + public static List getWordListSubset(int onset, int offset, List taggedWords) { + List result = new ArrayList(); + int charCounter = 0; + for (TaggedWord tw : taggedWords) { + charCounter += tw.word().length(); + if (charCounter >= onset && charCounter <= offset) { + result.add(tw); + } + charCounter++; + } + return result; + } + + + //Variables and Constants for false-positive detection + private static Set allExclusions = createAllExclusions(); + private static Set createAllExclusions(){ + Set exclusions = new HashSet<>(); + exclusions.addAll(Arrays.asList("that","this","these","those"));//demonstratives + exclusions.addAll(Arrays.asList("mine","yours","his","hers","its","ours","theirs"));//possessives + exclusions.addAll(Arrays.asList("i","you","he","she","it","we","they"));//personals + exclusions.addAll(Arrays.asList("where", "who", "when", "what", "why", "whose", "which", "how"));//questions + exclusions.addAll(Arrays.asList("there")); + return exclusions; + } + + +} diff --git a/core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java b/core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java new file mode 100644 index 0000000000..d5b1212660 --- /dev/null +++ b/core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java @@ -0,0 +1,71 @@ +package org.dbpedia.extraction.hearst; + +import edu.stanford.nlp.ling.TaggedWord; +import java.util.ArrayList; +import java.util.List; + + +public class IsaPattern { + private ArrayList instance; + private ArrayList clazz; + + public IsaPattern(ArrayList instance, ArrayList clazz) { + this.instance = (ArrayList)instance.clone(); + this.clazz = (ArrayList)clazz.clone(); + } + + public ArrayList getInstance() { + return instance; + } + + public ArrayList getClazz() { + return clazz; + } + + @Override + public String toString() { + return nounPhraseListUnderscoreToString(instance) + " --isa--> " + nounPhraseListUnderscoreToString(clazz); + } + + + private static String nounPhraseListToString(ArrayList nps) { + if (nps.size() == 0) { + return "{}"; + } + StringBuilder result = new StringBuilder(); + result.append("{"); + for (NounPhrase np : nps) { + result.append(np.toString()).append("|"); + } + result.setLength(result.length() - 1); + result.append("}"); + return result.toString(); + } + + private static String nounPhraseListUnderscoreToString(ArrayList nps) { + if (nps.size() == 0) { + return "{}"; + } + StringBuilder result = new StringBuilder(); + result.append("{"); + for (NounPhrase np : nps) { + for (TaggedWord tw : np.getPreModifier()) { + result.append(tw.word()).append(" "); + } + result.append("_"); + result.append(np.getNPCore().word()); + result.append("_"); + for (TaggedWord tw : np.getPostModifier()) { + result.append(tw.word()).append(" "); + } + //if (result.length() > 0) { + // result.setLength(result.length() - 1); + //} + + result.append("|"); + } + result.setLength(result.length() - 1); + result.append("}"); + return result.toString(); + } +} diff --git a/core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java b/core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java new file mode 100644 index 0000000000..99f0224890 --- /dev/null +++ b/core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java @@ -0,0 +1,139 @@ +package org.dbpedia.extraction.hearst; + +import java.util.ArrayList; + +import edu.stanford.nlp.ling.TaggedWord; +import java.util.StringJoiner; + +public class NounPhrase { + + public TaggedWord NPCore; + private ArrayList preModifier, postModifier; + private boolean isComplete; + private boolean coreFound; + private int maxNPLength; + + public NounPhrase(int maxNPLength) { + this.maxNPLength = maxNPLength; + + isComplete = false; + preModifier = new ArrayList(); + postModifier = new ArrayList(); + } + + public void addPreModifier(TaggedWord tw) { + preModifier.add(0, tw); + if (preModifier.size() == maxNPLength) { + preModifier.remove(preModifier.size() - 1); + } + } + + public void addPostModifier(TaggedWord tw) { + postModifier.add(tw); + if (postModifier.size() + 1 + preModifier.size() > maxNPLength) { + if (preModifier.size() > 0) { + preModifier.remove(0); + } else { + isComplete = true; + } + } + } + + public void NPCoreToPost(TaggedWord tw) { + postModifier.add(NPCore); + NPCore = tw; + if (postModifier.size() + 1 == maxNPLength) { + isComplete = true; + } + } + + public void clearPreMod() { + preModifier.clear(); + } + + public void clearPostMod() { + postModifier.clear(); + } + + public void setNPCore(TaggedWord tw) { + NPCore = tw; + if (postModifier.size() + 1 == maxNPLength) { + isComplete = true; + } + } + + public TaggedWord getNPCore() { + return NPCore; + } + + public ArrayList getPreModifier() { + return preModifier; + } + + public ArrayList getPostModifier() { + return postModifier; + } + + public String getPreModifierText() { + StringJoiner joiner = new StringJoiner(" "); + for (TaggedWord tw : preModifier) { + joiner.add(tw.word()); + } + return joiner.toString(); + } + + public String getPostModifierText() { + StringJoiner joiner = new StringJoiner(" "); + for (TaggedWord tw : postModifier) { + joiner.add(tw.word()); + } + return joiner.toString(); + } + + public String getNPCoreText() { + return NPCore.word(); + } + + public int getMaxNPLength() { + return maxNPLength; + } + + public boolean isCoreFound() { + return coreFound; + } + + public boolean isComplete() { + return isComplete; + } + + + public String toString() { + StringBuilder sb = new StringBuilder(); + for (TaggedWord tw : preModifier) { + sb.append(tw.word()).append(" "); + } + sb.append(NPCore.word()).append(" "); + for (TaggedWord tw : postModifier) { + sb.append(tw.word()).append(" "); + } + if (sb.length() > 0) { + sb.setLength(sb.length() - 1); + } + return sb.toString(); + } + + public String tagsToString() { + StringBuilder sb = new StringBuilder(); + for (TaggedWord tw : preModifier) { + sb.append(tw.tag()).append(" "); + } + sb.append(NPCore.tag()).append(" "); + for (TaggedWord tw : postModifier) { + sb.append(tw.tag()).append(" "); + } + if (sb.length() > 0) { + sb.setLength(sb.length() - 1); + } + return sb.toString(); + } +} diff --git a/core/src/main/resources/datasetdefinitions.json b/core/src/main/resources/datasetdefinitions.json index 946296ce04..c9ff844289 100644 --- a/core/src/main/resources/datasetdefinitions.json +++ b/core/src/main/resources/datasetdefinitions.json @@ -148,7 +148,13 @@ "traits":"LinkedData, Published", "desc": "Dataset linking a DBpedia resource to the same resource in other wikis.", "defaultgraph": "dataset" - }, + }, + "interwiki_links_link_section": { + "name": "InterWiki Links in Link Section", + "traits":"LinkedData, Published", + "desc": "All links which appears in a link section and linking to another wiki.", + "defaultgraph": "dataset" + }, "interlanguage_links_chapters": { "name": "Interlanguage Links between DBpedia Chapters", "traits":"LinkedData, Published", @@ -471,6 +477,12 @@ "traits":"LinkedData", "desc": "This are all equations collected during the NIF extraction, transformed into MathML XML syntax.", "defaultgraph": "dataset" + }, + "hearst_patterns": { + "name": "Hearst Patterns in the short abstract", + "traits":"LinkedData", + "desc": "Hearst Patterns extracted from the short abstract.", + "defaultgraph": "dataset" } }, "links":{ diff --git a/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala b/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala index 4e6c4265a0..a2e6e4e427 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala @@ -156,6 +156,7 @@ object DBpediaDatasets extends java.io.Serializable val PageIds: Dataset = datasets("page_ids") val InterLanguageLinks: Dataset = datasets("interlanguage_links") // Since the inter-language links were moved from Wikipedia to Wikidata, we now extract these links from the Wikidata dump, not from Wikipedia pages.") val InterWikiLinks: Dataset = datasets("interwiki_links") + val InterWikiLinksLinkSection: Dataset = datasets("interwiki_links_link_section") val InterLanguageLinksChapter: Dataset = datasets("interlanguage_links_chapters") val Genders: Dataset = datasets("genders") val TopicalConcepts: Dataset = datasets("topical_concepts") @@ -229,6 +230,7 @@ object DBpediaDatasets extends java.io.Serializable val NifTextLinks: Dataset = datasets("nif_text_links") val RawTables: Dataset = datasets("raw_tables") val Equations: Dataset = datasets("equations") + val HearstPatterns: Dataset = datasets("hearst_patterns") /** * Links diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/ArticleTemplatesClassExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/ArticleTemplatesClassExtractor.scala index 2e4fb5ed0b..1f21c4e0f5 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/ArticleTemplatesClassExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/ArticleTemplatesClassExtractor.scala @@ -58,7 +58,7 @@ class ArticleTemplatesClassExtractor( // title = title.substring(indexInfoBox + 7, title.length) //} titleLower = titleLower.replaceAll("infobox", "") - titleLower = titleLower.replace("/", "") + titleLower = titleLower.replace("/", "_") titleLower = stripAll(titleLower, " _-") if(titleLower.nonEmpty) { var classUri = context.language.dbpediaUri + "/class/" + titleLower @@ -68,7 +68,7 @@ class ArticleTemplatesClassExtractor( infoboxSeenClasses.synchronized { if (!infoboxSeenClasses.contains(classUri)) { - var classLabel = template.title.decoded.replaceAll("(?i)infobox", "") + var classLabel = template.title.decoded.replaceAll("(?i)infobox", "").replace("/", " ") classLabel = stripAll(classLabel, " _-") infoboxSeenClasses += classUri quads += new Quad(context.language, DBpediaDatasets.InfoboxTemplateTypeDefinitions, classUri, typeProperty, owlClass, node.sourceIri) @@ -87,13 +87,17 @@ class ArticleTemplatesClassExtractor( }else{ var selectedTemplateNode = templateNodes.sortBy(_.children.length).last - var templateClassUri = context.language.dbpediaUri + "/class/" + selectedTemplateNode.title.encoded.toLowerCase + var titleLower = selectedTemplateNode.title.encoded.toLowerCase + titleLower = titleLower.replace("/", "_") + titleLower = stripAll(titleLower, " _-") + + var templateClassUri = context.language.dbpediaUri + "/class/" + titleLower quads += new Quad(context.language, DBpediaDatasets.TemplateType, subjectUri, typeProperty, templateClassUri, node.sourceIri) seenClasses.synchronized { if (!seenClasses.contains(templateClassUri)) { - var classLabel = selectedTemplateNode.title.decoded + var classLabel = selectedTemplateNode.title.decoded.replace("/", " ") classLabel = stripAll(classLabel, " _-") seenClasses += templateClassUri quads += new Quad(context.language, DBpediaDatasets.TemplateTypeDefinitions, templateClassUri, typeProperty, owlClass, node.sourceIri) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/NifNewExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/NifNewExtractor.scala index 8204a32a7f..d3f06f1d48 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/NifNewExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/NifNewExtractor.scala @@ -49,7 +49,8 @@ class NifNewExtractor( DBpediaDatasets.NifContext,DBpediaDatasets.NifPageStructure,DBpediaDatasets.NifTextLinks, DBpediaDatasets.RawTables, DBpediaDatasets.Equations, DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts, - DBpediaDatasets.InterWikiLinks, DBpediaDatasets.ExternalLinks, DBpediaDatasets.PageLinks, DBpediaDatasets.InterLanguageLinks + DBpediaDatasets.InterWikiLinks, DBpediaDatasets.ExternalLinks, DBpediaDatasets.PageLinks, DBpediaDatasets.InterLanguageLinks, DBpediaDatasets.InterWikiLinksLinkSection, + DBpediaDatasets.HearstPatterns ) var config: WikiConfig = getSwebleConfig() @@ -98,15 +99,17 @@ class NifNewExtractor( //new PrintWriter(URLEncoder.encode(pageNode.title.encoded + "_ast_expansion", StandardCharsets.UTF_8.toString)) { write(AstPrinter.print[WtNode](page)); close } var astVisitor = new NifExtractionAstVisitor(context.language) astVisitor.go(page) - if(astVisitor.getFullText().trim.length == 0){ + if(astVisitor.getFullText().trim.length <= 5){ //don't expand templates page = engine.postprocess(pageId, source,null).getPage astVisitor = new NifExtractionAstVisitor(context.language) astVisitor.go(page) } - //extractionInfoPrinter(astVisitor, URLEncoder.encode(pageNode.title.encoded + "_nif_extractor", StandardCharsets.UTF_8.toString)) - quads ++= new GeneralNifExtractor(context, pageNode).extractNif(astVisitor.getTocMap(), astVisitor.getFullText()) - + if(astVisitor.getFullText().trim.length >= 5) { + //extractionInfoPrinter(astVisitor, URLEncoder.encode(pageNode.title.encoded + "_nif_extractor", StandardCharsets.UTF_8.toString)) + //new PrintWriter(URLEncoder.encode(pageNode.title.encoded + "_ast_expansion", StandardCharsets.UTF_8.toString)) { write(AstPrinter.print[WtNode](page)); close } + quads ++= new GeneralNifExtractor(context, pageNode).extractNif(astVisitor.getTocMap(), astVisitor.getFullText()) + } //} quads } diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala index 260fdccce1..4ced408819 100644 --- a/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala @@ -3,12 +3,14 @@ package org.dbpedia.extraction.nif import util.control.Breaks._ import org.dbpedia.extraction.config.Config import org.dbpedia.extraction.config.provenance.DBpediaDatasets -import org.dbpedia.extraction.ontology.{Ontology, OntologyProperty, RdfNamespace} +import org.dbpedia.extraction.hearst.ExtractHearstPatterns +import org.dbpedia.extraction.ontology.{DBpediaNamespace, Ontology, OntologyProperty, RdfNamespace} import org.dbpedia.extraction.transform.{Quad, QuadBuilder} import org.dbpedia.extraction.util.Language import org.dbpedia.extraction.wikiparser.WikiPage import org.dbpedia.iri.UriUtils +import scala.collection.mutable import scala.collection.mutable.{ArrayBuffer, ListBuffer} import scala.util.{Failure, Success} import scala.language.reflectiveCalls @@ -29,8 +31,6 @@ class GeneralNifExtractor ( private val nifContextUri = wikiPage.uri + "?dbpv=" + context.configFile.dbPediaVersion + "&nif=context" private val sourceUrl = wikiPage.sourceIri - private val nifcontext = "http://dbkwik.webdatacommons.org/ontology/nifcontext" - private val ontologyLink = "http://dbkwik.webdatacommons.org/ontology/" val wikiPageWikiLinkProperty = context.ontology.properties("wikiPageWikiLink") val wikiPageExternalLinkProperty = context.ontology.properties("wikiPageExternalLink") @@ -38,7 +38,10 @@ class GeneralNifExtractor ( val wikiPageInterLanguageLinkProperty = context.ontology.properties("wikiPageInterLanguageLink") val labelProperty = context.ontology.properties("rdfs:label") - + val hasTypePreModifier = DBpediaNamespace.ONTOLOGY.append("hasTypePreModifier") + val hasTypeHead = DBpediaNamespace.ONTOLOGY.append("hasTypeHead") + val hasTypePostModifier = DBpediaNamespace.ONTOLOGY.append("hasTypePostModifier") + val hasType = DBpediaNamespace.ONTOLOGY.append("hasType") protected lazy val nifContext: (String, String, String, String, String) => Quad = QuadBuilder.dynamicPredicate(context.language.isoCode,DBpediaDatasets.NifContext.encoded) _ @@ -73,12 +76,29 @@ class GeneralNifExtractor ( val describingParagraphs = section.paragraphs//getParagraphsDescribingConcept(section, text) if(describingParagraphs.size > 0){ quads += longQuad(wikiPage.uri, text.substring(describingParagraphs.head.begin.getOrElse(0), describingParagraphs.last.end.getOrElse(0)), sourceUrl) //text.substring(section.begin.getOrElse(0), section.end.getOrElse(0)), sourceUrl) - quads += shortQuad(wikiPage.uri, getShortAbstract(describingParagraphs, text), sourceUrl) // getShortAbstract(section.paragraphs, text), sourceUrl) + val shortAbstract = getShortAbstract(describingParagraphs, text) + quads += shortQuad(wikiPage.uri, shortAbstract, sourceUrl) // getShortAbstract(section.paragraphs, text), sourceUrl) + quads ++= extractHearstPattern(wikiPage.uri, shortAbstract, wikiPage.title.decoded) } } quads } + private def extractHearstPattern(subject:String, text:String, title:String): ArrayBuffer[Quad] = { + var quads = ArrayBuffer[Quad]() + val np = ExtractHearstPatterns.extract(text, title) + if (np != null) { + if(!np.getPreModifierText.isEmpty) + quads += new Quad(context.language, DBpediaDatasets.HearstPatterns, subject, hasTypePreModifier, np.getPreModifierText, wikiPage.sourceIri, context.ontology.datatypes("rdf:langString")) + quads += new Quad(context.language, DBpediaDatasets.HearstPatterns, subject, hasTypeHead, np.getNPCoreText(), wikiPage.sourceIri, context.ontology.datatypes("rdf:langString")) + if(!np.getPostModifierText.isEmpty) + quads += new Quad(context.language, DBpediaDatasets.HearstPatterns, subject, hasTypePostModifier, np.getPostModifierText, wikiPage.sourceIri, context.ontology.datatypes("rdf:langString")) + + quads += new Quad(context.language, DBpediaDatasets.HearstPatterns, subject, hasType, np.toString, wikiPage.sourceIri, context.ontology.datatypes("rdf:langString")) + } + quads + } + private def getParagraphsDescribingConcept(section: NifSection, text:String):Seq[NifParagraph] = { val wikiTitleWords = Set() ++ wikiPage.title.decoded.toLowerCase.split("[\\s\\u202F\\u00A0]").map(s=>s.trim) //try to find wiki title somewhere in the beginning of a paragraph @@ -148,7 +168,7 @@ class GeneralNifExtractor ( quads += nifContext(nifContextUri, RdfNamespace.NIF.append("sourceUrl"), sourceUrl, sourceUrl, null) quads += nifContext(nifContextUri, RdfNamespace.NIF.append("isString"), text.toString(), sourceUrl, RdfNamespace.XSD.append("string")) quads += nifContext(nifContextUri, RdfNamespace.NIF.append("predLang"), "http://lexvo.org/id/iso639-3/" + context.language.iso639_3, sourceUrl, null) - quads += nifContext(wikiPage.uri, nifcontext, nifContextUri, sourceUrl, null) //link between resource and nif context + quads += nifContext(wikiPage.uri, DBpediaNamespace.ONTOLOGY.append("nifcontext"), nifContextUri, sourceUrl, null) //link between resource and nif context quads } @@ -232,6 +252,22 @@ class GeneralNifExtractor ( quads ++= writeLinks(paragraphObject, paragraph, text) lastParagraph = Some(paragraph) } + //write links only occuring in links section: + if(section.id.contains("link")){ //english: link, german link, spanish: enlace, french: lien + val links = mutable.Map[String, mutable.HashSet[NifLink]]() + for (paragraphObject <- section.paragraphs) { + for (link <- paragraphObject.links) { + if(link.begin.nonEmpty && link.end.nonEmpty && link.linkType == NifLinkType.InterWiki && !link.uri.contains("#")) { + links.getOrElseUpdate(link.wikiTarget, new mutable.HashSet[NifLink]) += link + } + } + } + for ((wiki, linkSet) <- links) { + if(linkSet.size == 1){ //check if we have only one link to another wiki + quads += new Quad(context.language, DBpediaDatasets.InterWikiLinksLinkSection, wikiPage.uri, wikiPageInterWikiLinkProperty, linkSet.head.uri, sourceUrl, null) + } + } + } quads } @@ -245,7 +281,7 @@ class GeneralNifExtractor ( val typ = if (linkText.split(" ").length > 1) "Phrase" else "Word" val word = getNifIri(typ.toString.toLowerCase, link.begin.getOrElse(0), link.end.getOrElse(0)) quads += nifLinks(word, RdfNamespace.RDF.append("type"), RdfNamespace.NIF.append(typ), sourceUrl, null) - quads += nifLinks(word, RdfNamespace.RDF.append("type"), ontologyLink + link.linkType.toString + "Link", sourceUrl, null) + quads += nifLinks(word, RdfNamespace.RDF.append("type"), DBpediaNamespace.ONTOLOGY.append(link.linkType.toString + "Link"), sourceUrl, null) quads += nifLinks(word, RdfNamespace.NIF.append("referenceContext"), nifContextUri, sourceUrl, null) quads += nifLinks(word, RdfNamespace.NIF.append("beginIndex"), link.begin.getOrElse(0).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger")) quads += nifLinks(word, RdfNamespace.NIF.append("endIndex"), link.end.getOrElse(0).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger")) diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala index 7274de55a2..f5bf6b5446 100644 --- a/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala @@ -136,7 +136,8 @@ class NifExtractionAstVisitor(language : Language) begin = None, end = None, uri = link, - linkType = NifLinkType.External + linkType = NifLinkType.External, + wikiTarget = "" ) nifLink.begin = Some(context.length) write(link) @@ -159,7 +160,8 @@ class NifExtractionAstVisitor(language : Language) begin = None, end = None, uri = strLink, - linkType = NifLinkType.External + linkType = NifLinkType.External, + wikiTarget = "" ) nifLink.begin = Some(context.length) if(link.hasTitle) @@ -178,6 +180,7 @@ class NifExtractionAstVisitor(language : Language) def visit(link: WtInternalLink): Unit = { var linkTarget = link.getTarget.getAsString var linkType = NifLinkType.Internal + var wikiTarget = "" try { val destinationTitle = WikiTitle.parse(link.getTarget.getAsString, language) @@ -203,6 +206,7 @@ class NifExtractionAstVisitor(language : Language) linkType = NifLinkType.Internal } } + wikiTarget = destinationTitle.language.dbpediaDomain } catch { case _: Throwable => { write(link.getPrefix) @@ -216,7 +220,8 @@ class NifExtractionAstVisitor(language : Language) begin = None, end = None, uri = linkTarget, - linkType = linkType + linkType = linkType, + wikiTarget = wikiTarget ) nifLink.begin = Some(context.length) write(link.getPrefix) @@ -366,6 +371,7 @@ class NifExtractionAstVisitor(language : Language) def visit(n: WtPageSwitch): Unit = {} def visit(hr: WtHorizontalRule): Unit = {} def visit(n: WtTable): Unit = {}//no table + def visit(e: WtXmlEmptyTag): Unit = {} def visit(e: WtXmlEndTag): Unit = {} def visit(e: WtXmlStartTag): Unit = {} def visit(n: WtXmlComment): Unit = {} @@ -382,18 +388,22 @@ class NifExtractionAstVisitor(language : Language) private def write(s: String): Unit = { - if (s.isEmpty) return - if(context.isEmpty){ - context ++= s.replace("\n", "").replaceAll(" +", " ").replaceAll("^\\s+", "") + val processed = preprocessString(s) + if (processed.isEmpty) return + if(context.isEmpty || context.last == ' '){ + context ++= processed.replaceAll("^[\\s| ]+", "") }else{ - if(context.last == ' '){ - context ++= s.replace("\n", "").replaceAll(" +", " ").replaceAll("^\\s+", "") - }else{ - context ++= s.replace("\n", "").replaceAll(" +", " ") - } + context ++= processed } } + private def preprocessString(s:String): String = { + s.replace("\n", "").replace("\r", "").replace("\t", "") + .replaceAll("[\\s| ]+", " ") //different whitespace character + } + + + private def write(cs: Array[Char]): Unit = { write(String.valueOf(cs)) } diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/NifLink.scala b/core/src/main/scala/org/dbpedia/extraction/nif/NifLink.scala index e03eddbaee..958e0a3b50 100644 --- a/core/src/main/scala/org/dbpedia/extraction/nif/NifLink.scala +++ b/core/src/main/scala/org/dbpedia/extraction/nif/NifLink.scala @@ -4,7 +4,8 @@ class NifLink( var begin: Option[Int], var end: Option[Int], var uri: String, - var linkType: NifLinkType.Value + var linkType: NifLinkType.Value, + var wikiTarget : String ) { } diff --git a/scripts/src/main/java/org/dbpedia/extraction/relation/RelationExtractionJava.java b/scripts/src/main/java/org/dbpedia/extraction/relation/RelationExtractionJava.java index adde0e924b..687c8db18e 100644 --- a/scripts/src/main/java/org/dbpedia/extraction/relation/RelationExtractionJava.java +++ b/scripts/src/main/java/org/dbpedia/extraction/relation/RelationExtractionJava.java @@ -40,8 +40,6 @@ public static List run(Language language) throws Exception { Map> predictionCandidates = predictionCandidates(models, relations); - Set s = predictionCandidates.get("http://dbkwik.webdatacommons.org/harrypotter/resource/Daphne_Maldon"); - Map predictionSet = getPredictionSet(abstractEnds, predictionCandidates); updateType(predictionSet); computeSentenceFeatures(predictionSet, abstractEnds);