Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEV: Green Book: optional normalization of dot codes #60

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import gov.uspto.patent.model.Patent;

public abstract class KvParser implements Dom4j {

private static final Logger LOGGER = LoggerFactory.getLogger(KvParser.class);

private final KvReader kvReader;
Expand All @@ -28,6 +29,10 @@ public KvParser() {
kvReader = new KvReader();
}

public KvParser(final KvReader reader) {
kvReader = Preconditions.checkNotNull(reader);
}

public KvParser(Collection<String> maintainSpaceFields, Collection<String> paragraphFields,
Collection<String> headerFields, Collection<String> tableFields) {
kvReader = new KvReader();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public Document genXml(List<KeyValue> keyValues) {
tCount++;
}

field.setText(kv.getValue());
genXmlValue(field, kv);
currentSection.add(field);
}
}
Expand Down Expand Up @@ -177,7 +177,7 @@ public Document genXml(List<KeyValue> keyValues, Collection<String> sections) {
currentSection = DocumentHelper.createElement(kv.getKey());
} else {
Element field = DocumentHelper.createElement(kv.getKey());
field.setText(kv.getValue());
genXmlValue(field, kv);
currentSection.add(field);
}
}
Expand Down Expand Up @@ -242,11 +242,11 @@ else if (!entry.getFieldGroup().isMultivalued()){
currentSection = DocumentHelper.createElement(currentFieldGroup.getName());

Element field = DocumentHelper.createElement(kv.getKey());
field.setText(kv.getValue());
genXmlValue(field, kv);
currentSection.add(field);
} else {
Element field = DocumentHelper.createElement(kv.getKey());
field.setText(kv.getValue());
genXmlValue(field, kv);
currentSection.add(field);
}
}
Expand All @@ -263,12 +263,12 @@ else if (entry.getField().isAchor()) {
currentSection = DocumentHelper.createElement(currentFieldGroup.getName());

Element field = DocumentHelper.createElement(kv.getKey());
field.setText(kv.getValue());
genXmlValue(field, kv);
currentSection.add(field);

} else if (currentFieldGroup == entry.getFieldGroup() && currentSection != rootNode) {
Element field = DocumentHelper.createElement(kv.getKey());
field.setText(kv.getValue());
genXmlValue(field, kv);
currentSection.add(field);
} else {
if (currentSection != rootNode) {
Expand All @@ -278,7 +278,7 @@ else if (entry.getField().isAchor()) {
currentSection = rootNode;

Element field = DocumentHelper.createElement(kv.getKey());
field.setText(kv.getValue());
genXmlValue(field, kv);
currentSection.add(field);
}
}
Expand Down Expand Up @@ -307,7 +307,7 @@ public List<KeyValue> parse(Reader reader) throws PatentReaderException {
// String[] parts = processLineRegex(currentLine);
String[] parts = processLineLeadingWhiteSpace(currentLine);
if (parts.length == 2) {
keyValues.add(new KeyValue(parts[0], parts[1]));
keyValues.add(new KeyValue(parts[0], normalizeValue(parts[1])));
// } else if (sections != null && parts.length == 1 &&
// sections.contains(parts[0])) {
// keyValues.add(new KeyValue(parts[0], ""));
Expand All @@ -319,7 +319,7 @@ public List<KeyValue> parse(Reader reader) throws PatentReaderException {
}
int lastLoc = keyValues.size() - 1;
KeyValue lastKv = keyValues.get(lastLoc);
lastKv.appendValue(parts[0]);
lastKv.appendValue(normalizeValue(parts[0]));
currentFieldName = lastKv.getKey().toUpperCase();
}
}
Expand All @@ -331,6 +331,48 @@ public List<KeyValue> parse(Reader reader) throws PatentReaderException {
return keyValues;
}

/**
* Generate the XML representation of the given value, and add it to the given
* XML element
*
* @param element
* the element to which the XML representation should be added
* @param value
* the value to transform into XML
*/
protected void genXmlValue(final Element element, final String value) {
element.setText(value);
}

/**
* Normalize a {@link KeyValue}'s value
*
* <p>
* The default implementation merely returns its argument. This method should be
* overridden to customize the reader's behavior.
*
* @param string
* the string to normalize
* @return the normalized string
*/
protected String normalizeValue(final String string) {
return string;
}

/**
* Generate the XML representation of the given {@link KeyValue}'s value, and
* add it to the given XML element
*
* @param element
* the element to which the XML representation should be added
* @param keyValue
* the {@link KeyValue} to transform into XML
* @see KvReader#genXmlValue(Element, String)
*/
private void genXmlValue(final Element element, final KeyValue keyValue) {
genXmlValue(element, keyValue.getValue());
}

/**
* Process Line, split line by key value, else return line.
*
Expand Down
53 changes: 40 additions & 13 deletions PatentDocument/src/main/java/gov/uspto/patent/PatentReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@

import com.google.common.base.Preconditions;

import gov.uspto.parser.dom4j.Dom4JParser;
import gov.uspto.parser.dom4j.keyvalue.KvParser;
import gov.uspto.patent.doc.greenbook.DotCodes;
import gov.uspto.patent.doc.greenbook.Greenbook;
import gov.uspto.patent.doc.pap.PatentAppPubParser;
import gov.uspto.patent.doc.sgml.Sgml;
Expand All @@ -29,9 +32,13 @@
*
*/
public class PatentReader implements PatentDocReader<Patent> {

private static final long DEFAULT_MAX_BYTES = 100000000; // 100 MB.

private PatentDocFormat patentDocFormat;
private boolean normalize = false;

private final PatentDocFormat patentDocFormat;

private long maxByteSize = DEFAULT_MAX_BYTES;

/**
Expand All @@ -49,6 +56,18 @@ public void setMaxByteSize(long maxByteSize){
this.maxByteSize = maxByteSize;
}

/**
* Set whether {@link DotCodes} should be replaced by their Unicode or XML
* equivalents
*
* @param normalize
* whether {@link DotCodes} should be replaced by their Unicode or
* XML equivalents
*/
public void setNormalize(final boolean normalize) {
this.normalize = normalize;
}

/**
* Parse Dom4j Document
*
Expand All @@ -75,22 +94,30 @@ public Patent read(Reader reader) throws PatentReaderException, IOException {
throw new PatentReaderException("Patent too Large");
}

switch (patentDocFormat) {
case Greenbook:
return new Greenbook().parse(reader);
case RedbookApplication:
return new ApplicationParser().parse(getJDOM(reader));
case RedbookGrant:
return new GrantParser().parse(getJDOM(reader));
case Sgml:
return new Sgml().parse(getJDOM(reader));
case Pap:
return new PatentAppPubParser().parse(getJDOM(reader));
switch (patentDocFormat) {
case Greenbook:
return read(new Greenbook(normalize), reader);
case RedbookApplication:
return read(new ApplicationParser(), reader);
case RedbookGrant:
return read(new GrantParser(), reader);
case Sgml:
return read(new Sgml(), reader);
case Pap:
return read(new PatentAppPubParser(), reader);
default:
throw new PatentReaderException("Invalid or Unknown Document Type");
}
}

private Patent read(final Dom4JParser parser, final Reader reader) throws PatentReaderException {
return parser.parse(getJDOM(reader));
}

private Patent read(final KvParser parser, final Reader reader) throws PatentReaderException {
return parser.parse(reader);
}

/**
* Load XML Document
*
Expand Down Expand Up @@ -144,7 +171,7 @@ public static Document fixTagsJDOM(String badXml) throws IOException, PatentRead
sax.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
return sax.read(new StringReader(doc));
} catch (DocumentException | SAXException e) {
throw new PatentReaderException("Failed to Fix and Parse Docuemnt", e);
throw new PatentReaderException("Failed to Fix and Parse Document", e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,18 @@ public class Greenbook extends KvParser {
private static List<String> HEADER_FIELDS = Arrays.asList(new String[] { "PAC" });
private static List<String> TABLE_FIELDS = Arrays.asList(new String[] { "TBL" });

public Greenbook() {
super(MAINTAIN_SPACE_FIELDS, PARAGRAPH_FIELDS, HEADER_FIELDS, TABLE_FIELDS);
}
public Greenbook() {
this(false);
}

/**
* @param normalize
* whether {@link DotCodes} should be replaced by their Unicode or
* XML equivalents
*/
public Greenbook(final boolean normalize) {
super(newKvReader(normalize));
}

/*
* private static final Set<String> SECTIONS = new HashSet<String>(20);
Expand Down Expand Up @@ -226,4 +235,11 @@ public static void main(String[] args) throws PatentReaderException, IOException
}
}

private static GreenbookKvReader newKvReader(final boolean normalize) {
final GreenbookKvReader kvReader = new GreenbookKvReader(normalize);
kvReader.setMaintainSpaceFields(MAINTAIN_SPACE_FIELDS);
kvReader.setFieldsForId(PARAGRAPH_FIELDS, HEADER_FIELDS, TABLE_FIELDS);
return kvReader;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package gov.uspto.patent.doc.greenbook;

import java.io.StringReader;

import org.dom4j.Document;
import org.dom4j.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import gov.uspto.parser.dom4j.keyvalue.KvReader;
import gov.uspto.patent.PatentReader;
import gov.uspto.patent.PatentReaderException;

/**
* A {@link KvReader} implementation providing optional normalization of {@link DotCodes}.
*
* @see <a href="https://github.com/USPTO/PatentPublicData/issues/59">https://github.com/USPTO/PatentPublicData/issues/59</a>
*
* @author Luc Boruta (luc@thunken.com)
*/
public class GreenbookKvReader extends KvReader {

private static final Logger LOGGER = LoggerFactory.getLogger(GreenbookKvReader.class);

private final boolean normalize;

public GreenbookKvReader() {
this(false);
}

/**
* @param normalize
* whether {@link DotCodes} should be replaced by their Unicode or
* XML equivalents
*/
public GreenbookKvReader(final boolean normalize) {
this.normalize = normalize;
}

@Override
protected void genXmlValue(final Element element, final String value) {
Document document = null;
if (normalize && value != null) {
final String normalized = DotCodes.replaceSubSupHTML(value);
if (!normalized.equals(value)) {
/*
* Wrap a dummy <span> around the value, to avoid
* "Content is not allowed in prolog." errors.
*/
try {
document = PatentReader.getJDOM(new StringReader("<span>" + normalized + "</span>"));
} catch (final PatentReaderException e) {
LOGGER.warn("Failed to parse and normalize value");
document = null;
}
}
}
if (document == null) {
super.genXmlValue(element, value);
} else {
element.appendContent((Element) document.node(0));
}
}

@Override
protected String normalizeValue(final String string) {
return normalize && string != null ? DotCodes.replace(string) : string;
}

}