Skip to content

Commit

Permalink
[mw] EPO/Espacenet: Acquire "abstract" text from OPS API
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed May 26, 2022
1 parent 56da01e commit 346ae45
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 42 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Development
- [ui] EPO/Espacenet: Repair "external sources" links
- [mw] EPO/Espacenet: Acquire "abstract" text
- [mw] EPO/Espacenet: Make access layer production ready
- [mw] EPO/Espacenet: Acquire "abstract" text from OPS API


2019-11-01 0.169.3
Expand Down
38 changes: 38 additions & 0 deletions patzilla/access/epo/espacenet/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
# (c) 2015-2022 Andreas Motl <andreas.motl@ip-tools.org>
from patzilla.boot.cache import configure_cache_backend
from .client_api import espacenet_abstract
from .client_html import espacenet_description, espacenet_claims


if __name__ == '__main__':
"""
python -m patzilla.access.epo.espacenet.api
"""
configure_cache_backend()
numbers = [
"US5770123A",
"US6269530B1",
"DE19814298A1",
"DE29624638U1",
"EP0666666B1"
]
for number in numbers:
print("\n")
print("## {}".format(number))
print("")

print("### Abstract")
try:
print(espacenet_abstract(number))
except Exception as ex:
print("ERROR: {}".format(ex))
print("")

print("### Claims")
print(espacenet_claims(number))
print("")

print("### Description")
print(espacenet_description(number))
print("")
133 changes: 133 additions & 0 deletions patzilla/access/epo/espacenet/client_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# -*- coding: utf-8 -*-
# (c) 2015-2022 Andreas Motl <andreas.motl@ip-tools.org>
"""
Data access for EPO/Espacenet, via HTTP API.
https://worldwide.espacenet.com/
"""
import logging

import requests
from beaker.cache import cache_region

from patzilla.util.config import to_list
from patzilla.util.data.container import jpath
from patzilla.util.network.browser import regular_user_agent
from patzilla.util.numbers.normalize import normalize_patent


logger = logging.getLogger(__name__)

http = requests.Session()


@cache_region('medium')
def espacenet_fetch_json(document_number, section):
"""
Acquire data in JSON format from OPS-like API.
Example:
https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/EP0666666B1/biblio.json
"""

document_number = normalize_patent(document_number, as_string=True, provider='espacenet')

message_404 = 'No section "{section}" at Espacenet for "{document_number}"'.format(**locals())
message_fail = 'Fetching section "{section}" from Espacenet for "{document_number}" failed'.format(**locals())

url_template = "https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/{}/{}.json"
url = url_template.format(document_number, section)

logger.info('Accessing Espacenet: {}'.format(url))
response = http.get(url, headers={'User-Agent': regular_user_agent})

if response.status_code == 200:
return response

elif response.status_code == 404:
raise KeyError(message_404)

else:

if "SERVER.EntityNotFound" in response.content:
raise KeyError(message_404)
else:
raise ValueError(message_fail)


def espacenet_abstract(document_number):
"""
Acquire Espacenet "abstract" text from OPS API at worldwide.espacenet.com.
https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=US&NR=5770123A&DB=worldwide.espacenet.com&FT=D
https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=DE&NR=19814298A1&DB=worldwide.espacenet.com&FT=D
https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/EP0666666B1/biblio.json
TODO: Impossible to get abstract for document EP0666666B1.
"""

message_fail = 'Bibliographic data of "{document_number}" at Espacenet lacks "abstract" section'.format(**locals())

response = espacenet_fetch_json(document_number, 'biblio')
data = response.json()

documents = to_list(jpath("/ops:world-patent-data/exchange-documents/exchange-document", data))
# TODO: Is it sane to only process the first result?
document = documents[0]

# Decoder logic taken from `patzilla.access.epo.ops.api._format_abstract`.
try:
abstracts = to_list(document["abstract"])
except KeyError:
raise KeyError(message_fail)

results = []
for abstract in abstracts:
lines = to_list(abstract['p'])
lines = map(lambda line: line['$'], lines)
content = "\n".join(lines)
lang = abstract.get('@lang')

item = {
'xml': content,
'lang': lang,
'source': 'espacenet',
}
results.append(item)

try:
# TODO: Propagate all languages.
return results[0]
except IndexError:
raise KeyError(message_fail)


def espacenet_description_json(document_number):
"""
Acquire Espacenet "description" fulltext from OPS API at worldwide.espacenet.com.
ATTENTION: Does not work for US documents and friends.
https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/EP0666666B1/description.json
https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/US5770123A/description.json
Does not work for US documents.
<fault xmlns="http://ops.epo.org">
<code>CLIENT.InvalidCountryCode</code>
<message>At least one reference in the request has a unsupported country code: Request for fulltext for FulltextRetrievalType[format=text-only,locale=&lt;null&gt;,reference=OpsPublicationReference[country=&lt;null&gt;,docNumber=US5770123A,kind=&lt;null&gt;,regExKind=&lt;null&gt;,date=&lt;null&gt;,format=epodoc,sequence=0,status=&lt;null&gt;], system [null]] was deemed invalid.</message>
</fault>
"""

response = espacenet_fetch_json(document_number, 'description')
data = response.json()

# Decoder logic taken from `patzilla.access.epo.ops.api.analytics_family`.
description = jpath("/ops:world-patent-data/ftxt:fulltext-documents/ftxt:fulltext-document/description", data)
content = "\n".join(map(lambda line: line['$'], to_list(description['p'])))
lang = description['@lang']

item = {
'xml': content,
'lang': lang,
'source': 'espacenet',
}
return item
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# (c) 2015-2022 Andreas Motl <andreas.motl@ip-tools.org>
"""
Screenscraper for Espacenet
Data access for EPO/Espacenet, via Old Espacenet HTML.
https://worldwide.espacenet.com/
"""
import logging
Expand All @@ -20,8 +20,16 @@


@cache_region('medium')
def espacenet_fetch(document_number, section, element_id=None, element_class=None):
def espacenet_fetch_html(document_number, section, element_id=None, element_class=None):
"""
Acquire data through regular HTML page of Old Espacenet.
Examples:
- https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=EP&NR=0666666B1&KC=B1&FT=D
- https://worldwide.espacenet.com/data/publicationDetails/claims?CC=EP&NR=0666666B1&KC=B1&FT=D
"""

document_number = normalize_patent(document_number, as_string=True, provider='espacenet')
patent = normalize_patent(document_number, as_dict=True, provider='espacenet')

message_404 = 'No section "{section}" at Espacenet for "{document_number}"'.format(**locals())
Expand Down Expand Up @@ -95,49 +103,22 @@ def espacenet_fetch(document_number, section, element_id=None, element_class=Non
raise ValueError(message_fail)


def espacenet_abstract(document_number):
"""
Return Espacenet abstract text
https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=US&NR=5770123A&DB=worldwide.espacenet.com&FT=D
https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=DE&NR=19814298A1&DB=worldwide.espacenet.com&FT=D
"""
return espacenet_fetch(document_number, 'biblio', element_class='printAbstract')


def espacenet_description(document_number):
"""
Return Espacenet description fulltext
Acquire Espacenet "description" fulltext from HTML page of old Espacenet.
https://worldwide.espacenet.com/data/publicationDetails/description?CC=US&NR=5770123A&DB=worldwide.espacenet.com&FT=D
https://worldwide.espacenet.com/data/publicationDetails/description?CC=DE&NR=19814298A1&DB=worldwide.espacenet.com&FT=D
https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/EP0666666B1/description.json
"""
return espacenet_fetch(document_number, 'description', 'description')
return espacenet_fetch_html(document_number, 'description', 'description')


def espacenet_claims(document_number):
"""
Return Espacenet claims fulltext
Acquire Espacenet "claims" fulltext from HTML page of old Espacenet.
https://worldwide.espacenet.com/data/publicationDetails/claims?CC=US&NR=5770123A&FT=D&DB=worldwide.espacenet.com
https://worldwide.espacenet.com/data/publicationDetails/claims?CC=DE&NR=19814298A1&DB=worldwide.espacenet.com&FT=D
"""
return espacenet_fetch(document_number, 'claims', 'claims')


if __name__ == '__main__':
"""
python -m patzilla.access.epo.espacenet.client
"""
numbers = [
"US5770123A",
"US6269530B1",
"DE19814298A1",
"DE29624638U1",
]
for number in numbers:
print("## {}".format(number))
print("")
print("### Claims")
print(espacenet_claims(number))
print("")
print("### Description")
print(espacenet_description(number))
print("\n")
return espacenet_fetch_html(document_number, 'claims', 'claims')
2 changes: 1 addition & 1 deletion patzilla/access/epo/espacenet/pyramid.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
from cornice.service import Service
from pyramid.httpexceptions import HTTPBadRequest, HTTPNotFound
from patzilla.access.epo.espacenet.client import espacenet_claims, espacenet_description
from patzilla.access.epo.espacenet.api import espacenet_claims, espacenet_description

logger = logging.getLogger(__name__)

Expand Down
12 changes: 7 additions & 5 deletions tests/access/test_epo_espacenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
# (c) 2022 Andreas Motl <andreas.motl@ip-tools.org>
import pytest

from patzilla.access.epo.espacenet.client import espacenet_description, espacenet_claims, espacenet_abstract
from patzilla.access.epo.espacenet.api import espacenet_description, espacenet_claims, espacenet_abstract


def test_espacenet_abstract_success(app_request):
"""
Acquire "abstract" section of valid patent document from Espacenet.
TODO: Impossible to get abstract for document EP0666666B1.
"""
result = espacenet_abstract(document_number="EP0666666B1")
result = espacenet_abstract(document_number="EP0666666A2")
assert result["source"] == "espacenet"
assert result["lang"] == "en"
assert "A non-quota access indicator is circulated among nodes" in result["xml"]
Expand All @@ -21,7 +23,7 @@ def test_espacenet_abstract_failure(app_request):
"""
with pytest.raises(KeyError) as ex:
espacenet_abstract(document_number="EP123A2")
assert ex.match('No section "biblio" at Espacenet for "EP123A2"')
assert ex.match('No section "biblio" at Espacenet for "EP0000123A2"')


def test_espacenet_description_success(app_request):
Expand All @@ -40,7 +42,7 @@ def test_espacenet_description_failure(app_request):
"""
with pytest.raises(KeyError) as ex:
espacenet_description(document_number="EP123A2")
assert ex.match('No section "description" at Espacenet for "EP123A2"')
assert ex.match('No section "description" at Espacenet for "EP0000123A2"')


def test_espacenet_claims_success(app_request):
Expand All @@ -59,4 +61,4 @@ def test_espacenet_claims_failure(app_request):
"""
with pytest.raises(KeyError) as ex:
espacenet_claims(document_number="EP123A2")
assert ex.match('No section "claims" at Espacenet for "EP123A2"')
assert ex.match('No section "claims" at Espacenet for "EP0000123A2"')

0 comments on commit 346ae45

Please sign in to comment.