From 346ae45e7992cfd0f9055e9f6bb05a086e9edd8f Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 26 May 2022 04:59:05 +0200 Subject: [PATCH] [mw] EPO/Espacenet: Acquire "abstract" text from OPS API Example: https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/EP0666666A2/biblio.json --- CHANGES.rst | 1 + patzilla/access/epo/espacenet/api.py | 38 +++++ patzilla/access/epo/espacenet/client_api.py | 133 ++++++++++++++++++ .../espacenet/{client.py => client_html.py} | 53 +++---- patzilla/access/epo/espacenet/pyramid.py | 2 +- tests/access/test_epo_espacenet.py | 12 +- 6 files changed, 197 insertions(+), 42 deletions(-) create mode 100644 patzilla/access/epo/espacenet/api.py create mode 100644 patzilla/access/epo/espacenet/client_api.py rename patzilla/access/epo/espacenet/{client.py => client_html.py} (75%) diff --git a/CHANGES.rst b/CHANGES.rst index fccdd8b1..87abe646 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -37,6 +37,7 @@ Development - [ui] EPO/Espacenet: Repair "external sources" links - [mw] EPO/Espacenet: Acquire "abstract" text - [mw] EPO/Espacenet: Make access layer production ready +- [mw] EPO/Espacenet: Acquire "abstract" text from OPS API 2019-11-01 0.169.3 diff --git a/patzilla/access/epo/espacenet/api.py b/patzilla/access/epo/espacenet/api.py new file mode 100644 index 00000000..d0a27255 --- /dev/null +++ b/patzilla/access/epo/espacenet/api.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# (c) 2015-2022 Andreas Motl +from patzilla.boot.cache import configure_cache_backend +from .client_api import espacenet_abstract +from .client_html import espacenet_description, espacenet_claims + + +if __name__ == '__main__': + """ + python -m patzilla.access.epo.espacenet.api + """ + configure_cache_backend() + numbers = [ + "US5770123A", + "US6269530B1", + "DE19814298A1", + "DE29624638U1", + "EP0666666B1" + ] + for number in numbers: + print("\n") + print("## {}".format(number)) + print("") + + print("### Abstract") + try: + print(espacenet_abstract(number)) + except Exception as ex: + print("ERROR: {}".format(ex)) + print("") + + print("### Claims") + print(espacenet_claims(number)) + print("") + + print("### Description") + print(espacenet_description(number)) + print("") diff --git a/patzilla/access/epo/espacenet/client_api.py b/patzilla/access/epo/espacenet/client_api.py new file mode 100644 index 00000000..3215eca8 --- /dev/null +++ b/patzilla/access/epo/espacenet/client_api.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +# (c) 2015-2022 Andreas Motl +""" +Data access for EPO/Espacenet, via HTTP API. +https://worldwide.espacenet.com/ +""" +import logging + +import requests +from beaker.cache import cache_region + +from patzilla.util.config import to_list +from patzilla.util.data.container import jpath +from patzilla.util.network.browser import regular_user_agent +from patzilla.util.numbers.normalize import normalize_patent + + +logger = logging.getLogger(__name__) + +http = requests.Session() + + +@cache_region('medium') +def espacenet_fetch_json(document_number, section): + """ + Acquire data in JSON format from OPS-like API. + + Example: + https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/EP0666666B1/biblio.json + """ + + document_number = normalize_patent(document_number, as_string=True, provider='espacenet') + + message_404 = 'No section "{section}" at Espacenet for "{document_number}"'.format(**locals()) + message_fail = 'Fetching section "{section}" from Espacenet for "{document_number}" failed'.format(**locals()) + + url_template = "https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/{}/{}.json" + url = url_template.format(document_number, section) + + logger.info('Accessing Espacenet: {}'.format(url)) + response = http.get(url, headers={'User-Agent': regular_user_agent}) + + if response.status_code == 200: + return response + + elif response.status_code == 404: + raise KeyError(message_404) + + else: + + if "SERVER.EntityNotFound" in response.content: + raise KeyError(message_404) + else: + raise ValueError(message_fail) + + +def espacenet_abstract(document_number): + """ + Acquire Espacenet "abstract" text from OPS API at worldwide.espacenet.com. + + https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=US&NR=5770123A&DB=worldwide.espacenet.com&FT=D + https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=DE&NR=19814298A1&DB=worldwide.espacenet.com&FT=D + https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/EP0666666B1/biblio.json + + TODO: Impossible to get abstract for document EP0666666B1. + """ + + message_fail = 'Bibliographic data of "{document_number}" at Espacenet lacks "abstract" section'.format(**locals()) + + response = espacenet_fetch_json(document_number, 'biblio') + data = response.json() + + documents = to_list(jpath("/ops:world-patent-data/exchange-documents/exchange-document", data)) + # TODO: Is it sane to only process the first result? + document = documents[0] + + # Decoder logic taken from `patzilla.access.epo.ops.api._format_abstract`. + try: + abstracts = to_list(document["abstract"]) + except KeyError: + raise KeyError(message_fail) + + results = [] + for abstract in abstracts: + lines = to_list(abstract['p']) + lines = map(lambda line: line['$'], lines) + content = "\n".join(lines) + lang = abstract.get('@lang') + + item = { + 'xml': content, + 'lang': lang, + 'source': 'espacenet', + } + results.append(item) + + try: + # TODO: Propagate all languages. + return results[0] + except IndexError: + raise KeyError(message_fail) + + +def espacenet_description_json(document_number): + """ + Acquire Espacenet "description" fulltext from OPS API at worldwide.espacenet.com. + ATTENTION: Does not work for US documents and friends. + + https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/EP0666666B1/description.json + https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/US5770123A/description.json + + Does not work for US documents. + + + CLIENT.InvalidCountryCode + At least one reference in the request has a unsupported country code: Request for fulltext for FulltextRetrievalType[format=text-only,locale=<null>,reference=OpsPublicationReference[country=<null>,docNumber=US5770123A,kind=<null>,regExKind=<null>,date=<null>,format=epodoc,sequence=0,status=<null>], system [null]] was deemed invalid. + + """ + + response = espacenet_fetch_json(document_number, 'description') + data = response.json() + + # Decoder logic taken from `patzilla.access.epo.ops.api.analytics_family`. + description = jpath("/ops:world-patent-data/ftxt:fulltext-documents/ftxt:fulltext-document/description", data) + content = "\n".join(map(lambda line: line['$'], to_list(description['p']))) + lang = description['@lang'] + + item = { + 'xml': content, + 'lang': lang, + 'source': 'espacenet', + } + return item diff --git a/patzilla/access/epo/espacenet/client.py b/patzilla/access/epo/espacenet/client_html.py similarity index 75% rename from patzilla/access/epo/espacenet/client.py rename to patzilla/access/epo/espacenet/client_html.py index 45d1aa57..d5202db4 100644 --- a/patzilla/access/epo/espacenet/client.py +++ b/patzilla/access/epo/espacenet/client_html.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # (c) 2015-2022 Andreas Motl """ -Screenscraper for Espacenet +Data access for EPO/Espacenet, via Old Espacenet HTML. https://worldwide.espacenet.com/ """ import logging @@ -20,8 +20,16 @@ @cache_region('medium') -def espacenet_fetch(document_number, section, element_id=None, element_class=None): +def espacenet_fetch_html(document_number, section, element_id=None, element_class=None): + """ + Acquire data through regular HTML page of Old Espacenet. + + Examples: + - https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=EP&NR=0666666B1&KC=B1&FT=D + - https://worldwide.espacenet.com/data/publicationDetails/claims?CC=EP&NR=0666666B1&KC=B1&FT=D + """ + document_number = normalize_patent(document_number, as_string=True, provider='espacenet') patent = normalize_patent(document_number, as_dict=True, provider='espacenet') message_404 = 'No section "{section}" at Espacenet for "{document_number}"'.format(**locals()) @@ -95,49 +103,22 @@ def espacenet_fetch(document_number, section, element_id=None, element_class=Non raise ValueError(message_fail) -def espacenet_abstract(document_number): - """ - Return Espacenet abstract text - https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=US&NR=5770123A&DB=worldwide.espacenet.com&FT=D - https://worldwide.espacenet.com/data/publicationDetails/biblio?CC=DE&NR=19814298A1&DB=worldwide.espacenet.com&FT=D - """ - return espacenet_fetch(document_number, 'biblio', element_class='printAbstract') - - def espacenet_description(document_number): """ - Return Espacenet description fulltext + Acquire Espacenet "description" fulltext from HTML page of old Espacenet. + https://worldwide.espacenet.com/data/publicationDetails/description?CC=US&NR=5770123A&DB=worldwide.espacenet.com&FT=D https://worldwide.espacenet.com/data/publicationDetails/description?CC=DE&NR=19814298A1&DB=worldwide.espacenet.com&FT=D + https://worldwide.espacenet.com/3.2/rest-services/published-data/publication/docdb/EP0666666B1/description.json """ - return espacenet_fetch(document_number, 'description', 'description') + return espacenet_fetch_html(document_number, 'description', 'description') def espacenet_claims(document_number): """ - Return Espacenet claims fulltext + Acquire Espacenet "claims" fulltext from HTML page of old Espacenet. + https://worldwide.espacenet.com/data/publicationDetails/claims?CC=US&NR=5770123A&FT=D&DB=worldwide.espacenet.com https://worldwide.espacenet.com/data/publicationDetails/claims?CC=DE&NR=19814298A1&DB=worldwide.espacenet.com&FT=D """ - return espacenet_fetch(document_number, 'claims', 'claims') - - -if __name__ == '__main__': - """ - python -m patzilla.access.epo.espacenet.client - """ - numbers = [ - "US5770123A", - "US6269530B1", - "DE19814298A1", - "DE29624638U1", - ] - for number in numbers: - print("## {}".format(number)) - print("") - print("### Claims") - print(espacenet_claims(number)) - print("") - print("### Description") - print(espacenet_description(number)) - print("\n") + return espacenet_fetch_html(document_number, 'claims', 'claims') diff --git a/patzilla/access/epo/espacenet/pyramid.py b/patzilla/access/epo/espacenet/pyramid.py index 84db7205..db3038bb 100644 --- a/patzilla/access/epo/espacenet/pyramid.py +++ b/patzilla/access/epo/espacenet/pyramid.py @@ -4,7 +4,7 @@ import logging from cornice.service import Service from pyramid.httpexceptions import HTTPBadRequest, HTTPNotFound -from patzilla.access.epo.espacenet.client import espacenet_claims, espacenet_description +from patzilla.access.epo.espacenet.api import espacenet_claims, espacenet_description logger = logging.getLogger(__name__) diff --git a/tests/access/test_epo_espacenet.py b/tests/access/test_epo_espacenet.py index 73a49997..e2c41baa 100644 --- a/tests/access/test_epo_espacenet.py +++ b/tests/access/test_epo_espacenet.py @@ -2,14 +2,16 @@ # (c) 2022 Andreas Motl import pytest -from patzilla.access.epo.espacenet.client import espacenet_description, espacenet_claims, espacenet_abstract +from patzilla.access.epo.espacenet.api import espacenet_description, espacenet_claims, espacenet_abstract def test_espacenet_abstract_success(app_request): """ Acquire "abstract" section of valid patent document from Espacenet. + + TODO: Impossible to get abstract for document EP0666666B1. """ - result = espacenet_abstract(document_number="EP0666666B1") + result = espacenet_abstract(document_number="EP0666666A2") assert result["source"] == "espacenet" assert result["lang"] == "en" assert "A non-quota access indicator is circulated among nodes" in result["xml"] @@ -21,7 +23,7 @@ def test_espacenet_abstract_failure(app_request): """ with pytest.raises(KeyError) as ex: espacenet_abstract(document_number="EP123A2") - assert ex.match('No section "biblio" at Espacenet for "EP123A2"') + assert ex.match('No section "biblio" at Espacenet for "EP0000123A2"') def test_espacenet_description_success(app_request): @@ -40,7 +42,7 @@ def test_espacenet_description_failure(app_request): """ with pytest.raises(KeyError) as ex: espacenet_description(document_number="EP123A2") - assert ex.match('No section "description" at Espacenet for "EP123A2"') + assert ex.match('No section "description" at Espacenet for "EP0000123A2"') def test_espacenet_claims_success(app_request): @@ -59,4 +61,4 @@ def test_espacenet_claims_failure(app_request): """ with pytest.raises(KeyError) as ex: espacenet_claims(document_number="EP123A2") - assert ex.match('No section "claims" at Espacenet for "EP123A2"') + assert ex.match('No section "claims" at Espacenet for "EP0000123A2"')