Skip to content

Commit

Permalink
[mw] EPO/Espacenet: Make access layer production ready
Browse files Browse the repository at this point in the history
- Use module-wide HTTP session instead of creating a new HTTP connection
  on each occasion.
- Add content caching.
- Upgrade to Beautiful Soup 4.x.
  • Loading branch information
amotl committed May 26, 2022
1 parent 425ac23 commit 56da01e
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ Development
- [mw] EPO/Espacenet: Repair access layer
- [ui] EPO/Espacenet: Repair "external sources" links
- [mw] EPO/Espacenet: Acquire "abstract" text
- [mw] EPO/Espacenet: Make access layer production ready


2019-11-01 0.169.3
Expand Down
12 changes: 8 additions & 4 deletions patzilla/access/epo/espacenet/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,19 @@
import logging

import requests
from BeautifulSoup import BeautifulSoup
from beaker.cache import cache_region
from bs4 import BeautifulSoup

from patzilla.util.network.browser import regular_user_agent
from patzilla.util.numbers.normalize import normalize_patent


logger = logging.getLogger(__name__)

http = requests.Session()


@cache_region('medium')
def espacenet_fetch(document_number, section, element_id=None, element_class=None):

patent = normalize_patent(document_number, as_dict=True, provider='espacenet')
Expand All @@ -35,11 +39,11 @@ def espacenet_fetch(document_number, section, element_id=None, element_class=Non
url = url_tpl.format(section=section, **patent)

logger.info('Accessing Espacenet: {}'.format(url))
response = requests.get(url, headers={'User-Agent': regular_user_agent})
response = http.get(url, headers={'User-Agent': regular_user_agent})

# https://worldwide.espacenet.com/errorpages/error403.htm?reason=RobotAbuse&ip=89.247.174.135
if "errorpages" in response.url:
soup = BeautifulSoup(response.content)
soup = BeautifulSoup(response.content, features="lxml")
details = soup.find("h1").text
details += ", see " + response.url
message = "{}. Reason: {}".format(message_fail, details)
Expand All @@ -50,7 +54,7 @@ def espacenet_fetch(document_number, section, element_id=None, element_class=Non

if response.status_code == 200:
# TODO: when no result, "Claims not available" appears in response body
soup = BeautifulSoup(response.content)
soup = BeautifulSoup(response.content, features="lxml")

# Probe element by id.
element = None
Expand Down

0 comments on commit 56da01e

Please sign in to comment.