[mw] EPO/Espacenet: Make access layer production ready

- Use module-wide HTTP session instead of creating a new HTTP connection on each occasion. - Add content caching. - Upgrade to Beautiful Soup 4.x.
ip-tools · May 26, 2022 · 56da01e · 56da01e
1 parent 425ac23
commit 56da01e
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 4 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -36,6 +36,7 @@ Development
 - [mw] EPO/Espacenet: Repair access layer
 - [ui] EPO/Espacenet: Repair "external sources" links
 - [mw] EPO/Espacenet: Acquire "abstract" text
+- [mw] EPO/Espacenet: Make access layer production ready
 
 
 2019-11-01 0.169.3

diff --git a/patzilla/access/epo/espacenet/client.py b/patzilla/access/epo/espacenet/client.py
@@ -7,15 +7,19 @@
 import logging
 
 import requests
-from BeautifulSoup import BeautifulSoup
+from beaker.cache import cache_region
+from bs4 import BeautifulSoup
 
 from patzilla.util.network.browser import regular_user_agent
 from patzilla.util.numbers.normalize import normalize_patent
 
 
 logger = logging.getLogger(__name__)
 
+http = requests.Session()
 
+
+@cache_region('medium')
 def espacenet_fetch(document_number, section, element_id=None, element_class=None):
 
     patent = normalize_patent(document_number, as_dict=True, provider='espacenet')
@@ -35,11 +39,11 @@ def espacenet_fetch(document_number, section, element_id=None, element_class=Non
     url = url_tpl.format(section=section, **patent)
 
     logger.info('Accessing Espacenet: {}'.format(url))
-    response = requests.get(url, headers={'User-Agent': regular_user_agent})
+    response = http.get(url, headers={'User-Agent': regular_user_agent})
 
     # https://worldwide.espacenet.com/errorpages/error403.htm?reason=RobotAbuse&ip=89.247.174.135
     if "errorpages" in response.url:
-        soup = BeautifulSoup(response.content)
+        soup = BeautifulSoup(response.content, features="lxml")
         details = soup.find("h1").text
         details += ", see " + response.url
         message = "{}. Reason: {}".format(message_fail, details)
@@ -50,7 +54,7 @@ def espacenet_fetch(document_number, section, element_id=None, element_class=Non
 
     if response.status_code == 200:
         # TODO: when no result, "Claims not available" appears in response body
-        soup = BeautifulSoup(response.content)
+        soup = BeautifulSoup(response.content, features="lxml")
 
         # Probe element by id.
         element = None