From e673e907505a2f7e10978e06539f46ca04f462f7 Mon Sep 17 00:00:00 2001 From: runtarm Date: Fri, 25 Mar 2016 01:13:06 -0700 Subject: [PATCH 1/2] Detect [PDF] link on the right hand side --- scholar.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scholar.py b/scholar.py index 21f26aa..1cf3c87 100755 --- a/scholar.py +++ b/scholar.py @@ -539,10 +539,15 @@ def _parse_article(self, div): for tag in div: if not hasattr(tag, 'name'): continue + if str(tag).lower().find('.pdf'): if tag.find('div', {'class': 'gs_ttss'}): self._parse_links(tag.find('div', {'class': 'gs_ttss'})) + sidetag = tag.find('div', {'class': 'gs_ttss'}) + if sidetag and sidetag.a and str(sidetag).lower().find('[pdf]') >= 0: + self.article['url_pdf'] = self._path2url(sidetag.a['href']) + if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'): # There are (at least) two formats here. In the first # one, we have a link, e.g.: From b3b3da8f0f495a8ff682b65d315b6ba203ab64f1 Mon Sep 17 00:00:00 2001 From: runtarm Date: Tue, 29 Mar 2016 00:23:19 -0700 Subject: [PATCH 2/2] Also consider https:// as full URL --- scholar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scholar.py b/scholar.py index 1cf3c87..f2333e2 100755 --- a/scholar.py +++ b/scholar.py @@ -484,7 +484,7 @@ def _as_int(obj): def _path2url(self, path): """Helper, returns full URL in case path isn't one.""" - if path.startswith('http://'): + if path.startswith(('http://', 'https://')): return path if not path.startswith('/'): path = '/' + path