From 282cecb8800be93d700302b48115326e47783991 Mon Sep 17 00:00:00 2001 From: P M D Scully Date: Thu, 27 Apr 2017 00:44:21 +0700 Subject: [PATCH] Added extraction of url_pdf from right hand side [PDF] link. --- scholar.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scholar.py b/scholar.py index 13ccd43..5499e94 100755 --- a/scholar.py +++ b/scholar.py @@ -621,6 +621,11 @@ def _parse_article(self, div): raw_text = raw_text.replace('\n', '') self.article['excerpt'] = raw_text + if self.article['url_pdf'] is None and tag.name == 'div' and self._tag_has_class(tag, 'gs_ggs') \ + and tag.div and tag.div.div and tag.div.div.a and tag.div.div.a.span \ + and tag.div.div.a.span.get_text() == "[PDF]": + self.article['url_pdf'] = self._path2url(tag.div.div.a['href']) + class ScholarQuery(object): """