From d578d7958adc57511f268e3802ee7c1734204dc7 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Fri, 5 Dec 2014 15:43:07 -0800 Subject: [PATCH] Ability to extract content excerpts as reported in search results. Also a fix to -s|--some and -n|--none: these did not yet support passing lists of phrases. This now works correctly if you provide separate phrases via commas. --- README.md | 9 +++++++-- scholar.py | 48 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 6c1608d..72b9924 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Christian Features -------- -* Extracts publication title, most relevant web link, PDF link, number of citations, number of online versions, link to Google Scholar's article cluster for the work, Google Scholar's cluster of all works referencing the publication. +* Extracts publication title, most relevant web link, PDF link, number of citations, number of online versions, link to Google Scholar's article cluster for the work, Google Scholar's cluster of all works referencing the publication, and excerpt of content. * Extracts total number of hits as reported by Scholar (new in version 2.5) * Supports the full range of advanced query options provided by Google Scholar, such as title-only search, publication date timeframes, and inclusion/exclusion of patents and citations. * Supports article cluster IDs, i.e., information relating to the variants of an article already identified by Google Scholar @@ -44,6 +44,8 @@ Retrieve one article written by Einstein on quantum theory: PDF link http://icole.mut-es.ac.ir/downloads/Sci_Sec/W1/Einstein%201917.pdf Citations list http://scholar.google.com/scholar?cites=17749203648027613321&as_sdt=2005&sciodt=0,5&hl=en Versions list http://scholar.google.com/scholar?cluster=17749203648027613321&hl=en&as_sdt=0,5 + Excerpt The formal similarity between the chromatic distribution curve for thermal radiation [...] + Note the cluster ID in the above. Using this ID, you can directly access the cluster of articles Google Scholar has already determined to be variants of the same paper. So, let's see the versions: @@ -55,18 +57,21 @@ Note the cluster ID in the above. Using this ID, you can directly access the clu Cluster ID 17749203648027613321 PDF link http://icole.mut-es.ac.ir/downloads/Sci_Sec/W1/Einstein%201917.pdf Citations list http://scholar.google.com/scholar?cites=17749203648027613321&as_sdt=2005&sciodt=0,5&hl=en - + Excerpt The formal similarity between the chromatic distribution curve for thermal radiation [...] + Title ON THE QUANTUM THEORY OF RADIATION URL http://www.informationphilosopher.com/solutions/scientists/einstein/1917_Radiation.pdf Citations 0 Versions 0 PDF link http://www.informationphilosopher.com/solutions/scientists/einstein/1917_Radiation.pdf + Excerpt The formal similarity between the chromatic distribution curve for thermal radiation [...] Title The Quantum Theory of Radiation URL http://web.ihep.su/dbserv/compas/src/einstein17/eng.pdf Citations 0 Versions 0 PDF link http://web.ihep.su/dbserv/compas/src/einstein17/eng.pdf + Excerpt 1 on the assumption that there are discrete elements of energy, from which quantum [...] Let's retrieve a BibTeX entry for that quantum theory paper. The best BibTeX often seems to be the one linked from search results, not those in the article cluster, so let's do a search again: diff --git a/scholar.py b/scholar.py index 9395fca..8905723 100755 --- a/scholar.py +++ b/scholar.py @@ -7,6 +7,11 @@ # ChangeLog # --------- # +# 2.7 Ability to extract content excerpts as reported in search results. +# Also a fix to -s|--some and -n|--none: these did not yet support +# passing lists of phrases. This now works correctly if you provide +# separate phrases via commas. +# # 2.6 Ability to disable inclusion of patents and citations. This # has the same effect as unchecking the two patents/citations # checkboxes in the Scholar UI, which are checked by default. @@ -191,7 +196,7 @@ class QueryArgumentError(Error): class ScholarConf(object): """Helper class for global settings.""" - VERSION = '2.6' + VERSION = '2.7' LOG_LEVEL = 1 MAX_PAGE_RESULTS = 20 # Current maximum for per-page results SCHOLAR_SITE = 'http://scholar.google.com' @@ -249,6 +254,7 @@ def __init__(self): 'url_citations': [None, 'Citations list', 7], 'url_versions': [None, 'Versions list', 8], 'url_citation': [None, 'Citation link', 9], + 'excerpt': [None, 'Excerpt', 10], } # The citation data in one of the standard export formats, @@ -376,7 +382,6 @@ def _parse_globals(self): except (IndexError, ValueError): pass - def _parse_article(self, div): self.article = ScholarArticle() @@ -566,6 +571,14 @@ def _parse_article(self, div): if tag.find('div', {'class': 'gs_fl'}): self._parse_links(tag.find('div', {'class': 'gs_fl'})) + if tag.find('div', {'class': 'gs_rs'}): + # These are the content excerpts rendered into the results. + raw_text = tag.find('div', {'class': 'gs_rs'}).findAll(text=True) + if len(raw_text) > 0: + raw_text = ''.join(raw_text) + raw_text = raw_text.replace('\n', '') + self.article['excerpt'] = raw_text + class ScholarQuery(object): """ @@ -671,7 +684,7 @@ class SearchScholarQuery(ScholarQuery): + '&as_publication=%(pub)s' \ + '&as_ylo=%(ylo)s' \ + '&as_yhi=%(yhi)s' \ - + '&as_sdt=%(patents)s,5' \ + + '&as_sdt=%(patents)s%%2C5' \ + '&as_vis=%(citations)s' \ + '&btnG=&hl=en' \ + '&num=%(num)s' @@ -745,9 +758,30 @@ def get_url(self): and self.timeframe[0] is None and self.timeframe[1] is None: raise QueryArgumentError('search query needs more parameters') + # If we have some-words or none-words lists, we need to + # process them so GS understands them. For simple + # space-separeted word lists, there's nothing to do. For lists + # of phrases we have to ensure quotations around the phrases, + # separating them by whitespace. + words_some = None + words_none = None + + if self.words_some: + if self.words_some.find(',') >= 0: + phrases = self.words_some.split(',') + words_some = ' '.join(['"' + phrase.strip() + '"' for phrase in phrases]) + else: + words_some = self.words_some + if self.words_none: + if self.words_none.find(',') >= 0: + phrases = self.words_none.split(',') + words_none = ' '.join(['"' + phrase.strip() + '"' for phrase in phrases]) + else: + words_none = self.words_none + urlargs = {'words': self.words or '', - 'words_some': self.words_some or '', - 'words_none': self.words_none or '', + 'words_some': words_some or '', + 'words_none': words_none or '', 'phrase': self.phrase or '', 'scope': 'title' if self.scope_title else 'any', 'authors': self.author or '', @@ -1071,9 +1105,9 @@ def main(): group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw', help='Results must contain all of these words') group.add_option('-s', '--some', metavar='WORDS', default=None, - help='Results must contain at least one of these words') + help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases') group.add_option('-n', '--none', metavar='WORDS', default=None, - help='Results must contain none of these words') + help='Results must contain none of these words. See -s|--some re. formatting') group.add_option('-p', '--phrase', metavar='PHRASE', default=None, help='Results must contain exact phrase') group.add_option('-t', '--title-only', action='store_true', default=False,