Ability to extract content excerpts as reported in search results.

Also a fix to -s|--some and -n|--none: these did not yet support passing lists of phrases. This now works correctly if you provide separate phrases via commas.
ckreibich · Dec 5, 2014 · d578d79 · d578d79
1 parent 9699738
commit d578d79
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Christian
 Features
 --------
 
-* Extracts publication title, most relevant web link, PDF link, number of citations, number of online versions, link to Google Scholar's article cluster for the work, Google Scholar's cluster of all works referencing the publication.
+* Extracts publication title, most relevant web link, PDF link, number of citations, number of online versions, link to Google Scholar's article cluster for the work, Google Scholar's cluster of all works referencing the publication, and excerpt of content.
 * Extracts total number of hits as reported by Scholar (new in version 2.5)
 * Supports the full range of advanced query options provided by Google Scholar, such as title-only search, publication date timeframes, and inclusion/exclusion of patents and citations.
 * Supports article cluster IDs, i.e., information relating to the variants of an article already identified by Google Scholar
@@ -44,6 +44,8 @@ Retrieve one article written by Einstein on quantum theory:
           PDF link http://icole.mut-es.ac.ir/downloads/Sci_Sec/W1/Einstein%201917.pdf
     Citations list http://scholar.google.com/scholar?cites=17749203648027613321&as_sdt=2005&sciodt=0,5&hl=en
      Versions list http://scholar.google.com/scholar?cluster=17749203648027613321&hl=en&as_sdt=0,5
+           Excerpt The formal similarity between the chromatic distribution curve for thermal radiation [...]
+
 
 Note the cluster ID in the above. Using this ID, you can directly access the cluster of articles Google Scholar has already determined to be variants of the same paper. So, let's see the versions:
 
@@ -55,18 +57,21 @@ Note the cluster ID in the above. Using this ID, you can directly access the clu
         Cluster ID 17749203648027613321
           PDF link http://icole.mut-es.ac.ir/downloads/Sci_Sec/W1/Einstein%201917.pdf
     Citations list http://scholar.google.com/scholar?cites=17749203648027613321&as_sdt=2005&sciodt=0,5&hl=en
-
+           Excerpt The formal similarity between the chromatic distribution curve for thermal radiation [...]
+
              Title ON THE QUANTUM THEORY OF RADIATION
                URL http://www.informationphilosopher.com/solutions/scientists/einstein/1917_Radiation.pdf
          Citations 0
           Versions 0
           PDF link http://www.informationphilosopher.com/solutions/scientists/einstein/1917_Radiation.pdf
+           Excerpt The formal similarity between the chromatic distribution curve for thermal radiation [...]
 
              Title The Quantum Theory of Radiation
                URL http://web.ihep.su/dbserv/compas/src/einstein17/eng.pdf
          Citations 0
           Versions 0
           PDF link http://web.ihep.su/dbserv/compas/src/einstein17/eng.pdf
+           Excerpt 1 on the assumption that there are discrete elements of energy, from which quantum [...]
 
 
 Let's retrieve a BibTeX entry for that quantum theory paper. The best BibTeX often seems to be the one linked from search results, not those in the article cluster, so let's do a search again:

diff --git a/scholar.py b/scholar.py
@@ -7,6 +7,11 @@
 # ChangeLog
 # ---------
 #
+# 2.7   Ability to extract content excerpts as reported in search results.
+#       Also a fix to -s|--some and -n|--none: these did not yet support
+#       passing lists of phrases. This now works correctly if you provide
+#       separate phrases via commas.
+#
 # 2.6   Ability to disable inclusion of patents and citations. This
 #       has the same effect as unchecking the two patents/citations
 #       checkboxes in the Scholar UI, which are checked by default.
@@ -191,7 +196,7 @@ class QueryArgumentError(Error):
 class ScholarConf(object):
     """Helper class for global settings."""
 
-    VERSION = '2.6'
+    VERSION = '2.7'
     LOG_LEVEL = 1
     MAX_PAGE_RESULTS = 20 # Current maximum for per-page results
     SCHOLAR_SITE = 'http://scholar.google.com'
@@ -249,6 +254,7 @@ def __init__(self):
             'url_citations': [None, 'Citations list', 7],
             'url_versions':  [None, 'Versions list',  8],
             'url_citation':  [None, 'Citation link',  9],
+            'excerpt':       [None, 'Excerpt',       10],
         }
 
         # The citation data in one of the standard export formats,
@@ -376,7 +382,6 @@ def _parse_globals(self):
                 except (IndexError, ValueError):
                     pass
 
-
     def _parse_article(self, div):
         self.article = ScholarArticle()
 
@@ -566,6 +571,14 @@ def _parse_article(self, div):
                 if tag.find('div', {'class': 'gs_fl'}):
                     self._parse_links(tag.find('div', {'class': 'gs_fl'}))
 
+                if tag.find('div', {'class': 'gs_rs'}):
+                    # These are the content excerpts rendered into the results.
+                    raw_text = tag.find('div', {'class': 'gs_rs'}).findAll(text=True)
+                    if len(raw_text) > 0:
+                        raw_text = ''.join(raw_text)
+                        raw_text = raw_text.replace('\n', '')
+                        self.article['excerpt'] = raw_text
+
 
 class ScholarQuery(object):
     """
@@ -671,7 +684,7 @@ class SearchScholarQuery(ScholarQuery):
         + '&as_publication=%(pub)s' \
         + '&as_ylo=%(ylo)s' \
         + '&as_yhi=%(yhi)s' \
-        + '&as_sdt=%(patents)s,5' \
+        + '&as_sdt=%(patents)s%%2C5' \
         + '&as_vis=%(citations)s' \
         + '&btnG=&hl=en' \
         + '&num=%(num)s'
@@ -745,9 +758,30 @@ def get_url(self):
            and self.timeframe[0] is None and self.timeframe[1] is None:
             raise QueryArgumentError('search query needs more parameters')
 
+        # If we have some-words or none-words lists, we need to
+        # process them so GS understands them. For simple
+        # space-separeted word lists, there's nothing to do. For lists
+        # of phrases we have to ensure quotations around the phrases,
+        # separating them by whitespace.
+        words_some = None
+        words_none = None
+
+        if self.words_some:
+            if self.words_some.find(',') >= 0:
+                phrases = self.words_some.split(',')
+                words_some = ' '.join(['"' + phrase.strip() + '"' for phrase in phrases])
+            else:
+                words_some = self.words_some
+        if self.words_none:
+            if self.words_none.find(',') >= 0:
+                phrases = self.words_none.split(',')
+                words_none = ' '.join(['"' + phrase.strip() + '"' for phrase in phrases])
+            else:
+                words_none = self.words_none
+
         urlargs = {'words': self.words or '',
-                   'words_some': self.words_some or '',
-                   'words_none': self.words_none or '',
+                   'words_some': words_some or '',
+                   'words_none': words_none or '',
                    'phrase': self.phrase or '',
                    'scope': 'title' if self.scope_title else 'any',
                    'authors': self.author or '',
@@ -1071,9 +1105,9 @@ def main():
     group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw',
                      help='Results must contain all of these words')
     group.add_option('-s', '--some', metavar='WORDS', default=None,
-                     help='Results must contain at least one of these words')
+                     help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases')
     group.add_option('-n', '--none', metavar='WORDS', default=None,
-                     help='Results must contain none of these words')
+                     help='Results must contain none of these words. See -s|--some re. formatting')
     group.add_option('-p', '--phrase', metavar='PHRASE', default=None,
                      help='Results must contain exact phrase')
     group.add_option('-t', '--title-only', action='store_true', default=False,