diff --git a/README.md b/README.md index 59542c1..6c1608d 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Features * Extracts publication title, most relevant web link, PDF link, number of citations, number of online versions, link to Google Scholar's article cluster for the work, Google Scholar's cluster of all works referencing the publication. * Extracts total number of hits as reported by Scholar (new in version 2.5) -* Supports the full range of advanced query options provided by Google Scholar, such as title-only search or publication date timeframes. +* Supports the full range of advanced query options provided by Google Scholar, such as title-only search, publication date timeframes, and inclusion/exclusion of patents and citations. * Supports article cluster IDs, i.e., information relating to the variants of an article already identified by Google Scholar * Supports retrieval of citation details in standard external formats as provided by Google Scholar, including BibTeX and EndNote. * Command-line tool prints entries in CSV format, simple plain text, or in the citation export format. diff --git a/scholar.py b/scholar.py index 39e30c1..9395fca 100755 --- a/scholar.py +++ b/scholar.py @@ -7,6 +7,12 @@ # ChangeLog # --------- # +# 2.6 Ability to disable inclusion of patents and citations. This +# has the same effect as unchecking the two patents/citations +# checkboxes in the Scholar UI, which are checked by default. +# Accordingly, the command-line options are --no-patents and +# --no-citations. +# # 2.5: Ability to parse global result attributes. This right now means # only the total number of results as reported by Scholar at the # top of the results pages (e.g. "About 31 results"). Such @@ -185,7 +191,7 @@ class QueryArgumentError(Error): class ScholarConf(object): """Helper class for global settings.""" - VERSION = '2.5' + VERSION = '2.6' LOG_LEVEL = 1 MAX_PAGE_RESULTS = 20 # Current maximum for per-page results SCHOLAR_SITE = 'http://scholar.google.com' @@ -665,7 +671,10 @@ class SearchScholarQuery(ScholarQuery): + '&as_publication=%(pub)s' \ + '&as_ylo=%(ylo)s' \ + '&as_yhi=%(yhi)s' \ - + '&btnG=&hl=en&as_sdt=0,5&num=%(num)s' + + '&as_sdt=%(patents)s,5' \ + + '&as_vis=%(citations)s' \ + + '&btnG=&hl=en' \ + + '&num=%(num)s' def __init__(self): ScholarQuery.__init__(self) @@ -678,6 +687,8 @@ def __init__(self): self.author = None self.pub = None self.timeframe = [None, None] + self.include_patents = True + self.include_citations = True def set_words(self, words): """Sets words that *all* must be found in the result.""" @@ -721,6 +732,12 @@ def set_timeframe(self, start=None, end=None): end = ScholarUtils.ensure_int(end) self.timeframe = [start, end] + def set_include_citations(self, yesorno): + self.include_citations = yesorno + + def set_include_patents(self, yesorno): + self.include_patents = yesorno + def get_url(self): if self.words is None and self.words_some is None \ and self.words_none is None and self.phrase is None \ @@ -737,6 +754,8 @@ def get_url(self): 'pub': self.pub or '', 'ylo': self.timeframe[0] or '', 'yhi': self.timeframe[1] or '', + 'patents': '0' if self.include_patents else '1', + 'citations': '0' if self.include_citations else '1', 'num': self.num_results or ScholarConf.MAX_PAGE_RESULTS} for key, val in urlargs.items(): @@ -1065,6 +1084,10 @@ def main(): help='Results must have appeared in or after given year') group.add_option('--before', metavar='YEAR', default=None, help='Results must have appeared in or before given year') + group.add_option('--no-patents', action='store_true', default=False, + help='Do not include patents in results') + group.add_option('--no-citations', action='store_true', default=False, + help='Do not include citations in results') group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None, help='Do not search, just use articles in given cluster ID') group.add_option('-c', '--count', type='int', default=None, @@ -1159,6 +1182,10 @@ def main(): query.set_pub(options.pub) if options.after or options.before: query.set_timeframe(options.after, options.before) + if options.no_patents: + query.set_include_patents(False) + if options.no_citations: + query.set_include_citations(False) if options.count is not None: options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS)