Skip to content

Commit

Permalink
Ability to disable inclusion of patents and citations. This has the
Browse files Browse the repository at this point in the history
same effect as unchecking the two patents/citations checkboxes in the
Scholar UI, which are checked by default. Accordingly, the new
command-line options are --no-patents and --no-citations.
  • Loading branch information
ckreibich committed Dec 5, 2014
1 parent 3eaaf27 commit 9699738
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 3 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Features

* Extracts publication title, most relevant web link, PDF link, number of citations, number of online versions, link to Google Scholar's article cluster for the work, Google Scholar's cluster of all works referencing the publication.
* Extracts total number of hits as reported by Scholar (new in version 2.5)
* Supports the full range of advanced query options provided by Google Scholar, such as title-only search or publication date timeframes.
* Supports the full range of advanced query options provided by Google Scholar, such as title-only search, publication date timeframes, and inclusion/exclusion of patents and citations.
* Supports article cluster IDs, i.e., information relating to the variants of an article already identified by Google Scholar
* Supports retrieval of citation details in standard external formats as provided by Google Scholar, including BibTeX and EndNote.
* Command-line tool prints entries in CSV format, simple plain text, or in the citation export format.
Expand Down
31 changes: 29 additions & 2 deletions scholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
# ChangeLog
# ---------
#
# 2.6 Ability to disable inclusion of patents and citations. This
# has the same effect as unchecking the two patents/citations
# checkboxes in the Scholar UI, which are checked by default.
# Accordingly, the command-line options are --no-patents and
# --no-citations.
#
# 2.5: Ability to parse global result attributes. This right now means
# only the total number of results as reported by Scholar at the
# top of the results pages (e.g. "About 31 results"). Such
Expand Down Expand Up @@ -185,7 +191,7 @@ class QueryArgumentError(Error):
class ScholarConf(object):
"""Helper class for global settings."""

VERSION = '2.5'
VERSION = '2.6'
LOG_LEVEL = 1
MAX_PAGE_RESULTS = 20 # Current maximum for per-page results
SCHOLAR_SITE = 'http://scholar.google.com'
Expand Down Expand Up @@ -665,7 +671,10 @@ class SearchScholarQuery(ScholarQuery):
+ '&as_publication=%(pub)s' \
+ '&as_ylo=%(ylo)s' \
+ '&as_yhi=%(yhi)s' \
+ '&btnG=&hl=en&as_sdt=0,5&num=%(num)s'
+ '&as_sdt=%(patents)s,5' \
+ '&as_vis=%(citations)s' \
+ '&btnG=&hl=en' \
+ '&num=%(num)s'

def __init__(self):
ScholarQuery.__init__(self)
Expand All @@ -678,6 +687,8 @@ def __init__(self):
self.author = None
self.pub = None
self.timeframe = [None, None]
self.include_patents = True
self.include_citations = True

def set_words(self, words):
"""Sets words that *all* must be found in the result."""
Expand Down Expand Up @@ -721,6 +732,12 @@ def set_timeframe(self, start=None, end=None):
end = ScholarUtils.ensure_int(end)
self.timeframe = [start, end]

def set_include_citations(self, yesorno):
self.include_citations = yesorno

def set_include_patents(self, yesorno):
self.include_patents = yesorno

def get_url(self):
if self.words is None and self.words_some is None \
and self.words_none is None and self.phrase is None \
Expand All @@ -737,6 +754,8 @@ def get_url(self):
'pub': self.pub or '',
'ylo': self.timeframe[0] or '',
'yhi': self.timeframe[1] or '',
'patents': '0' if self.include_patents else '1',
'citations': '0' if self.include_citations else '1',
'num': self.num_results or ScholarConf.MAX_PAGE_RESULTS}

for key, val in urlargs.items():
Expand Down Expand Up @@ -1065,6 +1084,10 @@ def main():
help='Results must have appeared in or after given year')
group.add_option('--before', metavar='YEAR', default=None,
help='Results must have appeared in or before given year')
group.add_option('--no-patents', action='store_true', default=False,
help='Do not include patents in results')
group.add_option('--no-citations', action='store_true', default=False,
help='Do not include citations in results')
group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None,
help='Do not search, just use articles in given cluster ID')
group.add_option('-c', '--count', type='int', default=None,
Expand Down Expand Up @@ -1159,6 +1182,10 @@ def main():
query.set_pub(options.pub)
if options.after or options.before:
query.set_timeframe(options.after, options.before)
if options.no_patents:
query.set_include_patents(False)
if options.no_citations:
query.set_include_citations(False)

if options.count is not None:
options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS)
Expand Down

0 comments on commit 9699738

Please sign in to comment.