Skip to content

Commit

Permalink
Less aggressive use of num= requests; version bump to 2.11
Browse files Browse the repository at this point in the history
  • Loading branch information
ckreibich committed Feb 1, 2017
1 parent 74c5065 commit 7e6efb4
Showing 1 changed file with 34 additions and 21 deletions.
55 changes: 34 additions & 21 deletions scholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
# ChangeLog
# ---------
#
# 2.11 The Scholar site seems to have become more picky about the
# number of results requested. The default of 20 in scholar.py
# could cause HTTP 503 responses. scholar.py now doesn't request
# a maximum unless you provide it at the comment line. (For the
# time being, you still cannot request more than 20 results.)
#
# 2.10 Merged a fix for the "TypError: quote_from_bytes()" problem on
# Python 3.x from hinnefe2.
#
Expand Down Expand Up @@ -129,7 +135,7 @@
#
# Don't complain about missing docstrings: pylint: disable-msg=C0111
#
# Copyright 2010--2014 Christian Kreibich. All rights reserved.
# Copyright 2010--2017 Christian Kreibich. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -234,7 +240,7 @@ class ScholarConf(object):

VERSION = '2.10'
LOG_LEVEL = 1
MAX_PAGE_RESULTS = 20 # Current maximum for per-page results
MAX_PAGE_RESULTS = 10 # Current default for per-page results
SCHOLAR_SITE = 'http://scholar.google.com'

# USER_AGENT = 'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.2.9) Gecko/20100913 Firefox/3.6.9'
Expand Down Expand Up @@ -626,7 +632,7 @@ def __init__(self):
# The number of results requested from Scholar -- not the
# total number of results it reports (the latter gets stored
# in attrs, see below).
self.num_results = ScholarConf.MAX_PAGE_RESULTS
self.num_results = None

# Queries may have global result attributes, similar to
# per-article attributes in ScholarArticle. The exact set of
Expand All @@ -635,8 +641,9 @@ def __init__(self):
self.attrs = {}

def set_num_page_results(self, num_page_results):
msg = 'maximum number of results on page must be numeric'
self.num_results = ScholarUtils.ensure_int(num_page_results, msg)
self.num_results = ScholarUtils.ensure_int(
num_page_results,
'maximum number of results on page must be numeric')

def get_url(self):
"""
Expand Down Expand Up @@ -701,7 +708,7 @@ class ClusterScholarQuery(ScholarQuery):
"""
SCHOLAR_CLUSTER_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \
+ 'cluster=%(cluster)s' \
+ '&num=%(num)s'
+ '%(num)s'

def __init__(self, cluster=None):
ScholarQuery.__init__(self)
Expand All @@ -720,12 +727,16 @@ def get_url(self):
if self.cluster is None:
raise QueryArgumentError('cluster query needs cluster ID')

urlargs = {'cluster': self.cluster,
'num': self.num_results or ScholarConf.MAX_PAGE_RESULTS}
urlargs = {'cluster': self.cluster }

for key, val in urlargs.items():
urlargs[key] = quote(encode(val))

# The following URL arguments must not be quoted, or the
# server will not recognize them:
urlargs['num'] = ('&num=%d' % self.num_results
if self.num_results is not None else '')

return self.SCHOLAR_CLUSTER_URL % urlargs


Expand All @@ -744,10 +755,10 @@ class SearchScholarQuery(ScholarQuery):
+ '&as_publication=%(pub)s' \
+ '&as_ylo=%(ylo)s' \
+ '&as_yhi=%(yhi)s' \
+ '&as_sdt=%(patents)s%%2C5' \
+ '&as_vis=%(citations)s' \
+ '&btnG=&hl=en' \
+ '&num=%(num)s'
+ '%(num)s' \
+ '&as_sdt=%(patents)s%%2C5'

def __init__(self):
ScholarQuery.__init__(self)
Expand All @@ -757,7 +768,7 @@ def __init__(self):
self.words_none = None # None of these words
self.phrase = None
self.scope_title = False # If True, search in title only
self.author = None
self.author = None
self.pub = None
self.timeframe = [None, None]
self.include_patents = True
Expand Down Expand Up @@ -841,17 +852,20 @@ def get_url(self):
'ylo': self.timeframe[0] or '',
'yhi': self.timeframe[1] or '',
'patents': '0' if self.include_patents else '1',
'citations': '0' if self.include_citations else '1',
'num': self.num_results or ScholarConf.MAX_PAGE_RESULTS}
'citations': '0' if self.include_citations else '1'}

for key, val in urlargs.items():
urlargs[key] = quote(encode(val))

# The following URL arguments must not be quoted, or the
# server will not recognize them:
urlargs['num'] = ('&num=%d' % self.num_results
if self.num_results is not None else '')

return self.SCHOLAR_QUERY_URL % urlargs


class ScholarSettings(object):

"""
This class lets you adjust the Scholar settings for your
session. It's intended to mirror the features tunable in the
Expand All @@ -865,30 +879,29 @@ class ScholarSettings(object):

def __init__(self):
self.citform = 0 # Citation format, default none
self.per_page_results = ScholarConf.MAX_PAGE_RESULTS
self.per_page_results = None
self._is_configured = False

def set_citation_format(self, citform):
citform = ScholarUtils.ensure_int(citform)
if citform < 0 or citform > self.CITFORM_BIBTEX:
raise FormatError('citation format invalid, is "%s"' \
raise FormatError('citation format invalid, is "%s"'
% citform)
self.citform = citform
self._is_configured = True

def set_per_page_results(self, per_page_results):
msg = 'page results must be integer'
self.per_page_results = ScholarUtils.ensure_int(per_page_results, msg)
self.per_page_results = min(self.per_page_results,
ScholarConf.MAX_PAGE_RESULTS)
self.per_page_results = ScholarUtils.ensure_int(
per_page_results, 'page results must be integer')
self.per_page_results = min(
self.per_page_results, ScholarConf.MAX_PAGE_RESULTS)
self._is_configured = True

def is_configured(self):
return self._is_configured


class ScholarQuerier(object):

"""
ScholarQuerier instances can conduct a search on Google Scholar
with subsequent parsing of the resulting HTML content. The
Expand Down

0 comments on commit 7e6efb4

Please sign in to comment.