-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScholarUtils.py
98 lines (84 loc) · 4.94 KB
/
ScholarUtils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# ScholarUtils.py
# Utilities to manipulate GoogleScholarSearch results.
#
from serpapi import GoogleScholarSearch
import pandas as pd
import re
#!pip install PyYAML
import yaml
# Keep the API Key in an external YAML file. Format is:
# SERP_API_KEY: 92340184908390218409819aslkjfk13948
# (without the leading hash)
def InitScholar(configFile):
with open(configFile, 'r') as stream:
GoogleScholarSearch.SERP_API_KEY=yaml.safe_load(stream)['SERP_API_KEY']
def GetPapers( searchParams, maxPapers=100 ):
# Answers a list of (up to maxPapers) Google Scholar 'organic_result' structures for papers corresponding to the search params passed.
# E.g. u=GetPapers({"cites": "7379463099867128855"})
search=GoogleScholarSearch(searchParams.copy()) # Sigh! The class takes ownership of the parameters dict.
# search.params_dict['lr']='lang_en' # No - Scholar doesn't have the classifications consistent.
# Clumsy loop, because we don't know the loop invariant until after the first request:
results=[]
while len(results) < maxPapers:
search.params_dict['start']=len(results)
searchResult = search.get_json()
if searchResult.get('error'):
if searchResult['error'] == "Google hasn't returned any results for this query.":
print('No papers found for {}'.format(searchParams))
break
raise Exception(searchResult)
results += searchResult['organic_results']
# Three cases: limited by passed in maxPapers; limited by Google's 'Total results' for the query, or it's a single page result:
maxPapers = min(maxPapers, searchResult['search_information'].get('total_results',len(results)))
if search.params_dict['start'] == 0: # First request?
print('Retrieving {} paper{} for {}'.format(maxPapers, 's' if maxPapers>1 else '', searchParams))
return results[:maxPapers] # Truncate, else requesting 3 results could return 10.
def GetPaper( searchParams ):
# Answers the first 'organic_result' paper corresponding to the search params passed.
# If not found, answers an empty map. Note - might only find the citing paper.
# E.g. u=GetPaper({"q": "Search string"})
results=GetPapers( searchParams, 1 )
return results[0] if results else {}
yearRE=re.compile('(?:198|199|200|201|202)\d') # Matches 1980 through 2029
def YearFor(paper, **args):
# Look for publication date in the summary or perhaps the paper link:
# Keyword args:
# defaultYear: Year to use if none found.
defaultYear=args.get('defaultYear', 2022)
possibleDate=paper['publication_info']['summary'] + paper.get('resources',[{}])[0].get('link','')
yearsFound=yearRE.findall(possibleDate)
return int(yearsFound[0] if yearsFound else defaultYear) # Assume recent if not found, and check manually later if necessary.
def NumCitations(paper):
# Cited_by may be missing, or total may be either None (Google error, probably) or 0.
return (paper['inline_links']['cited_by']['total'] or 0) if paper['inline_links'].get('cited_by') else 0
def HasEnoughCitations(paper, **args):
# True if the paper has enough citations for inclusion in the survey.
# Keyword args:
# requiredAnnualCount: citations per year required for inclusion.
# finalYear: Last full year of survey.
requiredAnnualCount=args.get('requiredAnnualCount', 8)
finalYear=args.get('finalYear', 2020)
year=YearFor(paper)
citationsRequired=requiredAnnualCount / 2 if year >= finalYear else (finalYear - year) * requiredAnnualCount
return NumCitations(paper) > citationsRequired
def AuthorsFor(paper):
# Returns the (possibly truncated) list of authors.
#return (', '.join([author['name'] for author in paper['publication_info']['authors']])
# if paper['publication_info'].get('authors') else '') # only authors with Google Scholar entries
return paper['publication_info'].get('summary','').split(' -')[0]
def RelatedRef(paper):
# Answers the unique reference to get papers related to this one.
#return re.sub(r'.*q=related:(.*):scholar.google.com/.*', r'\1', paper['inline_links']['related_pages_link'])
return paper['result_id']
def RelatedQuery(id):
# Query to pass to GetPapers to get the related papers corresponding to the given related ID:
return {'q': 'related:{}:scholar.google.com/'.format(id)}
def WellCitedPapers(paperList, **args):
# Answers a dataframe of the well-cited papers from paperList
# Keyword arguments are passed to HasEnoughCitations.
return (pd.DataFrame([(paper['inline_links']['cited_by']['cites_id'], NumCitations(paper),
YearFor(paper, **args), paper['title'], AuthorsFor(paper), paper.get('link',''),
RelatedRef(paper), paper.get('snippet',''))
for paper in paperList if HasEnoughCitations(paper, **args)],
columns=['Key','Citations','Year','Title', 'Authors','Link','Related','Snippet'])
)