forked from PatentsView/PatentsView-APIWrapper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapi_wrapper.py
115 lines (93 loc) · 4.25 KB
/
api_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from __future__ import print_function
import configparser
import json
import os
import requests
import json_to_csv
import sys
import pandas as pd
def query(configfile):
# Query the PatentsView database using parameters specified in configfile
parser = configparser.ConfigParser()
parser.read(configfile)
# Loop through the separate queries listed in the config file.
for q in parser.sections():
print("Running query: ", q)
# Parse parameters from config file
entity = json.loads(parser.get(q, 'entity'))
url = 'https://api.patentsview.org/'+entity+'/query?'
input_file = json.loads(parser.get(q, 'input_file'))
directory = json.loads(parser.get(q, 'directory'))
input_type = json.loads(parser.get(q, 'input_type'))
fields = json.loads(parser.get(q, 'fields'))
try:
# If specified, 'sort' should be a list of dictionaries, specifying
# the order of keys and direction of each key.
sort = json.loads(parser.get(q, 'sort'))
sort_fields, sort_directions = [], []
for dct in sort:
for field in dct:
# We can only sort by fields that are in the data
if field in fields:
sort_fields.append(field)
sort_directions.append(dct[field])
if len(sort_fields) == 0:
sort_fields = [fields[0]]
sort_directions = ["asc"]
except:
sort_fields = [fields[0]]
sort_directions = ["asc"]
criteria = {"_and": [json.loads(parser.get(q, option)) for option in
parser.options(q) if option.startswith('criteria')]}
# remove the last line's carriage return
item_list = list(set(open(os.path.join(directory, input_file)).read().rstrip('\n').split('\n')))
results_found = 0
item_list_len = len(item_list)
# request the maximum of 10000 matches per query and page forward as necessary
per_page = 10000
for item in item_list:
count = per_page
page = 1
while count == per_page:
params = {
'q': {"_and": [{input_type: item}, criteria]},
'f': fields,
'o': {"per_page": per_page, "page": page}
}
r = requests.post(url, data=json.dumps(params))
page += 1
count = 0
if 400 <= r.status_code <= 499:
print("Client error when quering for value {}".format(item))
elif r.status_code >= 500:
print("Server error when quering for value {}. You may be exceeding the maximum API request size (1GB).".format(item))
else:
count = json.loads(r.text)['count']
if count != 0:
outp = open(os.path.join(directory, q + '_' + \
str(results_found) + '.json'), 'w')
print(r.text, end = '', file=outp)
outp.close()
results_found += 1
if results_found == 0:
print("Query {} returned no results".format(q))
else:
# Output merged CSV of formatted results.
json_to_csv.main(directory, q, results_found)
# Clean csv: reorder columns, drop duplicates, sort, then save
output_filename = os.path.join(directory, q+'.csv')
df = pd.read_csv(output_filename, dtype=object, encoding='Latin-1')
df = df[fields].drop_duplicates().sort_values(by=sort_fields,
ascending=[direction != 'desc' for direction in sort_directions])
df.to_csv(output_filename, index=False)
print('({} rows returned)'.format(len(df)))
if __name__ == '__main__':
if sys.version_info[0] != 3:
print("Please use Python version 3; you are using version:", sys.version)
sys.exit(1)
if len(sys.argv) < 2:
print("USAGE: python api_wrapper.py config_file")
sys.exit(1)
if not os.path.isfile(sys.argv[1]):
print("File not found: ", sys.argv[1])
query(sys.argv[1])