-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolish_paper_entries.py
197 lines (165 loc) · 6.44 KB
/
polish_paper_entries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import argparse
import difflib
import os
import requests
import yaml
from langchain.prompts import ChatPromptTemplate
from langchain.utilities import ArxivAPIWrapper
from langchain_openai import ChatOpenAI
from requests.sessions import Session
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
S2_API_KEY = os.environ.get("S2_API_KEY")
def get_paper(
session: Session, paper_id: str, fields: str = "paperId,title,abstract", **kwargs
) -> dict:
params = {
"fields": fields,
**kwargs,
}
headers = {
"X-API-KEY": S2_API_KEY,
}
with session.get(
f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}",
params=params,
headers=headers,
) as response:
response.raise_for_status()
return response.json()
def search_paper_by_title(session: Session, title: str, limit: int = 1) -> dict:
url = "https://api.semanticscholar.org/graph/v1/paper/search"
query_params = {"query": title, "limit": limit, "fields": "paperId,title,abstract"}
headers = {
"X-API-KEY": S2_API_KEY,
}
with session.get(url, params=query_params, headers=headers) as response:
response.raise_for_status()
results = response.json()
if results["data"]:
return results["data"][0] # Return the first result
return None
def search_arxiv(title):
arxiv = ArxivAPIWrapper()
try:
results = arxiv.run(title)
if results:
return results.split("\n")[1].split(": ")[1]
except Exception as e:
print(f"Error searching for '{title}': {e}")
return None
def get_paper_info(session, s2_id, title):
try:
if s2_id:
try:
paper = get_paper(session, s2_id)
return paper
except requests.exceptions.RequestException:
# If s2_id fails, fall through to search by title
pass
# Search by title
paper = search_paper_by_title(session, title)
if (
paper
and difflib.SequenceMatcher(
None, paper["title"].lower(), title.lower()
).ratio()
>= 0.95
):
return paper
# If no match found
print(f"No matching paper found for title: {title}")
return None
except requests.exceptions.RequestException as e:
print(f"Error fetching paper info for {s2_id or title}: {e}")
return None
def get_categories(title, abstract):
chat = ChatOpenAI(model="gpt-4o-mini")
prompt = ChatPromptTemplate.from_template(
"Given the following paper title and abstract, select up to 3 most relevant categories from this list: "
"[ foundation models, evaluation, training methods, interpretability, code generation, document understanding, specialized domains, LM agents ]\n"
"Do not explain. One category per line. Just output the categories.\n\n"
"Title: {title}\n\n"
"Abstract: {abstract}\n\n"
"Categories:"
)
response = chat.invoke(prompt.format(title=title, abstract=abstract))
categories = [cat.strip() for cat in response.content.split("\n")]
return categories[:3]
def process_file(file_path):
with open(file_path, "r") as file:
data = yaml.safe_load(file)
updated_data = []
with requests.Session() as session:
for entry in tqdm(data, desc=f"Processing {os.path.basename(file_path)}"):
# Fetch paper info from Semantic Scholar
s2_id = entry["url"].split("/")[-1] if entry["url"] is not None else None
paper_info = get_paper_info(session, s2_id, entry["title"])
if paper_info:
# Get categories
categories = get_categories(paper_info["title"], paper_info["abstract"])
# Update entry
updated_entry = {
"url": entry["url"]
or (
f"https://www.semanticscholar.org/paper/{paper_info['paperId']}"
if paper_info.get("paperId")
else None
),
"title": f'"{paper_info["title"]}"',
"venue": entry["venue"],
"year": entry["year"],
"authors": (
entry["authors"].split(", ")
if isinstance(entry["authors"], str)
else entry["authors"]
),
"project": entry["project"],
"display": entry["display"],
}
# Add categories if not present in the original entry
if "category" not in entry:
updated_entry["category"] = categories
else:
updated_entry["category"] = entry["category"]
# Add paper_link if it exists in the original entry
if "paper_link" in entry:
updated_entry["paper_link"] = entry["paper_link"]
# Add arXiv URL if not present
if entry["url"] is not None and "arxiv.org" not in entry["url"]:
arxiv_url = search_arxiv(paper_info["title"])
if arxiv_url:
updated_entry["arxiv_url"] = arxiv_url
updated_data.append(updated_entry)
else:
# If paper_info is None, keep the original entry
updated_data.append(entry)
print(f"Skipping entry with URL: {entry['url']}")
new_file_path = file_path.replace(".yml", "_updated.yml")
with open(new_file_path, "w") as file:
yaml.dump(
updated_data,
file,
sort_keys=False,
default_flow_style=False,
indent=2,
width=float("inf"),
)
print(f"Updated file saved as: {new_file_path}")
def main():
directory = "_data/publications"
parser = argparse.ArgumentParser()
parser.add_argument("--year", type=str, help="Specify the year to process")
args = parser.parse_args()
if args.year:
files = [f"{args.year}_publist.yml"]
else:
files = [f for f in os.listdir(directory) if f.endswith("_publist.yml")]
files = sorted(files, reverse=True)
print(f"Processing the following files: {files}")
for filename in files:
file_path = os.path.join(directory, filename)
process_file(file_path)
if __name__ == "__main__":
main()