-
-
Notifications
You must be signed in to change notification settings - Fork 528
/
Copy pathpaper.py
224 lines (204 loc) · 10.2 KB
/
paper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
from typing import Optional
from functools import cached_property
from tempfile import TemporaryDirectory
import arxiv
import tarfile
import re
from llm import get_llm
import requests
from requests.adapters import HTTPAdapter, Retry
from loguru import logger
import tiktoken
from contextlib import ExitStack
class ArxivPaper:
def __init__(self,paper:arxiv.Result):
self._paper = paper
self.score = None
@property
def title(self) -> str:
return self._paper.title
@property
def summary(self) -> str:
return self._paper.summary
@property
def authors(self) -> list[str]:
return self._paper.authors
@cached_property
def arxiv_id(self) -> str:
return re.sub(r'v\d+$', '', self._paper.get_short_id())
@property
def pdf_url(self) -> str:
return self._paper.pdf_url
@cached_property
def code_url(self) -> Optional[str]:
s = requests.Session()
retries = Retry(total=5, backoff_factor=0.1)
s.mount('https://', HTTPAdapter(max_retries=retries))
try:
paper_list = s.get(f'https://paperswithcode.com/api/v1/papers/?arxiv_id={self.arxiv_id}').json()
except Exception as e:
logger.debug(f'Error when searching {self.arxiv_id}: {e}')
return None
if paper_list.get('count',0) == 0:
return None
paper_id = paper_list['results'][0]['id']
try:
repo_list = s.get(f'https://paperswithcode.com/api/v1/papers/{paper_id}/repositories/').json()
except Exception as e:
logger.debug(f'Error when searching {self.arxiv_id}: {e}')
return None
if repo_list.get('count',0) == 0:
return None
return repo_list['results'][0]['url']
@cached_property
def tex(self) -> dict[str,str]:
with ExitStack() as stack:
tmpdirname = stack.enter_context(TemporaryDirectory())
file = self._paper.download_source(dirpath=tmpdirname)
try:
tar = stack.enter_context(tarfile.open(file))
except tarfile.ReadError:
logger.debug(f"Failed to find main tex file of {self.arxiv_id}: Not a tar file.")
return None
tex_files = [f for f in tar.getnames() if f.endswith('.tex')]
if len(tex_files) == 0:
logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file.")
return None
bbl_file = [f for f in tar.getnames() if f.endswith('.bbl')]
match len(bbl_file) :
case 0:
if len(tex_files) > 1:
logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple tex files while no bbl file.")
main_tex = None
else:
main_tex = tex_files[0]
case 1:
main_name = bbl_file[0].replace('.bbl','')
main_tex = f"{main_name}.tex"
if main_tex not in tex_files:
logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: The bbl file does not match any tex file.")
main_tex = None
case _:
logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple bbl files.")
main_tex = None
if main_tex is None:
logger.debug(f"Trying to choose tex file containing the document block as main tex file of {self.arxiv_id}")
#read all tex files
file_contents = {}
for t in tex_files:
f = tar.extractfile(t)
content = f.read().decode('utf-8',errors='ignore')
#remove comments
content = re.sub(r'%.*\n', '\n', content)
content = re.sub(r'\\begin{comment}.*?\\end{comment}', '', content, flags=re.DOTALL)
content = re.sub(r'\\iffalse.*?\\fi', '', content, flags=re.DOTALL)
#remove redundant \n
content = re.sub(r'\n+', '\n', content)
content = re.sub(r'\\\\', '', content)
#remove consecutive spaces
content = re.sub(r'[ \t\r\f]{3,}', ' ', content)
if main_tex is None and re.search(r'\\begin\{document\}', content):
main_tex = t
logger.debug(f"Choose {t} as main tex file of {self.arxiv_id}")
file_contents[t] = content
if main_tex is not None:
main_source:str = file_contents[main_tex]
#find and replace all included sub-files
include_files = re.findall(r'\\input\{(.+?)\}', main_source) + re.findall(r'\\include\{(.+?)\}', main_source)
for f in include_files:
if not f.endswith('.tex'):
file_name = f + '.tex'
else:
file_name = f
main_source = main_source.replace(f'\\input{{{f}}}', file_contents.get(file_name, ''))
file_contents["all"] = main_source
else:
logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file containing the document block.")
file_contents["all"] = None
return file_contents
@cached_property
def tldr(self) -> str:
introduction = ""
conclusion = ""
if self.tex is not None:
content = self.tex.get("all")
if content is None:
content = "\n".join(self.tex.values())
#remove cite
content = re.sub(r'~?\\cite.?\{.*?\}', '', content)
#remove figure
content = re.sub(r'\\begin\{figure\}.*?\\end\{figure\}', '', content, flags=re.DOTALL)
#remove table
content = re.sub(r'\\begin\{table\}.*?\\end\{table\}', '', content, flags=re.DOTALL)
#find introduction and conclusion
# end word can be \section or \end{document} or \bibliography or \appendix
match = re.search(r'\\section\{Introduction\}.*?(\\section|\\end\{document\}|\\bibliography|\\appendix|$)', content, flags=re.DOTALL)
if match:
introduction = match.group(0)
match = re.search(r'\\section\{Conclusion\}.*?(\\section|\\end\{document\}|\\bibliography|\\appendix|$)', content, flags=re.DOTALL)
if match:
conclusion = match.group(0)
prompt = """Given the title, abstract, introduction and the conclusion (if any) of a paper in latex format, generate a one-sentence TLDR summary:
\\title{__TITLE__}
\\begin{abstract}__ABSTRACT__\\end{abstract}
__INTRODUCTION__
__CONCLUSION__
"""
prompt = prompt.replace('__TITLE__', self.title)
prompt = prompt.replace('__ABSTRACT__', self.summary)
prompt = prompt.replace('__INTRODUCTION__', introduction)
prompt = prompt.replace('__CONCLUSION__', conclusion)
# use gpt-4o tokenizer for estimation
enc = tiktoken.encoding_for_model("gpt-4o")
prompt_tokens = enc.encode(prompt)
prompt_tokens = prompt_tokens[:4000] # truncate to 4000 tokens
prompt = enc.decode(prompt_tokens)
llm = get_llm()
tldr = llm.generate(
messages=[
{
"role": "system",
"content": "You are an assistant who perfectly summarizes scientific paper, and gives the core idea of the paper to the user.",
},
{"role": "user", "content": prompt},
]
)
return tldr
@cached_property
def affiliations(self) -> Optional[list[str]]:
if self.tex is not None:
content = self.tex.get("all")
if content is None:
content = "\n".join(self.tex.values())
#search for affiliations
match = re.search(r'\\author.*?\\maketitle', content, flags=re.DOTALL)
if match:
information_region = match.group(0)
else:
logger.debug(f"Failed to extract affiliations of {self.arxiv_id}: No author information found.")
return None
prompt = f"Given the author information of a paper in latex format, extract the affiliations of the authors in a python list format, which is sorted by the author order. If there is no affiliation found, return an empty list '[]'. Following is the author information:\n{information_region}"
# use gpt-4o tokenizer for estimation
enc = tiktoken.encoding_for_model("gpt-4o")
prompt_tokens = enc.encode(prompt)
prompt_tokens = prompt_tokens[:4000] # truncate to 4000 tokens
prompt = enc.decode(prompt_tokens)
llm = get_llm()
affiliations = llm.generate(
messages=[
{
"role": "system",
"content": "You are an assistant who perfectly extracts affiliations of authors from the author information of a paper. You should return a python list of affiliations sorted by the author order, like ['TsingHua University','Peking University']. If an affiliation is consisted of multi-level affiliations, like 'Department of Computer Science, TsingHua University', you should return the top-level affiliation 'TsingHua University' only. Do not contain duplicated affiliations. If there is no affiliation found, you should return an empty list [ ]. You should only return the final list of affiliations, and do not return any intermediate results.",
},
{"role": "user", "content": prompt},
]
)
try:
affiliations = re.search(r'\[.*?\]', affiliations, flags=re.DOTALL).group(0)
affiliations = eval(affiliations)
affiliations = list(set(affiliations))
affiliations = [str(a) for a in affiliations]
except Exception as e:
logger.debug(f"Failed to extract affiliations of {self.arxiv_id}: {e}")
return None
return affiliations