-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy patharxiv-search.py
216 lines (176 loc) · 7.75 KB
/
arxiv-search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import arxiv
import time
from datetime import datetime, timedelta
import textwrap
import tkinter as tk
from tkinter import ttk
import requests
from threading import Thread, Event
from config import restrict_to_most_recent, max_results, categories, search_terms_exclude, search_terms_include
import os
import re
import csv
if not os.path.exists("pdfs"):
os.makedirs("pdfs")
# Most Recent Days Checker. Sometimes arxiv posts papers w multiple dates on one day so we really just want to make sure we're checking whatever came out since we last queried
with open('most_recent_day_searched.txt', 'r') as file:
most_recent = file.read()
print(f'most recent day searched: {most_recent}')
try:
most_recent_check = datetime.strptime(most_recent, '%Y-%m-%d').date()
except ValueError:
most_recent_check = datetime.now().date() - timedelta(days=2)
with open('most_recent_day_searched.txt', 'w') as file:
file.write(most_recent_check.strftime('%Y-%m-%d'))
print("No date listed in most_recent_day_searched.txt. Two days ago's date inserted, but arXiv frequently publishes papers with older dates on a given day (yes it's confusing) so you may want to edit the text file to an even earlier date. If today is a weekend then the script may return nothing.")
# search terms
include = 'all:"' + '" OR all:"'.join(search_terms_include) + '"' if len(search_terms_include) > 0 else ""
exclude = 'all:"' + '" OR all:"'.join(search_terms_exclude) + '"' if len(search_terms_exclude) > 0 else ""
print("\nIncluded Terms:\n", include)
print("\nExcluded Terms:\n", exclude)
if len(search_terms_include) > 0 & len(search_terms_exclude) > 0:
query = f'({categories}) AND ({include}) ANDNOT ({exclude})'
elif len(search_terms_include) > 0:
query = f'({categories}) AND ({include})'
elif len(search_terms_exclude) > 0:
query = f'({categories}) ANDNOT ({exclude})'
else:
query = categories
print("\nQuery:\n", query)
client = arxiv.Client(
page_size = 1,
delay_seconds = 5.0,
num_retries = 100
)
# Define the search parameters
search = arxiv.Search(
query = query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate, # Sort by submission date
sort_order=arxiv.SortOrder.Descending # In descending order, so the most recent articles come first
)
results = client.results(search)
papers = []
def safe_iterator(iterable):
it = iter(iterable) # Get an iterator object from the iterable
while True:
try:
yield next(it) # Yield the next item from the iterator
except StopIteration:
print("got to StopIteration")
break # StopIteration means there are no more items
except Exception as e:
# Handle the exception or simply pass to skip to the next item
pass
i = 0
for result in safe_iterator(results):
if i == 0:
new_most_recent = result.published.date()#.strftime('%Y-%m-%d')
if restrict_to_most_recent & (result.published.date() <= most_recent_check):
print(f"GOT TO MOST RECENT DATE RESET: {result.published.date()} <= {most_recent_check}")
# Write new_most_recent to .txt file
# we only want to do that here bc if restrict_to_most_recent=False then we don't want to change the value
with open('most_recent_day_searched.txt', 'w') as file:
file.write(new_most_recent.strftime('%Y-%m-%d'))
# In case you need to run it back
print(f"If you need to run back the most recent check, then edit the date in most_recent_day_searched.txt to be '{most_recent_check.strftime('%Y-%m-%d')}'.")
# bc we've hit files we likely already downloaded before we'll end here
break
try:
papers.append({"i": i, "title": result.title, "url": result.pdf_url, "published_date": result.published.date()})
print(f'{result.title}\nPublish date: {result.published.date()}, PDF URL: {result.pdf_url}')
print(datetime.now().time())
#print(result.categories)
#print('Abstract: ', textwrap.fill(result.summary, width=220))
#print('DOI ', result.doi)
print()
except UnexpectedEmptyPageError:
continue
# Sleep for 5 seconds to avoid overloading the server
time.sleep(5)
i += 1
print(f"Total papers: {i}")
# Initialize CSV files with headers if they don't exist
csv_file_seen = "papers_seen.csv"
if not os.path.isfile(csv_file_seen):
with open(csv_file_seen, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Title", "ArXiv Link", "Paper Date", "Date Added"])
csv_file_downloaded = "papers_downloaded.csv"
if not os.path.isfile(csv_file_downloaded):
with open(csv_file_downloaded, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Title", "ArXiv Link", "Paper Date", "Date Added"])
# Function to download PDF from arXiv
def download_pdf(url, filename, event):
response = requests.get(url)
with open(filename, "wb") as f:
f.write(response.content)
event.set()
def on_button_click(url, filename):
arxiv_id = re.sub(r'v\d+$', '', url.split('/')[-1])
arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
line = f'{filename[5:-4]} | {arxiv_url}'
# Duplicate check:
try:
with open('links.txt', 'r') as file:
existing_lines = file.readlines()
if any(l.strip() == line for l in existing_lines):
print(f'Line already exists in links.txt - Skipping')
return # Skip if the URL already exists
except FileNotFoundError:
pass # Ignore if the file doesn't exist yet
# Write the title & URL to a text file
with open('links.txt', 'a') as file:
file.write(line + '\n')
# Write to papers_downloaded.csv
today_date = datetime.now().strftime('%Y-%m-%d')
with open(csv_file_downloaded, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerow([filename[5:-4], arxiv_url, new_most_recent, today_date])
# Download the PDF in a new thread
event = Event()
thread = Thread(target=download_pdf, args=(url, filename, event))
thread.daemon = True # Ensure the thread exits when the main program exits
thread.start()
def check_thread():
if event.is_set():
button.config(state=tk.NORMAL)
else:
root.after(100, check_thread)
button.config(state=tk.DISABLED)
check_thread()
# Create the main window
root = tk.Tk()
root.title("arXiv Paper Downloader")
# Create a canvas and a vertical scrollbar
canvas = tk.Canvas(root)
scrollbar = ttk.Scrollbar(root, orient="vertical", command=canvas.yview)
frame = ttk.Frame(canvas)
# Configure canvas and add the frame to it
canvas.create_window((0, 0), window=frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)
today_date = datetime.now().strftime('%Y-%m-%d')
for i, paper in enumerate(papers):
# Write to CSV
arxiv_id = re.sub(r'v\d+$', '', paper['url'].split('/')[-1])
arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
with open(csv_file_seen, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerow([paper['title'].replace(":", " -"), arxiv_url, paper['published_date'], today_date])
button = ttk.Button(
frame,
text=f"{paper['i']}: {paper['title']}",
command=lambda url=paper['url'],
fn=f"pdfs/{paper['title']}.pdf".replace(":", " -"): on_button_click(url, fn))
button.grid(row=i, column=1)
# Update frame size and set canvas scroll region
frame.update_idletasks()
canvas.config(scrollregion=canvas.bbox("all"))
# Place canvas and scrollbar in the GUI
canvas.grid(row=0, column=0, sticky="nsew")
scrollbar.grid(row=0, column=1, sticky="ns")
# Enable resizing
root.grid_rowconfigure(0, weight=1)
root.grid_columnconfigure(0, weight=1)
root.mainloop()