-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
91 lines (73 loc) · 3.03 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from lxml import html
import requests
import urllib
import urllib.request
import os
import json
NUM_TO_DOWNLOAD = 1000
# URLs of licenses which allow non-commercial redistribution
ALLOW_REDISTRIBUTION = [
'http://creativecommons.org/licenses/by/4.0/',
'http://creativecommons.org/licenses/by-sa/4.0/',
'http://creativecommons.org/licenses/by-nc-sa/4.0/',
'http://creativecommons.org/publicdomain/zero/1.0/'
]
# URLs of licenses which do not permit redistribution
DISALLOW_REDISTRIBUTION = ['http://arxiv.org/licenses/nonexclusive-distrib/1.0/']
def does_licence_allow_redistribution(url):
"""
Given a url to an ArXiv abstract, does the article's license permit
redistribution for non-commericial purposes. Returns true or false if
license is recognised, and prints error and return false if not.
"""
license_url = ""
try:
page = requests.get(url)
tree = html.fromstring(page.content)
abs_div = tree.xpath('//div[@class="abs-license"]')[0]
abs_a = abs_div.getchildren()[0]
license_url = abs_a.get('href')
if license_url in ALLOW_REDISTRIBUTION:
return True, license_url
if license_url in DISALLOW_REDISTRIBUTION:
return False, license_url
raise BaseException("Did not recognise license at: " + license_url)
except Exception as e:
print(e)
return False, license_url
def scrape_redistributable_pdfs():
"""
Loop over papers in the Etymo paper metadata JSON file and download the PDFs
of papers which are redistributable. Starts with the newest entries (end of
metadata).
"""
# Open the Etymo metadata json file
filename = 'OpenData/etymo-10k/papers.json'
json_data = open(filename).read()
papers = json.loads(json_data)
os.makedirs("papers", mode=0o777, exist_ok=True)
num_downloaded = 0
i = len(papers) - 1
while num_downloaded < NUM_TO_DOWNLOAD and i >= 0:
print("")
print("Paper: ", i)
distributable, license_url = does_licence_allow_redistribution(papers[i]["link"])
if distributable:
print("Distributable, downloading...", end='', flush=True)
paper_id = papers[i]["paper_id"]
paper_folder = "papers/" + str(paper_id) + "/"
os.makedirs(paper_folder, mode=0o777, exist_ok=True)
# Download PDF and place in its own folder
urllib.request.urlretrieve(papers[i]["pdf_link"], paper_folder + "fulltext.pdf")
# Add a short text file describing the license and source
with open(paper_folder + 'license.txt', 'w') as f:
print("License can be found at:", file=f)
print(license_url, file=f)
print("", file=f)
print("Authors can be found at source", file=f)
print(papers[i]["link"], file=f)
num_downloaded = num_downloaded + 1
print(" done. Total downloaded: ", num_downloaded)
else:
print("Not distributable, skipping")
i = i - 1