-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_wiki_image_urls.py
158 lines (128 loc) · 5.17 KB
/
get_wiki_image_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Copyright 2020, University of Freiburg
# Author: Natalie Prange <[email protected]>
import requests
import logging
import time
import argparse
from urllib import parse
from operator import itemgetter
from collections import defaultdict
# Set up the logger
logging.basicConfig(format='%(asctime)s: %(message)s',
datefmt="%H:%M:%S", level=logging.INFO)
logger = logging.getLogger(__name__)
def read_qid_title_list(input_file):
"""Read the QID to Wikipedia page-title from the given input file and
return the mapping as dictionary.
Arguments:
input_file - path to the mappings file
"""
logger.info("Read wikipedia mapping file %s" % input_file)
lst = list()
with open(input_file, "r", encoding="utf8") as file:
for line in file:
qid, title, _, _ = line.split("\t")
lst.append((qid, title))
return lst
def retrieve_wiki_image_url(batch, img_size=500):
"""Retrieve an image for the given titles using the Wikipedia API
Arguments:
batch - mapping from Wikipedia page title to list of QIDs
"""
# Prepare request
titles_str = "|".join(batch.keys())
host = "https://en.wikipedia.org/w/api.php"
data = {"action": "query", "prop": "pageimages", "titles": titles_str,
"pithumbsize": img_size, "format": "json", "formatversion": 2}
# Try to send request to API
counter = 0
while True:
counter += 1
try:
response = requests.get(host, params=data)
break
except requests.exceptions.RequestException:
time.sleep(2*counter)
if counter % 100 == 1:
logger.warning("Cannot reach host. Trial no %d" % counter)
# Process server response
if response:
response = response.json()
results = response["query"]["pages"]
urls = []
errors = []
for res in results:
# Get thumbnail url
thumbnail = ""
if "thumbnail" in res:
thumbnail = res["thumbnail"]["source"]
# Retrieve QID for result title and add QID + image url to result
title = res["title"]
if title in batch:
for qid in batch[title]:
urls.append((qid, thumbnail))
del batch[title]
else:
logger.warning("Result title could not be mapped to query: %s"
% title)
errors.append(title)
if len(batch) > 0:
logger.warning("Not all query titles could be mapped to result: %s"
% batch.items())
errors += batch.keys()
return urls, errors
return [], []
def write_qid_to_image_mapping(lst, outfile, batch_size=20):
"""For each QID-title pair in the given list, retrieve the Wikipedia image
url and write the QID to url mapping to the output file.
Arguments:
lst - a list containing tuples (QID, title)
outfile - the output file for the mapping
"""
error_file_name = outfile[:outfile.rfind(".") + 1] + "err"
error_file = open(error_file_name, "w", encoding="utf8")
logger.info("Query Wikipedia API")
logger.info("Write resulting qid to image url mapping to %s" % outfile)
with open(outfile, "w", encoding="utf8") as file:
start = time.time()
batch = defaultdict(list)
counter = 0
for qid, title in lst:
batch[title].append(qid)
# If batch is full, retrieve Wikipedia image url for each element
if len(batch) == batch_size:
urls, errors = retrieve_wiki_image_url(batch)
# Log errors in separate error file
for e in errors:
error_file.write("%s\n" % e)
# Write each QID and url in the batch to the output file
for q, url in urls:
file.write("%s\t%s\n" % (q, url))
# Reset batch
batch = defaultdict(list)
counter += 1
if counter % 1000 == 0:
logger.info("Processed %d qids in %fs" %
(counter, time.time() - start))
# Retrieve urls for batch remainder
if len(batch) > 0:
urls, errors = retrieve_wiki_image_url(batch)
# Log errors in separate error file
for e in errors:
error_file.write("%s\n" % e)
# Write each QID and url in the batch to the output file
for q, url in urls:
file.write("%s\t%s\n" % (q, url))
logger.info("Done.")
if __name__ == "__main__":
default_infile = "/nfs/students/natalie-prange/wikidata_mappings/qid_to_wikipedia_info.tsv"
parser = argparse.ArgumentParser()
parser.add_argument("output",
help="File to which to write the results.")
parser.add_argument("-i", "--input", default=default_infile,
help="QID to (title, image, abstract) mapping file.")
args = parser.parse_args()
infile = args.input
outfile = args.output
qid_title_list = read_qid_title_list(infile)
write_qid_to_image_mapping(qid_title_list, outfile)