Skip to content

Commit 3f8504f

Browse files
committed
outsourced wikidata endpoint as external dependency (submodule)
1 parent c331904 commit 3f8504f

7 files changed

+43
-37
lines changed

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "wikidata_endpoint"]
2+
path = wikidata_endpoint
3+
url = https://github.com/mpss2019fn1/wikidata_endpoint.git

__init__.py

Whitespace-only changes.

cluster.py

+9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import queue
2+
import time
3+
import logging
24
from pathlib import Path
35
from resources import constant
46

@@ -51,10 +53,15 @@ def _create_cluster(cluster_id, line):
5153
def fetch_wikidata_ids(self, mysql_connection):
5254
db_cursor = mysql_connection.cursor()
5355
entity_ids_list = ",".join([str(x.wikipedia_page_id) for x in self.entities])
56+
start_time = time.perf_counter()
5457
db_cursor.execute(f"SELECT pp_page, pp_value "
5558
f"FROM page_props "
5659
f"WHERE pp_propname LIKE 'wikibase_item' "
5760
f"AND pp_page IN ({entity_ids_list});")
61+
end_time = time.perf_counter()
62+
logging.info(f"MySQL query execution took {end_time - start_time} seconds for {len(self.entities)} entities")
63+
64+
start_time = time.perf_counter()
5865
mapping = {}
5966
for record in db_cursor:
6067
if not record[1]:
@@ -72,3 +79,5 @@ def fetch_wikidata_ids(self, mysql_connection):
7279
continue
7380

7481
entity.wikidata_id = mapping[entity.wikipedia_page_id]
82+
end_time = time.perf_counter()
83+
logging.info(f"Mapping WikiData ids took {end_time - start_time} seconds for {len(self.entities)} entities")

cluster_annotator.py

+25-17
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
from collections import Counter
33
import queue
44
import logging
5-
import requests
5+
from pathlib import Path
66

77
from resources import constant
8-
from sparql_endpoint import SparqlEndpoint
8+
from wikidata_endpoint import WikidataEndpoint
9+
from wikidata_endpoint import WikidataEndpointConfiguration
910

1011

1112
class Relation:
@@ -67,7 +68,9 @@ def __init__(self, thread_id, work_queue):
6768
self._cluster_annotations = {"relations": Counter()}
6869

6970
self._mysql_connection = constant.create_mysql_connection()
70-
self._sparql_endpoint = SparqlEndpoint(constant.WIKIDATA_API_URL)
71+
self._wikidata_endpoint_config = WikidataEndpointConfiguration(Path("resources/wikidata_endpoint_config.ini"))
72+
self._wikidata_endpoint = WikidataEndpoint(self._wikidata_endpoint_config)
73+
self._chunk_size = constant.STANDARD_CHUNK_SIZE
7174

7275
def run(self):
7376
while self._analyze_cluster():
@@ -85,28 +88,33 @@ def _analyze_cluster(self):
8588
return False
8689

8790
def _analyze_entities(self, cluster):
88-
chunk_size = constant.STANDARD_CHUNK_SIZE
8991
index = 0
9092
metrics = RelationMetrics(len(cluster.entities))
9193

9294
while index < len(cluster.entities):
93-
chunk = cluster.entities[index:index + chunk_size]
95+
chunk = cluster.entities[index:index + self._chunk_size]
9496
query = constant.named_entity_relations_sparql_query(chunk)
9597

96-
try:
97-
logging.info(
98-
f"Executing SPARQL query for batch [{index},{index + len(chunk)}] on Thread #{self._thread_id}")
99-
relations = [Relation.from_wikidata_record(record) for record in self._sparql_endpoint.query(query)]
100-
logging.info(
101-
f"Finished executing SPARQL query on Thread #{self._thread_id}")
102-
ClusterAnnotator._count_relations(relations, metrics)
103-
index += len(chunk)
104-
105-
except requests.exceptions.Timeout:
106-
chunk_size //= 2
107-
pass
98+
logging.info(
99+
f"Executing SPARQL query for batch [{index},{index + len(chunk)}] on Thread #{self._thread_id}")
100+
with self._wikidata_endpoint.request() as request:
101+
relations = [Relation.from_wikidata_record(record) for record in
102+
request.post(query,
103+
on_timeout=self._on_timeout_wikidata_endpoint,
104+
on_error=self._on_error_wikidata_endpoint)]
105+
index += self._chunk_size if len(relations) > 0 else 0
106+
logging.info(
107+
f"Finished executing SPARQL query on Thread #{self._thread_id}")
108+
ClusterAnnotator._count_relations(relations, metrics)
109+
108110
ClusterAnnotator._print_relations(cluster, metrics)
109111

112+
def _on_timeout_wikidata_endpoint(self, request):
113+
self._chunk_size = int(self._chunk_size * (3/4))
114+
115+
def _on_error_wikidata_endpoint(self, request, error):
116+
pass
117+
110118
@staticmethod
111119
def _count_relations(relations, metrics):
112120
for relation in relations:
+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[REMOTE]
2+
url = https://query.wikidata.org/sparql
3+
4+
[LIMITING]
5+
concurrent_requests = 5

sparql_endpoint.py

-20
This file was deleted.

wikidata_endpoint

Submodule wikidata_endpoint added at 467a769

0 commit comments

Comments
 (0)