Skip to content

Commit eacf344

Browse files
committedMay 13, 2019
replace mysql by wiki_index implementation in order to access wikipedia
-> wikidata mapping
1 parent 0be1201 commit eacf344

7 files changed

+28
-40
lines changed
 

‎.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
[submodule "wikidata_endpoint"]
22
path = wikidata_endpoint
33
url = https://github.com/mpss2019fn1/wikidata_endpoint.git
4+
[submodule "wiki_index"]
5+
path = wiki_index
6+
url = https://github.com/mpss2019fn1/wiki_index.git

‎cluster.py

+3-22
Original file line numberDiff line numberDiff line change
@@ -50,34 +50,15 @@ def _create_cluster(cluster_id, line):
5050
cluster.name = line.split(" ")[-1]
5151
return cluster
5252

53-
def fetch_wikidata_ids(self, mysql_connection):
54-
db_cursor = mysql_connection.cursor()
55-
entity_ids_list = ",".join([str(x.wikipedia_page_id) for x in self.entities])
53+
def fetch_wikidata_ids(self, wikipedia_wikidata_mapping):
5654
start_time = time.perf_counter()
57-
db_cursor.execute(f"SELECT pp_page, pp_value "
58-
f"FROM page_props "
59-
f"WHERE pp_propname LIKE 'wikibase_item' "
60-
f"AND pp_page IN ({entity_ids_list});")
61-
end_time = time.perf_counter()
62-
logging.info(f"MySQL query execution took {end_time - start_time} seconds for {len(self.entities)} entities")
63-
64-
start_time = time.perf_counter()
65-
mapping = {}
66-
for record in db_cursor:
67-
if not record[1]:
68-
print(record[0])
69-
continue
70-
71-
mapping[record[0]] = record[1]
72-
73-
db_cursor.close()
7455

7556
for entity in self.entities:
76-
if entity.wikipedia_page_id not in mapping:
57+
if entity.wikipedia_page_id not in wikipedia_wikidata_mapping:
7758
# entity is not present in wikidata
7859
self.entities.remove(entity)
7960
continue
8061

81-
entity.wikidata_id = mapping[entity.wikipedia_page_id]
62+
entity.wikidata_id = wikipedia_wikidata_mapping.wikidata_id(entity.wikipedia_page_id)
8263
end_time = time.perf_counter()
8364
logging.info(f"Mapping WikiData ids took {end_time - start_time} seconds for {len(self.entities)} entities")

‎cluster_annotator.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
class ClusterAnnotator(threading.Thread):
1212

13-
def __init__(self, thread_id, work_queue, wikidata_endpoint, output_directory):
13+
def __init__(self, thread_id, work_queue, wikidata_endpoint, output_directory, wikipedia_wikidata_mapping):
1414
threading.Thread.__init__(self)
1515

1616
self._thread_id = thread_id
@@ -19,9 +19,9 @@ def __init__(self, thread_id, work_queue, wikidata_endpoint, output_directory):
1919
self._cluster_annotations = {"relations": Counter()}
2020

2121
self._wikidata_endpoint = wikidata_endpoint
22-
self._mysql_connection = constant.create_mysql_connection()
2322
self._chunk_size = constant.STANDARD_CHUNK_SIZE
2423
self._output_directory = output_directory
24+
self._wikipedia_wikidata_mapping = wikipedia_wikidata_mapping
2525

2626
def run(self):
2727
while self._analyze_cluster():
@@ -31,7 +31,7 @@ def _analyze_cluster(self):
3131
try:
3232
cluster = self._work_queue.get_nowait()
3333
logging.info(f"Start analyzing cluster {cluster.name}")
34-
cluster.fetch_wikidata_ids(self._mysql_connection)
34+
cluster.fetch_wikidata_ids(self._wikipedia_wikidata_mapping)
3535
logging.info(f"Finished fetching wikidata ids from MySQL")
3636
self._analyze_entities(cluster)
3737
return True

‎main.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import argparse
22
import logging
33
import queue
4+
import time
45
from pathlib import Path
56

67
from cluster import Cluster
78
from cluster_annotator import ClusterAnnotator
9+
from wiki_index import InMemoryCsv, WikidataPageProps
810
from util.filesystem_validators import AccessibleDirectory, AccessibleTextFile
911
from wikidata_endpoint import WikidataEndpointConfiguration, WikidataEndpoint
1012

@@ -18,9 +20,14 @@ def main():
1820

1921
args = parser.parse_args()
2022

23+
start_time = time.perf_counter()
24+
wikipedia_wikidata_mapping = _create_wikidata_wikipedia_mapping(args.mapping_file)
25+
end_time = time.perf_counter()
26+
logging.info(f"Creating in-memory mapping index took {end_time - start_time} seconds")
27+
2128
global working_queue
2229
working_queue = Cluster.create_clusters(args.input)
23-
_initialize_threads(args.workers)
30+
_initialize_threads(args.workers, args.output, wikipedia_wikidata_mapping)
2431

2532
for thread in thread_pool:
2633
thread.join()
@@ -31,17 +38,24 @@ def _initialize_parser():
3138
description='Enrich clusters with further information by utilizing external sources')
3239
general_parser.add_argument("--input", help='File containing clustered entities', action=AccessibleTextFile,
3340
required=True)
41+
general_parser.add_argument("--mapping_file", help='CSV File containing wikidpedia page ids to wikidata mappings',
42+
action=AccessibleTextFile,
43+
required=True)
3444
general_parser.add_argument("--output", help='Desired location for enriched clusters', action=AccessibleDirectory,
3545
required=True)
3646
general_parser.add_argument("--workers", help='Number of workers to start in parallel', type=int,
3747
required=False, default=16)
3848
return general_parser
3949

4050

41-
def _initialize_threads(number_of_workers, output_directory):
51+
def _create_wikidata_wikipedia_mapping(mapping_file):
52+
return WikidataPageProps.initialize_instance_from_csv(mapping_file)
53+
54+
55+
def _initialize_threads(number_of_workers, output_directory, wikipedia_wikidata_mapping):
4256
endpoint = _create_wikidata_endpoint()
4357
for x in range(number_of_workers):
44-
_thread = ClusterAnnotator(x, working_queue, endpoint, output_directory)
58+
_thread = ClusterAnnotator(x, working_queue, endpoint, output_directory, wikipedia_wikidata_mapping)
4559
_thread.start()
4660
thread_pool.append(_thread)
4761

‎resources/constant.py

-11
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,6 @@
55
MAX_NUMBER_OF_VALUES_PER_RELATION = 20
66

77

8-
def create_mysql_connection():
9-
import mysql.connector
10-
return mysql.connector.connect(
11-
host="localhost",
12-
user="root",
13-
passwd="toor",
14-
database="mpss2019",
15-
auth_plugin="caching_sha2_password",
16-
)
17-
18-
198
def named_entity_relations_sparql_query(wikidata_ids):
209
query = """
2110
SELECT DISTINCT ?person ?wdLabel ?ps_Label WHERE {

‎wiki_index

Submodule wiki_index added at ba58200

0 commit comments

Comments
 (0)
Please sign in to comment.