replace mysql by wiki_index implementation in order to access wikipedia

julianweise · julianweise · commit eacf34407879 · 2019-05-13T09:15:03.000+02:00
-&gt; wikidata mapping
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "wikidata_endpoint"]
 	path = wikidata_endpoint
 	url = https://github.com/mpss2019fn1/wikidata_endpoint.git
+[submodule "wiki_index"]
+	path = wiki_index
+	url = https://github.com/mpss2019fn1/wiki_index.git
diff --git a/cluster.py b/cluster.py
@@ -50,34 +50,15 @@ def _create_cluster(cluster_id, line):
         cluster.name = line.split(" ")[-1]
         return cluster
 
-    def fetch_wikidata_ids(self, mysql_connection):
-        db_cursor = mysql_connection.cursor()
-        entity_ids_list = ",".join([str(x.wikipedia_page_id) for x in self.entities])
+    def fetch_wikidata_ids(self, wikipedia_wikidata_mapping):
         start_time = time.perf_counter()
-        db_cursor.execute(f"SELECT pp_page, pp_value "
-                          f"FROM page_props "
-                          f"WHERE pp_propname LIKE 'wikibase_item' "
-                          f"AND pp_page IN ({entity_ids_list});")
-        end_time = time.perf_counter()
-        logging.info(f"MySQL query execution took {end_time - start_time} seconds for {len(self.entities)} entities")
-
-        start_time = time.perf_counter()
-        mapping = {}
-        for record in db_cursor:
-            if not record[1]:
-                print(record[0])
-                continue
-
-            mapping[record[0]] = record[1]
-
-        db_cursor.close()
 
         for entity in self.entities:
-            if entity.wikipedia_page_id not in mapping:
+            if entity.wikipedia_page_id not in wikipedia_wikidata_mapping:
                 # entity is not present in wikidata
                 self.entities.remove(entity)
                 continue
 
-            entity.wikidata_id = mapping[entity.wikipedia_page_id]
+            entity.wikidata_id = wikipedia_wikidata_mapping.wikidata_id(entity.wikipedia_page_id)
         end_time = time.perf_counter()
         logging.info(f"Mapping WikiData ids took {end_time - start_time} seconds for {len(self.entities)} entities")
diff --git a/cluster_annotator.py b/cluster_annotator.py
@@ -10,7 +10,7 @@
 
 class ClusterAnnotator(threading.Thread):
 
-    def __init__(self, thread_id, work_queue, wikidata_endpoint, output_directory):
+    def __init__(self, thread_id, work_queue, wikidata_endpoint, output_directory, wikipedia_wikidata_mapping):
         threading.Thread.__init__(self)
 
         self._thread_id = thread_id
@@ -19,9 +19,9 @@ def __init__(self, thread_id, work_queue, wikidata_endpoint, output_directory):
         self._cluster_annotations = {"relations": Counter()}
 
         self._wikidata_endpoint = wikidata_endpoint
-        self._mysql_connection = constant.create_mysql_connection()
         self._chunk_size = constant.STANDARD_CHUNK_SIZE
         self._output_directory = output_directory
+        self._wikipedia_wikidata_mapping = wikipedia_wikidata_mapping
 
     def run(self):
         while self._analyze_cluster():
@@ -31,7 +31,7 @@ def _analyze_cluster(self):
         try:
             cluster = self._work_queue.get_nowait()
             logging.info(f"Start analyzing cluster {cluster.name}")
-            cluster.fetch_wikidata_ids(self._mysql_connection)
+            cluster.fetch_wikidata_ids(self._wikipedia_wikidata_mapping)
             logging.info(f"Finished fetching wikidata ids from MySQL")
             self._analyze_entities(cluster)
             return True
diff --git a/main.py b/main.py
@@ -1,10 +1,12 @@
 import argparse
 import logging
 import queue
+import time
 from pathlib import Path
 
 from cluster import Cluster
 from cluster_annotator import ClusterAnnotator
+from wiki_index import InMemoryCsv, WikidataPageProps
 from util.filesystem_validators import AccessibleDirectory, AccessibleTextFile
 from wikidata_endpoint import WikidataEndpointConfiguration, WikidataEndpoint
 
@@ -18,9 +20,14 @@ def main():
 
     args = parser.parse_args()
 
+    start_time = time.perf_counter()
+    wikipedia_wikidata_mapping = _create_wikidata_wikipedia_mapping(args.mapping_file)
+    end_time = time.perf_counter()
+    logging.info(f"Creating in-memory mapping index took {end_time - start_time} seconds")
+
     global working_queue
     working_queue = Cluster.create_clusters(args.input)
-    _initialize_threads(args.workers)
+    _initialize_threads(args.workers, args.output, wikipedia_wikidata_mapping)
 
     for thread in thread_pool:
         thread.join()
@@ -31,17 +38,24 @@ def _initialize_parser():
         description='Enrich clusters with further information by utilizing external sources')
     general_parser.add_argument("--input", help='File containing clustered entities', action=AccessibleTextFile,
                                 required=True)
+    general_parser.add_argument("--mapping_file", help='CSV File containing wikidpedia page ids to wikidata mappings',
+                                action=AccessibleTextFile,
+                                required=True)
     general_parser.add_argument("--output", help='Desired location for enriched clusters', action=AccessibleDirectory,
                                 required=True)
     general_parser.add_argument("--workers", help='Number of workers to start in parallel', type=int,
                                 required=False, default=16)
     return general_parser
 
 
-def _initialize_threads(number_of_workers, output_directory):
+def _create_wikidata_wikipedia_mapping(mapping_file):
+    return WikidataPageProps.initialize_instance_from_csv(mapping_file)
+
+
+def _initialize_threads(number_of_workers, output_directory, wikipedia_wikidata_mapping):
     endpoint = _create_wikidata_endpoint()
     for x in range(number_of_workers):
-        _thread = ClusterAnnotator(x, working_queue, endpoint, output_directory)
+        _thread = ClusterAnnotator(x, working_queue, endpoint, output_directory, wikipedia_wikidata_mapping)
         _thread.start()
         thread_pool.append(_thread)
 
diff --git a/resources/constant.py b/resources/constant.py
@@ -5,17 +5,6 @@
 MAX_NUMBER_OF_VALUES_PER_RELATION = 20
 
 
-def create_mysql_connection():
-    import mysql.connector
-    return mysql.connector.connect(
-        host="localhost",
-        user="root",
-        passwd="toor",
-        database="mpss2019",
-        auth_plugin="caching_sha2_password",
-    )
-
-
 def named_entity_relations_sparql_query(wikidata_ids):
     query = """
     SELECT DISTINCT ?person ?wdLabel ?ps_Label WHERE {
diff --git a/wiki_index b/wiki_index
@@ -0,0 +1 @@
+Subproject commit ba582002137cb2df4a9474d02dd18ea71f7ee1a3
diff --git a/wikidata_endpoint b/wikidata_endpoint
@@ -1 +1 @@
-Subproject commit b1dc639d7538be3ea9b2602c185001c8753d5106
+Subproject commit d529abd4bcd1275f7ed2238e745e583695a8e0c4