Initial commit

julianweise · julianweise · commit 89817fff86f9 · 2019-04-29T12:32:46.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,70 @@
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+__pycache__
+venv/
+.idea/
diff --git a/cluster.py b/cluster.py
@@ -0,0 +1,62 @@
+import queue
+from pathlib import Path
+from resources import constant
+
+
+class Entity:
+
+    def __init__(self, name):
+        self.name = name
+        self.wikipedia_page_id = name.split("_")[-1]
+        self.wikidata_id = None
+
+
+class Cluster:
+
+    def __init__(self):
+        self.name = ""
+        self.id = 0
+        self.entities = []
+
+    @staticmethod
+    def create_clusters(cluster_file):
+        input_file = Path(cluster_file)
+        cluster_id = 0
+        cluster_queue = queue.Queue()
+        last_cluster = None
+        with input_file.open() as file:
+            for line in file:
+                if line.startswith(constant.CLUSTER_HEADLINE):
+                    last_cluster = Cluster._create_cluster(cluster_id, line)
+                    cluster_queue.put(last_cluster)
+                    cluster_id += 1
+                    continue
+                last_cluster.entities.append(Entity(line))
+
+        return cluster_queue
+
+    @staticmethod
+    def _create_cluster(cluster_id, line):
+        cluster = Cluster()
+        cluster.id = cluster_id
+        cluster.name = line.split(" ")[-1]
+        return cluster
+
+    def fetch_wikidata_ids(self, mysql_connection):
+        db_cursor = mysql_connection.cursor()
+        entity_ids_list = ",".join([str(x.wikipedia_page_id) for x in self.entities])
+        db_cursor.execute(f"SELECT pp_page, pp_value "
+                                   f"FROM page_props "
+                                   f"WHERE pp_propname = 'wikibase_item' "
+                                   f"AND pp_page IN ({entity_ids_list});")
+        records = db_cursor.fetch_all()
+        db_cursor.close()
+
+        wikipedia_to_wikidata_key_mapping = {}
+        for record in records:
+            wikipedia_to_wikidata_key_mapping[record[0]] = record[1]
+
+        for entity in self.entities:
+            entity.wikidata_id = wikipedia_to_wikidata_key_mapping[entity.wikipedia_page_id]
+
+
diff --git a/cluster_annotator.py b/cluster_annotator.py
@@ -0,0 +1,38 @@
+import threading
+from collections import Counter
+import queue
+
+from resources import constant
+
+
+class ClusterAnnotator(threading.Thread):
+
+    def __init__(self, thread_id, work_queue):
+        threading.Thread.__init__(self)
+
+        self._thread_id = thread_id
+        self._work_queue = work_queue
+
+        self._cluster_annotations = {"relations": Counter()}
+
+        self._mysql_connection = constant.create_mysql_connection()
+
+    def run(self):
+        while self._analyze_cluster:
+            pass
+
+    def _analyze_cluster(self):
+        try:
+            cluster = self._work_queue.get_nowait()
+            cluster.fetch_wikidata_ids(self._mysql_connection)
+            for entity in cluster.entities:
+                self._analyze_entity(entity)
+            return True
+        except queue.Empty:
+            return False
+
+    def _analyze_entity(self, entity):
+        pass
+
+    def _analyze_relations(self, entity):
+        pass
diff --git a/main.py b/main.py
@@ -0,0 +1,47 @@
+import argparse
+import logging
+import queue
+
+from cluster import Cluster
+from cluster_annotator import ClusterAnnotator
+from util.filesystem_validators import AccessibleDirectory
+
+thread_pool = []
+working_queue = queue.Queue()
+
+
+def main():
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+    parser = _initialize_parser()
+
+    args = parser.parse_args()
+
+    global working_queue
+    working_queue = Cluster.create_clusters(args.input)
+    _initialize_threads(args.workers)
+
+    for thread in thread_pool:
+        thread.join()
+
+
+def _initialize_parser():
+    general_parser = argparse.ArgumentParser(
+        description='Enrich clusters with further information by utilizing external sources')
+    general_parser.add_argument("--input", help='File containing clustered entities', action=AccessibleDirectory,
+                                required=True)
+    general_parser.add_argument("--output", help='Desired location for enriched clusters', action=AccessibleDirectory,
+                                required=True)
+    general_parser.add_argument("--workers", help='Number of workers to start in parallel', action=AccessibleDirectory,
+                                required=True, default=16)
+    return general_parser
+
+
+def _initialize_threads(number_of_workers):
+    for x in range(number_of_workers):
+        _thread = ClusterAnnotator(x, working_queue.get)
+        _thread.start()
+        thread_pool.append(_thread)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/resources/constant.py b/resources/constant.py
@@ -0,0 +1,11 @@
+CLUSTER_HEADLINE = "[[CLUSTER"
+
+
+def create_mysql_connection():
+    import mysql.connector
+    return mysql.connector.connect(
+        host="localhost",
+        user="root",
+        passwd="toor",
+        database="mpss2019"
+    )
diff --git a/util/filesystem_validators.py b/util/filesystem_validators.py
@@ -0,0 +1,30 @@
+import argparse
+import os
+
+
+class AccessibleTextFile(argparse.Action):
+
+    def __call__(self, parser, parser_namespace, values, option_string=None):
+        path = os.path.abspath(os.path.expanduser(values))
+
+        if not os.path.isfile(path):
+            raise argparse.ArgumentError(self, "{0} is not a valid file".format(path))
+
+        if not os.access(path, os.R_OK):
+            raise argparse.ArgumentError(self, "Permission denied to read from {0}".format(path))
+
+        setattr(parser_namespace, self.dest, path)
+
+
+class AccessibleDirectory(argparse.Action):
+
+    def __call__(self, parser, parser_namespace, values, option_string=None):
+        path = os.path.abspath(os.path.expanduser(values))
+
+        if not os.path.isdir(path):
+            raise argparse.ArgumentError(self, "{0} is not a valid directory".format(path))
+
+        if not os.access(path, os.R_OK):
+            raise argparse.ArgumentError(self, "Permission denied to read from {0}".format(path))
+
+        setattr(parser_namespace, self.dest, path)