Skip to content

Commit 89817ff

Browse files
committed
Initial commit
0 parents  commit 89817ff

File tree

6 files changed

+258
-0
lines changed

6 files changed

+258
-0
lines changed

.gitignore

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
2+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
3+
4+
# User-specific stuff
5+
.idea/**/workspace.xml
6+
.idea/**/tasks.xml
7+
.idea/**/usage.statistics.xml
8+
.idea/**/dictionaries
9+
.idea/**/shelf
10+
11+
# Generated files
12+
.idea/**/contentModel.xml
13+
14+
# Sensitive or high-churn files
15+
.idea/**/dataSources/
16+
.idea/**/dataSources.ids
17+
.idea/**/dataSources.local.xml
18+
.idea/**/sqlDataSources.xml
19+
.idea/**/dynamic.xml
20+
.idea/**/uiDesigner.xml
21+
.idea/**/dbnavigator.xml
22+
23+
# Gradle
24+
.idea/**/gradle.xml
25+
.idea/**/libraries
26+
27+
# Gradle and Maven with auto-import
28+
# When using Gradle or Maven with auto-import, you should exclude module files,
29+
# since they will be recreated, and may cause churn. Uncomment if using
30+
# auto-import.
31+
# .idea/modules.xml
32+
# .idea/*.iml
33+
# .idea/modules
34+
35+
# CMake
36+
cmake-build-*/
37+
38+
# Mongo Explorer plugin
39+
.idea/**/mongoSettings.xml
40+
41+
# File-based project format
42+
*.iws
43+
44+
# IntelliJ
45+
out/
46+
47+
# mpeltonen/sbt-idea plugin
48+
.idea_modules/
49+
50+
# JIRA plugin
51+
atlassian-ide-plugin.xml
52+
53+
# Cursive Clojure plugin
54+
.idea/replstate.xml
55+
56+
# Crashlytics plugin (for Android Studio and IntelliJ)
57+
com_crashlytics_export_strings.xml
58+
crashlytics.properties
59+
crashlytics-build.properties
60+
fabric.properties
61+
62+
# Editor-based Rest Client
63+
.idea/httpRequests
64+
65+
# Android studio 3.1+ serialized cache file
66+
.idea/caches/build_file_checksums.ser
67+
68+
__pycache__
69+
venv/
70+
.idea/

cluster.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import queue
2+
from pathlib import Path
3+
from resources import constant
4+
5+
6+
class Entity:
7+
8+
def __init__(self, name):
9+
self.name = name
10+
self.wikipedia_page_id = name.split("_")[-1]
11+
self.wikidata_id = None
12+
13+
14+
class Cluster:
15+
16+
def __init__(self):
17+
self.name = ""
18+
self.id = 0
19+
self.entities = []
20+
21+
@staticmethod
22+
def create_clusters(cluster_file):
23+
input_file = Path(cluster_file)
24+
cluster_id = 0
25+
cluster_queue = queue.Queue()
26+
last_cluster = None
27+
with input_file.open() as file:
28+
for line in file:
29+
if line.startswith(constant.CLUSTER_HEADLINE):
30+
last_cluster = Cluster._create_cluster(cluster_id, line)
31+
cluster_queue.put(last_cluster)
32+
cluster_id += 1
33+
continue
34+
last_cluster.entities.append(Entity(line))
35+
36+
return cluster_queue
37+
38+
@staticmethod
39+
def _create_cluster(cluster_id, line):
40+
cluster = Cluster()
41+
cluster.id = cluster_id
42+
cluster.name = line.split(" ")[-1]
43+
return cluster
44+
45+
def fetch_wikidata_ids(self, mysql_connection):
46+
db_cursor = mysql_connection.cursor()
47+
entity_ids_list = ",".join([str(x.wikipedia_page_id) for x in self.entities])
48+
db_cursor.execute(f"SELECT pp_page, pp_value "
49+
f"FROM page_props "
50+
f"WHERE pp_propname = 'wikibase_item' "
51+
f"AND pp_page IN ({entity_ids_list});")
52+
records = db_cursor.fetch_all()
53+
db_cursor.close()
54+
55+
wikipedia_to_wikidata_key_mapping = {}
56+
for record in records:
57+
wikipedia_to_wikidata_key_mapping[record[0]] = record[1]
58+
59+
for entity in self.entities:
60+
entity.wikidata_id = wikipedia_to_wikidata_key_mapping[entity.wikipedia_page_id]
61+
62+

cluster_annotator.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import threading
2+
from collections import Counter
3+
import queue
4+
5+
from resources import constant
6+
7+
8+
class ClusterAnnotator(threading.Thread):
9+
10+
def __init__(self, thread_id, work_queue):
11+
threading.Thread.__init__(self)
12+
13+
self._thread_id = thread_id
14+
self._work_queue = work_queue
15+
16+
self._cluster_annotations = {"relations": Counter()}
17+
18+
self._mysql_connection = constant.create_mysql_connection()
19+
20+
def run(self):
21+
while self._analyze_cluster:
22+
pass
23+
24+
def _analyze_cluster(self):
25+
try:
26+
cluster = self._work_queue.get_nowait()
27+
cluster.fetch_wikidata_ids(self._mysql_connection)
28+
for entity in cluster.entities:
29+
self._analyze_entity(entity)
30+
return True
31+
except queue.Empty:
32+
return False
33+
34+
def _analyze_entity(self, entity):
35+
pass
36+
37+
def _analyze_relations(self, entity):
38+
pass

main.py

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import argparse
2+
import logging
3+
import queue
4+
5+
from cluster import Cluster
6+
from cluster_annotator import ClusterAnnotator
7+
from util.filesystem_validators import AccessibleDirectory
8+
9+
thread_pool = []
10+
working_queue = queue.Queue()
11+
12+
13+
def main():
14+
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
15+
parser = _initialize_parser()
16+
17+
args = parser.parse_args()
18+
19+
global working_queue
20+
working_queue = Cluster.create_clusters(args.input)
21+
_initialize_threads(args.workers)
22+
23+
for thread in thread_pool:
24+
thread.join()
25+
26+
27+
def _initialize_parser():
28+
general_parser = argparse.ArgumentParser(
29+
description='Enrich clusters with further information by utilizing external sources')
30+
general_parser.add_argument("--input", help='File containing clustered entities', action=AccessibleDirectory,
31+
required=True)
32+
general_parser.add_argument("--output", help='Desired location for enriched clusters', action=AccessibleDirectory,
33+
required=True)
34+
general_parser.add_argument("--workers", help='Number of workers to start in parallel', action=AccessibleDirectory,
35+
required=True, default=16)
36+
return general_parser
37+
38+
39+
def _initialize_threads(number_of_workers):
40+
for x in range(number_of_workers):
41+
_thread = ClusterAnnotator(x, working_queue.get)
42+
_thread.start()
43+
thread_pool.append(_thread)
44+
45+
46+
if __name__ == "__main__":
47+
main()

resources/constant.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
CLUSTER_HEADLINE = "[[CLUSTER"
2+
3+
4+
def create_mysql_connection():
5+
import mysql.connector
6+
return mysql.connector.connect(
7+
host="localhost",
8+
user="root",
9+
passwd="toor",
10+
database="mpss2019"
11+
)

util/filesystem_validators.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import argparse
2+
import os
3+
4+
5+
class AccessibleTextFile(argparse.Action):
6+
7+
def __call__(self, parser, parser_namespace, values, option_string=None):
8+
path = os.path.abspath(os.path.expanduser(values))
9+
10+
if not os.path.isfile(path):
11+
raise argparse.ArgumentError(self, "{0} is not a valid file".format(path))
12+
13+
if not os.access(path, os.R_OK):
14+
raise argparse.ArgumentError(self, "Permission denied to read from {0}".format(path))
15+
16+
setattr(parser_namespace, self.dest, path)
17+
18+
19+
class AccessibleDirectory(argparse.Action):
20+
21+
def __call__(self, parser, parser_namespace, values, option_string=None):
22+
path = os.path.abspath(os.path.expanduser(values))
23+
24+
if not os.path.isdir(path):
25+
raise argparse.ArgumentError(self, "{0} is not a valid directory".format(path))
26+
27+
if not os.access(path, os.R_OK):
28+
raise argparse.ArgumentError(self, "Permission denied to read from {0}".format(path))
29+
30+
setattr(parser_namespace, self.dest, path)

0 commit comments

Comments
 (0)