-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
318 lines (251 loc) · 10.5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import os
from numpy import savetxt
from numpy import loadtxt
import json
from io import open
import math
import string
import sys
import re
JLPT_READING_LEVELS = 5
# helper for cosine to read the file
def read_file(filename):
try:
with open(filename, encoding="utf8") as f:
data = f.read()
return data
except IOError:
print("Error opening or reading input file: ", filename)
sys.exit()
# for cosine
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase,
" "*len(string.punctuation)+string.ascii_lowercase)
# helper for cosine to get the list of words per line
def get_words_from_line_list(text):
text = text.translate(translation_table)
word_list = text.split()
# print(word_list)
return word_list
# helper for cosine
def count_frequency(word_list):
D = {}
for new_word in word_list:
if new_word in D:
D[new_word] = D[new_word] + 1
else:
D[new_word] = 1
return D
# helper for cosine to compute word frequencies
def word_frequencies_for_file(filename):
line_list = read_file(filename)
word_list = get_words_from_line_list(line_list)
freq_mapping = count_frequency(word_list)
return freq_mapping
# helper for cosine to compute dot product
def dotProduct(D1, D2):
Sum = 0.0
for key in D1:
if key in D2:
Sum += (D1[key] * D2[key])
return Sum
# helper for cosine to compute angle
def vector_angle(D1, D2):
numerator = dotProduct(D1, D2)
denominator = math.sqrt(dotProduct(D1, D1)*dotProduct(D2, D2))
return math.acos(numerator / denominator)
# cosine similarity
def data_cosine_similarity(input, data_file):
src = "c:/Users/hylee/OneDrive/Desktop/CS229/translated-texts/translated-texts"
docs = []
for filename in os.listdir(src):
docs.append(filename)
NUM_TEXTS = len(docs)
sorted_word_list_1 = word_frequencies_for_file(input)
sorted_word_list_2 = word_frequencies_for_file(data_file)
distance = vector_angle(sorted_word_list_1, sorted_word_list_2)
return distance
# returns the jaccards similarity between 2 documents
def compute_Jaccard_similarity(doc1, doc2):
# extracts the vocabulary of each document by tokenizing, omitting non-alphanumeric characters, and ignoring repeats
doc1_vocab = set()
doc2_vocab = set()
with open(doc1, encoding="utf8") as a_file:
for line in a_file:
stripped_line = line.strip()
word_list = stripped_line.split(" ")
for word in range(len(word_list)):
word_list[word] = re.sub(r'\W+', '', word_list[word])
if not word_list[word] == "":
doc1_vocab.add(word_list[word].lower())
with open(doc2, encoding="utf8") as a_file:
for line in a_file:
stripped_line = line.strip()
word_list = stripped_line.split(" ")
for word in range(len(word_list)):
word_list[word] = re.sub(r'\W+', '', word_list[word])
if not word_list[word] == "":
doc2_vocab.add(word_list[word].lower())
intersect = doc1_vocab.intersection(doc2_vocab)
union = doc1_vocab.union(doc2_vocab)
return (len(intersect) / len(union))
# finds list of best matches in a category
def find_best_match(input, assignments, src, cosine_similarity):
docs = []
highest_similarity = 0
best_match = ""
similarities = {}
for filename in os.listdir(src):
docs.append(filename)
# computes jaccard similarity for input compared to the rest of the texts
# finds the filename that is the best match
if not cosine_similarity:
# higher score indicates higher similarity for Jaccard
for doc in docs:
similarity = compute_Jaccard_similarity(
input, src + "/" + doc)
similarities[doc] = similarity
if similarity > highest_similarity:
highest_similarity = similarity
best_match = doc
else:
# lower angle between vectors indicates higher similarity for cosine
highest_similarity = float('inf')
for doc in docs:
similarity = data_cosine_similarity(input, src + "/" + doc)
similarities[doc] = similarity
if similarity < highest_similarity:
highest_similarity = similarity
best_match = doc
# finds all the members of the same category as the best match and maps them to their similarity scores
group_members = {}
ranking = []
category = assignments[best_match]
for key in assignments:
if assignments[key] == category:
group_members[key] = similarities[key]
if not cosine_similarity:
# sort the group members by score (descending order)
while len(group_members) != 0:
best_score = -1
best_file = ""
for member in group_members:
if group_members[member] > best_score:
best_score = group_members[member]
best_file = member
ranking.append((best_file, best_score))
group_members.pop(best_file)
else:
# sort the group members by score (ascending order)
while len(group_members) != 0:
best_score = float('inf')
best_file = ""
for member in group_members:
if group_members[member] < best_score:
best_score = group_members[member]
best_file = member
ranking.append((best_file, best_score))
group_members.pop(best_file)
return ranking
def run_kmeans(data, input, docs, cosine_similarity):
# sets the number of clusters
n_digits = 5
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
kmeans.fit(reduced_data)
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# # Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired, aspect="auto", origin="lower")
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3,
color="w", zorder=10)
plt.title("K-means clustering on the documents dataset (PCA-reduced data)\n"
"Centroids are marked with white cross")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
# plt.show()
kmeans.labels_
kmeans.cluster_centers_
# dictionary mapping filenames to their category assignments
assignments = {}
for i in range(len(kmeans.labels_)):
assignments[docs[i]] = kmeans.labels_[i]
return assignments
def print_recommendation_list(match_rankings, reading_level):
reading_difficulty = {}
# reading the data from the file
with open('difficulty-assessment/difficulty_dict.txt') as f:
reading_difficulty_dict = f.read()
# reconstructing the data as a dictionary
reading_difficulty_dict = reading_difficulty_dict.replace('\'', '\"')
reading_difficulty_dict = json.loads(reading_difficulty_dict)
print("\n---------------------------------------")
print("Same cluster and in order of similarity:")
print("---------------------------------------")
print("Similarity to input, Corpus text")
for rankings in match_rankings:
print(str(round(rankings[1], 3)) + ", " + rankings[0])
print("\n------------------------------------------------------------")
print("Same cluster, same reading level, and in order of similarity:")
print("------------------------------------------------------------")
print("Similarity to input, Corpus text")
for rankings in match_rankings:
if reading_difficulty_dict[rankings[0]] == int(reading_level):
print(str(round(rankings[1], 3)) +
", " + rankings[0])
return
def main():
# prompts user for Japanese Language reading level
print("Please enter your estimated JLPT reading reading level on a 1-5 scale, where 5 is beginner and 1 is advanced.")
reading_level = int(input("Estimated JLPT reading level: "))
while reading_level not in range(1, JLPT_READING_LEVELS + 1):
print("Invalid input. Please enter a number between 1 and 5.")
reading_level = int(input("Estimated JLPT reading level: "))
# Get input txt
print("Please enter a path to a .txt file to find similar texts.")
input_text_path = input("Path to input text: ")
input_text = ""
with open(input_text_path, "rb") as f:
input_text = f.read().decode("UTF-8")
src = "translated-texts"
docs = []
for filename in os.listdir(src):
docs.append(filename)
NUM_TEXTS = len(docs)
# can set to True or False to compute k-means using either Jaccard or cosine similarity
COSINE_SIMILARITY = False
kmeans_matrix = np.zeros((NUM_TEXTS, NUM_TEXTS))
if COSINE_SIMILARITY:
kmeans_matrix = loadtxt('cosine_similarities.csv', delimiter=',')
else:
kmeans_matrix = loadtxt('kmeans-matrix.csv', delimiter=',')
# runs k-means on the matrix of similarity scores
cluster_assignments = run_kmeans(
kmeans_matrix, input_text_path, docs, COSINE_SIMILARITY)
# returns list of tuples of best matches with their similarity scores
match_rankings = find_best_match(
input_text_path, cluster_assignments, src, COSINE_SIMILARITY)
print("\nHere are your recommendations:")
print_recommendation_list(match_rankings, str(reading_level))
if __name__ == "__main__":
main()