-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathTrendPositions.py
97 lines (91 loc) · 3.57 KB
/
TrendPositions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""Outputs the distribution of positions of trending words within their job ad description"""
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import csv, sys, os
from TrendDetectionPipeline import get_countries, cross_domain_filtering, get_pre_processed_entries
import numpy as np
def get_trending_words(filename):
"""
Gets trending words for given file
:param filename: string
:return: dict
"""
trend_dir = "TF IDF Delta"
file_path = os.path.join(trend_dir, "{}.txt".format(filename[:-4]))
with open(file_path, 'r') as file:
lines = file.readlines()
if len(lines) <= 3:
return {}
else:
trending_NLP = {}
for line in lines[3:]:
elts = line[:-1].split("\t")
if len(elts) == 2:
delta = float(elts[1])
if delta >= 0:
trending_NLP[elts[0]] = delta
return trending_NLP
def get_trend_positions(descriptions, trending_words):
"""
Gets trending word positions
:param descriptions: list of lists of strings
:param trending_words:
:return:
"""
max_len = 0
for desc in descriptions:
l = len(desc)
if l > max_len:
max_len = l
positions_count = np.full(max_len, 0)
for desc in descriptions:
for word_index in range(len(desc)):
if desc[word_index] in trending_words:
positions_count[word_index] += 1
return positions_count
def get_string_of_array(array):
"""
Gets joined string for list of strings
:param array: list of strings
:return: string
"""
return '\t'.join(list([str(x) for x in array]))
def get_trend_position_graphs():
"""
Plots and prints positions of trending words
:return: None
"""
countries = get_countries(DESC_DIR)
keywords_per_country = np.load('keys.npy').item()
for country in countries:
print(country)
filenames = sorted([filename for filename in os.listdir(DESC_DIR) if filename.startswith(country) and filename.endswith(".csv")])
to_delete = cross_domain_filtering(keywords_per_country[country])
for filename in filenames:
trending_NLP = get_trending_words(filename)
if trending_NLP:
print(filename)
file_path = os.path.join(DESC_DIR, filename)
try:
desc_2017 = np.load("Trend Positions/descriptions_2017_{}.npy".format(filename[:-4])).item()
except:
# Change date to new time period
desc_2017 = get_pre_processed_entries(file_path, to_delete, 1, 6, 2017)
np.save("Trend Positions/descriptions_2017_{}.npy".format(filename[:-4]), desc_2017)
positions_count = get_trend_positions(desc_2017, trending_NLP)
plt.title(filename)
plt.figure(figsize=(100, 40))
plt.xlabel('Positions of Trending Words in Text')
plt.ylabel('Frequency of Position')
plt.plot(np.arange(len(positions_count)), positions_count, 'bo-')
plt.savefig('{}/{}.png'.format(DEST_DIR, filename[:-4]))
file = open("{}/{}.txt".format(DEST_DIR, filename[:-4]), "w")
print(get_string_of_array(np.arange(len(positions_count))), file=file)
print(get_string_of_array(positions_count), file=file)
file.close()
if __name__ == "__main__":
DESC_DIR = "Lemmatised Text"
DEST_DIR = "Trend Positions"
csv.field_size_limit(sys.maxsize)
get_trend_position_graphs()