-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvectorization.py
117 lines (96 loc) · 3.36 KB
/
vectorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from scipy.io import mmwrite as matrix_write
import csv
from multiprocessing.dummy import Pool as ThreadPool
import gzip
import timeit
from os import listdir
import os
import pickle as pk
pool = ThreadPool(60)
# file_pool = ThreadPool(1)
# writing_pool = ThreadPool(1)
start = timeit.default_timer()
#Your statements here
#path = "karthik-gcloud-data/2015_08/"
# path = ""
path ="data/"
#files_list = [f for f in listdir(path)]
files_list = ["2015_08_000000000029.gz"]
# files_list = ["sample_filtering_text.csv"]
class vectorization():
def __init__(self,filename):
df = pd.read_csv(filename, compression="gzip")
self.filename = filename
#self.comments = list(df["body"])
self.filtered = []
self.comments = df["body"][:100]
self.chunk_size = 1000
def checkValid(self,i):
if not isinstance(i, str) :
return None
if(len(i)> 2):
# print i
# try:
# if TextBlob(i).detect_language() is "en":
# return str(i).strip()
# except:
body = str(i)
# print isinstance(i, unicode)
# print len(body), i
cmm = ""
for id in xrange(len(body)):
i = body[id]
if isinstance(i, unicode):
cmm += str(i.encode('ascii', 'ignore'))
else:
cmm += str(i)
#print "Transformed as: ", cmm
return cmm.strip()
def save(self,line):
self.writer.writerows(np.array(line))
def filewriting(self):
# writing_pool.map(self.checkValid,self.comments)
vectorizer = TfidfVectorizer(stop_words='english', norm='l2', sublinear_tf=True)
tfidf_matrix = vectorizer.fit_transform(self.filtered)
#print tfidf_matrix
#write = pd.DataFrame(data = tfidf_matrix)
rows,col = tfidf_matrix.shape
matrix_write(self.filename + "_out",tfidf_matrix)
# f = open(self.filename + "_out.csv", "wb")
# self.writer = csv.writer(f)
# for i in range(0,rows,self.chunk_size):
# if i+self.chunk_size < rows:
# l = i + self.chunk_size
# else:
# l = rows
#
# self.writer.writerows(tfidf_matrix[i:l].toarray())
# print "Transformation is done ..."
# f = open(self.path + self.filename + "_out.csv", "wb")
# self.writer = csv.writer(f)
#
#
# pool.map(self.save,tfidf_matrix.todense())
# file_pool.map(vectorization,files_list)
fname = "".join([path, files_list[0]])
v = vectorization(fname)
print "processing data: (count): ", len(v.comments)
if os._exists(fname+"_filtered.pk"):
filtered = pk.load(fname + "_filtered.pk")
else:
filtered = pool.map(v.checkValid, v.comments)
pk.dump(filtered, open(fname+"_filtered.pk", "wb"))
print "filtered data"
v.filtered = [i for i in filtered if i is not "" and i is not None]
print "set to v..writing now"
# print "\n".join([str((i, self.filtered[i])) for i in xrange(len(self.filtered)) if self.filtered[i] is not "" or self.filtered is not None])
v.filewriting()
print "written"
pool.close()
pool.join()
stop = timeit.default_timer()
print "Time taken: ", stop - start