-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfunctions.py
272 lines (223 loc) · 10.1 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#List of reusable functions that may be called in BMedScDOC_Scripts
#imports
import re
import os
import sys
import csv
import progressbar
import shutil
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize
import spacy
from tika import parser
from pathlib import Path
downloads_path = str(Path.home() / "Downloads") + "/" #Find Downloads folder path
os.environ['KMP_DUPLICATE_LIB_OK']='True'
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
extra_stopwords = ["said", "say", "...", "like", "et", "al", "doi", "use", "https", "http"]
#functions
class outputs:
#Action: Creates a Barchart from a list of words and saves it to computer
#Input: A list of strings ie.[word1, word2]
#Input: Path to save location ie. "usr/Documents/Folder" default is downloads folder
#Output: prints the list of tuples [total], displays barchart and saves a Bar.png to save_location
def barchart(list, save_location = downloads_path):
total = Counter(list).most_common()
print("Total>>>", total)
chart = outputs.plotwords(total)
chart.savefig('{}Bar.png'.format(save_location))
plt.show()
#Action: Creates a Barchart from a list of words and displays it
#Input: A list of tuples ie.[(word1, 2), (word2, 3)]
#Output: Displays barchart
def plotwords(list_of_tuples):
plotdata = ([ a for a,b in list_of_tuples ], [ b for a,b in list_of_tuples ])
#split x and y axis values into separate lists
words = plotdata[0]
occurrences = plotdata[1]
#use pyplot the graph
plt.figure(figsize=(20, 5))
plt.plot()
plt.bar(words, occurrences, width = 0.8, align = 'center', )
plt.xticks(rotation=90)
#title and labels for graph
plt.title("BMedScDOC words in corpus")
plt.ylabel("Number of occurrences")
plt.xlabel("Words from dictionary")
#all done!
return plt
#Action: Creates a wordcloud from a list of words (with repeats)
#Input: A list of strings ie.[word1, word2]
#Input: Path to image mask ie. "usr/Documents/Folder" default is head.png
#Input: Path to save location ie. "usr/Documents/Folder" default is downloads folder
#Output: Saves a freq_WordCloud.png to save_location
def wc(list, shape_path = "/Users/manojarachige/Documents/Coding/BMedScDOC1/Assets/head.png", save_location = downloads_path):
total = Counter(list).most_common()
brain_mask = np.array(Image.open(shape_path))
dict_for_wc = dict(total)
wordcloud = WordCloud(background_color="white", colormap="plasma", mask=brain_mask, contour_width=3, contour_color='firebrick').generate_from_frequencies(dict_for_wc)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
wordcloud.to_file('{}freq_WordCloud.png'.format(save_location))
#Action: Converts a list of text strings into a df tdm
#Input: list of strings, with each string being a "document" for the term-document matrix
#Output: pandas df of a tdm
def tdm_make(docs):
vec = CountVectorizer()
X = vec.fit_transform(docs)
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
return df
#Action: Make a csv file from a folder of txts, with 2 columns: [Name] and [Text]
#Input: path to folder of txt files
#Output: csv with 2 columns saved to the current directory. csv has 2 columns: [Name] and [Text]
def make_csv(folder_path, extension):
txt_lst = os_level.get_corpus(folder_path, extension) #list
headers = ["name", "text"]
Name = txt_lst
Text = []
with open('corpus.csv', 'w', encoding='UTF8', newline='') as file:
writer = csv.writer(file)
writer.writerow(headers)
for i in range(len(txt_lst)):
filepath = folder_path + "/" + txt_lst[i]
with open(filepath, 'r') as contents:
string = contents.read()
string.replace("\n", " ")
string.encode('unicode_escape')
data = [txt_lst[i], string]
writer.writerow(data)
class processing:
#Action: Splits stirng following an instance of "References" in a txt file
#Input: path to txt file with a references section ie. "usr/Documents/File"
#Output: modifies txt file by lowering it and removing references
def remove_ref(text):
f = open(text,"r")
openfile = f.read()
newfile = openfile.lower() #lowers the file just in case
if "references" in newfile:
finder = newfile.rfind("references")
newfile = newfile[:finder]
f.close()
n = open(text,"w")
n.write(newfile)
n.close()
#Action: Performs preprocessing of the string and returns an empty list if non-english.
#String should already be lowered and references removed
#Input: string to be modified and tokenised
#Output: list of preprocessed string ie.[preprocessed, words]
def preprocess(string, csv_path = "/Users/manojarachige/Documents/Coding/BMedScDOC1/Assets/Inputs/replacement.csv"):
#lower
string = string.lower()
#Remove weird characters that aren't usual ASCII 0-127 chars
encoded_string = string.encode("ascii", "ignore")
string = encoded_string.decode()
#remove line breaks and hyphenated words
string.replace("-\n", "").replace("\n", " ").replace("\r", " ")
string = re.sub(r'[^A-Za-z ]+', '', string)
#check if english
if detect(string) != "en":
return []
#remove stopwords
string = remove_stopwords(string)
#lemmatise
lemmatised = nlp(string)
lemmatised = " ".join([token.lemma_ for token in lemmatised])
#replace
df = pd.read_csv(csv_path)
replaced = lemmatised
for i in range(len(df)):
replaced = replaced.replace(df["Location"][i], df["Replacement"][i])
#tokenise
tokenised = word_tokenize(replaced)
#remove single chars and remove stopwords
cleaned = [word for word in tokenised if len(word) > 2 and word not in extra_stopwords]
#returns a list
return cleaned
#Action: Converts a pdf file to a txt file, lowers and removes references
#Input: a single PDF file
#Output: a single txt file of the same filename
def pdftotxt(text):
with open("{}.txt".format((text)[:-4]), "w") as output:
raw = parser.from_file(text)
string = raw["content"]
if string == None:
string = "None"
output.write(string)
processing.remove_ref("{}.txt".format((text)[:-4])) #removes references in this step AND lowers the file
return "{}.txt".format((text)[:-4])
class os_level:
#Action: Returns a list of files in a folder of a specified file type
#Input: path to folder of files
#The extension of the file type you want
#Output: list of the names of
def get_corpus(basepath, extension):
lst = os.listdir(basepath)
lst.sort()
newlst = []
for file in lst:
check = file
path = os.path.abspath(check)
ext = os.path.splitext(path)[-1].lower()
if ext == "." + extension:
newlst.append(check)
return newlst
#Action: Creates a list of words that are in a txt file, removing line breaks
#Input: path to txt file
#Output: list of words in txt file
def makedict(textfile):
with open(textfile, 'r') as file:
dictionary = file.read().replace('\n', ' ')
dictionary = dictionary.lower().split()
return dictionary
#Action: Checks if a file is a valid and working pdf
#Input: path to pdf
#Output: Boolean True if valid PDF
def isFullPdf(f):
end_content = ''
start_content = ''
size = os.path.getsize(f)
if size < 1024: return False
with open(f, 'rb') as fin:
#start content
fin.seek(0, 0)
start_content = fin.read(1024)
start_content = start_content.decode("ascii", 'ignore' )
fin.seek(-1024, 2)
end_content = fin.read()
end_content = end_content.decode("ascii", 'ignore' )
start_flag = False
#%PDF
if start_content.count('%PDF') > 0:
start_flag = True
if end_content.count('%%EOF') and start_flag > 0:
return True
eof = bytes([0])
eof = eof.decode("ascii")
if end_content.endswith(eof) and start_flag:
return True
return False
#Action: Iterates over a source folder of pdfs and separates the valid ones to output_location
#Input: source is a string path to folder of pdfs
#Input: output_location is a string path to output location of successfully separated pdfs
#Output: Copies valid pdfs to a specified output_location
def pdf_separate(source, output_location, extension = "pdf"):
separated_counter = 0
time_counter = 0
pdf_list = os_level.get_corpus(source, extension)
length = len(pdf_list)
with progressbar.ProgressBar(max_value=length, redirect_stdout=True) as p:
for i in pdf_list:
if os_level.isFullPdf(source + "/" + i) == True:
shutil.copy2(source + "/" + i, output_location)
separated_counter += 1
p.update(time_counter)
time_counter += 1
print(separated_counter, "files separated out of ", len(pdf_list), "source files" )