-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocxcleaner.py
40 lines (38 loc) · 1.65 KB
/
docxcleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from docx import Document
import json
doc = Document('movies3.docx')
#use this logic to initially parse the original word file until it returns a reasonable line by line output
def getText(filename):
#Create a raw list of movies from first parse
moviesList = []
#read every paragraph line
for word in filename.paragraphs:
#Check these corner cases for bad delimiters and break them into new objects(strings) in the list
if "___" in word.text:
continue
if " " in word.text:
for wordSplit in word.text.split(" "):
moviesList.append(wordSplit)
if "\t" in word.text:
for wordSplit in word.text.split("\t"):
moviesList.append(wordSplit)
if "\n" in word.text:
for wordSplit in word.text.split("\n"):
moviesList.append(wordSplit)
else:
#Wow we almost surely got a good result probably maybe...
moviesList.append(word.text)
#Now a final list to clean and return
moviesCleanList = []
for movie in moviesList:
# Just remove the whitespacing in front and back of the string...
movie = movie.strip()
if movie:
#And if it still has text (isn't empty), then add it to the final list
moviesCleanList.append(movie)
return moviesCleanList
#Print a prettified list, but realistically you'll want to save this to its own file to parse later
#print(json.dumps(getText(doc), indent=2))
cleanedMovies = json.dumps(getText(doc), indent=2)
with open ('cleanedmovies.json','w') as outfile:
json.dump(cleanedMovies,outfile)