-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtask4.py
115 lines (62 loc) · 2.54 KB
/
task4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import argparse
import math
def main():
# construct the argument parser and parse the arguments (input, output path )
ap = argparse.ArgumentParser()
ap.add_argument("-i",type=str, required=True,
help="path to inputfile for the fourth task")
ap.add_argument("-a",type=str, required=True,
help="first document")
ap.add_argument("-b",type=str, required=True,
help="second document")
args = vars(ap.parse_args())
# Opening the input file
i=open(args["i"], "r")
# retrieving docs
doc1 = args["a"]
doc2 = args["b"]
if i.mode == 'r':
# Reading the lines in the input file
contents =i.readlines()
# Main task : Extracting the two docs columns
titles = contents[0].split("\t")[1:][:-1]
# Setting booleans for checking the existence of the doc names that are prompted
exists1 = False
exists2 = False
for j in range(len(titles)):
if titles[j]==doc1:
index1 = j
exists1 = True
elif titles[j]==doc2:
index2 = j
exists2 = True
# Stopping the script if the documents do not exist in the input file
if not exists1:
print( "The first document does not exists in the input file")
return "The first document does not exists in the input file"
if not exists2:
print( "The second document does not exists in the input file")
return "The second document does not exists in the input file"
# Setting up vectors for the data of the two documents
document1 = []
document2 = []
# retriving data of the two documents
for line in contents[1:]:
weight1 = line.split("\t")[1:][index1].split("\n")[0]
weight2 = line.split("\t")[1:][index2].split("\n")[0]
document1.append(float(weight1))
document2.append(float(weight2))
# Computing the dot product of the documents
dot_product = sum([i*j for (i, j) in zip(document1, document2)])
# Computing the true value of each document
true_value1 = math.sqrt(sum([math.pow(i,2) for i in document1]))
true_value2 = math.sqrt(sum([math.pow(i,2) for i in document2]))
# Computing cosine value
cosine = dot_product / (true_value1*true_value2)
print(cosine)
else :
return "error reading input file"
# Closing the file readers
i.close()
if __name__ == "__main__":
main()