-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrequency_term_validator.py
93 lines (71 loc) · 2.79 KB
/
frequency_term_validator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
# -*- coding: utf8 -*-
"""
TEST
The idea is to filter our terms by their frequency in the language (FR only
for testing purpose)
If a term have an higher frequency in the mails than in the french language
that means this term is probably specific
TESTED:
load_reference: load the reference file (frequency table for each words
on wikipedia FR in 2008)
filter_by_frequency: for a given term compare our frequency to the
general frequency in french language
TODO:
try to use Pierre's work for real testing (score function already up-to-
date)
Maybe use this filter before stemming etc or stemm the reference too
NOTES:
Reference file full of shit (like punctuation) but, we dont care, the
huge amount of terms in it compensate by far
TEST
"""
import re
import math
def load_reference(freq_file):
"""
Load the reference corpus, return a dict containing the frequency of a
term in the corpus
"""
# init
freq = dict()
global_term_count = 0
# Get total number of words
with open(freq_file, 'r') as INFILE:
for line in INFILE: #not loading the whole file
if len(line) > 1:
# print(line)
n, w = re.split("\s+", line.rstrip(),1)
global_term_count += int(n)
freq[w] = n #not a frequency yet
# Frequency (we log it because it's really low)
# N.B. we multiply by -1 for easier use later
# Note that, doing log on <1 and *-1 will inverse the order
# The highest frequency will be the closest to 0
for key in freq.keys():
freq[key] = int(freq[key])/int(global_term_count)
return freq
def filter_by_frequency(query_dict, reference_dict, score_modifier = 1):
"""
Compare the frequence of terms (keys) in query dict to the ones in the
reference dict
If the term have an higher frequency in the query_dict than in the
reference_dict then we keep it, it must be specific to the field of our
corpus, if not we delete it
You can specify a score_modifier for a wider range in reference_dict
It must be something like: 5% --> 1.05
"""
keys_to_pop = list()
for key in query_dict.keys():
if key in reference_dict.keys():
if query_dict[key] <= score_modifier*reference_dict[key]:
keys_to_pop.append(key)
if len(keys_to_pop) > 0:
for key in keys_to_pop:
query_dict.pop(key, None)
return query_dict
# freq=load_reference("/home/jerome/Desktop/tableFrequenceWiki_2008_utf8.txt")
# print(freq["le"])
# print(freq["bioinformatique"])
# query_dict={"bioinformatique":120, "le":4.2}
# print(filter_by_frequency(query_dict, freq, 1.05))