forked from lawlesst/fast-reconcile
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext.py
118 lines (108 loc) · 3.23 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Taken from the Helmut project.
https://github.com/okfn/helmut/blob/master/helmut/text.py
"""
from unicodedata import normalize as ucnorm, category
def normalize(text):
""" Simplify a piece of text to generate a more canonical
representation. This involves lowercasing, stripping trailing
spaces, removing symbols, diacritical marks (umlauts) and
converting all newlines etc. to single spaces.
"""
if not isinstance(text, str):
text = str(text)
text = text.lower()
decomposed = ucnorm('NFKD', text)
filtered = []
for char in decomposed:
cat = category(char)
if cat.startswith('C'):
filtered.append(' ')
elif cat.startswith('M'):
# marks, such as umlauts
continue
elif cat.startswith('Z'):
# newlines, non-breaking etc.
filtered.append(' ')
elif cat.startswith('S'):
# symbols, such as currency
continue
else:
filtered.append(char)
text = ''.join(filtered)
while ' ' in text:
text = text.replace(' ', ' ')
#remove hyphens
text = text.replace('-', ' ')
text = text.strip()
return ucnorm('NFKC', text)
def url_slug(text):
text = normalize(text)
text = text.replace(' ', '-')
text = text.replace('.', '_')
return text
def tokenize(text, splits='COPZ'):
token = []
for c in str(text):
if category(c)[0] in splits:
if len(token):
yield ''.join(token)
token = []
else:
token.append(c)
if len(token):
yield ''.join(token)
"""
Taken from the Helmut project.
https://github.com/okfn/helmut/blob/master/helmut/text.py
"""
from unicodedata import normalize as ucnorm, category
def normalize(text):
""" Simplify a piece of text to generate a more canonical
representation. This involves lowercasing, stripping trailing
spaces, removing symbols, diacritical marks (umlauts) and
converting all newlines etc. to single spaces.
"""
if not isinstance(text, str):
text = str(text)
text = text.lower()
decomposed = ucnorm('NFKD', text)
filtered = []
for char in decomposed:
cat = category(char)
if cat.startswith('C'):
filtered.append(' ')
elif cat.startswith('M'):
# marks, such as umlauts
continue
elif cat.startswith('Z'):
# newlines, non-breaking etc.
filtered.append(' ')
elif cat.startswith('S'):
# symbols, such as currency
continue
else:
filtered.append(char)
text = ''.join(filtered)
while ' ' in text:
text = text.replace(' ', ' ')
#remove hyphens
text = text.replace('-', ' ')
text = text.strip()
return ucnorm('NFKC', text)
def url_slug(text):
text = normalize(text)
text = text.replace(' ', '-')
text = text.replace('.', '_')
return text
def tokenize(text, splits='COPZ'):
token = []
for c in str(text):
if category(c)[0] in splits:
if len(token):
yield ''.join(token)
token = []
else:
token.append(c)
if len(token):
yield ''.join(token)