Skip to content

Commit c2a27b2

Browse files
committed
First port to Cpython of custom Arabic tokenizer.
1 parent 9e1bf93 commit c2a27b2

File tree

4 files changed

+350
-0
lines changed

4 files changed

+350
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
*~
22
.*.sw[opn]
33
*.py[cod]
4+
*.cpp
45
*.egg-info
56
.DS_Store
67
.cache

nlp/spacy_custom/ar/msatokenizer.pxd

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from cymem.cymem cimport Pool
2+
from libcpp.vector cimport vector
3+
from preshed.maps cimport PreshMap
4+
5+
from spacy.matcher.phrasematcher cimport PhraseMatcher
6+
from spacy.strings cimport StringStore
7+
from spacy.structs cimport LexemeC, SpanC, TokenC
8+
from spacy.tokens.doc cimport Doc
9+
from spacy.typedefs cimport hash_t
10+
from spacy.vocab cimport LexemesOrTokens, Vocab, _Cached
11+
12+
cdef class MsaTokenizer:
13+
cdef readonly Vocab vocab
14+
cdef int count
15+
cdef object nlp
16+
cdef object atb_tokenizer

nlp/spacy_custom/ar/msatokenizer.pyx

+316
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
# cython: embedsignature=True, binding=True
2+
# distutils: language=c++
3+
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
4+
5+
cimport cython
6+
7+
import os
8+
import re
9+
import spacy
10+
# from spacy.tokens import Doc
11+
from spacy.vocab import Vocab
12+
# from spacy.tokenizer import Tokenizer
13+
14+
from .camel_tools.utils.charsets import UNICODE_LETTER_CHARSET
15+
from .camel_tools.utils.dediac import dediac_ar
16+
from .camel_tools.disambig.mle import MLEDisambiguator
17+
from .camel_tools.tokenizers.morphological import MorphologicalTokenizer
18+
19+
from cymem.cymem cimport Pool
20+
from cython.operator cimport dereference as deref
21+
from cython.operator cimport preincrement as preinc
22+
from libc.string cimport memcpy, memset
23+
from libcpp.set cimport set as stdset
24+
from preshed.maps cimport PreshMap
25+
26+
# import re
27+
from spacy.lexeme cimport EMPTY_LEXEME
28+
from spacy.strings cimport hash_string
29+
from spacy.tokens.doc cimport Doc
30+
31+
from spacy import util
32+
from spacy.attrs import intify_attrs
33+
from spacy.errors import Errors
34+
from spacy.scorer import Scorer
35+
from spacy.symbols import NORM, ORTH
36+
from spacy.tokens import Span
37+
from spacy.training import validate_examples
38+
from spacy.util import get_words_and_spaces
39+
40+
# define and replace the Arabic tokenizer
41+
42+
TATWEEL = u'\u0640' # 'ـ' Tatweel/Kashida character (esthetic character elongation for improved layout)
43+
ALEF_SUPER = u'\u0670' # ' ' Arabic Letter superscript Alef
44+
45+
# class cdef MsaTokenizer(Tokenizer):
46+
cdef class MsaTokenizer:
47+
48+
def __init__(self, Vocab vocab):
49+
self.vocab = vocab
50+
self.count = 0
51+
self.nlp = spacy.blank("ar")
52+
mle_msa = MLEDisambiguator.pretrained('calima-msa-r13')
53+
self.atb_tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, scheme='atbtok', split=True)
54+
55+
def __call__(self, text):
56+
self.count += 1
57+
doc = self.nlp(text)
58+
raw_tokens = [t.text for t in doc if t.text]
59+
n_raw_tokens = len(raw_tokens)
60+
raw_tokens_text = ''.join(raw_tokens)
61+
words = []
62+
spaces = []
63+
morphos = self.atb_tokenizer.tokenize(raw_tokens)
64+
n_morphos = len(morphos)
65+
tat_weel_spans = []
66+
i_raw = 0 # index of token in simple tokenization
67+
raw_token = doc[i_raw]
68+
raw_text = raw_token.text
69+
raw_idx = raw_token.idx
70+
raw_len = len(raw_text)
71+
raw_space = raw_token.whitespace_
72+
73+
morphos_chars = 0
74+
i_morpho = 0 # morpho index
75+
l_morphos = 0
76+
for morpho in morphos:
77+
assert len(morpho) > 0
78+
if morpho and len(morpho) > 1:
79+
if morpho[0] == '+' and not raw_text[l_morphos] == '+':
80+
morpho = morpho[1:]
81+
elif morpho[-1] == '+' and not raw_text[l_morphos+len(morpho)-1] == '+':
82+
morpho = morpho[:-1]
83+
l_morpho = len(morpho)
84+
try:
85+
assert l_morpho <= raw_len
86+
except:
87+
print('!', morphos_chars, l_morphos, raw_len, i_raw, raw_text, morpho)
88+
morpho_source = raw_tokens_text[morphos_chars : morphos_chars+l_morpho]
89+
assert l_morpho > 0
90+
words.append(morpho_source)
91+
morphos_chars += l_morpho
92+
l_morphos += l_morpho
93+
i_morpho += 1
94+
if l_morphos == raw_len:
95+
spaces.append(raw_space)
96+
else:
97+
spaces.append('')
98+
99+
if l_morphos > raw_len:
100+
print('!!!', morphos_chars, l_morphos, raw_len, i_raw, raw_text, morpho)
101+
break
102+
103+
if l_morphos == raw_len:
104+
l_morphos = 0
105+
i_raw += 1
106+
if i_raw < n_raw_tokens:
107+
raw_token = doc[i_raw]
108+
raw_text = raw_token.text
109+
raw_idx = raw_token.idx
110+
raw_len = len(raw_text)
111+
raw_space = raw_token.whitespace_
112+
if False: # self.count == 6221:
113+
tokens_chars = 0
114+
token_list = []
115+
for token in doc:
116+
token_list.append([tokens_chars, len(token.text), token.text])
117+
tokens_chars += len(token.text)
118+
print(token_list)
119+
morphos_chars = 0
120+
morpho_list = []
121+
for morpho in morphos:
122+
morpho_list.append([morphos_chars, len(morpho), morpho])
123+
morphos_chars += len(morpho)
124+
print(morpho_list)
125+
words_chars = 0
126+
word_list = []
127+
for word in words:
128+
word_list.append([words_chars, len(word), word])
129+
words_chars += len(word)
130+
print(word_list)
131+
morpho_doc = Doc(Vocab(), words=words, spaces=spaces)
132+
if False: # self.count == 6221:
133+
print([[token.idx, len(token.text), token.text] for token in morpho_doc])
134+
doc_text = doc.text
135+
morpho_doc_text = morpho_doc.text
136+
# print('---', self.count, len(text), len(doc_text), len(morpho_doc_text))
137+
if morpho_doc_text != text:
138+
print(text)
139+
print(doc_text)
140+
print(morpho_doc_text)
141+
return morpho_doc
142+
143+
def pipe(self, texts, batch_size=1000):
144+
"""Tokenize a stream of texts.
145+
146+
texts: A sequence of unicode texts.
147+
batch_size (int): Number of texts to accumulate in an internal buffer.
148+
Defaults to 1000.
149+
YIELDS (Doc): A sequence of Doc objects, in order.
150+
151+
DOCS: https://spacy.io/api/tokenizer#pipe
152+
"""
153+
for text in texts:
154+
yield self(text)
155+
156+
def score(self, examples, **kwargs):
157+
validate_examples(examples, "Tokenizer.score")
158+
return Scorer.score_tokenization(examples)
159+
160+
def to_disk(self, path, **kwargs):
161+
"""Save the current state to a directory.
162+
163+
path (str / Path): A path to a directory, which will be created if
164+
it doesn't exist.
165+
exclude (list): String names of serialization fields to exclude.
166+
167+
DOCS: https://spacy.io/api/tokenizer#to_disk
168+
"""
169+
path = util.ensure_path(path)
170+
with path.open("wb") as file_:
171+
file_.write(self.to_bytes(**kwargs))
172+
173+
def from_disk(self, path, *, exclude=tuple()):
174+
"""Loads state from a directory. Modifies the object in place and
175+
returns it.
176+
177+
path (str / Path): A path to a directory.
178+
exclude (list): String names of serialization fields to exclude.
179+
RETURNS (Tokenizer): The modified `Tokenizer` object.
180+
181+
DOCS: https://spacy.io/api/tokenizer#from_disk
182+
"""
183+
path = util.ensure_path(path)
184+
with path.open("rb") as file_:
185+
bytes_data = file_.read()
186+
self.from_bytes(bytes_data, exclude=exclude)
187+
return self
188+
189+
def to_bytes(self, *, exclude=tuple()):
190+
"""Serialize the current state to a binary string.
191+
192+
exclude (list): String names of serialization fields to exclude.
193+
RETURNS (bytes): The serialized form of the `Tokenizer` object.
194+
195+
DOCS: https://spacy.io/api/tokenizer#to_bytes
196+
"""
197+
"""
198+
serializers = {
199+
"vocab": lambda: self.vocab.to_bytes(exclude=exclude),
200+
"prefix_search": lambda: _get_regex_pattern(self.prefix_search),
201+
"suffix_search": lambda: _get_regex_pattern(self.suffix_search),
202+
"infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
203+
"token_match": lambda: _get_regex_pattern(self.token_match),
204+
"url_match": lambda: _get_regex_pattern(self.url_match),
205+
"exceptions": lambda: dict(sorted(self._rules.items())),
206+
"faster_heuristics": lambda: self.faster_heuristics,
207+
}
208+
"""
209+
serializers = {
210+
"vocab": lambda: self.vocab.to_bytes(exclude=exclude),
211+
}
212+
return util.to_bytes(serializers, exclude)
213+
214+
def from_bytes(self, bytes_data, *, exclude=tuple()):
215+
"""Load state from a binary string.
216+
217+
bytes_data (bytes): The data to load from.
218+
exclude (list): String names of serialization fields to exclude.
219+
RETURNS (Tokenizer): The `Tokenizer` object.
220+
221+
DOCS: https://spacy.io/api/tokenizer#from_bytes
222+
"""
223+
data = {}
224+
"""
225+
deserializers = {
226+
"vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
227+
"prefix_search": lambda b: data.setdefault("prefix_search", b),
228+
"suffix_search": lambda b: data.setdefault("suffix_search", b),
229+
"infix_finditer": lambda b: data.setdefault("infix_finditer", b),
230+
"token_match": lambda b: data.setdefault("token_match", b),
231+
"url_match": lambda b: data.setdefault("url_match", b),
232+
"exceptions": lambda b: data.setdefault("rules", b),
233+
"faster_heuristics": lambda b: data.setdefault("faster_heuristics", b),
234+
}
235+
"""
236+
deserializers = {
237+
"vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
238+
}
239+
# reset all properties and flush all caches (through rules),
240+
# reset rules first so that _reload_special_cases is trivial/fast as
241+
# the other properties are reset
242+
self.rules = {}
243+
self.prefix_search = None
244+
self.suffix_search = None
245+
self.infix_finditer = None
246+
self.token_match = None
247+
self.url_match = None
248+
util.from_bytes(bytes_data, deserializers, exclude)
249+
"""
250+
if "prefix_search" in data and isinstance(data["prefix_search"], str):
251+
self.prefix_search = re.compile(data["prefix_search"]).search
252+
if "suffix_search" in data and isinstance(data["suffix_search"], str):
253+
self.suffix_search = re.compile(data["suffix_search"]).search
254+
if "infix_finditer" in data and isinstance(data["infix_finditer"], str):
255+
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
256+
if "token_match" in data and isinstance(data["token_match"], str):
257+
self.token_match = re.compile(data["token_match"]).match
258+
if "url_match" in data and isinstance(data["url_match"], str):
259+
self.url_match = re.compile(data["url_match"]).match
260+
if "faster_heuristics" in data:
261+
self.faster_heuristics = data["faster_heuristics"]
262+
# always load rules last so that all other settings are set before the
263+
# internal tokenization for the phrase matcher
264+
if "rules" in data and isinstance(data["rules"], dict):
265+
self.rules = data["rules"]
266+
"""
267+
return self
268+
269+
@spacy.registry.tokenizers("msa_tokenizer")
270+
def make_msa_tokenizer():
271+
272+
def create_msa_tokenizer(nlp):
273+
return MsaTokenizer(nlp.vocab)
274+
275+
return create_msa_tokenizer
276+
277+
def msa_filter_pattern(in_file, out_file, pattern):
278+
assert in_file and pattern
279+
char = None
280+
n_matches = n_removed = 0
281+
while 1:
282+
prev = char
283+
char = in_file.read(1)
284+
if not char:
285+
break
286+
if char==pattern:
287+
n_matches += 1
288+
if prev in UNICODE_LETTER_CHARSET or pattern==ALEF_SUPER:
289+
n_removed += 1
290+
continue
291+
if out_file:
292+
out_file.write(char)
293+
return n_matches, n_removed
294+
295+
def msa_filter(folder='/_Tecnica/AI/CL/spacy/training/ar', filename='ar_padt-ud-train.conllu', remove=False):
296+
in_path = os.path.join(folder, filename)
297+
in_file = open(in_path, 'r', encoding='utf-8')
298+
i = 1
299+
for pattern, pat_name in ((TATWEEL, 'TATWEEL'), (ALEF_SUPER, 'ALEF_SUPER')):
300+
if remove:
301+
out_path = os.path.join(folder, filename+'.'+str(i))
302+
out_file = open(out_path, 'w', encoding='utf-8')
303+
n_matches, n_removed = msa_filter_pattern(in_file, remove and out_file, pattern)
304+
print(pat_name, '- found:', n_matches, '- removed:', n_removed)
305+
if pat_name != 'ALEF_SUPER': # check it wasn't the last iteration
306+
if remove:
307+
out_file.close()
308+
in_path = out_path
309+
in_file = open(in_path, 'r', encoding='utf-8')
310+
i += 1
311+
else:
312+
in_file.seek(0)
313+
in_file.close()
314+
if remove:
315+
out_file.close()
316+

pyx_setup.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import os
2+
from setuptools import setup
3+
from Cython.Build import cythonize
4+
import numpy
5+
6+
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
7+
#define_macros=[['NPY_NO_DEPRECATED_API',None], ['NPY_1_7_API_VERSION',None]]
8+
9+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
10+
pyx_path = os.path.join(BASE_DIR, "nlp", "spacy_custom", "ar", "msatokenizer.pyx")
11+
print(pyx_path)
12+
13+
setup(
14+
name='MsaTokenizer class',
15+
ext_modules=cythonize(pyx_path),
16+
include_dirs=[numpy.get_include()]
17+
)

0 commit comments

Comments
 (0)