1
+ # cython: embedsignature=True, binding=True
2
+ # distutils: language=c++
3
+ # define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
4
+
5
+ cimport cython
6
+
7
+ import os
8
+ import re
9
+ import spacy
10
+ # from spacy.tokens import Doc
11
+ from spacy.vocab import Vocab
12
+ # from spacy.tokenizer import Tokenizer
13
+
14
+ from .camel_tools.utils.charsets import UNICODE_LETTER_CHARSET
15
+ from .camel_tools.utils.dediac import dediac_ar
16
+ from .camel_tools.disambig.mle import MLEDisambiguator
17
+ from .camel_tools.tokenizers.morphological import MorphologicalTokenizer
18
+
19
+ from cymem.cymem cimport Pool
20
+ from cython.operator cimport dereference as deref
21
+ from cython.operator cimport preincrement as preinc
22
+ from libc.string cimport memcpy, memset
23
+ from libcpp.set cimport set as stdset
24
+ from preshed.maps cimport PreshMap
25
+
26
+ # import re
27
+ from spacy.lexeme cimport EMPTY_LEXEME
28
+ from spacy.strings cimport hash_string
29
+ from spacy.tokens.doc cimport Doc
30
+
31
+ from spacy import util
32
+ from spacy.attrs import intify_attrs
33
+ from spacy.errors import Errors
34
+ from spacy.scorer import Scorer
35
+ from spacy.symbols import NORM, ORTH
36
+ from spacy.tokens import Span
37
+ from spacy.training import validate_examples
38
+ from spacy.util import get_words_and_spaces
39
+
40
+ # define and replace the Arabic tokenizer
41
+
42
+ TATWEEL = u ' \u0640 ' # 'ـ' Tatweel/Kashida character (esthetic character elongation for improved layout)
43
+ ALEF_SUPER = u ' \u0670 ' # ' ' Arabic Letter superscript Alef
44
+
45
+ # class cdef MsaTokenizer(Tokenizer):
46
+ cdef class MsaTokenizer:
47
+
48
+ def __init__ (self , Vocab vocab ):
49
+ self .vocab = vocab
50
+ self .count = 0
51
+ self .nlp = spacy.blank(" ar" )
52
+ mle_msa = MLEDisambiguator.pretrained(' calima-msa-r13' )
53
+ self .atb_tokenizer = MorphologicalTokenizer(disambiguator = mle_msa, scheme = ' atbtok' , split = True )
54
+
55
+ def __call__ (self , text ):
56
+ self .count += 1
57
+ doc = self .nlp(text)
58
+ raw_tokens = [t.text for t in doc if t.text]
59
+ n_raw_tokens = len (raw_tokens)
60
+ raw_tokens_text = ' ' .join(raw_tokens)
61
+ words = []
62
+ spaces = []
63
+ morphos = self .atb_tokenizer.tokenize(raw_tokens)
64
+ n_morphos = len (morphos)
65
+ tat_weel_spans = []
66
+ i_raw = 0 # index of token in simple tokenization
67
+ raw_token = doc[i_raw]
68
+ raw_text = raw_token.text
69
+ raw_idx = raw_token.idx
70
+ raw_len = len (raw_text)
71
+ raw_space = raw_token.whitespace_
72
+
73
+ morphos_chars = 0
74
+ i_morpho = 0 # morpho index
75
+ l_morphos = 0
76
+ for morpho in morphos:
77
+ assert len (morpho) > 0
78
+ if morpho and len (morpho) > 1 :
79
+ if morpho[0 ] == ' +' and not raw_text[l_morphos] == ' +' :
80
+ morpho = morpho[1 :]
81
+ elif morpho[- 1 ] == ' +' and not raw_text[l_morphos+ len (morpho)- 1 ] == ' +' :
82
+ morpho = morpho[:- 1 ]
83
+ l_morpho = len (morpho)
84
+ try :
85
+ assert l_morpho <= raw_len
86
+ except :
87
+ print (' !' , morphos_chars, l_morphos, raw_len, i_raw, raw_text, morpho)
88
+ morpho_source = raw_tokens_text[morphos_chars : morphos_chars+ l_morpho]
89
+ assert l_morpho > 0
90
+ words.append(morpho_source)
91
+ morphos_chars += l_morpho
92
+ l_morphos += l_morpho
93
+ i_morpho += 1
94
+ if l_morphos == raw_len:
95
+ spaces.append(raw_space)
96
+ else :
97
+ spaces.append(' ' )
98
+
99
+ if l_morphos > raw_len:
100
+ print (' !!!' , morphos_chars, l_morphos, raw_len, i_raw, raw_text, morpho)
101
+ break
102
+
103
+ if l_morphos == raw_len:
104
+ l_morphos = 0
105
+ i_raw += 1
106
+ if i_raw < n_raw_tokens:
107
+ raw_token = doc[i_raw]
108
+ raw_text = raw_token.text
109
+ raw_idx = raw_token.idx
110
+ raw_len = len (raw_text)
111
+ raw_space = raw_token.whitespace_
112
+ if False : # self.count == 6221:
113
+ tokens_chars = 0
114
+ token_list = []
115
+ for token in doc:
116
+ token_list.append([tokens_chars, len (token.text), token.text])
117
+ tokens_chars += len (token.text)
118
+ print (token_list)
119
+ morphos_chars = 0
120
+ morpho_list = []
121
+ for morpho in morphos:
122
+ morpho_list.append([morphos_chars, len (morpho), morpho])
123
+ morphos_chars += len (morpho)
124
+ print (morpho_list)
125
+ words_chars = 0
126
+ word_list = []
127
+ for word in words:
128
+ word_list.append([words_chars, len (word), word])
129
+ words_chars += len (word)
130
+ print (word_list)
131
+ morpho_doc = Doc(Vocab(), words = words, spaces = spaces)
132
+ if False : # self.count == 6221:
133
+ print ([[token.idx, len (token.text), token.text] for token in morpho_doc])
134
+ doc_text = doc.text
135
+ morpho_doc_text = morpho_doc.text
136
+ # print('---', self.count, len(text), len(doc_text), len(morpho_doc_text))
137
+ if morpho_doc_text != text:
138
+ print (text)
139
+ print (doc_text)
140
+ print (morpho_doc_text)
141
+ return morpho_doc
142
+
143
+ def pipe (self , texts , batch_size = 1000 ):
144
+ """ Tokenize a stream of texts.
145
+
146
+ texts: A sequence of unicode texts.
147
+ batch_size (int): Number of texts to accumulate in an internal buffer.
148
+ Defaults to 1000.
149
+ YIELDS (Doc): A sequence of Doc objects, in order.
150
+
151
+ DOCS: https://spacy.io/api/tokenizer#pipe
152
+ """
153
+ for text in texts:
154
+ yield self (text)
155
+
156
+ def score (self , examples , **kwargs ):
157
+ validate_examples(examples, " Tokenizer.score" )
158
+ return Scorer.score_tokenization(examples)
159
+
160
+ def to_disk (self , path , **kwargs ):
161
+ """ Save the current state to a directory.
162
+
163
+ path (str / Path): A path to a directory, which will be created if
164
+ it doesn't exist.
165
+ exclude (list): String names of serialization fields to exclude.
166
+
167
+ DOCS: https://spacy.io/api/tokenizer#to_disk
168
+ """
169
+ path = util.ensure_path(path)
170
+ with path.open(" wb" ) as file_:
171
+ file_.write(self .to_bytes(** kwargs))
172
+
173
+ def from_disk (self , path , *, exclude = tuple ()):
174
+ """ Loads state from a directory. Modifies the object in place and
175
+ returns it.
176
+
177
+ path (str / Path): A path to a directory.
178
+ exclude (list): String names of serialization fields to exclude.
179
+ RETURNS (Tokenizer): The modified `Tokenizer` object.
180
+
181
+ DOCS: https://spacy.io/api/tokenizer#from_disk
182
+ """
183
+ path = util.ensure_path(path)
184
+ with path.open(" rb" ) as file_:
185
+ bytes_data = file_.read()
186
+ self .from_bytes(bytes_data, exclude = exclude)
187
+ return self
188
+
189
+ def to_bytes (self , *, exclude = tuple ()):
190
+ """ Serialize the current state to a binary string.
191
+
192
+ exclude (list): String names of serialization fields to exclude.
193
+ RETURNS (bytes): The serialized form of the `Tokenizer` object.
194
+
195
+ DOCS: https://spacy.io/api/tokenizer#to_bytes
196
+ """
197
+ """
198
+ serializers = {
199
+ "vocab": lambda: self.vocab.to_bytes(exclude=exclude),
200
+ "prefix_search": lambda: _get_regex_pattern(self.prefix_search),
201
+ "suffix_search": lambda: _get_regex_pattern(self.suffix_search),
202
+ "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
203
+ "token_match": lambda: _get_regex_pattern(self.token_match),
204
+ "url_match": lambda: _get_regex_pattern(self.url_match),
205
+ "exceptions": lambda: dict(sorted(self._rules.items())),
206
+ "faster_heuristics": lambda: self.faster_heuristics,
207
+ }
208
+ """
209
+ serializers = {
210
+ " vocab" : lambda : self .vocab.to_bytes(exclude = exclude),
211
+ }
212
+ return util.to_bytes(serializers, exclude)
213
+
214
+ def from_bytes (self , bytes_data , *, exclude = tuple ()):
215
+ """ Load state from a binary string.
216
+
217
+ bytes_data (bytes): The data to load from.
218
+ exclude (list): String names of serialization fields to exclude.
219
+ RETURNS (Tokenizer): The `Tokenizer` object.
220
+
221
+ DOCS: https://spacy.io/api/tokenizer#from_bytes
222
+ """
223
+ data = {}
224
+ """
225
+ deserializers = {
226
+ "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
227
+ "prefix_search": lambda b: data.setdefault("prefix_search", b),
228
+ "suffix_search": lambda b: data.setdefault("suffix_search", b),
229
+ "infix_finditer": lambda b: data.setdefault("infix_finditer", b),
230
+ "token_match": lambda b: data.setdefault("token_match", b),
231
+ "url_match": lambda b: data.setdefault("url_match", b),
232
+ "exceptions": lambda b: data.setdefault("rules", b),
233
+ "faster_heuristics": lambda b: data.setdefault("faster_heuristics", b),
234
+ }
235
+ """
236
+ deserializers = {
237
+ " vocab" : lambda b : self .vocab.from_bytes(b, exclude = exclude),
238
+ }
239
+ # reset all properties and flush all caches (through rules),
240
+ # reset rules first so that _reload_special_cases is trivial/fast as
241
+ # the other properties are reset
242
+ self .rules = {}
243
+ self .prefix_search = None
244
+ self .suffix_search = None
245
+ self .infix_finditer = None
246
+ self .token_match = None
247
+ self .url_match = None
248
+ util.from_bytes(bytes_data, deserializers, exclude)
249
+ """
250
+ if "prefix_search" in data and isinstance(data["prefix_search"], str):
251
+ self.prefix_search = re.compile(data["prefix_search"]).search
252
+ if "suffix_search" in data and isinstance(data["suffix_search"], str):
253
+ self.suffix_search = re.compile(data["suffix_search"]).search
254
+ if "infix_finditer" in data and isinstance(data["infix_finditer"], str):
255
+ self.infix_finditer = re.compile(data["infix_finditer"]).finditer
256
+ if "token_match" in data and isinstance(data["token_match"], str):
257
+ self.token_match = re.compile(data["token_match"]).match
258
+ if "url_match" in data and isinstance(data["url_match"], str):
259
+ self.url_match = re.compile(data["url_match"]).match
260
+ if "faster_heuristics" in data:
261
+ self.faster_heuristics = data["faster_heuristics"]
262
+ # always load rules last so that all other settings are set before the
263
+ # internal tokenization for the phrase matcher
264
+ if "rules" in data and isinstance(data["rules"], dict):
265
+ self.rules = data["rules"]
266
+ """
267
+ return self
268
+
269
+ @ spacy.registry.tokenizers (" msa_tokenizer" )
270
+ def make_msa_tokenizer ():
271
+
272
+ def create_msa_tokenizer (nlp ):
273
+ return MsaTokenizer(nlp.vocab)
274
+
275
+ return create_msa_tokenizer
276
+
277
+ def msa_filter_pattern (in_file , out_file , pattern ):
278
+ assert in_file and pattern
279
+ char = None
280
+ n_matches = n_removed = 0
281
+ while 1 :
282
+ prev = char
283
+ char = in_file.read(1 )
284
+ if not char :
285
+ break
286
+ if char == pattern:
287
+ n_matches += 1
288
+ if prev in UNICODE_LETTER_CHARSET or pattern== ALEF_SUPER:
289
+ n_removed += 1
290
+ continue
291
+ if out_file:
292
+ out_file.write(char )
293
+ return n_matches, n_removed
294
+
295
+ def msa_filter (folder = ' /_Tecnica/AI/CL/spacy/training/ar' , filename = ' ar_padt-ud-train.conllu' , remove = False ):
296
+ in_path = os.path.join(folder, filename)
297
+ in_file = open (in_path, ' r' , encoding = ' utf-8' )
298
+ i = 1
299
+ for pattern, pat_name in ((TATWEEL, ' TATWEEL' ), (ALEF_SUPER, ' ALEF_SUPER' )):
300
+ if remove:
301
+ out_path = os.path.join(folder, filename+ ' .' + str (i))
302
+ out_file = open (out_path, ' w' , encoding = ' utf-8' )
303
+ n_matches, n_removed = msa_filter_pattern(in_file, remove and out_file, pattern)
304
+ print (pat_name, ' - found:' , n_matches, ' - removed:' , n_removed)
305
+ if pat_name != ' ALEF_SUPER' : # check it wasn't the last iteration
306
+ if remove:
307
+ out_file.close()
308
+ in_path = out_path
309
+ in_file = open (in_path, ' r' , encoding = ' utf-8' )
310
+ i += 1
311
+ else :
312
+ in_file.seek(0 )
313
+ in_file.close()
314
+ if remove:
315
+ out_file.close()
316
+
0 commit comments