5
5
import spacy
6
6
from spacy .tokens import Doc
7
7
from spacy .vocab import Vocab
8
+ from spacy .tokenizer import Tokenizer
8
9
9
10
from .camel_tools .utils .charsets import UNICODE_LETTER_CHARSET
10
11
from .camel_tools .utils .dediac import dediac_ar
11
12
12
13
TATWEEL = u'\u0640 ' # 'ـ' Tatweel/Kashida character (esthetic character elongation for improved layout)
13
14
ALEF_SUPER = u'\u0670 ' # ' ' Arabic Letter superscript Alef
14
15
15
- class MsaTokenizer :
16
+ # class MsaTokenizer:
17
+ class MsaTokenizer (Tokenizer ):
16
18
17
- def __init__ (self ):
18
- print ( '--- init MsaTokenizer ---' )
19
+ def __init__ (self , vocab ):
20
+ super ( MsaTokenizer , self ). __init__ ( vocab )
19
21
self .count = 0
20
22
from .camel_tools .disambig .mle import MLEDisambiguator
21
23
from .camel_tools .tokenizers .morphological import MorphologicalTokenizer
@@ -104,7 +106,7 @@ def __call__(self, text):
104
106
print ([[token .idx , len (token .text ), token .text ] for token in morpho_doc ])
105
107
doc_text = doc .text
106
108
morpho_doc_text = morpho_doc .text
107
- print ('---' , self .count , len (text ), len (doc_text ), len (morpho_doc_text ))
109
+ # print('---', self.count, len(text), len(doc_text), len(morpho_doc_text))
108
110
if morpho_doc_text != text :
109
111
print (text )
110
112
print (doc_text )
@@ -115,12 +117,10 @@ def __call__(self, text):
115
117
def make_msa_tokenizer ():
116
118
117
119
def create_msa_tokenizer (nlp ):
118
- return MsaTokenizer ()
120
+ return MsaTokenizer (nlp . vocab )
119
121
120
122
return create_msa_tokenizer
121
123
122
- print (spacy .registry .tokenizers .get ("msa_tokenizer" ))
123
-
124
124
def msa_filter_pattern (in_file , out_file , pattern ):
125
125
assert in_file and pattern
126
126
char = None
0 commit comments