Skip to content

Commit 9e1bf93

Browse files
committed
MsaTokenizer subclasses spacy Tokenizer.
1 parent e88b4a4 commit 9e1bf93

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

nlp/spacy_custom/ar/tokenizer.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,19 @@
55
import spacy
66
from spacy.tokens import Doc
77
from spacy.vocab import Vocab
8+
from spacy.tokenizer import Tokenizer
89

910
from .camel_tools.utils.charsets import UNICODE_LETTER_CHARSET
1011
from .camel_tools.utils.dediac import dediac_ar
1112

1213
TATWEEL = u'\u0640' # 'ـ' Tatweel/Kashida character (esthetic character elongation for improved layout)
1314
ALEF_SUPER = u'\u0670' # ' ' Arabic Letter superscript Alef
1415

15-
class MsaTokenizer:
16+
# class MsaTokenizer:
17+
class MsaTokenizer(Tokenizer):
1618

17-
def __init__(self):
18-
print('--- init MsaTokenizer ---')
19+
def __init__(self, vocab):
20+
super(MsaTokenizer, self).__init__(vocab)
1921
self.count = 0
2022
from .camel_tools.disambig.mle import MLEDisambiguator
2123
from .camel_tools.tokenizers.morphological import MorphologicalTokenizer
@@ -104,7 +106,7 @@ def __call__(self, text):
104106
print([[token.idx, len(token.text), token.text] for token in morpho_doc])
105107
doc_text = doc.text
106108
morpho_doc_text = morpho_doc.text
107-
print('---', self.count, len(text), len(doc_text), len(morpho_doc_text))
109+
# print('---', self.count, len(text), len(doc_text), len(morpho_doc_text))
108110
if morpho_doc_text != text:
109111
print(text)
110112
print(doc_text)
@@ -115,12 +117,10 @@ def __call__(self, text):
115117
def make_msa_tokenizer():
116118

117119
def create_msa_tokenizer(nlp):
118-
return MsaTokenizer()
120+
return MsaTokenizer(nlp.vocab)
119121

120122
return create_msa_tokenizer
121123

122-
print(spacy.registry.tokenizers.get("msa_tokenizer"))
123-
124124
def msa_filter_pattern(in_file, out_file, pattern):
125125
assert in_file and pattern
126126
char = None

0 commit comments

Comments
 (0)