-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtokenizer.py
36 lines (25 loc) · 863 Bytes
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/python3
import tensorflow as tf;
import pandas as pd;
class Tokenizer(object):
def __init__(self, vocab_file = 'vocab.pkl'):
self.vocab = pd.read_pickle('vocab.pkl');
def tokenize(self, str):
tokens = list();
for ch in str:
token = self.vocab[self.vocab['character'] == ch]['token'].iloc[0];
tokens.append(token);
return tf.constant(tokens, dtype = tf.int64);
def translate(self, tokens):
s = list();
for token in tokens:
ch = self.vocab[self.vocab['token'] == int(token)]['character'].iloc[0] if 0 <= token < len(self.vocab) else '-';
s.append(ch);
return ''.join(s);
def size(self):
return len(self.vocab);
if __name__ == "__main__":
tokenizer = Tokenizer();
print(tokenizer.translate([0,1,2,3]));
print(tokenizer.tokenize('你好世界'));
print(tokenizer.size());