This repository was archived by the owner on Jan 15, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 530
/
Copy pathapply_subword.py
198 lines (183 loc) · 9.91 KB
/
apply_subword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import os
import argparse
import textwrap
from multiprocessing import Pool
import numpy as np
import time
from gluonnlp.data import tokenizers
from gluonnlp.data.tokenizers.huggingface import is_new_version_model_file
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent('''
Encode based on different implementations.
We support the following models:
"nlp_process apply_subword --model spm" : Encode with Sentencepiece Model;
"nlp_process apply_subword --model subword_nmt" : Encode with the subword-nmt package;
"nlp_process apply_subword --model yttm" : Encode with YouTokenToMe;
"nlp_process apply_subword --model hf_bytebpe" : Encode with the Byte-level BPE Tokenizer Implemented by Huggingface.
"nlp_process apply_subword --model hf_wordpiece" : Encode with the WordPiece Tokenizer Implementated by Huggingface.
"nlp_process apply_subword --model hf_bpe" : Encode with the BPE Tokenizer Implemented by Huggingface.
''')
)
parser.add_argument('--corpus', type=str, nargs='+', required=True,
help='Path of the corpus. '
'You may input multiple corpus files separated by space.')
parser.add_argument('--save-path', type=str, required=True,
help='Path of the output file')
parser.add_argument('--model-path', type=str, default=None,
help='Path of the model file')
parser.add_argument('--vocab-path', type=str, default=None,
help='Path of the vocabulary file')
parser.add_argument('--model', type=str, choices=['spm',
'subword_nmt',
'yttm',
'hf_bytebpe',
'hf_wordpiece',
'hf_bpe'],
required=True, help='Subword model type')
parser.add_argument('--num-process', type=int, default=os.cpu_count(),
help='Number of process')
parser.add_argument('--lowercase', action='store_true', default=False,
help='Use lowercase, '
'only applicable to hf_bpe, hf_bytebpe and hf_wordpiece')
parser.add_argument('--strip-accents', action='store_true', default=False,
help='Disable BERT characters normalization, '
'only applicable to hf_wordpiece')
parser.add_argument('--output-type', type=str, choices=['subword', 'id'], default='subword',
help='Whether output subwords or ids')
parser.add_argument('--bpe-dropout', type=float, default=None,
help='BPE dropout, applicable to subword_nmt, yttm, hf_bpe and hf_bytebpe')
return parser
class ParallelCorpusApplyer:
def __init__(self, corpus, tokenizer_model, output_type):
self.chunk_size = 1024 * 1024
self.corpus = corpus
self.tokenizer_model = tokenizer_model
self.output_type = output_type
def chunk_iter(self, step=10):
for corpus_path in self.corpus:
line_pos = [0]
with open(corpus_path, 'rb') as fcb:
pos = 0
for line in fcb:
pos += len(line)
line_pos.append(pos)
line_pos = np.array(line_pos, dtype=np.int64)
line_size = line_pos[1:] - line_pos[:-1]
num_lines = line_pos.shape[0] - 1
budget = self.chunk_size
chunk_start = 0
cur_chunk_size = 0
for i in range(0, num_lines, step):
line_batch_num = min(num_lines - i, step)
batch_line_size = line_size[i:(i + line_batch_num)].sum()
budget -= batch_line_size
cur_chunk_size += batch_line_size
if budget <= 0 or i + step >= num_lines:
yield corpus_path, chunk_start, cur_chunk_size
chunk_start += cur_chunk_size
budget = self.chunk_size
cur_chunk_size = 0
def process_chunk(self, args):
corpus_path, chunk_start, cur_chunk_size = args
with open(corpus_path, 'rb') as fcb:
fcb.seek(chunk_start)
lines_byte = fcb.read(cur_chunk_size)
lines_byte = lines_byte.splitlines()
sentences = [line_byte.decode('utf-8').strip() for line_byte in lines_byte]
all_tokens = self.tokenizer_model.encode(sentences, self.output_type)
tokenized_sentences = []
for ele_tokens in all_tokens:
if self.output_type == int:
ele_tokens = [str(token) for token in ele_tokens]
tokenized_sentences.append(' '.join(ele_tokens))
sentence_num = len(tokenized_sentences)
token_num = sum([len(sentence) for sentence in tokenized_sentences])
unk = self.tokenizer_model.vocab.unk_token
unk_num = sum(sentence.count(unk) for sentence in tokenized_sentences)
return tokenized_sentences, sentence_num, token_num, unk_num
def main(args):
start = time.time()
if args.model == 'spm':
assert args.model_path is not None, 'Must specify --model_path when using the "spm" model.'
tokenizer_model = tokenizers.create('spm',
model_path=args.model_path,
vocab=args.vocab_path)
elif args.model == 'subword_nmt':
assert args.model_path is not None,\
'Must specify --model_path when using the "subword_nmt" model.'
assert args.vocab_path is not None, \
'Must specify --vocab_path when using the "subword_nmt" model.'
tokenizer_model = tokenizers.create('subword_nmt',
model_path=args.model_path,
vocab=args.vocab_path,
bpe_dropout=args.bpe_dropout)
elif args.model == 'yttm':
assert args.model_path is not None,\
'Must specify --model_path when using the "subword_nmt" model.'
args.bpe_dropout = 0.0 if not args.bpe_dropout else args.bpe_dropout
tokenizer_model = tokenizers.create('yttm',
model_path=args.model_path,
vocab=args.vocab_path,
bpe_dropout=args.bpe_dropout,
n_threads=1)
elif args.model == 'hf_bytebpe' or 'hf_bpe' or 'hf_wordpiece':
if is_new_version_model_file(args.model_path):
assert args.model_path is not None, \
'Must specify --model_path when using the "{}" model.'.format(args.model)
assert args.vocab_path is not None, \
'Must specify --vocab_path when using the "{}" model.'.format(args.model)
tokenizer_model = tokenizers.create('hf_tokenizer',
model_path=args.model_path,
vocab=args.vocab_path)
else:
if args.model == 'hf_bytebpe':
tokenizer_model = tokenizers.create('hf_bytebpe',
merges_file=args.model_path,
vocab_file=args.vocab_path,
dropout=args.bpe_dropout,
lowercase=args.lowercase)
elif args.model == 'hf_wordpiece':
tokenizer_model = tokenizers.create('hf_wordpiece',
vocab_file=args.vocab_path,
lowercase=args.lowercase,
strip_accents=args.strip_accents)
elif args.model == 'hf_bpe':
tokenizer_model = tokenizers.create('hf_bpe',
merges_file=args.model_path,
vocab_file=args.vocab_path,
dropout=args.bpe_dropout,
lowercase=args.lowercase)
else:
raise NotImplementedError
print('Applying "{}" to "{}" and save to "{}"'
.format(tokenizer_model.__class__.__name__,
', '.join(args.corpus),
args.save_path))
output_type = {'subword': str, 'id': int}[args.output_type]
applyer = ParallelCorpusApplyer(args.corpus, tokenizer_model, output_type)
with open(args.save_path, 'w', encoding='utf-8', newline='\n') as fo:
with Pool(args.num_process) as pool:
sentence_count = token_count = unk_count = 0
for i, (tokenized_sentences, sentence_num, token_num, unk_num) in \
enumerate(pool.imap(applyer.process_chunk, applyer.chunk_iter())):
fo.write('\n'.join(tokenized_sentences))
fo.write('\n')
sentence_count += sentence_num
token_count += token_num
unk_count += unk_num
if (i + 1) % 100 == 0:
print('Chunk {} , #Lines processed: {}'
.format(i + 1, sentence_count))
end = time.time()
print('Done, #Lines processed {}, Avg tokens of sentences {:.1f},'
'Unknown rate {:.1f}%, Time spent {}'
.format(sentence_count, token_count / sentence_count,
unk_count * 100 / token_count, end - start))
def cli_main():
parser = get_parser()
args = parser.parse_args()
main(args)
if __name__ == '__main__':
cli_main()