Skip to content

Commit 9b11eea

Browse files
authored
fix for tokenizers (#549)
Signed-off-by: Evelina Bakhturina <[email protected]>
1 parent b0d17ee commit 9b11eea

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def get_features(
135135
capit_all_labels[i] = [pad_id] + capit_all_labels[i][-max_seq_length + 1 :]
136136
too_long_count += 1
137137

138-
all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens])
138+
all_input_ids.append(tokenizer.tokens_to_ids(subtokens))
139139

140140
if len(subtokens) < max_seq_length:
141141
extra = max_seq_length - len(subtokens)

nemo/collections/nlp/data/datasets/token_classification_dataset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def get_features(
128128
all_labels[i] = [pad_id] + all_labels[i][-max_seq_length + 1 :]
129129
too_long_count += 1
130130

131-
all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens])
131+
all_input_ids.append(tokenizer.tokens_to_ids(subtokens))
132132

133133
if len(subtokens) < max_seq_length:
134134
extra = max_seq_length - len(subtokens)

0 commit comments

Comments
 (0)