Skip to content

Commit 09d220c

Browse files
committed
Handle empty documents in preprocess_data.
1 parent 1b8e289 commit 09d220c

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

tools/preprocess_data.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def encode(self, json_line):
8585
sentence_ids = Encoder.tokenizer.tokenize(sentence)
8686
if len(sentence_ids) > 0:
8787
doc_ids.append(sentence_ids)
88-
if self.args.append_eod:
88+
if len(doc_ids) > 0 and self.args.append_eod:
8989
doc_ids[-1].append(Encoder.tokenizer.eod)
9090
ids[key] = doc_ids
9191
return ids, len(json_line)
@@ -182,6 +182,8 @@ def main():
182182
for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
183183
total_bytes_processed += bytes_processed
184184
for key, sentences in doc.items():
185+
if len(sentences) == 0:
186+
continue
185187
for sentence in sentences:
186188
builders[key].add_item(torch.IntTensor(sentence))
187189
builders[key].end_document()

0 commit comments

Comments
 (0)