Skip to content

Commit bf60566

Browse files
fixed emptydoc issue in preprocessing and updated test
1 parent d36a7ef commit bf60566

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

cuBERT_topic_modelling/tests/test_data_preprocess.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def input_data_docs_df():
1111
"This document is the second document.",
1212
"And this is the third one.",
1313
"Is this the first document?",
14+
""
1415
]
1516

1617
docs_df = pd.DataFrame(data_trivial, columns=["Document"])

cuBERT_topic_modelling/vectorizer/vectorizer.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,14 @@ def preprocess_text_gpu(self, doc):
2525
doc = doc.str.filter_characters(
2626
{"a": "z", "0": "9", " ": " ", "A": "Z"}, True, ""
2727
)
28-
doc[doc == ""] = "emptydoc"
29-
28+
3029
# TODO: check if its required
3130
# sklearn by default removes tokens of
3231
# length 1, if its remove alphanumerics
3332
# if remove_single_token_len:
3433
doc = doc.str.filter_tokens(2)
34+
doc = doc.str.strip()
35+
doc[doc == ""] = "emptydoc"
3536

3637
return doc
3738

0 commit comments

Comments
 (0)