fixed emptydoc issue in preprocessing and updated test

mayankanand007 · mayankanand007 · commit bf60566c15bc · 2021-12-21T18:39:30.000Z
diff --git a/cuBERT_topic_modelling/tests/test_data_preprocess.py b/cuBERT_topic_modelling/tests/test_data_preprocess.py
@@ -11,6 +11,7 @@ def input_data_docs_df():
         "This document is the second document.",
         "And this is the third one.",
         "Is this the first document?",
+        ""
     ]
 
     docs_df = pd.DataFrame(data_trivial, columns=["Document"])
diff --git a/cuBERT_topic_modelling/vectorizer/vectorizer.py b/cuBERT_topic_modelling/vectorizer/vectorizer.py
@@ -25,13 +25,14 @@ def preprocess_text_gpu(self, doc):
         doc = doc.str.filter_characters(
             {"a": "z", "0": "9", " ": " ", "A": "Z"}, True, ""
         )
-        doc[doc == ""] = "emptydoc"
-
+        
         # TODO: check if its required
         # sklearn by default removes tokens of
         # length 1, if its remove alphanumerics
         # if remove_single_token_len:
         doc = doc.str.filter_tokens(2)
+        doc = doc.str.strip()
+        doc[doc == ""] = "emptydoc"
 
         return doc
 

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ def input_data_docs_df():`
`11`	`11`	`"This document is the second document.",`
`12`	`12`	`"And this is the third one.",`
`13`	`13`	`"Is this the first document?",`
	`14`	`+ ""`
`14`	`15`	`]`
`15`	`16`
`16`	`17`	`docs_df = pd.DataFrame(data_trivial, columns=["Document"])`