diff --git a/pinecone_text/sparse/bm25_tokenizer.py b/pinecone_text/sparse/bm25_tokenizer.py index 9513176..bbdf23c 100644 --- a/pinecone_text/sparse/bm25_tokenizer.py +++ b/pinecone_text/sparse/bm25_tokenizer.py @@ -44,7 +44,7 @@ def nltk_setup() -> None: nltk.download("stopwords") def __call__(self, text: str) -> List[str]: - tokens = word_tokenize(text) + tokens = word_tokenize(text, self.language) if self.lower_case: tokens = [word.lower() for word in tokens] if self.remove_punctuation: