From 7b072bf03c1cb34b96d916f9a44252783728d7d3 Mon Sep 17 00:00:00 2001 From: elampe-freeday Date: Mon, 23 Sep 2024 14:43:56 +0200 Subject: [PATCH] Add the language to the word_tokenize function --- pinecone_text/sparse/bm25_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pinecone_text/sparse/bm25_tokenizer.py b/pinecone_text/sparse/bm25_tokenizer.py index 9513176..bbdf23c 100644 --- a/pinecone_text/sparse/bm25_tokenizer.py +++ b/pinecone_text/sparse/bm25_tokenizer.py @@ -44,7 +44,7 @@ def nltk_setup() -> None: nltk.download("stopwords") def __call__(self, text: str) -> List[str]: - tokens = word_tokenize(text) + tokens = word_tokenize(text, self.language) if self.lower_case: tokens = [word.lower() for word in tokens] if self.remove_punctuation: