Skip to content

Commit

Permalink
fix: replace bligfire with nltk sent_tokenize
Browse files Browse the repository at this point in the history
  • Loading branch information
parambharat committed Sep 27, 2024
1 parent 2f3e7ba commit 1ddf621
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 3 deletions.
4 changes: 2 additions & 2 deletions rag-advanced/notebooks/scripts/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Callable, List, Optional

import numpy as np
from blingfire import text_to_sentences
from nltk import sent_tokenize
from sklearn.metrics.pairwise import cosine_distances
from tqdm.notebook import tqdm

Expand All @@ -26,7 +26,7 @@ def sentence_splitter(text: str) -> List[str]:
Returns:
List[str]: A list of sentences.
"""
return text_to_sentences(text).split("\n")
return sent_tokenize(text)


def split_into_chunks(
Expand Down
1 change: 0 additions & 1 deletion rag-advanced/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
weave>=0.51.2
cohere>=5.9.4
beautifulsoup4>=4.12.3
blingfire>=0.1.8
levenshtein>=0.25.1
markdown-it-py>=3.0.0
nltk>=3.8.1
Expand Down

0 comments on commit 1ddf621

Please sign in to comment.