1
1
import requests .exceptions
2
2
import tiktoken
3
3
from googlesearch import search
4
- from langchain .text_splitter import RecursiveCharacterTextSplitter
5
4
from langchain_community .document_loaders import WebBaseLoader
6
- from langchain_community .vectorstores .faiss import FAISS
7
5
from langchain_core .documents import Document
8
6
from langchain_core .output_parsers import StrOutputParser
9
7
from colorama import Fore , Style
10
8
from core .tools .model_loader import load_model
11
9
from core .models .configurations import load_llm_config
12
10
from core .tools .utils import purify_name
13
- from core .tools .dbops import get_vec_db_by_name
14
11
from core .classes .query import WebQuery
15
- from core .tools .utils import is_text_junk , remove_characters
16
12
17
13
encoder = tiktoken .get_encoding ("cl100k_base" )
18
14
output_parser = StrOutputParser ()
@@ -40,10 +36,6 @@ def docs_to_context(docs_and_scores: list[Document], token_limit: int) -> str:
40
36
return context_text
41
37
42
38
43
- def rag_query_lookup (prompt_text : str ) -> str :
44
- pass
45
-
46
-
47
39
def query_for_urls (query : WebQuery , url_amount = embed_config .article_limit ) -> list [str ]:
48
40
print (f"{ Fore .CYAN } { Style .BRIGHT } Searching for:{ Style .RESET_ALL } " , query .web_query )
49
41
@@ -67,59 +59,3 @@ def download_article(url):
67
59
except requests .exceptions .ConnectionError :
68
60
return None
69
61
return document
70
-
71
-
72
- def populate_db_with_google_search (database : FAISS , query : WebQuery ):
73
- url_list = query_for_urls (query )
74
-
75
- for url in url_list :
76
- document = download_article (url )
77
-
78
- text_splitter = RecursiveCharacterTextSplitter (
79
- separators = embed_config .buffer_stops ,
80
- chunk_size = query .db_chunk_size ,
81
- chunk_overlap = embed_config .chunk_overlap ,
82
- keep_separator = False ,
83
- strip_whitespace = True ,
84
- )
85
-
86
- chunks = text_splitter .split_documents (document )
87
-
88
- for chunk in chunks :
89
- if is_text_junk (chunk .page_content ):
90
- chunks .remove (chunk )
91
- continue
92
-
93
- chunk .page_content = remove_characters (chunk .page_content , ["\n " , "`" ])
94
- chunk .page_content = (
95
- query .db_embedding_prefix
96
- + chunk .page_content
97
- + query .db_embedding_postfix
98
- )
99
-
100
- if len (chunks ) != 0 :
101
- database .add_documents (documents = chunks , embeddings = embeddings )
102
-
103
- db_name = embedding_model_safe_name + query .db_save_file_extension
104
- database .save_local (folder_path = "store/vector" , index_name = db_name )
105
-
106
- print (f"{ Fore .CYAN } Document vectorization completed.{ Fore .RESET } " )
107
-
108
-
109
- def web_query_google_lookup (
110
- query : WebQuery , token_limit : int = embed_config .model_token_limit
111
- ):
112
- db_name = embedding_model_safe_name + query .db_save_file_extension
113
- db = get_vec_db_by_name (db_name , embeddings )
114
-
115
- populate_db_with_google_search (db , query )
116
-
117
- # return the document with the highest prompt similarity score (for now only browsing the first search result)
118
- embedding_vector = embeddings .embed_query (query .db_embed_query )
119
- docs_and_scores = db .similarity_search_by_vector (
120
- embedding_vector , k = round (token_limit / 64 )
121
- )
122
-
123
- print (f"{ Fore .CYAN } Database search completed.{ Fore .RESET } " )
124
-
125
- return docs_to_context (docs_and_scores , llm_config .model_token_limit )
0 commit comments