Skip to content

Commit 1a60d2b

Browse files
authored
Remove legacy workflow (#33)
1 parent 42745d2 commit 1a60d2b

File tree

8 files changed

+1
-214
lines changed

8 files changed

+1
-214
lines changed

core/chainables/web.py

-18
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,7 @@
11
import datetime
2-
3-
from core.tools.scraper import web_query_google_lookup
4-
from core.classes.query import WebQuery
52
from langchain_core.prompts import ChatPromptTemplate
63

74

8-
def web_news_lookup(prompt_text: str):
9-
query = WebQuery("news", prompt_core=prompt_text)
10-
return web_query_google_lookup(query)
11-
12-
13-
def web_wiki_lookup(prompt_text: str):
14-
query = WebQuery("wiki", prompt_core=prompt_text)
15-
return web_query_google_lookup(query)
16-
17-
18-
def web_docs_lookup(prompt_text: str):
19-
query = WebQuery("docs", prompt_core=prompt_text)
20-
return web_query_google_lookup(query)
21-
22-
235
def web_docs_lookup_prompt():
246
return ChatPromptTemplate.from_messages(
257
[

core/classes/query.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import datetime
43
from typing import Literal
54

65
from core.tools import utils

core/lookup.py

-79
This file was deleted.

core/tools/scraper.py

-64
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,14 @@
11
import requests.exceptions
22
import tiktoken
33
from googlesearch import search
4-
from langchain.text_splitter import RecursiveCharacterTextSplitter
54
from langchain_community.document_loaders import WebBaseLoader
6-
from langchain_community.vectorstores.faiss import FAISS
75
from langchain_core.documents import Document
86
from langchain_core.output_parsers import StrOutputParser
97
from colorama import Fore, Style
108
from core.tools.model_loader import load_model
119
from core.models.configurations import load_llm_config
1210
from core.tools.utils import purify_name
13-
from core.tools.dbops import get_vec_db_by_name
1411
from core.classes.query import WebQuery
15-
from core.tools.utils import is_text_junk, remove_characters
1612

1713
encoder = tiktoken.get_encoding("cl100k_base")
1814
output_parser = StrOutputParser()
@@ -40,10 +36,6 @@ def docs_to_context(docs_and_scores: list[Document], token_limit: int) -> str:
4036
return context_text
4137

4238

43-
def rag_query_lookup(prompt_text: str) -> str:
44-
pass
45-
46-
4739
def query_for_urls(query: WebQuery, url_amount=embed_config.article_limit) -> list[str]:
4840
print(f"{Fore.CYAN}{Style.BRIGHT}Searching for:{Style.RESET_ALL}", query.web_query)
4941

@@ -67,59 +59,3 @@ def download_article(url):
6759
except requests.exceptions.ConnectionError:
6860
return None
6961
return document
70-
71-
72-
def populate_db_with_google_search(database: FAISS, query: WebQuery):
73-
url_list = query_for_urls(query)
74-
75-
for url in url_list:
76-
document = download_article(url)
77-
78-
text_splitter = RecursiveCharacterTextSplitter(
79-
separators=embed_config.buffer_stops,
80-
chunk_size=query.db_chunk_size,
81-
chunk_overlap=embed_config.chunk_overlap,
82-
keep_separator=False,
83-
strip_whitespace=True,
84-
)
85-
86-
chunks = text_splitter.split_documents(document)
87-
88-
for chunk in chunks:
89-
if is_text_junk(chunk.page_content):
90-
chunks.remove(chunk)
91-
continue
92-
93-
chunk.page_content = remove_characters(chunk.page_content, ["\n", "`"])
94-
chunk.page_content = (
95-
query.db_embedding_prefix
96-
+ chunk.page_content
97-
+ query.db_embedding_postfix
98-
)
99-
100-
if len(chunks) != 0:
101-
database.add_documents(documents=chunks, embeddings=embeddings)
102-
103-
db_name = embedding_model_safe_name + query.db_save_file_extension
104-
database.save_local(folder_path="store/vector", index_name=db_name)
105-
106-
print(f"{Fore.CYAN}Document vectorization completed.{Fore.RESET}")
107-
108-
109-
def web_query_google_lookup(
110-
query: WebQuery, token_limit: int = embed_config.model_token_limit
111-
):
112-
db_name = embedding_model_safe_name + query.db_save_file_extension
113-
db = get_vec_db_by_name(db_name, embeddings)
114-
115-
populate_db_with_google_search(db, query)
116-
117-
# return the document with the highest prompt similarity score (for now only browsing the first search result)
118-
embedding_vector = embeddings.embed_query(query.db_embed_query)
119-
docs_and_scores = db.similarity_search_by_vector(
120-
embedding_vector, k=round(token_limit / 64)
121-
)
122-
123-
print(f"{Fore.CYAN}Database search completed.{Fore.RESET}")
124-
125-
return docs_to_context(docs_and_scores, llm_config.model_token_limit)

core/tools/utils.py

-29
Original file line numberDiff line numberDiff line change
@@ -56,35 +56,6 @@ def remove_characters(text: str, wordlist: list[str]):
5656
return text
5757

5858

59-
def timeout_function(task, timeout=2.0):
60-
# FIXME: THIS FUNCTION MAY BE BROKEN, TEST THIS
61-
62-
ctx = multiprocessing.get_context("spawn")
63-
q = ctx.Queue()
64-
65-
def wrapper(q):
66-
task_result = task()
67-
q.put(task_result)
68-
69-
thread_loop = asyncio.new_event_loop()
70-
asyncio.set_event_loop(thread_loop)
71-
72-
thread = ctx.Process(target=wrapper, args=(q,))
73-
74-
thread.start()
75-
thread.join(timeout) # close thread if work is finished
76-
if thread.is_alive():
77-
thread.kill()
78-
return None
79-
80-
result = q.get()
81-
82-
thread_loop.run_until_complete(asyncio.sleep(0))
83-
thread_loop.close()
84-
85-
return result
86-
87-
8859
def extract_links(text: str):
8960
return re.findall(r"(https?://\S+\.\S+/)", text)
9061

main.py

+1-21
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,12 @@
1-
import curses
21
import requests
32

43
from colorama import init as colorama_init, Fore, Style
5-
from terminal_gui import user_input, select_input
6-
from core.lookup import web_lookup
74

85

96
colorama_init()
107

118
try:
12-
try:
13-
mode_input = curses.wrapper(select_input)
14-
text_input = curses.wrapper(user_input)
15-
print(f"{Fore.GREEN}{Style.BRIGHT}Mode: {Fore.RESET}{mode_input}")
16-
print(f"{Fore.GREEN}{Style.BRIGHT}Input: {Fore.RESET}{text_input}")
17-
except curses.error:
18-
# terminal is not present,
19-
# user likely tries running through IDE
20-
print(
21-
f"{Fore.YELLOW}Terminal not detected, full functionality may not be available.{Fore.RESET}"
22-
)
23-
mode_input = "Wiki"
24-
text_input = input(f"{Fore.GREEN}{Style.BRIGHT}(user){Fore.RESET} ")
25-
26-
chain_output = web_lookup.invoke({"input": text_input, "mode": mode_input})
27-
print(f"{Fore.GREEN}{Style.BRIGHT}(llm){Fore.RESET} ", end="")
28-
print(chain_output, end="", flush=True)
29-
print(end="\n")
9+
pass
3010
except requests.exceptions.ConnectionError:
3111
print(
3212
f"{Fore.RED}{Style.BRIGHT}Connection error, make sure Ollama server is running...{Fore.RESET}{Style.RESET_ALL}"

terminal_gui.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import argparse
21
import curses
32

43
HIGHLIGHTED_COLOR_ID = 1

workers/summarizer.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from core.databases.db_crawl_tasks import (
2-
db_are_tasks_completed,
32
db_are_crawl_tasks_fully_embedded,
43
)
54
from core.databases.db_embeddings import (

0 commit comments

Comments
 (0)