diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3d68e9f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +# Use an official Python runtime as a parent image +FROM python:3.10-slim-buster + +# Set the working directory to /app +WORKDIR /app + +# Copy the current directory contents into the container at /app +COPY . /app +COPY ./data/* /app/data/ + +# Install any needed packages specified in requirements.txt +RUN pip install -r requirements.txt + +# Expose the port that the Gradio app will run on +EXPOSE 7860 + +# Run the command to start the Gradio app +CMD ["python", "chat_web.py"] diff --git a/README.md b/README.md index d7c78dd..3ef39ec 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,15 @@ cd chat-with-your-doc 3. Install the required Python packages: +Create virtual environment: + +```bash +python3 -m venv .venv +source .venv/bin/activate +``` + +Install depenancies: + ```bash pip install -r requirements.txt ``` @@ -66,32 +75,31 @@ The CLI application is built to support both `ingest` and `chat` commands. Pytho This command would take the documents as input, split the texts, generate the embeddings and store in a vector store `FAISS`. The vector store would be store locally for later used for chat. -```bash -$ python chat_cli.py ingest --help - - Usage: chat_cli.py ingest [OPTIONS] DOC_PATH INDEX_NAME +![](./static/cli_ingest.png) -Arguments: -doc_path TEXT Path to the documents to be ingested, support glob pattern [required] -index_name TEXT Name of the index to be created [default: None] [required] - -Options: ---help Show this message and exit. +For example if you want to put all the PDFs in the directory into one single vector store named `surface`, you could run: + +```bash +$ python chat_cli.py ingest --path "./data/source_documents/*.pdf" --name surface ``` +Note that the path should be enclosed with double quotes to avoid shell expansion. ### **Chat** -This command would start a interactive chat, with documents as a external knowledge base in a vector store. You could choose which knowledge base to load for chat. +This command would start a interactive chat, with documents as a external knowledge base in a vector store. You could choose which knowledge base to load for chat. -```bash -$ python chat_cli.py chat --help +![CLI Chat](./static/cli_chat.png) -Usage: chat_cli.py chat [OPTIONS] +Two sample documents about Surface has been provided in the [data/source_document](data/source_documents) directory and already ingested into the default vector store `index`, stored in the [data/vector_store](data/vector_store). You could run the following command to start a chat with the documents: -Options: ---index-name TEXT [default: index] ---help Show this message and exit. +```bash +$ python chat_cli.py chat +``` + +Or you could specify the vector store to load for chat: +```bash +$ python chat_cli.py chat --name surface ``` ## Usage: Web diff --git a/chat_cli.py b/chat_cli.py index 2ead252..0c415e5 100644 --- a/chat_cli.py +++ b/chat_cli.py @@ -12,19 +12,19 @@ @app.command() def ingest( - doc_path : Annotated[str, typer.Argument(help="Path to the documents to be ingested, support glob pattern", show_default=False)], - index_name : Annotated[str, typer.Argument(help="Name of the index to be created")]): + path : Annotated[str, typer.Option(help="Path to the documents to be ingested, support glob pattern", show_default=False)], + name : Annotated[str, typer.Option(help="Name of the index to be created", show_default=False)]): #support for glob in doc_path - file_list = glob.glob(doc_path) + file_list = glob.glob(path) # print(file_list) docChatbot.init_vector_db_from_documents(file_list) - docChatbot.save_vector_db_to_local(VECTORDB_PATH, index_name) + docChatbot.save_vector_db_to_local(VECTORDB_PATH, name) @app.command() -def chat(index_name : str = "index"): +def chat(name : str = "index"): - docChatbot.load_vector_db_from_local(VECTORDB_PATH, index_name) + docChatbot.load_vector_db_from_local(VECTORDB_PATH, name) docChatbot.init_chatchain() chat_history = [] diff --git a/chat_web.py b/chat_web.py index a7b1e1e..bad5a26 100644 --- a/chat_web.py +++ b/chat_web.py @@ -71,8 +71,19 @@ def get_answer(message, chat_history): a = "" if chat[1] == None else chat[1] ch.append((q, a)) + #todo: need to handle exception result_answer, result_source = docChatbot.get_answer_with_source(message, ch) + output_source = "\n\n" + i = 0 + for doc in result_source: + reference_html = f"""
Reference [{i+1}] {os.path.basename(doc.metadata["source"])} P{doc.metadata['page']+1} \n""" + reference_html += f"""{doc.page_content}\n""" + reference_html += f"""
""" + output_source += reference_html + i += 1 + #todo: show referenced pdf content in web ui + chat_history.append((message, result_answer)) return "", chat_history @@ -146,4 +157,7 @@ def get_answer(message, chat_history): -demo.launch() \ No newline at end of file +demo.launch( + server_name="0.0.0.0", + server_port=8000 +) \ No newline at end of file diff --git a/chatbot.py b/chatbot.py index 29528fd..7692e74 100644 --- a/chatbot.py +++ b/chatbot.py @@ -24,22 +24,23 @@ class DocChatbot: def __init__(self) -> None: #init for OpenAI GPT-4 and Embeddings load_dotenv() - openai.api_type = "azure" - openai.api_version = "2023-03-15-preview" - openai.api_base = os.getenv("OPENAI_API_BASE") - openai.api_key = os.getenv("OPENAI_API_KEY") self.llm = AzureChatOpenAI( deployment_name=os.getenv("OPENAI_DEPLOYMENT_NAME"), temperature=0, - openai_api_version="2023-03-15-preview" + openai_api_version="2023-05-15", + openai_api_type="azure", + openai_api_base=os.getenv("OPENAI_API_BASE"), + openai_api_key=os.getenv("OPENAI_API_KEY"), + request_timeout=30 ) self.embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", chunk_size=1) def init_chatchain(self, chain_type : str = "stuff") -> None: # init for ConversationalRetrievalChain - CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""Given the following conversation and a follow up question, rephrase the follow up question. + CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""Given the following conversation and a follow up question, rephrase the follow up question. + The follow up question should be in the same language with the input. For example, if the input is in Chinese, the follow up question or the standalone question below should be in Chinese too. Chat History: {chat_history} @@ -103,8 +104,8 @@ def init_vector_db_from_documents(self, file_list: List[str]): docs.extend(doc) print("Processed document: " + file) + print("Generating embeddings and ingesting to vector db.") self.vector_db = FAISS.from_documents(docs, OpenAIEmbeddings(chunk_size=1)) - print("Generated embeddings and ingested to vector db.") - + print("Vector db initialized.") \ No newline at end of file diff --git a/data/source_documents/FY2223-Q3-JFM-8-K-Final.pdf b/data/source_documents/FY2223-Q3-JFM-8-K-Final.pdf deleted file mode 100644 index 07c71ff..0000000 Binary files a/data/source_documents/FY2223-Q3-JFM-8-K-Final.pdf and /dev/null differ diff --git a/data/source_documents/surface.pdf b/data/source_documents/surface.pdf new file mode 100644 index 0000000..747d8c0 Binary files /dev/null and b/data/source_documents/surface.pdf differ diff --git a/data/source_documents/Surface_Pro_9_Guide.pdf b/data/source_documents/surface9_service.pdf similarity index 100% rename from data/source_documents/Surface_Pro_9_Guide.pdf rename to data/source_documents/surface9_service.pdf diff --git a/data/vector_store/index.faiss b/data/vector_store/index.faiss index feba2b9..1437598 100644 Binary files a/data/vector_store/index.faiss and b/data/vector_store/index.faiss differ diff --git a/data/vector_store/index.pkl b/data/vector_store/index.pkl index fd0bfe8..d5276d4 100644 Binary files a/data/vector_store/index.pkl and b/data/vector_store/index.pkl differ diff --git a/data/vector_store/pgq3.faiss b/data/vector_store/pgq3.faiss deleted file mode 100644 index cad828b..0000000 Binary files a/data/vector_store/pgq3.faiss and /dev/null differ diff --git a/data/vector_store/pgq3.pkl b/data/vector_store/pgq3.pkl deleted file mode 100644 index 528cc16..0000000 Binary files a/data/vector_store/pgq3.pkl and /dev/null differ diff --git a/requirements.txt b/requirements.txt index 2ba3ef2..9311e95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,15 @@ aiofiles==23.1.0 aiohttp==3.8.4 aiosignal==1.3.1 -altair==4.2.2 +altair==5.0.0 anyio==3.6.2 -argilla==1.6.0 +argilla==1.7.0 async-timeout==4.0.2 attrs==23.1.0 backoff==2.2.1 -certifi==2022.12.7 +certifi==2023.5.7 cffi==1.15.1 +chardet==5.1.0 charset-normalizer==3.1.0 click==8.1.3 commonmark==0.9.1 @@ -17,17 +18,16 @@ cryptography==40.0.2 cycler==0.11.0 dataclasses-json==0.5.7 Deprecated==1.2.13 -entrypoints==0.4 et-xmlfile==1.1.0 faiss-cpu==1.7.4 fastapi==0.95.1 ffmpy==0.3.0 filelock==3.12.0 -fonttools==4.39.3 +fonttools==4.39.4 frozenlist==1.3.3 -fsspec==2023.4.0 -gradio==3.28.1 -gradio_client==0.1.4 +fsspec==2023.5.0 +gradio==3.30.0 +gradio_client==0.2.4 greenlet==2.0.2 h11==0.14.0 httpcore==0.16.3 @@ -39,8 +39,8 @@ Jinja2==3.1.2 joblib==1.2.0 jsonschema==4.17.3 kiwisolver==1.4.4 -langchain==0.0.153 -linkify-it-py==2.0.1 +langchain==0.0.191 +linkify-it-py==2.0.2 lxml==4.9.2 Markdown==3.4.3 markdown-it-py==2.2.0 @@ -58,17 +58,15 @@ nltk==3.8.1 numexpr==2.8.4 numpy==1.23.5 olefile==0.46 -openai==0.27.0 +openai==0.27.6 openapi-schema-pydantic==1.2.4 openpyxl==3.1.2 -orjson==3.8.11 +orjson==3.8.12 packaging==23.1 pandas==1.5.3 -pdfminer==20191125 pdfminer.six==20221105 Pillow==9.5.0 pycparser==2.21 -pycryptodome==3.17 pydantic==1.10.7 pydub==0.25.1 Pygments==2.15.1 @@ -84,28 +82,28 @@ python-multipart==0.0.6 python-pptx==0.6.21 pytz==2023.3 PyYAML==6.0 -regex==2023.3.23 -requests==2.29.0 +regex==2023.5.5 +requests==2.30.0 rfc3986==1.5.0 rich==13.0.1 semantic-version==2.10.0 six==1.16.0 sniffio==1.3.0 -SQLAlchemy==2.0.12 +SQLAlchemy==2.0.13 starlette==0.26.1 tenacity==8.2.2 -tiktoken==0.3.3 +tiktoken==0.4.0 toolz==0.12.0 tqdm==4.65.0 typer==0.9.0 typing-inspect==0.8.0 typing_extensions==4.5.0 -tzdata==2023.3 uc-micro-py==1.0.2 -unstructured==0.6.2 -urllib3==1.26.15 +unstructured==0.7.1 +urllib3==2.0.2 uvicorn==0.22.0 -websockets==11.0.2 +websockets==11.0.3 wrapt==1.14.1 +xlrd==2.0.1 XlsxWriter==3.1.0 yarl==1.9.2 diff --git a/static/cli_chat.png b/static/cli_chat.png new file mode 100644 index 0000000..22e236f Binary files /dev/null and b/static/cli_chat.png differ diff --git a/static/cli_ingest.png b/static/cli_ingest.png new file mode 100644 index 0000000..d7c1686 Binary files /dev/null and b/static/cli_ingest.png differ