diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index dfe0770..0000000 --- a/.gitattributes +++ /dev/null @@ -1,2 +0,0 @@ -# Auto detect text files and perform LF normalization -* text=auto diff --git a/.gitignore b/.gitignore index 7f9a822..6769e21 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments .env -test_server.py +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..09e90ca --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# fork from JayZhou repository + +## app.py is the original file + +## app2.py is the file created by me + +## Key points + +1. To test FastAPI, go to the swagger file under http://127.0.0.1/8000/docs + +2. Import multiple versions of pydrantic + +3. Use StructuredTool to create tools for agent diff --git a/app2.py b/app2.py new file mode 100644 index 0000000..0714f5b --- /dev/null +++ b/app2.py @@ -0,0 +1,254 @@ +import os +import time +from dotenv import load_dotenv +import json +import requests + +from langchain.indexes import VectorstoreIndexCreator +from langchain_community.document_loaders.base import Document +from langchain_community.utilities import ApifyWrapper + +from langchain_openai import ChatOpenAI +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.prompts import PromptTemplate +from langchain.chains.summarize import load_summarize_chain + +# import as an alias +from pydantic import BaseModel as BaseModelv2, Field as Fieldv2 # pydantic v2 not compatible with langchain +from pydantic.v1 import BaseModel as BaseModelv1, Field as Fieldv1 # pydantic v1 compatible with langchain + +from langchain.tools import StructuredTool + +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder + +from langchain.agents.format_scratchpad.openai_tools import ( + format_to_openai_tool_messages +) +from langchain.agents.output_parsers.openai_tools import ( + OpenAIToolsAgentOutputParser +) + +from langchain.agents import AgentExecutor + +import streamlit as st + +from fastapi import FastAPI + + +# 0. Load env variables +load_dotenv() +APIFY_API_TOKEN = os.getenv("APIFY_API_TOKEN") +SERPER_API_KEY = os.getenv("SERPER_API_KEY") + +# 1. function for search +def search(query) -> str: + """Search a query on Google for the content and the referenced URL""" + print(f"Googling {query}...") + url = "https://google.serper.dev/search" + + payload = json.dumps({ + "q": query + }) + + headers = { + 'X-API-KEY': SERPER_API_KEY, + 'Content-Type': 'application/json' + } + + response = requests.request("POST", url, headers=headers, data=payload) + + print("SEARCH FOUND: \n", response.text) + + return response.text + +# 2. function for scraping the website +def scrape_website(objective:str, url: str) -> str: + + print(f"Scraping website [{url}]...") + apify = ApifyWrapper() + + loader = apify.call_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": url}]}, + dataset_mapping_function=lambda item: Document( + page_content=item["text"] or "", metadata={"source": item["url"]} + ), + ) + + index = VectorstoreIndexCreator().from_loaders([loader]) + + result = index.query_with_sources(objective) + + print("ANSWERS: \n", result["answer"]) + print("SOURCES: \n", result["sources"]) + + output = result["answer"] + if len(output) > 10000: + output = summarize(objective, output) + + return output + +# 3. function for summarizing the website content +def summarize(objective, content): + print("Summarizing...") + + llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613") + + text_splitter = RecursiveCharacterTextSplitter( + separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500) + docs = text_splitter.create_documents([content]) + map_prompt = """ + Write a summary of the following text for {objective}: + "{text}" + SUMMARY: + """ + map_prompt_template = PromptTemplate( + template=map_prompt, input_variables=["text", "objective"]) + + summary_chain = load_summarize_chain( + llm=llm, + chain_type='map_reduce', + map_prompt=map_prompt_template, + combine_prompt=map_prompt_template, + verbose=True + ) + + output = summary_chain.run(input_documents=docs, objective=objective) + + print("SUMMARIZED: \n", output) + + return output + +# 4. Tool for searching the internet +searchTool = StructuredTool.from_function( + func=search, + name="Search", + description="Search a query on Google for the content and the referenced URL. You should ask targeted questions" +) + +# 4. Tool for scraping the website +class ScrapeWebsiteInput(BaseModelv1): + """Inputs for Scraping a website on the given objective""" + objective: str = Fieldv1(description="The objective & task that users give to the agent") + url: str = Fieldv1(description="The url of the website to be scraped") + +scrapeWebsiteTool = StructuredTool.from_function( + func=scrape_website, + name="scrape_website", + description="useful when you need to get data from a website url, passing both url and objective to the function; DO NOT make up any url, the url should only be from the search results", + args_schema=ScrapeWebsiteInput, + return_direct=False +) + +tools = [searchTool, scrapeWebsiteTool] + +MEMORY_KEY = "search_history" + +prompt = ChatPromptTemplate.from_messages([ + ( + "system", + """You are a world class researcher, who can do detailed research on any topic and produce facts based results; + you do not make things up, you will try as hard as possible to gather facts & data to back up the research + + Please make sure you complete the objective above with the following rules: + 1/ You should do enough research to gather as much information as possible about the objective + 2/ If there are url of relevant links & articles, you will scrape it to gather more information + 3/ After scraping & search, you should think "is there any new things i should search & scraping based on the data I collected to increase research quality?" If answer is yes, continue; But don't do this more than 3 iteratins + 4/ You should not make things up, you should only write facts & data that you have gathered + 5/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research + 6/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research + """ + ), + MessagesPlaceholder(variable_name=MEMORY_KEY), + ("user", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), +]) + +llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) + +llm_with_tools = llm.bind_tools(tools) + +agent = ( + { + "input": lambda x: x["input"], + "agent_scratchpad": lambda x: format_to_openai_tool_messages( + x["intermediate_steps"] + ), + "search_history": lambda x: x["search_history"], + } + | prompt + | llm_with_tools + | OpenAIToolsAgentOutputParser() +) + +agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) + +def do_research(topic: str): + search_history = [] + + result = agent_executor.invoke({ + "input": topic, + "search_history": search_history + }) + + print("RESULT: ", result) + + return result + + +def test(): + topic = "Why has Meta's Thread product grown more quickly than other products?" + + #search_result = search(topic) + #searchTool.invoke(topic) + #print(isinstance(ScrapeWebsiteInput, BaseModel)) + #print(type(ScrapeWebsiteInput)) + # scrape_website( + # objective=topic, + # url="https://www.nytimes.com/2023/07/11/technology/threads-zuckerberg-meta-google-plus.html" + # ) + # scrapeWebsiteTool.invoke({ + # "objective": topic, + # "url": "https://www.techtarget.com/whatis/feature/Meta-Threads-explained-Everything-you-need-to-know" + # }) + result = do_research(topic) + print(result) + + +def app(): + st.set_page_config(page_title="AI research agent", page_icon=":bird:") + + st.header("AI research agent :bird:") + topic = st.text_input("Research Topic:") + + placeholder = st.empty() + + if topic: + start_time = time.time() + placeholder.text(f"Doing research ...") + + result = do_research(topic) + + end_time = time.time() + research_time = int(end_time - start_time) + + placeholder.text(f"Here is what I have found after [{research_time} seconds]:") + + st.info(result["output"]) + +# if __name__ == "__main__": +# # test() # Local Testing +# app() # Streamlit webapp + +# FastAPI service +api = FastAPI() + +class Query(BaseModelv2): + topic: str + +@api.post("/") +def service(query: Query): + topic = query.topic + result = do_research(topic) + return result + diff --git a/requirements.txt b/requirements.txt index 8a1d7ee..60ef66e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ fastapi[all] openai python-dotenv -pydantic==1.* +pydantic langchain bs4 tiktoken \ No newline at end of file diff --git a/test-api.txt b/test-api.txt new file mode 100644 index 0000000..3ef5f1a --- /dev/null +++ b/test-api.txt @@ -0,0 +1,7 @@ +curl -X 'POST' \ + 'http://127.0.0.1:8000/' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "topic": "Common ways to get rich" +}' \ No newline at end of file