Skip to content

Commit bfa3fbe

Browse files
committed
initial commit
1 parent becc3ba commit bfa3fbe

File tree

75 files changed

+7218
-379
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+7218
-379
lines changed

.gitignore

+91-379
Large diffs are not rendered by default.

azure.yaml

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# yaml-language-server: $schema=https://raw.githubusercontent.com/Azure/azure-dev/main/schemas/v1.0/azure.yaml.json
2+
3+
name: rag-flow
4+
workflows:
5+
up:
6+
steps:
7+
- azd: provision
8+
hooks:
9+
postprovision:
10+
posix:
11+
shell: sh
12+
continueOnError: false
13+
interactive: true
14+
run: infra/hooks/postprovision.sh
15+
windows:
16+
shell: pwsh
17+
continueOnError: false
18+
interactive: true
19+
run: infra/hooks/postprovision.ps1
20+
21+
services:
22+
rag-flow:
23+
project: ./flow
24+
language: python
25+
host: function

azure.yaml.json

+1,016
Large diffs are not rendered by default.

data/sample-documents-indexing.ipynb

+254
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Generating your search index\n",
8+
"This notebook is designed to automatically create the rag search index for you. It uses the [documents csv file](documents.csv) file to create the index. In order to do so it needs names ane keys for the following services:\n",
9+
"\n",
10+
"- Azure Search Service\n",
11+
"- Azure OpenAI Service"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": null,
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"import os\n",
21+
"import pandas as pd\n",
22+
"from azure.identity import DefaultAzureCredential\n",
23+
"from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n",
24+
"from azure.search.documents import SearchClient\n",
25+
"from azure.search.documents.indexes import SearchIndexClient\n",
26+
"from azure.search.documents.indexes.models import (\n",
27+
" HnswParameters,\n",
28+
" HnswAlgorithmConfiguration,\n",
29+
" SemanticPrioritizedFields,\n",
30+
" SearchableField,\n",
31+
" SearchField,\n",
32+
" SearchFieldDataType,\n",
33+
" SearchIndex,\n",
34+
" SemanticSearch,\n",
35+
" SemanticConfiguration,\n",
36+
" SemanticField,\n",
37+
" SimpleField,\n",
38+
" VectorSearch,\n",
39+
" VectorSearchAlgorithmKind,\n",
40+
" VectorSearchAlgorithmMetric,\n",
41+
" ExhaustiveKnnAlgorithmConfiguration,\n",
42+
" ExhaustiveKnnParameters,\n",
43+
" VectorSearchProfile,\n",
44+
")\n",
45+
"from typing import List, Dict\n",
46+
"from openai import AzureOpenAI\n",
47+
"from dotenv import load_dotenv\n",
48+
"\n",
49+
"from pathlib import Path\n",
50+
"\n",
51+
"load_dotenv()"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 2,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"def delete_index(search_index_client: SearchIndexClient, search_index: str):\n",
61+
" print(f\"deleting index {search_index}\")\n",
62+
" search_index_client.delete_index(search_index)"
63+
]
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": 3,
68+
"metadata": {},
69+
"outputs": [],
70+
"source": [
71+
"def create_index_definition(name: str) -> SearchIndex:\n",
72+
" \"\"\"\n",
73+
" Returns an Azure Cognitive Search index with the given name.\n",
74+
" \"\"\"\n",
75+
" # The fields we want to index. The \"embedding\" field is a vector field that will\n",
76+
" # be used for vector search.\n",
77+
" fields = [\n",
78+
" SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n",
79+
" SearchableField(name=\"content\", type=SearchFieldDataType.String),\n",
80+
" SimpleField(name=\"filepath\", type=SearchFieldDataType.String),\n",
81+
" SearchableField(name=\"title\", type=SearchFieldDataType.String),\n",
82+
" SimpleField(name=\"url\", type=SearchFieldDataType.String),\n",
83+
" SearchField(\n",
84+
" name=\"contentVector\",\n",
85+
" type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n",
86+
" searchable=True,\n",
87+
" # Size of the vector created by the text-embedding-ada-002 model.\n",
88+
" vector_search_dimensions=1536,\n",
89+
" vector_search_profile_name=\"myHnswProfile\",\n",
90+
" ),\n",
91+
" ]\n",
92+
"\n",
93+
" # The \"content\" field should be prioritized for semantic ranking.\n",
94+
" semantic_config = SemanticConfiguration(\n",
95+
" name=\"default\",\n",
96+
" prioritized_fields=SemanticPrioritizedFields(\n",
97+
" title_field=SemanticField(field_name=\"title\"),\n",
98+
" keywords_fields=[],\n",
99+
" content_fields=[SemanticField(field_name=\"content\")],\n",
100+
" ),\n",
101+
" )\n",
102+
"\n",
103+
" # For vector search, we want to use the HNSW (Hierarchical Navigable Small World)\n",
104+
" # algorithm (a type of approximate nearest neighbor search algorithm) with cosine\n",
105+
" # distance.\n",
106+
" vector_search = VectorSearch(\n",
107+
" algorithms=[\n",
108+
" HnswAlgorithmConfiguration(\n",
109+
" name=\"myHnsw\",\n",
110+
" kind=VectorSearchAlgorithmKind.HNSW,\n",
111+
" parameters=HnswParameters(\n",
112+
" m=4,\n",
113+
" ef_construction=400,\n",
114+
" ef_search=500,\n",
115+
" metric=VectorSearchAlgorithmMetric.COSINE,\n",
116+
" ),\n",
117+
" ),\n",
118+
" ExhaustiveKnnAlgorithmConfiguration(\n",
119+
" name=\"myExhaustiveKnn\",\n",
120+
" kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,\n",
121+
" parameters=ExhaustiveKnnParameters(\n",
122+
" metric=VectorSearchAlgorithmMetric.COSINE\n",
123+
" ),\n",
124+
" ),\n",
125+
" ],\n",
126+
" profiles=[\n",
127+
" VectorSearchProfile(\n",
128+
" name=\"myHnswProfile\",\n",
129+
" algorithm_configuration_name=\"myHnsw\",\n",
130+
" ),\n",
131+
" VectorSearchProfile(\n",
132+
" name=\"myExhaustiveKnnProfile\",\n",
133+
" algorithm_configuration_name=\"myExhaustiveKnn\",\n",
134+
" ),\n",
135+
" ],\n",
136+
" )\n",
137+
"\n",
138+
" # Create the semantic settings with the configuration\n",
139+
" semantic_search = SemanticSearch(configurations=[semantic_config])\n",
140+
"\n",
141+
" # Create the search index.\n",
142+
" index = SearchIndex(\n",
143+
" name=name,\n",
144+
" fields=fields,\n",
145+
" semantic_search=semantic_search,\n",
146+
" vector_search=vector_search,\n",
147+
" )\n",
148+
"\n",
149+
" return index"
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"execution_count": 4,
155+
"metadata": {},
156+
"outputs": [],
157+
"source": [
158+
"def gen_documents(\n",
159+
" path: str,\n",
160+
") -> List[Dict[str, any]]:\n",
161+
" openai_service_endoint = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n",
162+
" openai_deployment = \"text-embedding-ada-002\"\n",
163+
"\n",
164+
" token_provider = get_bearer_token_provider(DefaultAzureCredential(), \"https://cognitiveservices.azure.com/.default\")\n",
165+
" # openai.Embedding.create() -> client.embeddings.create()\n",
166+
" client = AzureOpenAI(\n",
167+
" api_version=\"2023-07-01-preview\",\n",
168+
" azure_endpoint=openai_service_endoint,\n",
169+
" azure_deployment=openai_deployment,\n",
170+
" azure_ad_token_provider=token_provider\n",
171+
" )\n",
172+
"\n",
173+
" documents = pd.read_csv(path)\n",
174+
" items = []\n",
175+
" for document in documents.to_dict(\"records\"):\n",
176+
" content = document[\"description\"]\n",
177+
" id = str(document[\"id\"])\n",
178+
" title = document[\"name\"]\n",
179+
" url = document[\"url\"]\n",
180+
" emb = client.embeddings.create(input=content, model=openai_deployment)\n",
181+
" rec = {\n",
182+
" \"id\": id,\n",
183+
" \"content\": content,\n",
184+
" \"filepath\": f\"{title.lower().replace(' ', '-')}\",\n",
185+
" \"title\": title,\n",
186+
" \"url\": url,\n",
187+
" \"contentVector\": emb.data[0].embedding,\n",
188+
" }\n",
189+
" items.append(rec)\n",
190+
"\n",
191+
" return items"
192+
]
193+
},
194+
{
195+
"cell_type": "code",
196+
"execution_count": null,
197+
"metadata": {},
198+
"outputs": [],
199+
"source": [
200+
"rag_search = os.environ[\"AZURE_SEARCH_ENDPOINT\"]\n",
201+
"index_name = \"rag-index\"\n",
202+
"\n",
203+
"search_index_client = SearchIndexClient(\n",
204+
" rag_search, DefaultAzureCredential()\n",
205+
")\n",
206+
"\n",
207+
"delete_index(search_index_client, index_name)\n",
208+
"index = create_index_definition(index_name)\n",
209+
"print(f\"creating index {index_name}\")\n",
210+
"search_index_client.create_or_update_index(index)\n",
211+
"print(f\"index {index_name} created\")"
212+
]
213+
},
214+
{
215+
"cell_type": "code",
216+
"execution_count": null,
217+
"metadata": {},
218+
"outputs": [],
219+
"source": [
220+
"print(f\"indexing documents\")\n",
221+
"docs = gen_documents(\"sample-documents.csv\")\n",
222+
"# Upload our data to the index.\n",
223+
"search_client = SearchClient(\n",
224+
" endpoint=rag_search,\n",
225+
" index_name=index_name,\n",
226+
" credential=DefaultAzureCredential(),\n",
227+
")\n",
228+
"print(f\"uploading {len(docs)} documents to index {index_name}\")\n",
229+
"ds = search_client.upload_documents(docs)"
230+
]
231+
}
232+
],
233+
"metadata": {
234+
"kernelspec": {
235+
"display_name": ".venv",
236+
"language": "python",
237+
"name": "python3"
238+
},
239+
"language_info": {
240+
"codemirror_mode": {
241+
"name": "ipython",
242+
"version": 3
243+
},
244+
"file_extension": ".py",
245+
"mimetype": "text/x-python",
246+
"name": "python",
247+
"nbconvert_exporter": "python",
248+
"pygments_lexer": "ipython3",
249+
"version": "3.12.2"
250+
}
251+
},
252+
"nbformat": 4,
253+
"nbformat_minor": 2
254+
}

data/sample-documents.csv

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
id,name,description,url
2+
1,Mare Imbrium,"A vast lunar mare in the Moon's Imbrium Basin, formed by ancient volcanic activity",https://example.com
3+
2,Tycho Crater,"A prominent lunar impact crater known for its bright ray system, named after the Danish astronomer Tycho Brahe",https://example.com
4+
3,Sea of Tranquility,"The landing site of Apollo 11, the first manned moon landing, marking humanity's first steps on another celestial body",https://example.com
5+
4,Copernicus Crater,"A large, prominent crater that is a dominant feature in the Moon's Oceanus Procellarum, showcasing terraced walls and central peaks",https://example.com
6+
5,Earth to Moon Distance,"The average distance from Earth to the Moon is about 384,400 kilometers or 238,855 miles",https://example.com
7+
6,Sinus Iridum,"Known as the Bay of Rainbows, a plain of basaltic lava surrounded by the rugged Jura Mountains",https://example.com
8+
7,Clavius Crater,"One of the largest crater formations on the Moon, visible with the naked eye, featuring a notable arc of smaller craters within it",https://example.com
9+
8,Mare Serenitatis,"A lunar mare located to the east of Mare Imbrium, notable for its relatively smooth and flat basaltic plains",https://example.com
10+
9,Mare Frigoris,"A lunar mare that stretches across the Moon's northern edge, distinguished by its cold, dark basaltic surface",https://example.com
11+
10,Lunar Surface,"The Moon's surface is covered with a layer of regolith, a mix of fine dust and rocky debris created by millennia of meteorite impacts",https://example.com
12+
11,South Pole-Aitken Basin,"One of the largest and oldest impact features on the Moon, stretching across the Moon's southern hemisphere",https://example.com
13+
12,Hadley Rille,"A sinuous rille or channel associated with the Mare Imbrium, explored by the Apollo 15 mission",https://example.com
14+
13,Mountains of the Moon,"A range of mountains on the Moon that includes some of the highest peaks, offering striking views and geological features",https://example.com
15+
14,Plato Crater,"A large, ancient crater with a dark, flat floor, located near the lunar Alps",https://example.com
16+
15,Mare Nectaris,"A lunar mare that forms a basin on the Moon's surface, surrounded by rugged highlands and ringed by mountain ranges",https://example.com
17+
16,Aristarchus Plateau,"A region on the Moon noted for its high albedo and geological complexity, featuring the Aristarchus Crater and Schroter's Valley",https://example.com
18+
17,Schroter's Valley,"The largest sinuous rille on the Moon, located near the Aristarchus Plateau, resembling a winding river",https://example.com
19+
18,Mare Humorum,"A smaller mare on the Moon, surrounded by the lunar highlands, characterized by its circular shape and dark basaltic plains",https://example.com
20+
19,Lunar Maria,"The large, dark plains on the Moon's surface, formed by ancient volcanic eruptions and covering about 16% of the lunar surface",https://example.com
21+
20,Taurus-Littrow Valley,"The valley explored by the Apollo 17 astronauts, featuring a mix of highlands and volcanic features",https://example.com

flow/__init__.py

Whitespace-only changes.

flow/ai_search.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from typing import List
2+
import os
3+
from azure.identity import DefaultAzureCredential
4+
from azure.search.documents import SearchClient
5+
from azure.search.documents.models import (
6+
VectorizedQuery,
7+
QueryType,
8+
QueryCaptionType,
9+
QueryAnswerType,
10+
)
11+
12+
def retrieve_documentation(
13+
question: str,
14+
index_name: str,
15+
embedding: List[float],
16+
) -> str:
17+
18+
19+
search_client = SearchClient(
20+
endpoint=os.environ["AZURE_SEARCH_ENDPOINT"],
21+
index_name=index_name,
22+
credential=DefaultAzureCredential()
23+
)
24+
25+
vector_query = VectorizedQuery(
26+
vector=embedding, k_nearest_neighbors=3, fields="contentVector"
27+
)
28+
29+
results = search_client.search(
30+
search_text=question,
31+
vector_queries=[vector_query],
32+
query_type=QueryType.SEMANTIC,
33+
semantic_configuration_name="default",
34+
query_caption=QueryCaptionType.EXTRACTIVE,
35+
query_answer=QueryAnswerType.EXTRACTIVE,
36+
top=3,
37+
)
38+
39+
docs = [
40+
{
41+
"id": doc["id"],
42+
"title": doc["title"],
43+
"content": doc["content"],
44+
"url": doc["url"],
45+
}
46+
for doc in results
47+
]
48+
49+
return docs

flow/chat.json

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"documents": {
3+
"id": "1",
4+
"title": "Lunar Exploration Guide",
5+
"name": "Lunar Exploration Guide",
6+
"description": "Explore the moon's wonders with our Lunar Exploration Guide! Covering ancient craters, the Mare Frigoris, and more, learn about lunar phases, its impact on Earth, and the history of moon exploration. Discover astronaut technology and future lunar missions. This guide is packed with maps, photos, and facts for lunar enthusiasts.",
7+
"content": "The moon's surface is a fascinating landscape marked by vast plains called 'maria', numerous craters, and rugged highlands. The maria, Latin for 'seas', are large, dark basaltic plains formed by ancient volcanic eruptions, while the craters resulted from collisions with asteroids and comets. The highlands, or 'terrae', are lighter areas that rise above the maria, consisting of anorthosite, a type of lunar rock. This diverse topography offers insights into the moon's geological history and the solar system's early years.",
8+
},
9+
"question": "tell me about the moon's surface",
10+
"chat_history": []
11+
}

0 commit comments

Comments
 (0)