kailani-h
diff --git a/‎.gitignore
+91-379 b/‎.gitignore
+91-379
diff --git a/‎azure.yaml
+25 b/‎azure.yaml
+25
diff --git a/‎azure.yaml.json
+1,016 b/‎azure.yaml.json
+1,016
diff --git a/‎data/sample-documents-indexing.ipynb
+254 b/‎data/sample-documents-indexing.ipynb
+254
diff --git a/‎data/sample-documents.csv
+21 b/‎data/sample-documents.csv
+21
diff --git a/‎flow/__init__.py b/‎flow/__init__.py
diff --git a/‎flow/ai_search.py
+49 b/‎flow/ai_search.py
+49
diff --git a/‎flow/chat.json
+11 b/‎flow/chat.json
+11
@@ -0,0 +1,25 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/Azure/azure-dev/main/schemas/v1.0/azure.yaml.json
+
+name: rag-flow
+workflows:
+  up:
+    steps:
+      - azd: provision
+hooks:
+  postprovision:
+    posix:
+      shell: sh
+      continueOnError: false
+      interactive: true
+      run: infra/hooks/postprovision.sh
+    windows:
+      shell: pwsh
+      continueOnError: false
+      interactive: true
+      run: infra/hooks/postprovision.ps1
+
+services:
+  rag-flow:
+    project: ./flow
+    language: python
+    host: function
@@ -0,0 +1,254 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generating your search index\n",
+    "This notebook is designed to automatically create the rag search index for you. It uses the [documents csv file](documents.csv) file to create the index. In order to do so it needs names ane keys for the following services:\n",
+    "\n",
+    "- Azure Search Service\n",
+    "- Azure OpenAI Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n",
+    "from azure.search.documents import SearchClient\n",
+    "from azure.search.documents.indexes import SearchIndexClient\n",
+    "from azure.search.documents.indexes.models import (\n",
+    "    HnswParameters,\n",
+    "    HnswAlgorithmConfiguration,\n",
+    "    SemanticPrioritizedFields,\n",
+    "    SearchableField,\n",
+    "    SearchField,\n",
+    "    SearchFieldDataType,\n",
+    "    SearchIndex,\n",
+    "    SemanticSearch,\n",
+    "    SemanticConfiguration,\n",
+    "    SemanticField,\n",
+    "    SimpleField,\n",
+    "    VectorSearch,\n",
+    "    VectorSearchAlgorithmKind,\n",
+    "    VectorSearchAlgorithmMetric,\n",
+    "    ExhaustiveKnnAlgorithmConfiguration,\n",
+    "    ExhaustiveKnnParameters,\n",
+    "    VectorSearchProfile,\n",
+    ")\n",
+    "from typing import List, Dict\n",
+    "from openai import AzureOpenAI\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def delete_index(search_index_client: SearchIndexClient, search_index: str):\n",
+    "    print(f\"deleting index {search_index}\")\n",
+    "    search_index_client.delete_index(search_index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_index_definition(name: str) -> SearchIndex:\n",
+    "    \"\"\"\n",
+    "    Returns an Azure Cognitive Search index with the given name.\n",
+    "    \"\"\"\n",
+    "    # The fields we want to index. The \"embedding\" field is a vector field that will\n",
+    "    # be used for vector search.\n",
+    "    fields = [\n",
+    "        SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n",
+    "        SearchableField(name=\"content\", type=SearchFieldDataType.String),\n",
+    "        SimpleField(name=\"filepath\", type=SearchFieldDataType.String),\n",
+    "        SearchableField(name=\"title\", type=SearchFieldDataType.String),\n",
+    "        SimpleField(name=\"url\", type=SearchFieldDataType.String),\n",
+    "        SearchField(\n",
+    "            name=\"contentVector\",\n",
+    "            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n",
+    "            searchable=True,\n",
+    "            # Size of the vector created by the text-embedding-ada-002 model.\n",
+    "            vector_search_dimensions=1536,\n",
+    "            vector_search_profile_name=\"myHnswProfile\",\n",
+    "        ),\n",
+    "    ]\n",
+    "\n",
+    "    # The \"content\" field should be prioritized for semantic ranking.\n",
+    "    semantic_config = SemanticConfiguration(\n",
+    "        name=\"default\",\n",
+    "        prioritized_fields=SemanticPrioritizedFields(\n",
+    "            title_field=SemanticField(field_name=\"title\"),\n",
+    "            keywords_fields=[],\n",
+    "            content_fields=[SemanticField(field_name=\"content\")],\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "    # For vector search, we want to use the HNSW (Hierarchical Navigable Small World)\n",
+    "    # algorithm (a type of approximate nearest neighbor search algorithm) with cosine\n",
+    "    # distance.\n",
+    "    vector_search = VectorSearch(\n",
+    "        algorithms=[\n",
+    "            HnswAlgorithmConfiguration(\n",
+    "                name=\"myHnsw\",\n",
+    "                kind=VectorSearchAlgorithmKind.HNSW,\n",
+    "                parameters=HnswParameters(\n",
+    "                    m=4,\n",
+    "                    ef_construction=400,\n",
+    "                    ef_search=500,\n",
+    "                    metric=VectorSearchAlgorithmMetric.COSINE,\n",
+    "                ),\n",
+    "            ),\n",
+    "            ExhaustiveKnnAlgorithmConfiguration(\n",
+    "                name=\"myExhaustiveKnn\",\n",
+    "                kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,\n",
+    "                parameters=ExhaustiveKnnParameters(\n",
+    "                    metric=VectorSearchAlgorithmMetric.COSINE\n",
+    "                ),\n",
+    "            ),\n",
+    "        ],\n",
+    "        profiles=[\n",
+    "            VectorSearchProfile(\n",
+    "                name=\"myHnswProfile\",\n",
+    "                algorithm_configuration_name=\"myHnsw\",\n",
+    "            ),\n",
+    "            VectorSearchProfile(\n",
+    "                name=\"myExhaustiveKnnProfile\",\n",
+    "                algorithm_configuration_name=\"myExhaustiveKnn\",\n",
+    "            ),\n",
+    "        ],\n",
+    "    )\n",
+    "\n",
+    "    # Create the semantic settings with the configuration\n",
+    "    semantic_search = SemanticSearch(configurations=[semantic_config])\n",
+    "\n",
+    "    # Create the search index.\n",
+    "    index = SearchIndex(\n",
+    "        name=name,\n",
+    "        fields=fields,\n",
+    "        semantic_search=semantic_search,\n",
+    "        vector_search=vector_search,\n",
+    "    )\n",
+    "\n",
+    "    return index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gen_documents(\n",
+    "    path: str,\n",
+    ") -> List[Dict[str, any]]:\n",
+    "    openai_service_endoint = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n",
+    "    openai_deployment = \"text-embedding-ada-002\"\n",
+    "\n",
+    "    token_provider = get_bearer_token_provider(DefaultAzureCredential(), \"https://cognitiveservices.azure.com/.default\")\n",
+    "    # openai.Embedding.create() -> client.embeddings.create()\n",
+    "    client = AzureOpenAI(\n",
+    "        api_version=\"2023-07-01-preview\",\n",
+    "        azure_endpoint=openai_service_endoint,\n",
+    "        azure_deployment=openai_deployment,\n",
+    "         azure_ad_token_provider=token_provider\n",
+    "    )\n",
+    "\n",
+    "    documents = pd.read_csv(path)\n",
+    "    items = []\n",
+    "    for document in documents.to_dict(\"records\"):\n",
+    "        content = document[\"description\"]\n",
+    "        id = str(document[\"id\"])\n",
+    "        title = document[\"name\"]\n",
+    "        url = document[\"url\"]\n",
+    "        emb = client.embeddings.create(input=content, model=openai_deployment)\n",
+    "        rec = {\n",
+    "            \"id\": id,\n",
+    "            \"content\": content,\n",
+    "            \"filepath\": f\"{title.lower().replace(' ', '-')}\",\n",
+    "            \"title\": title,\n",
+    "            \"url\": url,\n",
+    "            \"contentVector\": emb.data[0].embedding,\n",
+    "        }\n",
+    "        items.append(rec)\n",
+    "\n",
+    "    return items"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rag_search = os.environ[\"AZURE_SEARCH_ENDPOINT\"]\n",
+    "index_name = \"rag-index\"\n",
+    "\n",
+    "search_index_client = SearchIndexClient(\n",
+    "    rag_search, DefaultAzureCredential()\n",
+    ")\n",
+    "\n",
+    "delete_index(search_index_client, index_name)\n",
+    "index = create_index_definition(index_name)\n",
+    "print(f\"creating index {index_name}\")\n",
+    "search_index_client.create_or_update_index(index)\n",
+    "print(f\"index {index_name} created\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"indexing documents\")\n",
+    "docs = gen_documents(\"sample-documents.csv\")\n",
+    "# Upload our data to the index.\n",
+    "search_client = SearchClient(\n",
+    "    endpoint=rag_search,\n",
+    "    index_name=index_name,\n",
+    "    credential=DefaultAzureCredential(),\n",
+    ")\n",
+    "print(f\"uploading {len(docs)} documents to index {index_name}\")\n",
+    "ds = search_client.upload_documents(docs)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,21 @@
+id,name,description,url
+1,Mare Imbrium,"A vast lunar mare in the Moon's Imbrium Basin, formed by ancient volcanic activity",https://example.com
+2,Tycho Crater,"A prominent lunar impact crater known for its bright ray system, named after the Danish astronomer Tycho Brahe",https://example.com
+3,Sea of Tranquility,"The landing site of Apollo 11, the first manned moon landing, marking humanity's first steps on another celestial body",https://example.com
+4,Copernicus Crater,"A large, prominent crater that is a dominant feature in the Moon's Oceanus Procellarum, showcasing terraced walls and central peaks",https://example.com
+5,Earth to Moon Distance,"The average distance from Earth to the Moon is about 384,400 kilometers or 238,855 miles",https://example.com
+6,Sinus Iridum,"Known as the Bay of Rainbows, a plain of basaltic lava surrounded by the rugged Jura Mountains",https://example.com
+7,Clavius Crater,"One of the largest crater formations on the Moon, visible with the naked eye, featuring a notable arc of smaller craters within it",https://example.com
+8,Mare Serenitatis,"A lunar mare located to the east of Mare Imbrium, notable for its relatively smooth and flat basaltic plains",https://example.com
+9,Mare Frigoris,"A lunar mare that stretches across the Moon's northern edge, distinguished by its cold, dark basaltic surface",https://example.com
+10,Lunar Surface,"The Moon's surface is covered with a layer of regolith, a mix of fine dust and rocky debris created by millennia of meteorite impacts",https://example.com
+11,South Pole-Aitken Basin,"One of the largest and oldest impact features on the Moon, stretching across the Moon's southern hemisphere",https://example.com
+12,Hadley Rille,"A sinuous rille or channel associated with the Mare Imbrium, explored by the Apollo 15 mission",https://example.com
+13,Mountains of the Moon,"A range of mountains on the Moon that includes some of the highest peaks, offering striking views and geological features",https://example.com
+14,Plato Crater,"A large, ancient crater with a dark, flat floor, located near the lunar Alps",https://example.com
+15,Mare Nectaris,"A lunar mare that forms a basin on the Moon's surface, surrounded by rugged highlands and ringed by mountain ranges",https://example.com
+16,Aristarchus Plateau,"A region on the Moon noted for its high albedo and geological complexity, featuring the Aristarchus Crater and Schroter's Valley",https://example.com
+17,Schroter's Valley,"The largest sinuous rille on the Moon, located near the Aristarchus Plateau, resembling a winding river",https://example.com
+18,Mare Humorum,"A smaller mare on the Moon, surrounded by the lunar highlands, characterized by its circular shape and dark basaltic plains",https://example.com
+19,Lunar Maria,"The large, dark plains on the Moon's surface, formed by ancient volcanic eruptions and covering about 16% of the lunar surface",https://example.com
+20,Taurus-Littrow Valley,"The valley explored by the Apollo 17 astronauts, featuring a mix of highlands and volcanic features",https://example.com
@@ -0,0 +1,49 @@
+from typing import List
+import os
+from azure.identity import DefaultAzureCredential
+from azure.search.documents import SearchClient
+from azure.search.documents.models import (
+    VectorizedQuery,
+    QueryType,
+    QueryCaptionType,
+    QueryAnswerType,
+)
+
+def retrieve_documentation(
+    question: str,
+    index_name: str,
+    embedding: List[float],
+) -> str:
+
+    
+    search_client = SearchClient(
+        endpoint=os.environ["AZURE_SEARCH_ENDPOINT"],
+        index_name=index_name,
+        credential=DefaultAzureCredential()
+    )
+
+    vector_query = VectorizedQuery(
+        vector=embedding, k_nearest_neighbors=3, fields="contentVector"
+    )
+
+    results = search_client.search(
+        search_text=question,
+        vector_queries=[vector_query],
+        query_type=QueryType.SEMANTIC,
+        semantic_configuration_name="default",
+        query_caption=QueryCaptionType.EXTRACTIVE,
+        query_answer=QueryAnswerType.EXTRACTIVE,
+        top=3,
+    )
+
+    docs = [
+        {
+            "id": doc["id"],
+            "title": doc["title"],
+            "content": doc["content"],
+            "url": doc["url"],
+        }
+        for doc in results
+    ]
+
+    return docs
@@ -0,0 +1,11 @@
+{
+  "documents": {
+    "id": "1",
+    "title": "Lunar Exploration Guide",
+    "name": "Lunar Exploration Guide",
+    "description": "Explore the moon's wonders with our Lunar Exploration Guide! Covering ancient craters, the Mare Frigoris, and more, learn about lunar phases, its impact on Earth, and the history of moon exploration. Discover astronaut technology and future lunar missions. This guide is packed with maps, photos, and facts for lunar enthusiasts.",
+    "content": "The moon's surface is a fascinating landscape marked by vast plains called 'maria', numerous craters, and rugged highlands. The maria, Latin for 'seas', are large, dark basaltic plains formed by ancient volcanic eruptions, while the craters resulted from collisions with asteroids and comets. The highlands, or 'terrae', are lighter areas that rise above the maria, consisting of anorthosite, a type of lunar rock. This diverse topography offers insights into the moon's geological history and the solar system's early years.",
+  },
+  "question": "tell me about the moon's surface", 
+  "chat_history": []
+}