microsoft
diff --git a/‎.gitattributes
+1 b/‎.gitattributes
+1
diff --git a/‎python/ta/Makefile
+2-1 b/‎python/ta/Makefile
+2-1
diff --git a/‎python/ta/auth.py
+24 b/‎python/ta/auth.py
+24
diff --git a/‎python/ta/demo.py
+20 b/‎python/ta/demo.py
+20
diff --git a/‎python/ta/requirements.txt
+8-2 b/‎python/ta/requirements.txt
+8-2
diff --git a/‎python/ta/testdata/Episode_53_AdrianTchaikovsky.txt
+380 b/‎python/ta/testdata/Episode_53_AdrianTchaikovsky.txt
+380
diff --git a/‎python/ta/testdata/Episode_53_AdrianTchaikovsky_index_data.json
+52,365 b/‎python/ta/testdata/Episode_53_AdrianTchaikovsky_index_data.json
+52,365
diff --git a/‎python/ta/testdata/Episode_53_AdrianTchaikovsky_index_embeddings.bin
7.03 MB b/‎python/ta/testdata/Episode_53_AdrianTchaikovsky_index_embeddings.bin
7.03 MB
diff --git a/‎python/ta/typeagent/knowpro/convindex.py
+93-4 b/‎python/ta/typeagent/knowpro/convindex.py
+93-4
diff --git a/‎python/ta/typeagent/knowpro/convknowledge.py
+65 b/‎python/ta/typeagent/knowpro/convknowledge.py
+65
diff --git a/‎python/ta/typeagent/knowpro/interfaces.py
+25-4 b/‎python/ta/typeagent/knowpro/interfaces.py
+25-4
@@ -13,3 +13,4 @@ pnpm-lock.yaml               merge=binary
 *.jpg -text
 *.gif -text
 *.pdf -text
+*.bin -text
@@ -34,7 +34,8 @@ clean:
 .PHONY: help
 help:
 	@echo "Usage: make [target]"
-	@echo "make help   # Help"
+	@echo "make help   # Help (this message)"
+	@echo "make        # Same as 'make all'"
 	@echo "make all    # venv, format, check, test, build"
 	@echo "make format # Run black"
 	@echo "make check  # Run pyright"
 
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Utility to set AZURE_OPENAI_API_KEY to a valid token.
+
+Usage: eval `./auth.py`
+
+NOTE: The token is only valid for a short time.
+"""
+
+import sys
+
+from azure.identity import DeviceCodeCredential
+
+save_stdout = sys.stdout
+sys.stdout = sys.stderr
+
+# TODO: Do something non-interactive.
+credential = DeviceCodeCredential()
+token = credential.get_token("https://cognitiveservices.azure.com/.default")
+
+sys.stdout = save_stdout
+print(f"export AZURE_OPENAI_API_KEY={token.token}")
@@ -0,0 +1,20 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import asyncio
+import os
+
+import dotenv
+
+from typeagent.knowpro import convknowledge
+
+
+async def main():
+    dotenv.load_dotenv(os.path.expanduser("~/TypeAgent/ts/.env"))  # TODO: Only works in dev tree
+    # for k, v in os.environ.items():
+    #     print(f"{k}={v!r}")
+    ke = convknowledge.KnowledgeExtractor()
+    print(await ke.extract("There is a book about hobbits called the Lord of the Rings."))
+
+
+asyncio.run(main())
@@ -1,6 +1,12 @@
-# runtime deps
+# Runtime deps
+
 typechat @ git+https://github.com/microsoft/TypeChat#subdirectory=python
-# build deps
+python-dotenv
+pydantic
+azure-identity
+
+# Build deps
+
 build
 black
 pyright
@@ -4,9 +4,12 @@
 from dataclasses import dataclass, field
 from typing import Any, Callable
 
+import typechat
+
 from .interfaces import (
     # Interfaces.
     IConversation,
+    IConversationSecondaryIndexes,
     IMessage,
     ITermToSemanticRefIndex,
     # Other imports.
@@ -20,11 +23,12 @@
     SemanticRef,
     TermToSemanticRefIndexItemData,
     TermToSemanticRefIndexData,
+    TextIndexingResult,
     TextLocation,
     TextRange,
     Topic,
 )
-from . import kplib
+from . import convknowledge, importing, kplib
 
 
 def text_range_from_location(
@@ -91,12 +95,14 @@ def add_facet(
 
 
 def add_topic_to_index(
-    topic: Topic,
+    topic: Topic | str,
     semantic_refs: list[SemanticRef],
     semantic_ref_index: ITermToSemanticRefIndex,
     message_ordinal: MessageOrdinal,
     chunk_ordinal: int = 0,
 ) -> None:
+    if isinstance(topic, str):
+        topic = Topic(text=topic)
     ref_ordinal = len(semantic_refs)
     semantic_refs.append(
         SemanticRef(
@@ -143,6 +149,22 @@ def add_action_to_index(
     add_facet(action.subject_entity_facet, ref_ordinal, semantic_ref_index)
 
 
+def add_knowledge_to_index(
+        semantic_refs: list[SemanticRef],
+        semantic_ref_index: ITermToSemanticRefIndex,
+        message_ordinal: MessageOrdinal,
+        knowledge: kplib.KnowledgeResponse,
+) -> None:
+    for entity in knowledge.entities:
+        add_entity_to_index(entity, semantic_refs, semantic_ref_index, message_ordinal)
+    for action in knowledge.actions:
+        add_action_to_index(action, semantic_refs, semantic_ref_index, message_ordinal)
+    for inverse_action in knowledge.inverse_actions:
+        add_action_to_index(inverse_action, semantic_refs, semantic_ref_index, message_ordinal)
+    for topic in knowledge.topics:
+        add_topic_to_index(topic, semantic_refs, semantic_ref_index, message_ordinal)
+
+
 def add_metadata_to_index[TMessage: IMessage](
     messages: list[TMessage],
     semantic_refs: list[SemanticRef],
@@ -242,16 +264,83 @@ def _prepare_term(self, term: str) -> str:
 # ...
 
 
+def create_knowledge_extractor(
+    model: typechat.TypeChatLanguageModel | None = None,
+) -> convknowledge.KnowledgeExtractor:
+    return convknowledge.KnowledgeExtractor(model)
+
+
 async def build_conversation_index(
     conversation: IConversation,
-    conversation_settings: Any,  # TODO: ConversationSettings
+    conversation_settings: importing.ConversationSettings,
     event_handler: IndexingEventHandlers | None = None,
 ) -> IndexingResults:
     result = IndexingResults()
+    result.semantic_refs = await build_semantic_ref_index(
+        conversation, None, event_handler
+    )
     # TODO
-    # result.semantic_refs = await build_semantic_ref_index(conversation, None, event_handler)
     # if result.semantic_refs and not result.semantic_refs.error and conversation.semantic_ref_index:
     #     result.secondary_index_results = await build_secondary_indexes(
     #         conversation, conversation_settings, event_handler
     #     )
     return result
+
+
+async def build_semantic_ref_index[TM: IMessage, TC: IConversationSecondaryIndexes](
+    conversation: IConversation[TM, ConversationIndex, TC],
+    extractor: convknowledge.KnowledgeExtractor | None = None,
+    event_handler: IndexingEventHandlers | None = None,
+) -> TextIndexingResult:
+    semantic_ref_index = conversation.semantic_ref_index
+    if semantic_ref_index is None:
+        conversation.semantic_ref_index = semantic_ref_index = ConversationIndex()
+
+    semantic_refs = conversation.semantic_refs
+    if semantic_refs is None:
+        conversation.semantic_refs = semantic_refs = []
+
+    if extractor is None:
+        extractor = create_knowledge_extractor()
+
+    indexing_result = TextIndexingResult()
+
+    for message_ordinal, message in enumerate(conversation.messages):
+        print(f"\nPROCESSING MESSAGE {message_ordinal}")
+        chunk_ordinal = 0
+        # Only one chunk per message for now.
+        text = message.text_chunks[chunk_ordinal]
+        # TODO: retries
+        knowledge = await extractor.extract(text)
+        if knowledge is None:
+            indexing_result.error = f"Failed to extract knowledge from message {message_ordinal}: {text}"
+            print(indexing_result.error)
+            break
+        if knowledge.entities or knowledge.actions or knowledge.inverse_actions or knowledge.topics:
+            add_knowledge_to_index(
+                semantic_refs,
+                semantic_ref_index,
+                message_ordinal,
+                knowledge,
+            )
+        completed_chunk = TextLocation(message_ordinal, chunk_ordinal)
+        indexing_result.completed_upto = completed_chunk
+        if event_handler and event_handler.on_knowledge_extracted:
+            if not event_handler.on_knowledge_extracted(completed_chunk, knowledge):
+                print("BREAK")
+                break
+
+    # dump(semantic_ref_index, semantic_refs)
+
+    return indexing_result
+
+
+def dump(semantic_ref_index: ConversationIndex, semantic_refs: list[SemanticRef]) -> None:
+    print("semantic_ref_index = {")
+    for k, v in semantic_ref_index._map.items():
+        print(f"    {k!r}: {v},")
+    print("}\n")
+    print("semantic_refs = {")
+    for semantic_ref in semantic_refs:
+        print(f"    {semantic_ref},")
+    print("}\n")
@@ -0,0 +1,65 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from dataclasses import dataclass, field
+import os
+
+import typechat
+
+from . import kplib
+
+
+def create_typechat_model() -> typechat.TypeChatLanguageModel:
+    return typechat.create_language_model(dict(os.environ))
+
+
+@dataclass
+class KnowledgeExtractor:
+
+    def __init__(self, model: typechat.TypeChatLanguageModel | None = None):
+        if model is None:
+            model = create_typechat_model()
+        assert model is not None
+        self.model = model
+        self.translator = self.create_translator(self.model)
+
+    async def extract(self, message: str) -> kplib.KnowledgeResponse | None:
+        result: typechat.Result[kplib.KnowledgeResponse] = await self.extract_knowledge(
+            message
+        )
+        if isinstance(result, typechat.Success):
+            return result.value
+        else:
+            return None
+
+    async def extract_knowledge(
+        self, message: str
+    ) -> typechat.Result[kplib.KnowledgeResponse]:
+        result = await self.translator.translate(message)
+        # TODO
+        # if isinstance(result, typechat.Success):
+        #     self.merge_action_knowledge(result.data)
+        return result
+
+    def create_translator(
+        self, model: typechat.TypeChatLanguageModel
+    ) -> typechat.TypeChatJsonTranslator[kplib.KnowledgeResponse]:
+        schema = kplib.KnowledgeResponse
+        type_name = "KnowledgeResponse"
+        validator = typechat.TypeChatValidator[kplib.KnowledgeResponse](schema)
+        translator = typechat.TypeChatJsonTranslator[kplib.KnowledgeResponse](
+            model, validator, kplib.KnowledgeResponse
+        )
+        schema_text = translator._schema_str
+
+        def create_request_prompt(intent: str) -> str:
+            return (
+                f'You are a service that translates user messages in a conversation into JSON objects of type "{type_name}" according to the following TypeScript definitions:\n'
+                + f"```\n{schema_text}```\n"
+                + f"The following are messages in a conversation:\n"
+                + f'"""\n{intent}\n"""\n'
+                + f"The following is the user request translated into a JSON object with 2 spaces of indentation and no properties with the value undefined:\n"
+            )
+
+        translator._create_request_prompt = create_request_prompt
+        return translator
@@ -59,6 +59,9 @@ class ScoredSemanticRefOrdinal:
     semantic_ref_ordinal: SemanticRefOrdinal
     score: float
 
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.score})"
+
     def serialize(self) -> "ScoredSemanticRefOrdinalData":
         return ScoredSemanticRefOrdinalData(
             semanticRefOrdinal=self.semantic_ref_ordinal, score=self.score
@@ -115,13 +118,16 @@ class Tag:
 
 @dataclass(order=True)
 class TextLocation:
-    # The index of the message.
+    # The ordinal of the message.
     message_ordinal: MessageOrdinal
-    # The index of the chunk.
+    # The ordinal of the chunk.
     chunk_ordinal: int = 0
-    # The index of the character within the chunk.
+    # The ordinal of the character within the chunk.
     char_ordinal: int = 0
 
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.message_ordinal}, {self.chunk_ordinal}, {self.char_ordinal})"
+
 
 # A text range within a session.
 @dataclass(order=True)
@@ -131,6 +137,12 @@ class TextRange:
     # The end of the range (exclusive). If None, the range is a single point.
     end: TextLocation | None = None
 
+    def __repr__(self) -> str:
+        if self.end is None:
+            return f"{self.__class__.__name__}({self.start})"
+        else:
+            return f"{self.__class__.__name__}({self.start}, {self.end})"
+
     def __contains__(self, other: Self) -> bool:
         otherend = other.end or other.start
         selfend = self.end or self.start
@@ -144,13 +156,22 @@ class SemanticRef:
     knowledge_type: KnowledgeType
     knowledge: Knowledge
 
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.range}, {self.knowledge_type!r}, {self.knowledge})"
+
 
 @dataclass
 class DateRange:
     start: Datetime
     # Inclusive. If None, the range is unbounded.
     end: Datetime | None = None
 
+    def __repr__(self) -> str:
+        if self.end is None:
+            return f"{self.__class__.__name__}({self.start})"
+        else:
+            return f"{self.__class__.__name__}({self.start}, {self.end})"
+
     def __contains__(self, datetime: Datetime) -> bool:
         if self.end is None:
             return self.start <= datetime
@@ -410,7 +431,7 @@ class IndexingEventHandlers:
 
 @dataclass
 class TextIndexingResult:
-    completedUpto: TextLocation | None = None
+    completed_upto: TextLocation | None = None
     error: str | None = None