Skip to content

Commit f8bb760

Browse files
[python/knowpro] Got the first knowledge extraction working (#823)
- No secondary indexes yet. - Auth is a mess (see auth.py). - Lots of infrastructure flexibility is missing. - No searching. But I am still excited.
1 parent 0b0c21c commit f8bb760

14 files changed

+53028
-49
lines changed

.gitattributes

+1
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ pnpm-lock.yaml merge=binary
1313
*.jpg -text
1414
*.gif -text
1515
*.pdf -text
16+
*.bin -text

python/ta/Makefile

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ clean:
3434
.PHONY: help
3535
help:
3636
@echo "Usage: make [target]"
37-
@echo "make help # Help"
37+
@echo "make help # Help (this message)"
38+
@echo "make # Same as 'make all'"
3839
@echo "make all # venv, format, check, test, build"
3940
@echo "make format # Run black"
4041
@echo "make check # Run pyright"

python/ta/auth.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env python
2+
# Copyright (c) Microsoft Corporation.
3+
# Licensed under the MIT License.
4+
5+
"""Utility to set AZURE_OPENAI_API_KEY to a valid token.
6+
7+
Usage: eval `./auth.py`
8+
9+
NOTE: The token is only valid for a short time.
10+
"""
11+
12+
import sys
13+
14+
from azure.identity import DeviceCodeCredential
15+
16+
save_stdout = sys.stdout
17+
sys.stdout = sys.stderr
18+
19+
# TODO: Do something non-interactive.
20+
credential = DeviceCodeCredential()
21+
token = credential.get_token("https://cognitiveservices.azure.com/.default")
22+
23+
sys.stdout = save_stdout
24+
print(f"export AZURE_OPENAI_API_KEY={token.token}")

python/ta/demo.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
import asyncio
5+
import os
6+
7+
import dotenv
8+
9+
from typeagent.knowpro import convknowledge
10+
11+
12+
async def main():
13+
dotenv.load_dotenv(os.path.expanduser("~/TypeAgent/ts/.env")) # TODO: Only works in dev tree
14+
# for k, v in os.environ.items():
15+
# print(f"{k}={v!r}")
16+
ke = convknowledge.KnowledgeExtractor()
17+
print(await ke.extract("There is a book about hobbits called the Lord of the Rings."))
18+
19+
20+
asyncio.run(main())

python/ta/requirements.txt

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
1-
# runtime deps
1+
# Runtime deps
2+
23
typechat @ git+https://github.com/microsoft/TypeChat#subdirectory=python
3-
# build deps
4+
python-dotenv
5+
pydantic
6+
azure-identity
7+
8+
# Build deps
9+
410
build
511
black
612
pyright

python/ta/testdata/Episode_53_AdrianTchaikovsky.txt

+380
Large diffs are not rendered by default.

python/ta/testdata/Episode_53_AdrianTchaikovsky_index_data.json

+52,365
Large diffs are not rendered by default.
Binary file not shown.

python/ta/typeagent/knowpro/convindex.py

+93-4
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
from dataclasses import dataclass, field
55
from typing import Any, Callable
66

7+
import typechat
8+
79
from .interfaces import (
810
# Interfaces.
911
IConversation,
12+
IConversationSecondaryIndexes,
1013
IMessage,
1114
ITermToSemanticRefIndex,
1215
# Other imports.
@@ -20,11 +23,12 @@
2023
SemanticRef,
2124
TermToSemanticRefIndexItemData,
2225
TermToSemanticRefIndexData,
26+
TextIndexingResult,
2327
TextLocation,
2428
TextRange,
2529
Topic,
2630
)
27-
from . import kplib
31+
from . import convknowledge, importing, kplib
2832

2933

3034
def text_range_from_location(
@@ -91,12 +95,14 @@ def add_facet(
9195

9296

9397
def add_topic_to_index(
94-
topic: Topic,
98+
topic: Topic | str,
9599
semantic_refs: list[SemanticRef],
96100
semantic_ref_index: ITermToSemanticRefIndex,
97101
message_ordinal: MessageOrdinal,
98102
chunk_ordinal: int = 0,
99103
) -> None:
104+
if isinstance(topic, str):
105+
topic = Topic(text=topic)
100106
ref_ordinal = len(semantic_refs)
101107
semantic_refs.append(
102108
SemanticRef(
@@ -143,6 +149,22 @@ def add_action_to_index(
143149
add_facet(action.subject_entity_facet, ref_ordinal, semantic_ref_index)
144150

145151

152+
def add_knowledge_to_index(
153+
semantic_refs: list[SemanticRef],
154+
semantic_ref_index: ITermToSemanticRefIndex,
155+
message_ordinal: MessageOrdinal,
156+
knowledge: kplib.KnowledgeResponse,
157+
) -> None:
158+
for entity in knowledge.entities:
159+
add_entity_to_index(entity, semantic_refs, semantic_ref_index, message_ordinal)
160+
for action in knowledge.actions:
161+
add_action_to_index(action, semantic_refs, semantic_ref_index, message_ordinal)
162+
for inverse_action in knowledge.inverse_actions:
163+
add_action_to_index(inverse_action, semantic_refs, semantic_ref_index, message_ordinal)
164+
for topic in knowledge.topics:
165+
add_topic_to_index(topic, semantic_refs, semantic_ref_index, message_ordinal)
166+
167+
146168
def add_metadata_to_index[TMessage: IMessage](
147169
messages: list[TMessage],
148170
semantic_refs: list[SemanticRef],
@@ -242,16 +264,83 @@ def _prepare_term(self, term: str) -> str:
242264
# ...
243265

244266

267+
def create_knowledge_extractor(
268+
model: typechat.TypeChatLanguageModel | None = None,
269+
) -> convknowledge.KnowledgeExtractor:
270+
return convknowledge.KnowledgeExtractor(model)
271+
272+
245273
async def build_conversation_index(
246274
conversation: IConversation,
247-
conversation_settings: Any, # TODO: ConversationSettings
275+
conversation_settings: importing.ConversationSettings,
248276
event_handler: IndexingEventHandlers | None = None,
249277
) -> IndexingResults:
250278
result = IndexingResults()
279+
result.semantic_refs = await build_semantic_ref_index(
280+
conversation, None, event_handler
281+
)
251282
# TODO
252-
# result.semantic_refs = await build_semantic_ref_index(conversation, None, event_handler)
253283
# if result.semantic_refs and not result.semantic_refs.error and conversation.semantic_ref_index:
254284
# result.secondary_index_results = await build_secondary_indexes(
255285
# conversation, conversation_settings, event_handler
256286
# )
257287
return result
288+
289+
290+
async def build_semantic_ref_index[TM: IMessage, TC: IConversationSecondaryIndexes](
291+
conversation: IConversation[TM, ConversationIndex, TC],
292+
extractor: convknowledge.KnowledgeExtractor | None = None,
293+
event_handler: IndexingEventHandlers | None = None,
294+
) -> TextIndexingResult:
295+
semantic_ref_index = conversation.semantic_ref_index
296+
if semantic_ref_index is None:
297+
conversation.semantic_ref_index = semantic_ref_index = ConversationIndex()
298+
299+
semantic_refs = conversation.semantic_refs
300+
if semantic_refs is None:
301+
conversation.semantic_refs = semantic_refs = []
302+
303+
if extractor is None:
304+
extractor = create_knowledge_extractor()
305+
306+
indexing_result = TextIndexingResult()
307+
308+
for message_ordinal, message in enumerate(conversation.messages):
309+
print(f"\nPROCESSING MESSAGE {message_ordinal}")
310+
chunk_ordinal = 0
311+
# Only one chunk per message for now.
312+
text = message.text_chunks[chunk_ordinal]
313+
# TODO: retries
314+
knowledge = await extractor.extract(text)
315+
if knowledge is None:
316+
indexing_result.error = f"Failed to extract knowledge from message {message_ordinal}: {text}"
317+
print(indexing_result.error)
318+
break
319+
if knowledge.entities or knowledge.actions or knowledge.inverse_actions or knowledge.topics:
320+
add_knowledge_to_index(
321+
semantic_refs,
322+
semantic_ref_index,
323+
message_ordinal,
324+
knowledge,
325+
)
326+
completed_chunk = TextLocation(message_ordinal, chunk_ordinal)
327+
indexing_result.completed_upto = completed_chunk
328+
if event_handler and event_handler.on_knowledge_extracted:
329+
if not event_handler.on_knowledge_extracted(completed_chunk, knowledge):
330+
print("BREAK")
331+
break
332+
333+
# dump(semantic_ref_index, semantic_refs)
334+
335+
return indexing_result
336+
337+
338+
def dump(semantic_ref_index: ConversationIndex, semantic_refs: list[SemanticRef]) -> None:
339+
print("semantic_ref_index = {")
340+
for k, v in semantic_ref_index._map.items():
341+
print(f" {k!r}: {v},")
342+
print("}\n")
343+
print("semantic_refs = {")
344+
for semantic_ref in semantic_refs:
345+
print(f" {semantic_ref},")
346+
print("}\n")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
from dataclasses import dataclass, field
5+
import os
6+
7+
import typechat
8+
9+
from . import kplib
10+
11+
12+
def create_typechat_model() -> typechat.TypeChatLanguageModel:
13+
return typechat.create_language_model(dict(os.environ))
14+
15+
16+
@dataclass
17+
class KnowledgeExtractor:
18+
19+
def __init__(self, model: typechat.TypeChatLanguageModel | None = None):
20+
if model is None:
21+
model = create_typechat_model()
22+
assert model is not None
23+
self.model = model
24+
self.translator = self.create_translator(self.model)
25+
26+
async def extract(self, message: str) -> kplib.KnowledgeResponse | None:
27+
result: typechat.Result[kplib.KnowledgeResponse] = await self.extract_knowledge(
28+
message
29+
)
30+
if isinstance(result, typechat.Success):
31+
return result.value
32+
else:
33+
return None
34+
35+
async def extract_knowledge(
36+
self, message: str
37+
) -> typechat.Result[kplib.KnowledgeResponse]:
38+
result = await self.translator.translate(message)
39+
# TODO
40+
# if isinstance(result, typechat.Success):
41+
# self.merge_action_knowledge(result.data)
42+
return result
43+
44+
def create_translator(
45+
self, model: typechat.TypeChatLanguageModel
46+
) -> typechat.TypeChatJsonTranslator[kplib.KnowledgeResponse]:
47+
schema = kplib.KnowledgeResponse
48+
type_name = "KnowledgeResponse"
49+
validator = typechat.TypeChatValidator[kplib.KnowledgeResponse](schema)
50+
translator = typechat.TypeChatJsonTranslator[kplib.KnowledgeResponse](
51+
model, validator, kplib.KnowledgeResponse
52+
)
53+
schema_text = translator._schema_str
54+
55+
def create_request_prompt(intent: str) -> str:
56+
return (
57+
f'You are a service that translates user messages in a conversation into JSON objects of type "{type_name}" according to the following TypeScript definitions:\n'
58+
+ f"```\n{schema_text}```\n"
59+
+ f"The following are messages in a conversation:\n"
60+
+ f'"""\n{intent}\n"""\n'
61+
+ f"The following is the user request translated into a JSON object with 2 spaces of indentation and no properties with the value undefined:\n"
62+
)
63+
64+
translator._create_request_prompt = create_request_prompt
65+
return translator

python/ta/typeagent/knowpro/interfaces.py

+25-4
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ class ScoredSemanticRefOrdinal:
5959
semantic_ref_ordinal: SemanticRefOrdinal
6060
score: float
6161

62+
def __repr__(self) -> str:
63+
return f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.score})"
64+
6265
def serialize(self) -> "ScoredSemanticRefOrdinalData":
6366
return ScoredSemanticRefOrdinalData(
6467
semanticRefOrdinal=self.semantic_ref_ordinal, score=self.score
@@ -115,13 +118,16 @@ class Tag:
115118

116119
@dataclass(order=True)
117120
class TextLocation:
118-
# The index of the message.
121+
# The ordinal of the message.
119122
message_ordinal: MessageOrdinal
120-
# The index of the chunk.
123+
# The ordinal of the chunk.
121124
chunk_ordinal: int = 0
122-
# The index of the character within the chunk.
125+
# The ordinal of the character within the chunk.
123126
char_ordinal: int = 0
124127

128+
def __repr__(self) -> str:
129+
return f"{self.__class__.__name__}({self.message_ordinal}, {self.chunk_ordinal}, {self.char_ordinal})"
130+
125131

126132
# A text range within a session.
127133
@dataclass(order=True)
@@ -131,6 +137,12 @@ class TextRange:
131137
# The end of the range (exclusive). If None, the range is a single point.
132138
end: TextLocation | None = None
133139

140+
def __repr__(self) -> str:
141+
if self.end is None:
142+
return f"{self.__class__.__name__}({self.start})"
143+
else:
144+
return f"{self.__class__.__name__}({self.start}, {self.end})"
145+
134146
def __contains__(self, other: Self) -> bool:
135147
otherend = other.end or other.start
136148
selfend = self.end or self.start
@@ -144,13 +156,22 @@ class SemanticRef:
144156
knowledge_type: KnowledgeType
145157
knowledge: Knowledge
146158

159+
def __repr__(self) -> str:
160+
return f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.range}, {self.knowledge_type!r}, {self.knowledge})"
161+
147162

148163
@dataclass
149164
class DateRange:
150165
start: Datetime
151166
# Inclusive. If None, the range is unbounded.
152167
end: Datetime | None = None
153168

169+
def __repr__(self) -> str:
170+
if self.end is None:
171+
return f"{self.__class__.__name__}({self.start})"
172+
else:
173+
return f"{self.__class__.__name__}({self.start}, {self.end})"
174+
154175
def __contains__(self, datetime: Datetime) -> bool:
155176
if self.end is None:
156177
return self.start <= datetime
@@ -410,7 +431,7 @@ class IndexingEventHandlers:
410431

411432
@dataclass
412433
class TextIndexingResult:
413-
completedUpto: TextLocation | None = None
434+
completed_upto: TextLocation | None = None
414435
error: str | None = None
415436

416437

0 commit comments

Comments
 (0)