1
+ import json
1
2
from typing import AsyncIterator , Optional , Union
2
3
3
4
import structlog
11
12
12
13
13
14
async def ollama_stream_generator (
14
- stream : AsyncIterator [ChatResponse ],
15
+ stream : AsyncIterator [ChatResponse ], is_cline_client : bool
15
16
) -> AsyncIterator [str ]:
16
17
"""OpenAI-style SSE format"""
17
18
try :
18
19
async for chunk in stream :
19
20
try :
20
- yield f"{ chunk .model_dump_json ()} \n \n "
21
+ # TODO We should wire in the client info so we can respond with
22
+ # the correct format and start to handle multiple clients
23
+ # in a more robust way.
24
+ if not is_cline_client :
25
+ yield f"{ chunk .model_dump_json ()} \n \n "
26
+ else :
27
+ # First get the raw dict from the chunk
28
+ chunk_dict = chunk .model_dump ()
29
+ # Create response dictionary in OpenAI-like format
30
+ response = {
31
+ "id" : f"chatcmpl-{ chunk_dict .get ('created_at' , '' )} " ,
32
+ "object" : "chat.completion.chunk" ,
33
+ "created" : chunk_dict .get ("created_at" ),
34
+ "model" : chunk_dict .get ("model" ),
35
+ "choices" : [
36
+ {
37
+ "index" : 0 ,
38
+ "delta" : {
39
+ "content" : chunk_dict .get ("message" , {}).get ("content" , "" ),
40
+ "role" : chunk_dict .get ("message" , {}).get ("role" , "assistant" ),
41
+ },
42
+ "finish_reason" : (
43
+ chunk_dict .get ("done_reason" )
44
+ if chunk_dict .get ("done" , False )
45
+ else None
46
+ ),
47
+ }
48
+ ],
49
+ }
50
+ # Preserve existing type or add default if missing
51
+ response ["type" ] = chunk_dict .get ("type" , "stream" )
52
+
53
+ # Add optional fields that might be present in the final message
54
+ optional_fields = [
55
+ "total_duration" ,
56
+ "load_duration" ,
57
+ "prompt_eval_count" ,
58
+ "prompt_eval_duration" ,
59
+ "eval_count" ,
60
+ "eval_duration" ,
61
+ ]
62
+ for field in optional_fields :
63
+ if field in chunk_dict :
64
+ response [field ] = chunk_dict [field ]
65
+
66
+ yield f"data: { json .dumps (response )} \n \n "
21
67
except Exception as e :
22
- yield f"{ str (e )} \n \n "
68
+ logger .error (f"Error in stream generator: { str (e )} " )
69
+ yield f"data: { json .dumps ({'error' : str (e ), 'type' : 'error' , 'choices' : []})} \n \n "
23
70
except Exception as e :
24
- yield f"{ str (e )} \n \n "
71
+ logger .error (f"Stream error: { str (e )} " )
72
+ yield f"data: { json .dumps ({'error' : str (e ), 'type' : 'error' , 'choices' : []})} \n \n "
25
73
26
74
27
75
class OllamaShim (BaseCompletionHandler ):
28
76
29
77
def __init__ (self , base_url ):
30
78
self .client = AsyncClient (host = base_url , timeout = 300 )
79
+ self .is_cline_client = False
31
80
32
81
async def execute_completion (
33
82
self ,
34
83
request : ChatCompletionRequest ,
35
84
api_key : Optional [str ],
36
85
stream : bool = False ,
37
86
is_fim_request : bool = False ,
87
+ is_cline_client : bool = False ,
38
88
) -> Union [ChatResponse , GenerateResponse ]:
39
89
"""Stream response directly from Ollama API."""
90
+
91
+ # TODO: I don't like this, but it's a quick fix for now until we start
92
+ # passing through the client info so we can respond with the correct
93
+ # format.
94
+ # Determine if the client is a Cline client
95
+ self .is_cline_client = any (
96
+ "Cline" in message ["content" ] for message in request .get ("messages" , [])
97
+ )
98
+
40
99
if is_fim_request :
41
100
prompt = request ["messages" ][0 ]["content" ]
42
101
response = await self .client .generate (
@@ -57,7 +116,7 @@ def _create_streaming_response(self, stream: AsyncIterator[ChatResponse]) -> Str
57
116
is the format that FastAPI expects for streaming responses.
58
117
"""
59
118
return StreamingResponse (
60
- ollama_stream_generator (stream ),
119
+ ollama_stream_generator (stream , self . is_cline_client ),
61
120
media_type = "application/x-ndjson" ,
62
121
headers = {
63
122
"Cache-Control" : "no-cache" ,
0 commit comments