microsoft · auto-d · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/.../packages/autogen-core/docs/src/user-guide/core-user-guide/components/model-clients.ipynb b/.../packages/autogen-core/docs/src/user-guide/core-user-guide/components/model-clients.ipynb
@@ -184,15 +184,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Comparing usage returns in the above Non Streaming `model_client.create(messages=messages)` vs streaming `model_client.create_stream(messages=messages)` we see differences.\n",
-    "The non streaming response by default returns valid prompt and completion token usage counts. \n",
-    "The streamed response by default returns zero values.\n",
+    "Comparing usage returns in the above non-streaming `model_client.create(messages=messages)` to streaming `model_client.create_stream(messages=messages)`, we see differences. The non-streaming response by default returns a valid prompt and completion token usage counts. The streamed response by default returns zero values.\n",
     "\n",
-    "as documented in the OPENAI API Reference an additional parameter `stream_options` can be specified to return valid usage counts. see [stream_options](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stream_options)\n",
+    "As documented in the OpenAI API Reference, an additional parameter `stream_options` can be specified to return valid usage counts. See [stream_options](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stream_options). Only set this when using streaming, i.e. when using `create_stream`. To enable this, set `extra_create_args={\"stream_options\": {\"include_usage\": True}},` when calling `create_stream`. Depending on which completion client is being used, the maximum empty chunks allowed may need to be adjusted, e.g. `max_consecutive_empty_chunk_tolerance=2`, to account for the trailing empty message containing usage information.\n",
     "\n",
-    "Only set this when you using streaming ie , using `create_stream` \n",
-    "\n",
-    "to enable this in `create_stream` set `extra_create_args={\"stream_options\": {\"include_usage\": True}},`\n",
     "\n",
     "```{note}\n",
     "Note whilst other API's like LiteLLM also support this, it is not always guarenteed that it is fully supported or correct.\n",

diff --git a/python/packages/autogen-ext/src/autogen_ext/models/openai/_openai_client.py b/python/packages/autogen-ext/src/autogen_ext/models/openai/_openai_client.py
@@ -1008,6 +1008,8 @@ class OpenAIChatCompletionClient(BaseOpenAIChatCompletionClient, Component[OpenA
 
         client = ChatCompletionClient.load_component(config)
 
+    Note: When usage information is requested (see `documentation <https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices>`_.) with the `create_stream` method, `max_consecutive_empty_chunk_tolerance` should be increased to permit the trailing empty chunk carrying the usage information.  E.g. `completion_client.create_stream(... , max_consecutive_empty_chunk_tolerance=2, extra_create_args={"stream_options": {"include_usage": True}})`.
+
     To view the full list of available configuration options, see the :py:class:`OpenAIClientConfigurationConfigModel` class.
 
     """
@@ -1117,7 +1119,7 @@ class AzureOpenAIChatCompletionClient(
                 # api_key="sk-...", # For key-based authentication. `AZURE_OPENAI_API_KEY` environment variable can also be used instead.
             )
 
-    To load the client that uses identity based aith from a configuration, you can use the `load_component` method:
+    To load the client that uses identity based auth from a configuration, you can use the `load_component` method:
 
     .. code-block:: python
 
@@ -1142,7 +1144,8 @@ class AzureOpenAIChatCompletionClient(
 
         client = ChatCompletionClient.load_component(config)
 
-
+    Note: When usage information is requested (see `documentation <https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices>`_.) with the `create_stream` method, `max_consecutive_empty_chunk_tolerance` should be increased to permit the trailing empty chunk carrying the usage information.  E.g. `completion_client.create_stream(... , max_consecutive_empty_chunk_tolerance=2, extra_create_args={"stream_options": {"include_usage": True}})`.
+
     To view the full list of available configuration options, see the :py:class:`AzureOpenAIClientConfigurationConfigModel` class.