diff --git a/.gitignore b/.gitignore
index 8779740800..70815df7f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,7 @@ dist
.envrc
codegen.log
Brewfile.lock.json
+
+.DS_Store
+
+examples/*.mp3
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 4556676715..42bc7e250e 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
{
- ".": "1.67.0"
+ ".": "1.68.0"
}
\ No newline at end of file
diff --git a/.stats.yml b/.stats.yml
index e0b06dc22a..abb9371314 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,2 +1,2 @@
-configured_endpoints: 81
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-b26121d5df6eb5d3032a45a267473798b15fcfec76dd44a3256cf1238be05fa4.yml
+configured_endpoints: 82
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c22f59c66aec7914b6ee653d3098d1c1c8c16c180d2a158e819c8ddbf476f74b.yml
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ddd8b945c6..78ae21f27f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
# Changelog
+## 1.68.0 (2025-03-20)
+
+Full Changelog: [v1.67.0...v1.68.0](https://github.com/openai/openai-python/compare/v1.67.0...v1.68.0)
+
+### Features
+
+* add audio helpers ([423655c](https://github.com/openai/openai-python/commit/423655ca9077cfd258f1e52f6eb386fc8307fa5f))
+* **api:** new models for TTS, STT, + new audio features for Realtime ([#2232](https://github.com/openai/openai-python/issues/2232)) ([ab5192d](https://github.com/openai/openai-python/commit/ab5192d0a7b417ade622ec94dd48f86beb90692c))
+
## 1.67.0 (2025-03-19)
Full Changelog: [v1.66.5...v1.67.0](https://github.com/openai/openai-python/compare/v1.66.5...v1.67.0)
diff --git a/api.md b/api.md
index 7f3a9392a2..a5f81c624c 100644
--- a/api.md
+++ b/api.md
@@ -151,7 +151,11 @@ Types:
```python
from openai.types.audio import (
Transcription,
+ TranscriptionInclude,
TranscriptionSegment,
+ TranscriptionStreamEvent,
+ TranscriptionTextDeltaEvent,
+ TranscriptionTextDoneEvent,
TranscriptionVerbose,
TranscriptionWord,
TranscriptionCreateResponse,
@@ -338,7 +342,9 @@ from openai.types.beta.realtime import (
ConversationItemDeleteEvent,
ConversationItemDeletedEvent,
ConversationItemInputAudioTranscriptionCompletedEvent,
+ ConversationItemInputAudioTranscriptionDeltaEvent,
ConversationItemInputAudioTranscriptionFailedEvent,
+ ConversationItemRetrieveEvent,
ConversationItemTruncateEvent,
ConversationItemTruncatedEvent,
ConversationItemWithReference,
@@ -375,6 +381,8 @@ from openai.types.beta.realtime import (
SessionCreatedEvent,
SessionUpdateEvent,
SessionUpdatedEvent,
+ TranscriptionSessionUpdate,
+ TranscriptionSessionUpdatedEvent,
)
```
@@ -390,6 +398,18 @@ Methods:
- client.beta.realtime.sessions.create(\*\*params) -> SessionCreateResponse
+### TranscriptionSessions
+
+Types:
+
+```python
+from openai.types.beta.realtime import TranscriptionSession
+```
+
+Methods:
+
+- client.beta.realtime.transcription_sessions.create(\*\*params) -> TranscriptionSession
+
## Assistants
Types:
diff --git a/examples/audio.py b/examples/audio.py
index 85f47bfb06..af41fe601b 100755
--- a/examples/audio.py
+++ b/examples/audio.py
@@ -1,6 +1,5 @@
#!/usr/bin/env rye run python
-import time
from pathlib import Path
from openai import OpenAI
@@ -12,8 +11,6 @@
def main() -> None:
- stream_to_speakers()
-
# Create text-to-speech audio file
with openai.audio.speech.with_streaming_response.create(
model="tts-1",
@@ -37,28 +34,5 @@ def main() -> None:
print(translation.text)
-def stream_to_speakers() -> None:
- import pyaudio
-
- player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
-
- start_time = time.time()
-
- with openai.audio.speech.with_streaming_response.create(
- model="tts-1",
- voice="alloy",
- response_format="pcm", # similar to WAV, but without a header chunk at the start.
- input="""I see skies of blue and clouds of white
- The bright blessed days, the dark sacred nights
- And I think to myself
- What a wonderful world""",
- ) as response:
- print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
- for chunk in response.iter_bytes(chunk_size=1024):
- player_stream.write(chunk)
-
- print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
-
-
if __name__ == "__main__":
main()
diff --git a/examples/speech_to_text.py b/examples/speech_to_text.py
new file mode 100755
index 0000000000..cc3f56b424
--- /dev/null
+++ b/examples/speech_to_text.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env rye run python
+
+import asyncio
+
+from openai import AsyncOpenAI
+from openai.helpers import Microphone
+
+# gets OPENAI_API_KEY from your environment variables
+openai = AsyncOpenAI()
+
+
+async def main() -> None:
+ print("Recording for the next 10 seconds...")
+ recording = await Microphone(timeout=10).record()
+ print("Recording complete")
+ transcription = await openai.audio.transcriptions.create(
+ model="whisper-1",
+ file=recording,
+ )
+
+ print(transcription.text)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/examples/text_to_speech.py b/examples/text_to_speech.py
new file mode 100755
index 0000000000..ac8b12b0ab
--- /dev/null
+++ b/examples/text_to_speech.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env rye run python
+
+import time
+import asyncio
+
+from openai import AsyncOpenAI
+from openai.helpers import LocalAudioPlayer
+
+# gets OPENAI_API_KEY from your environment variables
+openai = AsyncOpenAI()
+
+
+async def main() -> None:
+ start_time = time.time()
+
+ async with openai.audio.speech.with_streaming_response.create(
+ model="tts-1",
+ voice="alloy",
+ response_format="pcm", # similar to WAV, but without a header chunk at the start.
+ input="""I see skies of blue and clouds of white
+ The bright blessed days, the dark sacred nights
+ And I think to myself
+ What a wonderful world""",
+ ) as response:
+ print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
+ await LocalAudioPlayer().play(response)
+ print(f"Time to play: {int((time.time() - start_time) * 1000)}ms")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
index a0a7eba2f5..5ee7157038 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "openai"
-version = "1.67.0"
+version = "1.68.0"
description = "The official Python library for the openai API"
dynamic = ["readme"]
license = "Apache-2.0"
@@ -16,6 +16,8 @@ dependencies = [
"sniffio",
"tqdm > 4",
"jiter>=0.4.0, <1",
+ "sounddevice>=0.5.1",
+ "numpy>=2.0.2",
]
requires-python = ">= 3.8"
classifiers = [
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 48e49f926c..0755ddb3c5 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -33,6 +33,7 @@ certifi==2023.7.22
# via requests
cffi==1.16.0
# via cryptography
+ # via sounddevice
charset-normalizer==3.3.2
# via requests
click==8.1.7
@@ -92,7 +93,7 @@ nest-asyncio==1.6.0
nodeenv==1.8.0
# via pyright
nox==2023.4.22
-numpy==1.26.3
+numpy==2.0.2
# via openai
# via pandas
# via pandas-stubs
@@ -102,7 +103,7 @@ packaging==23.2
# via black
# via nox
# via pytest
-pandas==2.1.4
+pandas==2.2.3
# via openai
pandas-stubs==2.1.4.231227
# via openai
@@ -154,6 +155,8 @@ sniffio==1.3.0
# via trio
sortedcontainers==2.4.0
# via trio
+sounddevice==0.5.1
+ # via openai
time-machine==2.9.0
toml==0.10.2
# via inline-snapshot
diff --git a/requirements.lock b/requirements.lock
index b935c0ee59..fa88e26c0f 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -18,6 +18,8 @@ anyio==4.1.0
certifi==2023.7.22
# via httpcore
# via httpx
+cffi==1.17.1
+ # via sounddevice
distro==1.8.0
# via openai
exceptiongroup==1.2.2
@@ -41,6 +43,8 @@ pandas==2.2.3
# via openai
pandas-stubs==2.2.2.240807
# via openai
+pycparser==2.22
+ # via cffi
pydantic==2.10.3
# via openai
pydantic-core==2.27.1
@@ -54,6 +58,8 @@ six==1.16.0
sniffio==1.3.0
# via anyio
# via openai
+sounddevice==0.5.1
+ # via openai
tqdm==4.66.5
# via openai
types-pytz==2024.2.0.20241003
diff --git a/src/openai/_version.py b/src/openai/_version.py
index b63e6ad189..23e4e7ffb7 100644
--- a/src/openai/_version.py
+++ b/src/openai/_version.py
@@ -1,4 +1,4 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
__title__ = "openai"
-__version__ = "1.67.0" # x-release-please-version
+__version__ = "1.68.0" # x-release-please-version
diff --git a/src/openai/helpers.py b/src/openai/helpers.py
new file mode 100644
index 0000000000..1a10168a96
--- /dev/null
+++ b/src/openai/helpers.py
@@ -0,0 +1,4 @@
+from .helpers.microphone import Microphone
+from .helpers.local_audio_player import LocalAudioPlayer
+
+__all__ = ["LocalAudioPlayer", "Microphone"]
diff --git a/src/openai/helpers/__init__.py b/src/openai/helpers/__init__.py
new file mode 100644
index 0000000000..ab3044da59
--- /dev/null
+++ b/src/openai/helpers/__init__.py
@@ -0,0 +1,4 @@
+from .microphone import Microphone
+from .local_audio_player import LocalAudioPlayer
+
+__all__ = ["Microphone", "LocalAudioPlayer"]
diff --git a/src/openai/helpers/local_audio_player.py b/src/openai/helpers/local_audio_player.py
new file mode 100644
index 0000000000..46a16ce6bb
--- /dev/null
+++ b/src/openai/helpers/local_audio_player.py
@@ -0,0 +1,162 @@
+# mypy: ignore-errors
+import queue
+import asyncio
+from typing import Any, Union, Callable, AsyncGenerator, cast
+
+import numpy as np
+import sounddevice as sd # type: ignore
+import numpy.typing as npt
+
+from .. import _legacy_response
+from .._response import StreamedBinaryAPIResponse, AsyncStreamedBinaryAPIResponse
+
+SAMPLE_RATE = 24000
+
+
+class LocalAudioPlayer:
+ def __init__(
+ self,
+ should_stop: Union[Callable[[], bool], None] = None,
+ ):
+ self.channels = 1
+ self.dtype = np.float32
+ self.should_stop = should_stop
+
+ async def _tts_response_to_buffer(
+ self,
+ response: Union[
+ _legacy_response.HttpxBinaryResponseContent,
+ AsyncStreamedBinaryAPIResponse,
+ StreamedBinaryAPIResponse,
+ ],
+ ) -> npt.NDArray[np.float32]:
+ chunks: list[bytes] = []
+ if isinstance(response, _legacy_response.HttpxBinaryResponseContent) or isinstance(
+ response, StreamedBinaryAPIResponse
+ ):
+ for chunk in response.iter_bytes(chunk_size=1024):
+ if chunk:
+ chunks.append(chunk)
+ else:
+ async for chunk in response.iter_bytes(chunk_size=1024):
+ if chunk:
+ chunks.append(chunk)
+
+ audio_bytes = b"".join(chunks)
+ audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0
+ audio_np = audio_np.reshape(-1, 1)
+ return audio_np
+
+ async def play(
+ self,
+ input: Union[
+ npt.NDArray[np.int16],
+ npt.NDArray[np.float32],
+ _legacy_response.HttpxBinaryResponseContent,
+ AsyncStreamedBinaryAPIResponse,
+ StreamedBinaryAPIResponse,
+ ],
+ ) -> None:
+ audio_content: npt.NDArray[np.float32]
+ if isinstance(input, np.ndarray):
+ if input.dtype == np.int16 and self.dtype == np.float32:
+ audio_content = (input.astype(np.float32) / 32767.0).reshape(-1, self.channels)
+ elif input.dtype == np.float32:
+ audio_content = cast(npt.NDArray[np.float32], input)
+ else:
+ raise ValueError(f"Unsupported dtype: {input.dtype}")
+ else:
+ audio_content = await self._tts_response_to_buffer(input)
+
+ loop = asyncio.get_event_loop()
+ event = asyncio.Event()
+ idx = 0
+
+ def callback(
+ outdata: npt.NDArray[np.float32],
+ frame_count: int,
+ _time_info: Any,
+ _status: Any,
+ ):
+ nonlocal idx
+
+ remainder = len(audio_content) - idx
+ if remainder == 0 or (callable(self.should_stop) and self.should_stop()):
+ loop.call_soon_threadsafe(event.set)
+ raise sd.CallbackStop
+ valid_frames = frame_count if remainder >= frame_count else remainder
+ outdata[:valid_frames] = audio_content[idx : idx + valid_frames]
+ outdata[valid_frames:] = 0
+ idx += valid_frames
+
+ stream = sd.OutputStream(
+ samplerate=SAMPLE_RATE,
+ callback=callback,
+ dtype=audio_content.dtype,
+ channels=audio_content.shape[1],
+ )
+ with stream:
+ await event.wait()
+
+ async def play_stream(
+ self,
+ buffer_stream: AsyncGenerator[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None], None],
+ ) -> None:
+ loop = asyncio.get_event_loop()
+ event = asyncio.Event()
+ buffer_queue: queue.Queue[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None]] = queue.Queue(maxsize=50)
+
+ async def buffer_producer():
+ async for buffer in buffer_stream:
+ if buffer is None:
+ break
+ await loop.run_in_executor(None, buffer_queue.put, buffer)
+ await loop.run_in_executor(None, buffer_queue.put, None) # Signal completion
+
+ def callback(
+ outdata: npt.NDArray[np.float32],
+ frame_count: int,
+ _time_info: Any,
+ _status: Any,
+ ):
+ nonlocal current_buffer, buffer_pos
+
+ frames_written = 0
+ while frames_written < frame_count:
+ if current_buffer is None or buffer_pos >= len(current_buffer):
+ try:
+ current_buffer = buffer_queue.get(timeout=0.1)
+ if current_buffer is None:
+ loop.call_soon_threadsafe(event.set)
+ raise sd.CallbackStop
+ buffer_pos = 0
+
+ if current_buffer.dtype == np.int16 and self.dtype == np.float32:
+ current_buffer = (current_buffer.astype(np.float32) / 32767.0).reshape(-1, self.channels)
+
+ except queue.Empty:
+ outdata[frames_written:] = 0
+ return
+
+ remaining_frames = len(current_buffer) - buffer_pos
+ frames_to_write = min(frame_count - frames_written, remaining_frames)
+ outdata[frames_written : frames_written + frames_to_write] = current_buffer[
+ buffer_pos : buffer_pos + frames_to_write
+ ]
+ buffer_pos += frames_to_write
+ frames_written += frames_to_write
+
+ current_buffer = None
+ buffer_pos = 0
+
+ producer_task = asyncio.create_task(buffer_producer())
+
+ with sd.OutputStream(
+ samplerate=SAMPLE_RATE,
+ channels=self.channels,
+ dtype=self.dtype,
+ callback=callback,
+ ):
+ await event.wait()
+
+ await producer_task
diff --git a/src/openai/helpers/microphone.py b/src/openai/helpers/microphone.py
new file mode 100644
index 0000000000..18650909aa
--- /dev/null
+++ b/src/openai/helpers/microphone.py
@@ -0,0 +1,98 @@
+# mypy: ignore-errors
+import io
+import time
+import wave
+import asyncio
+from typing import Any, Type, Union, Generic, TypeVar, Callable, overload
+from typing_extensions import Literal
+
+import numpy as np
+import sounddevice as sd # type: ignore
+import numpy.typing as npt
+
+from openai._types import FileTypes, FileContent
+
+SAMPLE_RATE = 24000
+
+DType = TypeVar("DType", bound=np.generic)
+
+
+class Microphone(Generic[DType]):
+ def __init__(
+ self,
+ channels: int = 1,
+ dtype: Type[DType] = np.int16,
+ should_record: Union[Callable[[], bool], None] = None,
+ timeout: Union[float, None] = None,
+ ):
+ self.channels = channels
+ self.dtype = dtype
+ self.should_record = should_record
+ self.buffer_chunks = []
+ self.timeout = timeout
+ self.has_record_function = callable(should_record)
+
+ def _ndarray_to_wav(self, audio_data: npt.NDArray[DType]) -> FileTypes:
+ buffer: FileContent = io.BytesIO()
+ with wave.open(buffer, "w") as wav_file:
+ wav_file.setnchannels(self.channels)
+ wav_file.setsampwidth(np.dtype(self.dtype).itemsize)
+ wav_file.setframerate(SAMPLE_RATE)
+ wav_file.writeframes(audio_data.tobytes())
+ buffer.seek(0)
+ return ("audio.wav", buffer, "audio/wav")
+
+ @overload
+ async def record(self, return_ndarray: Literal[True]) -> npt.NDArray[DType]: ...
+
+ @overload
+ async def record(self, return_ndarray: Literal[False]) -> FileTypes: ...
+
+ @overload
+ async def record(self, return_ndarray: None = ...) -> FileTypes: ...
+
+ async def record(self, return_ndarray: Union[bool, None] = False) -> Union[npt.NDArray[DType], FileTypes]:
+ loop = asyncio.get_event_loop()
+ event = asyncio.Event()
+ self.buffer_chunks: list[npt.NDArray[DType]] = []
+ start_time = time.perf_counter()
+
+ def callback(
+ indata: npt.NDArray[DType],
+ _frame_count: int,
+ _time_info: Any,
+ _status: Any,
+ ):
+ execution_time = time.perf_counter() - start_time
+ reached_recording_timeout = execution_time > self.timeout if self.timeout is not None else False
+ if reached_recording_timeout:
+ loop.call_soon_threadsafe(event.set)
+ raise sd.CallbackStop
+
+ should_be_recording = self.should_record() if callable(self.should_record) else True
+ if not should_be_recording:
+ loop.call_soon_threadsafe(event.set)
+ raise sd.CallbackStop
+
+ self.buffer_chunks.append(indata.copy())
+
+ stream = sd.InputStream(
+ callback=callback,
+ dtype=self.dtype,
+ samplerate=SAMPLE_RATE,
+ channels=self.channels,
+ )
+ with stream:
+ await event.wait()
+
+ # Concatenate all chunks into a single buffer, handle empty case
+ concatenated_chunks: npt.NDArray[DType] = (
+ np.concatenate(self.buffer_chunks, axis=0)
+ if len(self.buffer_chunks) > 0
+ else np.array([], dtype=self.dtype)
+ )
+
+ if return_ndarray:
+ return concatenated_chunks
+ else:
+ return self._ndarray_to_wav(concatenated_chunks)
diff --git a/src/openai/resources/audio/speech.py b/src/openai/resources/audio/speech.py
index ad01118161..529e3a47ea 100644
--- a/src/openai/resources/audio/speech.py
+++ b/src/openai/resources/audio/speech.py
@@ -54,6 +54,7 @@ def create(
input: str,
model: Union[str, SpeechModel],
voice: Literal["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"],
+ instructions: str | NotGiven = NOT_GIVEN,
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | NotGiven = NOT_GIVEN,
speed: float | NotGiven = NOT_GIVEN,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -71,13 +72,16 @@ def create(
model:
One of the available [TTS models](https://platform.openai.com/docs/models#tts):
- `tts-1` or `tts-1-hd`
+ `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
voice: The voice to use when generating the audio. Supported voices are `alloy`, `ash`,
`coral`, `echo`, `fable`, `onyx`, `nova`, `sage` and `shimmer`. Previews of the
voices are available in the
[Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
+ instructions: Control the voice of your generated audio with additional instructions. Does not
+ work with `tts-1` or `tts-1-hd`.
+
response_format: The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`,
`wav`, and `pcm`.
@@ -100,6 +104,7 @@ def create(
"input": input,
"model": model,
"voice": voice,
+ "instructions": instructions,
"response_format": response_format,
"speed": speed,
},
@@ -138,6 +143,7 @@ async def create(
input: str,
model: Union[str, SpeechModel],
voice: Literal["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"],
+ instructions: str | NotGiven = NOT_GIVEN,
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | NotGiven = NOT_GIVEN,
speed: float | NotGiven = NOT_GIVEN,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -155,13 +161,16 @@ async def create(
model:
One of the available [TTS models](https://platform.openai.com/docs/models#tts):
- `tts-1` or `tts-1-hd`
+ `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
voice: The voice to use when generating the audio. Supported voices are `alloy`, `ash`,
`coral`, `echo`, `fable`, `onyx`, `nova`, `sage` and `shimmer`. Previews of the
voices are available in the
[Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
+ instructions: Control the voice of your generated audio with additional instructions. Does not
+ work with `tts-1` or `tts-1-hd`.
+
response_format: The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`,
`wav`, and `pcm`.
@@ -184,6 +193,7 @@ async def create(
"input": input,
"model": model,
"voice": voice,
+ "instructions": instructions,
"response_format": response_format,
"speed": speed,
},
diff --git a/src/openai/resources/audio/transcriptions.py b/src/openai/resources/audio/transcriptions.py
index f338ad067d..2a77f91d69 100644
--- a/src/openai/resources/audio/transcriptions.py
+++ b/src/openai/resources/audio/transcriptions.py
@@ -3,7 +3,7 @@
from __future__ import annotations
import logging
-from typing import TYPE_CHECKING, List, Union, Mapping, cast
+from typing import TYPE_CHECKING, List, Union, Mapping, Optional, cast
from typing_extensions import Literal, overload, assert_never
import httpx
@@ -13,6 +13,7 @@
from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven, FileTypes
from ..._utils import (
extract_files,
+ required_args,
maybe_transform,
deepcopy_minimal,
async_maybe_transform,
@@ -20,12 +21,16 @@
from ..._compat import cached_property
from ..._resource import SyncAPIResource, AsyncAPIResource
from ..._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
+from ..._streaming import Stream, AsyncStream
from ...types.audio import transcription_create_params
from ..._base_client import make_request_options
from ...types.audio_model import AudioModel
from ...types.audio.transcription import Transcription
from ...types.audio_response_format import AudioResponseFormat
+from ...types.audio.transcription_include import TranscriptionInclude
from ...types.audio.transcription_verbose import TranscriptionVerbose
+from ...types.audio.transcription_stream_event import TranscriptionStreamEvent
+from ...types.audio.transcription_create_response import TranscriptionCreateResponse
__all__ = ["Transcriptions", "AsyncTranscriptions"]
@@ -58,6 +63,7 @@ def create(
*,
file: FileTypes,
model: Union[str, AudioModel],
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
response_format: Union[Literal["json"], NotGiven] = NOT_GIVEN,
language: str | NotGiven = NOT_GIVEN,
prompt: str | NotGiven = NOT_GIVEN,
@@ -77,6 +83,7 @@ def create(
*,
file: FileTypes,
model: Union[str, AudioModel],
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
response_format: Literal["verbose_json"],
language: str | NotGiven = NOT_GIVEN,
prompt: str | NotGiven = NOT_GIVEN,
@@ -97,6 +104,7 @@ def create(
file: FileTypes,
model: Union[str, AudioModel],
response_format: Literal["text", "srt", "vtt"],
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
language: str | NotGiven = NOT_GIVEN,
prompt: str | NotGiven = NOT_GIVEN,
temperature: float | NotGiven = NOT_GIVEN,
@@ -109,11 +117,96 @@ def create(
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
) -> str: ...
+ @overload
+ def create(
+ self,
+ *,
+ file: FileTypes,
+ model: Union[str, AudioModel],
+ stream: Literal[True],
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
+ language: str | NotGiven = NOT_GIVEN,
+ prompt: str | NotGiven = NOT_GIVEN,
+ response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+ temperature: float | NotGiven = NOT_GIVEN,
+ timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+ # The extra values given here take precedence over values defined on the client or passed to this method.
+ extra_headers: Headers | None = None,
+ extra_query: Query | None = None,
+ extra_body: Body | None = None,
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+ ) -> Stream[TranscriptionStreamEvent]:
+ """
+ Transcribes audio into the input language.
+
+ Args:
+ file:
+ The audio file object (not file name) to transcribe, in one of these formats:
+ flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+
+ model: ID of the model to use. The options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+ Whisper V2 model).
+
+ stream: If set to true, the model response data will be streamed to the client as it is
+ generated using
+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+ See the
+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+ for more information.
+
+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+ include: Additional information to include in the transcription response. `logprobs` will
+ return the log probabilities of the tokens in the response to understand the
+ model's confidence in the transcription. `logprobs` only works with
+ response_format set to `json` and only with the models `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`.
+
+ language: The language of the input audio. Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+
+ prompt: An optional text to guide the model's style or continue a previous audio
+ segment. The
+ [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+ should match the audio language.
+
+ response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
+ `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+ the only supported format is `json`.
+
+ temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
+ output more random, while lower values like 0.2 will make it more focused and
+ deterministic. If set to 0, the model will use
+ [log probability](https://en.wikipedia.org/wiki/Log_probability) to
+ automatically increase the temperature until certain thresholds are hit.
+
+ timestamp_granularities: The timestamp granularities to populate for this transcription.
+ `response_format` must be set `verbose_json` to use timestamp granularities.
+ Either or both of these options are supported: `word`, or `segment`. Note: There
+ is no additional latency for segment timestamps, but generating word timestamps
+ incurs additional latency.
+
+ extra_headers: Send extra headers
+
+ extra_query: Add additional query parameters to the request
+
+ extra_body: Add additional JSON properties to the request
+
+ timeout: Override the client-level default timeout for this request, in seconds
+ """
+ ...
+
+ @overload
def create(
self,
*,
file: FileTypes,
model: Union[str, AudioModel],
+ stream: bool,
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
language: str | NotGiven = NOT_GIVEN,
prompt: str | NotGiven = NOT_GIVEN,
response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
@@ -125,7 +218,7 @@ def create(
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
- ) -> Transcription | TranscriptionVerbose | str:
+ ) -> TranscriptionCreateResponse | Stream[TranscriptionStreamEvent]:
"""
Transcribes audio into the input language.
@@ -134,8 +227,24 @@ def create(
The audio file object (not file name) to transcribe, in one of these formats:
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
- model: ID of the model to use. Only `whisper-1` (which is powered by our open source
- Whisper V2 model) is currently available.
+ model: ID of the model to use. The options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+ Whisper V2 model).
+
+ stream: If set to true, the model response data will be streamed to the client as it is
+ generated using
+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+ See the
+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+ for more information.
+
+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+ include: Additional information to include in the transcription response. `logprobs` will
+ return the log probabilities of the tokens in the response to understand the
+ model's confidence in the transcription. `logprobs` only works with
+ response_format set to `json` and only with the models `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`.
language: The language of the input audio. Supplying the input language in
[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -147,7 +256,8 @@ def create(
should match the audio language.
response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`.
+ `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+ the only supported format is `json`.
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
output more random, while lower values like 0.2 will make it more focused and
@@ -169,13 +279,37 @@ def create(
timeout: Override the client-level default timeout for this request, in seconds
"""
+ ...
+
+ @required_args(["file", "model"], ["file", "model", "stream"])
+ def create(
+ self,
+ *,
+ file: FileTypes,
+ model: Union[str, AudioModel],
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
+ language: str | NotGiven = NOT_GIVEN,
+ prompt: str | NotGiven = NOT_GIVEN,
+ response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+ stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+ temperature: float | NotGiven = NOT_GIVEN,
+ timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+ # The extra values given here take precedence over values defined on the client or passed to this method.
+ extra_headers: Headers | None = None,
+ extra_query: Query | None = None,
+ extra_body: Body | None = None,
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+ ) -> str | Transcription | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
body = deepcopy_minimal(
{
"file": file,
"model": model,
+ "include": include,
"language": language,
"prompt": prompt,
"response_format": response_format,
+ "stream": stream,
"temperature": temperature,
"timestamp_granularities": timestamp_granularities,
}
@@ -193,6 +327,8 @@ def create(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
cast_to=_get_response_format_type(response_format),
+ stream=stream or False,
+ stream_cls=Stream[TranscriptionStreamEvent],
)
@@ -226,6 +362,7 @@ async def create(
language: str | NotGiven = NOT_GIVEN,
prompt: str | NotGiven = NOT_GIVEN,
temperature: float | NotGiven = NOT_GIVEN,
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
# The extra values given here take precedence over values defined on the client or passed to this method.
@@ -241,6 +378,7 @@ async def create(
*,
file: FileTypes,
model: Union[str, AudioModel],
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
response_format: Literal["verbose_json"],
language: str | NotGiven = NOT_GIVEN,
prompt: str | NotGiven = NOT_GIVEN,
@@ -260,6 +398,7 @@ async def create(
*,
file: FileTypes,
model: Union[str, AudioModel],
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
response_format: Literal["text", "srt", "vtt"],
language: str | NotGiven = NOT_GIVEN,
prompt: str | NotGiven = NOT_GIVEN,
@@ -273,11 +412,96 @@ async def create(
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
) -> str: ...
+ @overload
+ async def create(
+ self,
+ *,
+ file: FileTypes,
+ model: Union[str, AudioModel],
+ stream: Literal[True],
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
+ language: str | NotGiven = NOT_GIVEN,
+ prompt: str | NotGiven = NOT_GIVEN,
+ response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+ temperature: float | NotGiven = NOT_GIVEN,
+ timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+ # The extra values given here take precedence over values defined on the client or passed to this method.
+ extra_headers: Headers | None = None,
+ extra_query: Query | None = None,
+ extra_body: Body | None = None,
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+ ) -> AsyncStream[TranscriptionStreamEvent]:
+ """
+ Transcribes audio into the input language.
+
+ Args:
+ file:
+ The audio file object (not file name) to transcribe, in one of these formats:
+ flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+
+ model: ID of the model to use. The options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+ Whisper V2 model).
+
+ stream: If set to true, the model response data will be streamed to the client as it is
+ generated using
+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+ See the
+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+ for more information.
+
+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+ include: Additional information to include in the transcription response. `logprobs` will
+ return the log probabilities of the tokens in the response to understand the
+ model's confidence in the transcription. `logprobs` only works with
+ response_format set to `json` and only with the models `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`.
+
+ language: The language of the input audio. Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+
+ prompt: An optional text to guide the model's style or continue a previous audio
+ segment. The
+ [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+ should match the audio language.
+
+ response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
+ `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+ the only supported format is `json`.
+
+ temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
+ output more random, while lower values like 0.2 will make it more focused and
+ deterministic. If set to 0, the model will use
+ [log probability](https://en.wikipedia.org/wiki/Log_probability) to
+ automatically increase the temperature until certain thresholds are hit.
+
+ timestamp_granularities: The timestamp granularities to populate for this transcription.
+ `response_format` must be set `verbose_json` to use timestamp granularities.
+ Either or both of these options are supported: `word`, or `segment`. Note: There
+ is no additional latency for segment timestamps, but generating word timestamps
+ incurs additional latency.
+
+ extra_headers: Send extra headers
+
+ extra_query: Add additional query parameters to the request
+
+ extra_body: Add additional JSON properties to the request
+
+ timeout: Override the client-level default timeout for this request, in seconds
+ """
+ ...
+
+ @overload
async def create(
self,
*,
file: FileTypes,
model: Union[str, AudioModel],
+ stream: bool,
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
language: str | NotGiven = NOT_GIVEN,
prompt: str | NotGiven = NOT_GIVEN,
response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
@@ -289,7 +513,7 @@ async def create(
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
- ) -> Transcription | TranscriptionVerbose | str:
+ ) -> TranscriptionCreateResponse | AsyncStream[TranscriptionStreamEvent]:
"""
Transcribes audio into the input language.
@@ -298,8 +522,24 @@ async def create(
The audio file object (not file name) to transcribe, in one of these formats:
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
- model: ID of the model to use. Only `whisper-1` (which is powered by our open source
- Whisper V2 model) is currently available.
+ model: ID of the model to use. The options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+ Whisper V2 model).
+
+ stream: If set to true, the model response data will be streamed to the client as it is
+ generated using
+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+ See the
+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+ for more information.
+
+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+ include: Additional information to include in the transcription response. `logprobs` will
+ return the log probabilities of the tokens in the response to understand the
+ model's confidence in the transcription. `logprobs` only works with
+ response_format set to `json` and only with the models `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`.
language: The language of the input audio. Supplying the input language in
[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -311,7 +551,8 @@ async def create(
should match the audio language.
response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`.
+ `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+ the only supported format is `json`.
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
output more random, while lower values like 0.2 will make it more focused and
@@ -333,13 +574,37 @@ async def create(
timeout: Override the client-level default timeout for this request, in seconds
"""
+ ...
+
+ @required_args(["file", "model"], ["file", "model", "stream"])
+ async def create(
+ self,
+ *,
+ file: FileTypes,
+ model: Union[str, AudioModel],
+ include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
+ language: str | NotGiven = NOT_GIVEN,
+ prompt: str | NotGiven = NOT_GIVEN,
+ response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+ stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+ temperature: float | NotGiven = NOT_GIVEN,
+ timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+ # The extra values given here take precedence over values defined on the client or passed to this method.
+ extra_headers: Headers | None = None,
+ extra_query: Query | None = None,
+ extra_body: Body | None = None,
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+ ) -> Transcription | TranscriptionVerbose | str | AsyncStream[TranscriptionStreamEvent]:
body = deepcopy_minimal(
{
"file": file,
"model": model,
+ "include": include,
"language": language,
"prompt": prompt,
"response_format": response_format,
+ "stream": stream,
"temperature": temperature,
"timestamp_granularities": timestamp_granularities,
}
@@ -357,6 +622,8 @@ async def create(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
cast_to=_get_response_format_type(response_format),
+ stream=stream or False,
+ stream_cls=AsyncStream[TranscriptionStreamEvent],
)
diff --git a/src/openai/resources/audio/translations.py b/src/openai/resources/audio/translations.py
index cd3132dc57..f55dbd0ee5 100644
--- a/src/openai/resources/audio/translations.py
+++ b/src/openai/resources/audio/translations.py
@@ -9,7 +9,6 @@
import httpx
from ... import _legacy_response
-from ...types import AudioResponseFormat
from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven, FileTypes
from ..._utils import (
extract_files,
@@ -109,7 +108,7 @@ def create(
file: FileTypes,
model: Union[str, AudioModel],
prompt: str | NotGiven = NOT_GIVEN,
- response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+ response_format: Union[Literal["json", "text", "srt", "verbose_json", "vtt"], NotGiven] = NOT_GIVEN,
temperature: float | NotGiven = NOT_GIVEN,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
# The extra values given here take precedence over values defined on the client or passed to this method.
diff --git a/src/openai/resources/beta/realtime/__init__.py b/src/openai/resources/beta/realtime/__init__.py
index 474434e6e1..7ab3d9931c 100644
--- a/src/openai/resources/beta/realtime/__init__.py
+++ b/src/openai/resources/beta/realtime/__init__.py
@@ -16,6 +16,14 @@
SessionsWithStreamingResponse,
AsyncSessionsWithStreamingResponse,
)
+from .transcription_sessions import (
+ TranscriptionSessions,
+ AsyncTranscriptionSessions,
+ TranscriptionSessionsWithRawResponse,
+ AsyncTranscriptionSessionsWithRawResponse,
+ TranscriptionSessionsWithStreamingResponse,
+ AsyncTranscriptionSessionsWithStreamingResponse,
+)
__all__ = [
"Sessions",
@@ -24,6 +32,12 @@
"AsyncSessionsWithRawResponse",
"SessionsWithStreamingResponse",
"AsyncSessionsWithStreamingResponse",
+ "TranscriptionSessions",
+ "AsyncTranscriptionSessions",
+ "TranscriptionSessionsWithRawResponse",
+ "AsyncTranscriptionSessionsWithRawResponse",
+ "TranscriptionSessionsWithStreamingResponse",
+ "AsyncTranscriptionSessionsWithStreamingResponse",
"Realtime",
"AsyncRealtime",
"RealtimeWithRawResponse",
diff --git a/src/openai/resources/beta/realtime/realtime.py b/src/openai/resources/beta/realtime/realtime.py
index cd610d9089..76e57f8cb7 100644
--- a/src/openai/resources/beta/realtime/realtime.py
+++ b/src/openai/resources/beta/realtime/realtime.py
@@ -32,7 +32,19 @@
from ...._resource import SyncAPIResource, AsyncAPIResource
from ...._exceptions import OpenAIError
from ...._base_client import _merge_mappings
-from ....types.beta.realtime import session_update_event_param, response_create_event_param
+from ....types.beta.realtime import (
+ session_update_event_param,
+ response_create_event_param,
+ transcription_session_update_param,
+)
+from .transcription_sessions import (
+ TranscriptionSessions,
+ AsyncTranscriptionSessions,
+ TranscriptionSessionsWithRawResponse,
+ AsyncTranscriptionSessionsWithRawResponse,
+ TranscriptionSessionsWithStreamingResponse,
+ AsyncTranscriptionSessionsWithStreamingResponse,
+)
from ....types.websocket_connection_options import WebsocketConnectionOptions
from ....types.beta.realtime.realtime_client_event import RealtimeClientEvent
from ....types.beta.realtime.realtime_server_event import RealtimeServerEvent
@@ -55,6 +67,10 @@ class Realtime(SyncAPIResource):
def sessions(self) -> Sessions:
return Sessions(self._client)
+ @cached_property
+ def transcription_sessions(self) -> TranscriptionSessions:
+ return TranscriptionSessions(self._client)
+
@cached_property
def with_raw_response(self) -> RealtimeWithRawResponse:
"""
@@ -107,6 +123,10 @@ class AsyncRealtime(AsyncAPIResource):
def sessions(self) -> AsyncSessions:
return AsyncSessions(self._client)
+ @cached_property
+ def transcription_sessions(self) -> AsyncTranscriptionSessions:
+ return AsyncTranscriptionSessions(self._client)
+
@cached_property
def with_raw_response(self) -> AsyncRealtimeWithRawResponse:
"""
@@ -162,6 +182,10 @@ def __init__(self, realtime: Realtime) -> None:
def sessions(self) -> SessionsWithRawResponse:
return SessionsWithRawResponse(self._realtime.sessions)
+ @cached_property
+ def transcription_sessions(self) -> TranscriptionSessionsWithRawResponse:
+ return TranscriptionSessionsWithRawResponse(self._realtime.transcription_sessions)
+
class AsyncRealtimeWithRawResponse:
def __init__(self, realtime: AsyncRealtime) -> None:
@@ -171,6 +195,10 @@ def __init__(self, realtime: AsyncRealtime) -> None:
def sessions(self) -> AsyncSessionsWithRawResponse:
return AsyncSessionsWithRawResponse(self._realtime.sessions)
+ @cached_property
+ def transcription_sessions(self) -> AsyncTranscriptionSessionsWithRawResponse:
+ return AsyncTranscriptionSessionsWithRawResponse(self._realtime.transcription_sessions)
+
class RealtimeWithStreamingResponse:
def __init__(self, realtime: Realtime) -> None:
@@ -180,6 +208,10 @@ def __init__(self, realtime: Realtime) -> None:
def sessions(self) -> SessionsWithStreamingResponse:
return SessionsWithStreamingResponse(self._realtime.sessions)
+ @cached_property
+ def transcription_sessions(self) -> TranscriptionSessionsWithStreamingResponse:
+ return TranscriptionSessionsWithStreamingResponse(self._realtime.transcription_sessions)
+
class AsyncRealtimeWithStreamingResponse:
def __init__(self, realtime: AsyncRealtime) -> None:
@@ -189,14 +221,19 @@ def __init__(self, realtime: AsyncRealtime) -> None:
def sessions(self) -> AsyncSessionsWithStreamingResponse:
return AsyncSessionsWithStreamingResponse(self._realtime.sessions)
+ @cached_property
+ def transcription_sessions(self) -> AsyncTranscriptionSessionsWithStreamingResponse:
+ return AsyncTranscriptionSessionsWithStreamingResponse(self._realtime.transcription_sessions)
+
class AsyncRealtimeConnection:
"""Represents a live websocket connection to the Realtime API"""
session: AsyncRealtimeSessionResource
response: AsyncRealtimeResponseResource
- conversation: AsyncRealtimeConversationResource
input_audio_buffer: AsyncRealtimeInputAudioBufferResource
+ conversation: AsyncRealtimeConversationResource
+ transcription_session: AsyncRealtimeTranscriptionSessionResource
_connection: AsyncWebsocketConnection
@@ -205,8 +242,9 @@ def __init__(self, connection: AsyncWebsocketConnection) -> None:
self.session = AsyncRealtimeSessionResource(self)
self.response = AsyncRealtimeResponseResource(self)
- self.conversation = AsyncRealtimeConversationResource(self)
self.input_audio_buffer = AsyncRealtimeInputAudioBufferResource(self)
+ self.conversation = AsyncRealtimeConversationResource(self)
+ self.transcription_session = AsyncRealtimeTranscriptionSessionResource(self)
async def __aiter__(self) -> AsyncIterator[RealtimeServerEvent]:
"""
@@ -377,8 +415,9 @@ class RealtimeConnection:
session: RealtimeSessionResource
response: RealtimeResponseResource
- conversation: RealtimeConversationResource
input_audio_buffer: RealtimeInputAudioBufferResource
+ conversation: RealtimeConversationResource
+ transcription_session: RealtimeTranscriptionSessionResource
_connection: WebsocketConnection
@@ -387,8 +426,9 @@ def __init__(self, connection: WebsocketConnection) -> None:
self.session = RealtimeSessionResource(self)
self.response = RealtimeResponseResource(self)
- self.conversation = RealtimeConversationResource(self)
self.input_audio_buffer = RealtimeInputAudioBufferResource(self)
+ self.conversation = RealtimeConversationResource(self)
+ self.transcription_session = RealtimeTranscriptionSessionResource(self)
def __iter__(self) -> Iterator[RealtimeServerEvent]:
"""
@@ -582,20 +622,6 @@ def update(self, *, session: session_update_event_param.Session, event_id: str |
class RealtimeResponseResource(BaseRealtimeConnectionResource):
- def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None:
- """Send this event to cancel an in-progress response.
-
- The server will respond
- with a `response.cancelled` event or an error if there is no response to
- cancel.
- """
- self._connection.send(
- cast(
- RealtimeClientEventParam,
- strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}),
- )
- )
-
def create(
self,
*,
@@ -626,6 +652,70 @@ def create(
)
)
+ def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None:
+ """Send this event to cancel an in-progress response.
+
+ The server will respond
+ with a `response.cancelled` event or an error if there is no response to
+ cancel.
+ """
+ self._connection.send(
+ cast(
+ RealtimeClientEventParam,
+ strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}),
+ )
+ )
+
+
+class RealtimeInputAudioBufferResource(BaseRealtimeConnectionResource):
+ def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+ """Send this event to clear the audio bytes in the buffer.
+
+ The server will
+ respond with an `input_audio_buffer.cleared` event.
+ """
+ self._connection.send(
+ cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id}))
+ )
+
+ def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+ """
+ Send this event to commit the user input audio buffer, which will create a
+ new user message item in the conversation. This event will produce an error
+ if the input audio buffer is empty. When in Server VAD mode, the client does
+ not need to send this event, the server will commit the audio buffer
+ automatically.
+
+ Committing the input audio buffer will trigger input audio transcription
+ (if enabled in session configuration), but it will not create a response
+ from the model. The server will respond with an `input_audio_buffer.committed`
+ event.
+ """
+ self._connection.send(
+ cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
+ )
+
+ def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
+ """Send this event to append audio bytes to the input audio buffer.
+
+ The audio
+ buffer is temporary storage you can write to and later commit. In Server VAD
+ mode, the audio buffer is used to detect speech and the server will decide
+ when to commit. When Server VAD is disabled, you must commit the audio buffer
+ manually.
+
+ The client may choose how much audio to place in each event up to a maximum
+ of 15 MiB, for example streaming smaller chunks from the client may allow the
+ VAD to be more responsive. Unlike made other client events, the server will
+ not send a confirmation response to this event.
+ """
+ self._connection.send(
+ cast(
+ RealtimeClientEventParam,
+ strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}),
+ )
+ )
+
class RealtimeConversationResource(BaseRealtimeConnectionResource):
@cached_property
@@ -711,53 +801,30 @@ def truncate(
)
)
-
-class RealtimeInputAudioBufferResource(BaseRealtimeConnectionResource):
- def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
- """Send this event to clear the audio bytes in the buffer.
-
- The server will
- respond with an `input_audio_buffer.cleared` event.
+ def retrieve(self, *, item_id: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
"""
- self._connection.send(
- cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id}))
- )
-
- def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
- """
- Send this event to commit the user input audio buffer, which will create a
- new user message item in the conversation. This event will produce an error
- if the input audio buffer is empty. When in Server VAD mode, the client does
- not need to send this event, the server will commit the audio buffer
- automatically.
-
- Committing the input audio buffer will trigger input audio transcription
- (if enabled in session configuration), but it will not create a response
- from the model. The server will respond with an `input_audio_buffer.committed`
- event.
+ Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD.
+ The server will respond with a `conversation.item.retrieved` event,
+ unless the item does not exist in the conversation history, in which case the
+ server will respond with an error.
"""
self._connection.send(
- cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
+ cast(
+ RealtimeClientEventParam,
+ strip_not_given({"type": "conversation.item.retrieve", "item_id": item_id, "event_id": event_id}),
+ )
)
- def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
- """Send this event to append audio bytes to the input audio buffer.
- The audio
- buffer is temporary storage you can write to and later commit. In Server VAD
- mode, the audio buffer is used to detect speech and the server will decide
- when to commit. When Server VAD is disabled, you must commit the audio buffer
- manually.
-
- The client may choose how much audio to place in each event up to a maximum
- of 15 MiB, for example streaming smaller chunks from the client may allow the
- VAD to be more responsive. Unlike made other client events, the server will
- not send a confirmation response to this event.
- """
+class RealtimeTranscriptionSessionResource(BaseRealtimeConnectionResource):
+ def update(
+ self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN
+ ) -> None:
+ """Send this event to update a transcription session."""
self._connection.send(
cast(
RealtimeClientEventParam,
- strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}),
+ strip_not_given({"type": "transcription_session.update", "session": session, "event_id": event_id}),
)
)
@@ -792,20 +859,6 @@ async def update(
class AsyncRealtimeResponseResource(BaseAsyncRealtimeConnectionResource):
- async def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None:
- """Send this event to cancel an in-progress response.
-
- The server will respond
- with a `response.cancelled` event or an error if there is no response to
- cancel.
- """
- await self._connection.send(
- cast(
- RealtimeClientEventParam,
- strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}),
- )
- )
-
async def create(
self,
*,
@@ -836,6 +889,70 @@ async def create(
)
)
+ async def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None:
+ """Send this event to cancel an in-progress response.
+
+ The server will respond
+ with a `response.cancelled` event or an error if there is no response to
+ cancel.
+ """
+ await self._connection.send(
+ cast(
+ RealtimeClientEventParam,
+ strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}),
+ )
+ )
+
+
+class AsyncRealtimeInputAudioBufferResource(BaseAsyncRealtimeConnectionResource):
+ async def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+ """Send this event to clear the audio bytes in the buffer.
+
+ The server will
+ respond with an `input_audio_buffer.cleared` event.
+ """
+ await self._connection.send(
+ cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id}))
+ )
+
+ async def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+ """
+ Send this event to commit the user input audio buffer, which will create a
+ new user message item in the conversation. This event will produce an error
+ if the input audio buffer is empty. When in Server VAD mode, the client does
+ not need to send this event, the server will commit the audio buffer
+ automatically.
+
+ Committing the input audio buffer will trigger input audio transcription
+ (if enabled in session configuration), but it will not create a response
+ from the model. The server will respond with an `input_audio_buffer.committed`
+ event.
+ """
+ await self._connection.send(
+ cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
+ )
+
+ async def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
+ """Send this event to append audio bytes to the input audio buffer.
+
+ The audio
+ buffer is temporary storage you can write to and later commit. In Server VAD
+ mode, the audio buffer is used to detect speech and the server will decide
+ when to commit. When Server VAD is disabled, you must commit the audio buffer
+ manually.
+
+ The client may choose how much audio to place in each event up to a maximum
+ of 15 MiB, for example streaming smaller chunks from the client may allow the
+ VAD to be more responsive. Unlike made other client events, the server will
+ not send a confirmation response to this event.
+ """
+ await self._connection.send(
+ cast(
+ RealtimeClientEventParam,
+ strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}),
+ )
+ )
+
class AsyncRealtimeConversationResource(BaseAsyncRealtimeConnectionResource):
@cached_property
@@ -921,52 +1038,29 @@ async def truncate(
)
)
-
-class AsyncRealtimeInputAudioBufferResource(BaseAsyncRealtimeConnectionResource):
- async def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
- """Send this event to clear the audio bytes in the buffer.
-
- The server will
- respond with an `input_audio_buffer.cleared` event.
- """
- await self._connection.send(
- cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id}))
- )
-
- async def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+ async def retrieve(self, *, item_id: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
"""
- Send this event to commit the user input audio buffer, which will create a
- new user message item in the conversation. This event will produce an error
- if the input audio buffer is empty. When in Server VAD mode, the client does
- not need to send this event, the server will commit the audio buffer
- automatically.
-
- Committing the input audio buffer will trigger input audio transcription
- (if enabled in session configuration), but it will not create a response
- from the model. The server will respond with an `input_audio_buffer.committed`
- event.
+ Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD.
+ The server will respond with a `conversation.item.retrieved` event,
+ unless the item does not exist in the conversation history, in which case the
+ server will respond with an error.
"""
await self._connection.send(
- cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
+ cast(
+ RealtimeClientEventParam,
+ strip_not_given({"type": "conversation.item.retrieve", "item_id": item_id, "event_id": event_id}),
+ )
)
- async def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
- """Send this event to append audio bytes to the input audio buffer.
-
- The audio
- buffer is temporary storage you can write to and later commit. In Server VAD
- mode, the audio buffer is used to detect speech and the server will decide
- when to commit. When Server VAD is disabled, you must commit the audio buffer
- manually.
- The client may choose how much audio to place in each event up to a maximum
- of 15 MiB, for example streaming smaller chunks from the client may allow the
- VAD to be more responsive. Unlike made other client events, the server will
- not send a confirmation response to this event.
- """
+class AsyncRealtimeTranscriptionSessionResource(BaseAsyncRealtimeConnectionResource):
+ async def update(
+ self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN
+ ) -> None:
+ """Send this event to update a transcription session."""
await self._connection.send(
cast(
RealtimeClientEventParam,
- strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}),
+ strip_not_given({"type": "transcription_session.update", "session": session, "event_id": event_id}),
)
)
diff --git a/src/openai/resources/beta/realtime/sessions.py b/src/openai/resources/beta/realtime/sessions.py
index 4b337b7c19..5884e54de2 100644
--- a/src/openai/resources/beta/realtime/sessions.py
+++ b/src/openai/resources/beta/realtime/sessions.py
@@ -47,6 +47,7 @@ def create(
self,
*,
input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
+ input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
instructions: str | NotGiven = NOT_GIVEN,
max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
@@ -86,14 +87,20 @@ def create(
`pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
(mono), and little-endian byte order.
+ input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
+ off. Noise reduction filters audio added to the input audio buffer before it is
+ sent to VAD and the model. Filtering the audio can improve VAD and turn
+ detection accuracy (reducing false positives) and model performance by improving
+ perception of the input audio.
+
input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
`null` to turn off once on. Input audio transcription is not native to the
model, since the model consumes audio directly. Transcription runs
asynchronously through
- [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as rough guidance rather than the representation
- understood by the model. The client can optionally set the language and prompt
- for transcription, these fields will be passed to the Whisper API.
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
instructions: The default system instructions (i.e. system message) prepended to model calls.
This field allows the client to guide the model on desired responses. The model
@@ -119,16 +126,24 @@ def create(
output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
For `pcm16`, output audio is sampled at a rate of 24kHz.
- temperature: Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+ temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
+ temperature of 0.8 is highly recommended for best performance.
tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
a function.
tools: Tools (functions) available to the model.
- turn_detection: Configuration for turn detection. Can be set to `null` to turn off. Server VAD
- means that the model will detect the start and end of speech based on audio
- volume and respond at the end of user speech.
+ turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ set to `null` to turn off, in which case the client must manually trigger model
+ response. Server VAD means that the model will detect the start and end of
+ speech based on audio volume and respond at the end of user speech. Semantic VAD
+ is more advanced and uses a turn detection model (in conjuction with VAD) to
+ semantically estimate whether the user has finished speaking, then dynamically
+ sets a timeout based on this probability. For example, if user audio trails off
+ with "uhhm", the model will score a low probability of turn end and wait longer
+ for the user to continue speaking. This can be useful for more natural
+ conversations, but may have a higher latency.
voice: The voice the model uses to respond. Voice cannot be changed during the session
once the model has responded with audio at least once. Current voice options are
@@ -148,6 +163,7 @@ def create(
body=maybe_transform(
{
"input_audio_format": input_audio_format,
+ "input_audio_noise_reduction": input_audio_noise_reduction,
"input_audio_transcription": input_audio_transcription,
"instructions": instructions,
"max_response_output_tokens": max_response_output_tokens,
@@ -193,6 +209,7 @@ async def create(
self,
*,
input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
+ input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
instructions: str | NotGiven = NOT_GIVEN,
max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
@@ -232,14 +249,20 @@ async def create(
`pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
(mono), and little-endian byte order.
+ input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
+ off. Noise reduction filters audio added to the input audio buffer before it is
+ sent to VAD and the model. Filtering the audio can improve VAD and turn
+ detection accuracy (reducing false positives) and model performance by improving
+ perception of the input audio.
+
input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
`null` to turn off once on. Input audio transcription is not native to the
model, since the model consumes audio directly. Transcription runs
asynchronously through
- [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as rough guidance rather than the representation
- understood by the model. The client can optionally set the language and prompt
- for transcription, these fields will be passed to the Whisper API.
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
instructions: The default system instructions (i.e. system message) prepended to model calls.
This field allows the client to guide the model on desired responses. The model
@@ -265,16 +288,24 @@ async def create(
output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
For `pcm16`, output audio is sampled at a rate of 24kHz.
- temperature: Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+ temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
+ temperature of 0.8 is highly recommended for best performance.
tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
a function.
tools: Tools (functions) available to the model.
- turn_detection: Configuration for turn detection. Can be set to `null` to turn off. Server VAD
- means that the model will detect the start and end of speech based on audio
- volume and respond at the end of user speech.
+ turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ set to `null` to turn off, in which case the client must manually trigger model
+ response. Server VAD means that the model will detect the start and end of
+ speech based on audio volume and respond at the end of user speech. Semantic VAD
+ is more advanced and uses a turn detection model (in conjuction with VAD) to
+ semantically estimate whether the user has finished speaking, then dynamically
+ sets a timeout based on this probability. For example, if user audio trails off
+ with "uhhm", the model will score a low probability of turn end and wait longer
+ for the user to continue speaking. This can be useful for more natural
+ conversations, but may have a higher latency.
voice: The voice the model uses to respond. Voice cannot be changed during the session
once the model has responded with audio at least once. Current voice options are
@@ -294,6 +325,7 @@ async def create(
body=await async_maybe_transform(
{
"input_audio_format": input_audio_format,
+ "input_audio_noise_reduction": input_audio_noise_reduction,
"input_audio_transcription": input_audio_transcription,
"instructions": instructions,
"max_response_output_tokens": max_response_output_tokens,
diff --git a/src/openai/resources/beta/realtime/transcription_sessions.py b/src/openai/resources/beta/realtime/transcription_sessions.py
new file mode 100644
index 0000000000..0917da71fa
--- /dev/null
+++ b/src/openai/resources/beta/realtime/transcription_sessions.py
@@ -0,0 +1,277 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import List
+from typing_extensions import Literal
+
+import httpx
+
+from .... import _legacy_response
+from ...._types import NOT_GIVEN, Body, Query, Headers, NotGiven
+from ...._utils import (
+ maybe_transform,
+ async_maybe_transform,
+)
+from ...._compat import cached_property
+from ...._resource import SyncAPIResource, AsyncAPIResource
+from ...._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
+from ...._base_client import make_request_options
+from ....types.beta.realtime import transcription_session_create_params
+from ....types.beta.realtime.transcription_session import TranscriptionSession
+
+__all__ = ["TranscriptionSessions", "AsyncTranscriptionSessions"]
+
+
+class TranscriptionSessions(SyncAPIResource):
+ @cached_property
+ def with_raw_response(self) -> TranscriptionSessionsWithRawResponse:
+ """
+ This property can be used as a prefix for any HTTP method call to return
+ the raw response object instead of the parsed content.
+
+ For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
+ """
+ return TranscriptionSessionsWithRawResponse(self)
+
+ @cached_property
+ def with_streaming_response(self) -> TranscriptionSessionsWithStreamingResponse:
+ """
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+ For more information, see https://www.github.com/openai/openai-python#with_streaming_response
+ """
+ return TranscriptionSessionsWithStreamingResponse(self)
+
+ def create(
+ self,
+ *,
+ include: List[str] | NotGiven = NOT_GIVEN,
+ input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
+ input_audio_noise_reduction: transcription_session_create_params.InputAudioNoiseReduction
+ | NotGiven = NOT_GIVEN,
+ input_audio_transcription: transcription_session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
+ modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
+ turn_detection: transcription_session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+ # The extra values given here take precedence over values defined on the client or passed to this method.
+ extra_headers: Headers | None = None,
+ extra_query: Query | None = None,
+ extra_body: Body | None = None,
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+ ) -> TranscriptionSession:
+ """
+ Create an ephemeral API token for use in client-side applications with the
+ Realtime API specifically for realtime transcriptions. Can be configured with
+ the same session parameters as the `transcription_session.update` client event.
+
+ It responds with a session object, plus a `client_secret` key which contains a
+ usable ephemeral API token that can be used to authenticate browser clients for
+ the Realtime API.
+
+ Args:
+ include:
+ The set of items to include in the transcription. Current available items are:
+
+ - `item.input_audio_transcription.logprobs`
+
+ input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
+ `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
+ (mono), and little-endian byte order.
+
+ input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
+ off. Noise reduction filters audio added to the input audio buffer before it is
+ sent to VAD and the model. Filtering the audio can improve VAD and turn
+ detection accuracy (reducing false positives) and model performance by improving
+ perception of the input audio.
+
+ input_audio_transcription: Configuration for input audio transcription. The client can optionally set the
+ language and prompt for transcription, these offer additional guidance to the
+ transcription service.
+
+ modalities: The set of modalities the model can respond with. To disable audio, set this to
+ ["text"].
+
+ turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ set to `null` to turn off, in which case the client must manually trigger model
+ response. Server VAD means that the model will detect the start and end of
+ speech based on audio volume and respond at the end of user speech. Semantic VAD
+ is more advanced and uses a turn detection model (in conjuction with VAD) to
+ semantically estimate whether the user has finished speaking, then dynamically
+ sets a timeout based on this probability. For example, if user audio trails off
+ with "uhhm", the model will score a low probability of turn end and wait longer
+ for the user to continue speaking. This can be useful for more natural
+ conversations, but may have a higher latency.
+
+ extra_headers: Send extra headers
+
+ extra_query: Add additional query parameters to the request
+
+ extra_body: Add additional JSON properties to the request
+
+ timeout: Override the client-level default timeout for this request, in seconds
+ """
+ extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
+ return self._post(
+ "/realtime/transcription_sessions",
+ body=maybe_transform(
+ {
+ "include": include,
+ "input_audio_format": input_audio_format,
+ "input_audio_noise_reduction": input_audio_noise_reduction,
+ "input_audio_transcription": input_audio_transcription,
+ "modalities": modalities,
+ "turn_detection": turn_detection,
+ },
+ transcription_session_create_params.TranscriptionSessionCreateParams,
+ ),
+ options=make_request_options(
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+ ),
+ cast_to=TranscriptionSession,
+ )
+
+
+class AsyncTranscriptionSessions(AsyncAPIResource):
+ @cached_property
+ def with_raw_response(self) -> AsyncTranscriptionSessionsWithRawResponse:
+ """
+ This property can be used as a prefix for any HTTP method call to return
+ the raw response object instead of the parsed content.
+
+ For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
+ """
+ return AsyncTranscriptionSessionsWithRawResponse(self)
+
+ @cached_property
+ def with_streaming_response(self) -> AsyncTranscriptionSessionsWithStreamingResponse:
+ """
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+ For more information, see https://www.github.com/openai/openai-python#with_streaming_response
+ """
+ return AsyncTranscriptionSessionsWithStreamingResponse(self)
+
+ async def create(
+ self,
+ *,
+ include: List[str] | NotGiven = NOT_GIVEN,
+ input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
+ input_audio_noise_reduction: transcription_session_create_params.InputAudioNoiseReduction
+ | NotGiven = NOT_GIVEN,
+ input_audio_transcription: transcription_session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
+ modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
+ turn_detection: transcription_session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+ # The extra values given here take precedence over values defined on the client or passed to this method.
+ extra_headers: Headers | None = None,
+ extra_query: Query | None = None,
+ extra_body: Body | None = None,
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+ ) -> TranscriptionSession:
+ """
+ Create an ephemeral API token for use in client-side applications with the
+ Realtime API specifically for realtime transcriptions. Can be configured with
+ the same session parameters as the `transcription_session.update` client event.
+
+ It responds with a session object, plus a `client_secret` key which contains a
+ usable ephemeral API token that can be used to authenticate browser clients for
+ the Realtime API.
+
+ Args:
+ include:
+ The set of items to include in the transcription. Current available items are:
+
+ - `item.input_audio_transcription.logprobs`
+
+ input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
+ `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
+ (mono), and little-endian byte order.
+
+ input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
+ off. Noise reduction filters audio added to the input audio buffer before it is
+ sent to VAD and the model. Filtering the audio can improve VAD and turn
+ detection accuracy (reducing false positives) and model performance by improving
+ perception of the input audio.
+
+ input_audio_transcription: Configuration for input audio transcription. The client can optionally set the
+ language and prompt for transcription, these offer additional guidance to the
+ transcription service.
+
+ modalities: The set of modalities the model can respond with. To disable audio, set this to
+ ["text"].
+
+ turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ set to `null` to turn off, in which case the client must manually trigger model
+ response. Server VAD means that the model will detect the start and end of
+ speech based on audio volume and respond at the end of user speech. Semantic VAD
+ is more advanced and uses a turn detection model (in conjuction with VAD) to
+ semantically estimate whether the user has finished speaking, then dynamically
+ sets a timeout based on this probability. For example, if user audio trails off
+ with "uhhm", the model will score a low probability of turn end and wait longer
+ for the user to continue speaking. This can be useful for more natural
+ conversations, but may have a higher latency.
+
+ extra_headers: Send extra headers
+
+ extra_query: Add additional query parameters to the request
+
+ extra_body: Add additional JSON properties to the request
+
+ timeout: Override the client-level default timeout for this request, in seconds
+ """
+ extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
+ return await self._post(
+ "/realtime/transcription_sessions",
+ body=await async_maybe_transform(
+ {
+ "include": include,
+ "input_audio_format": input_audio_format,
+ "input_audio_noise_reduction": input_audio_noise_reduction,
+ "input_audio_transcription": input_audio_transcription,
+ "modalities": modalities,
+ "turn_detection": turn_detection,
+ },
+ transcription_session_create_params.TranscriptionSessionCreateParams,
+ ),
+ options=make_request_options(
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+ ),
+ cast_to=TranscriptionSession,
+ )
+
+
+class TranscriptionSessionsWithRawResponse:
+ def __init__(self, transcription_sessions: TranscriptionSessions) -> None:
+ self._transcription_sessions = transcription_sessions
+
+ self.create = _legacy_response.to_raw_response_wrapper(
+ transcription_sessions.create,
+ )
+
+
+class AsyncTranscriptionSessionsWithRawResponse:
+ def __init__(self, transcription_sessions: AsyncTranscriptionSessions) -> None:
+ self._transcription_sessions = transcription_sessions
+
+ self.create = _legacy_response.async_to_raw_response_wrapper(
+ transcription_sessions.create,
+ )
+
+
+class TranscriptionSessionsWithStreamingResponse:
+ def __init__(self, transcription_sessions: TranscriptionSessions) -> None:
+ self._transcription_sessions = transcription_sessions
+
+ self.create = to_streamed_response_wrapper(
+ transcription_sessions.create,
+ )
+
+
+class AsyncTranscriptionSessionsWithStreamingResponse:
+ def __init__(self, transcription_sessions: AsyncTranscriptionSessions) -> None:
+ self._transcription_sessions = transcription_sessions
+
+ self.create = async_to_streamed_response_wrapper(
+ transcription_sessions.create,
+ )
diff --git a/src/openai/types/audio/__init__.py b/src/openai/types/audio/__init__.py
index 822e0f3a8d..396944ee47 100644
--- a/src/openai/types/audio/__init__.py
+++ b/src/openai/types/audio/__init__.py
@@ -8,9 +8,13 @@
from .transcription_word import TranscriptionWord as TranscriptionWord
from .translation_verbose import TranslationVerbose as TranslationVerbose
from .speech_create_params import SpeechCreateParams as SpeechCreateParams
+from .transcription_include import TranscriptionInclude as TranscriptionInclude
from .transcription_segment import TranscriptionSegment as TranscriptionSegment
from .transcription_verbose import TranscriptionVerbose as TranscriptionVerbose
from .translation_create_params import TranslationCreateParams as TranslationCreateParams
+from .transcription_stream_event import TranscriptionStreamEvent as TranscriptionStreamEvent
from .transcription_create_params import TranscriptionCreateParams as TranscriptionCreateParams
from .translation_create_response import TranslationCreateResponse as TranslationCreateResponse
from .transcription_create_response import TranscriptionCreateResponse as TranscriptionCreateResponse
+from .transcription_text_done_event import TranscriptionTextDoneEvent as TranscriptionTextDoneEvent
+from .transcription_text_delta_event import TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent
diff --git a/src/openai/types/audio/speech_create_params.py b/src/openai/types/audio/speech_create_params.py
index ed1a1ce748..958680710b 100644
--- a/src/openai/types/audio/speech_create_params.py
+++ b/src/openai/types/audio/speech_create_params.py
@@ -17,7 +17,7 @@ class SpeechCreateParams(TypedDict, total=False):
model: Required[Union[str, SpeechModel]]
"""
One of the available [TTS models](https://platform.openai.com/docs/models#tts):
- `tts-1` or `tts-1-hd`
+ `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
"""
voice: Required[Literal["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]]
@@ -28,6 +28,12 @@ class SpeechCreateParams(TypedDict, total=False):
[Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
"""
+ instructions: str
+ """Control the voice of your generated audio with additional instructions.
+
+ Does not work with `tts-1` or `tts-1-hd`.
+ """
+
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]
"""The format to audio in.
diff --git a/src/openai/types/audio/speech_model.py b/src/openai/types/audio/speech_model.py
index bd685ab34d..f004f805da 100644
--- a/src/openai/types/audio/speech_model.py
+++ b/src/openai/types/audio/speech_model.py
@@ -4,4 +4,4 @@
__all__ = ["SpeechModel"]
-SpeechModel: TypeAlias = Literal["tts-1", "tts-1-hd"]
+SpeechModel: TypeAlias = Literal["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
diff --git a/src/openai/types/audio/transcription.py b/src/openai/types/audio/transcription.py
index edb5f227fc..1576385404 100644
--- a/src/openai/types/audio/transcription.py
+++ b/src/openai/types/audio/transcription.py
@@ -1,11 +1,30 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+from typing import List, Optional
from ..._models import BaseModel
-__all__ = ["Transcription"]
+__all__ = ["Transcription", "Logprob"]
+
+
+class Logprob(BaseModel):
+ token: Optional[str] = None
+ """The token in the transcription."""
+
+ bytes: Optional[List[float]] = None
+ """The bytes of the token."""
+
+ logprob: Optional[float] = None
+ """The log probability of the token."""
class Transcription(BaseModel):
text: str
"""The transcribed text."""
+
+ logprobs: Optional[List[Logprob]] = None
+ """The log probabilities of the tokens in the transcription.
+
+ Only returned with the models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`
+ if `logprobs` is added to the `include` array.
+ """
diff --git a/src/openai/types/audio/transcription_create_params.py b/src/openai/types/audio/transcription_create_params.py
index f1779c35e6..0cda4c7907 100644
--- a/src/openai/types/audio/transcription_create_params.py
+++ b/src/openai/types/audio/transcription_create_params.py
@@ -2,17 +2,22 @@
from __future__ import annotations
-from typing import List, Union
+from typing import List, Union, Optional
from typing_extensions import Literal, Required, TypedDict
from ..._types import FileTypes
from ..audio_model import AudioModel
+from .transcription_include import TranscriptionInclude
from ..audio_response_format import AudioResponseFormat
-__all__ = ["TranscriptionCreateParams"]
+__all__ = [
+ "TranscriptionCreateParamsBase",
+ "TranscriptionCreateParamsNonStreaming",
+ "TranscriptionCreateParamsStreaming",
+]
-class TranscriptionCreateParams(TypedDict, total=False):
+class TranscriptionCreateParamsBase(TypedDict, total=False):
file: Required[FileTypes]
"""
The audio file object (not file name) to transcribe, in one of these formats:
@@ -22,8 +27,17 @@ class TranscriptionCreateParams(TypedDict, total=False):
model: Required[Union[str, AudioModel]]
"""ID of the model to use.
- Only `whisper-1` (which is powered by our open source Whisper V2 model) is
- currently available.
+ The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
+ (which is powered by our open source Whisper V2 model).
+ """
+
+ include: List[TranscriptionInclude]
+ """Additional information to include in the transcription response.
+
+ `logprobs` will return the log probabilities of the tokens in the response to
+ understand the model's confidence in the transcription. `logprobs` only works
+ with response_format set to `json` and only with the models `gpt-4o-transcribe`
+ and `gpt-4o-mini-transcribe`.
"""
language: str
@@ -45,7 +59,8 @@ class TranscriptionCreateParams(TypedDict, total=False):
response_format: AudioResponseFormat
"""
The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`.
+ `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+ the only supported format is `json`.
"""
temperature: float
@@ -65,3 +80,34 @@ class TranscriptionCreateParams(TypedDict, total=False):
is no additional latency for segment timestamps, but generating word timestamps
incurs additional latency.
"""
+
+
+class TranscriptionCreateParamsNonStreaming(TranscriptionCreateParamsBase, total=False):
+ stream: Optional[Literal[False]]
+ """
+ If set to true, the model response data will be streamed to the client as it is
+ generated using
+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+ See the
+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+ for more information.
+
+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+ """
+
+
+class TranscriptionCreateParamsStreaming(TranscriptionCreateParamsBase):
+ stream: Required[Literal[True]]
+ """
+ If set to true, the model response data will be streamed to the client as it is
+ generated using
+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+ See the
+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+ for more information.
+
+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+ """
+
+
+TranscriptionCreateParams = Union[TranscriptionCreateParamsNonStreaming, TranscriptionCreateParamsStreaming]
diff --git a/src/openai/types/audio/transcription_include.py b/src/openai/types/audio/transcription_include.py
new file mode 100644
index 0000000000..0e464ac934
--- /dev/null
+++ b/src/openai/types/audio/transcription_include.py
@@ -0,0 +1,7 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal, TypeAlias
+
+__all__ = ["TranscriptionInclude"]
+
+TranscriptionInclude: TypeAlias = Literal["logprobs"]
diff --git a/src/openai/types/audio/transcription_stream_event.py b/src/openai/types/audio/transcription_stream_event.py
new file mode 100644
index 0000000000..757077a280
--- /dev/null
+++ b/src/openai/types/audio/transcription_stream_event.py
@@ -0,0 +1,14 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Union
+from typing_extensions import Annotated, TypeAlias
+
+from ..._utils import PropertyInfo
+from .transcription_text_done_event import TranscriptionTextDoneEvent
+from .transcription_text_delta_event import TranscriptionTextDeltaEvent
+
+__all__ = ["TranscriptionStreamEvent"]
+
+TranscriptionStreamEvent: TypeAlias = Annotated[
+ Union[TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent], PropertyInfo(discriminator="type")
+]
diff --git a/src/openai/types/audio/transcription_text_delta_event.py b/src/openai/types/audio/transcription_text_delta_event.py
new file mode 100644
index 0000000000..f8d5355491
--- /dev/null
+++ b/src/openai/types/audio/transcription_text_delta_event.py
@@ -0,0 +1,35 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionTextDeltaEvent", "Logprob"]
+
+
+class Logprob(BaseModel):
+ token: Optional[str] = None
+ """The token that was used to generate the log probability."""
+
+ bytes: Optional[List[object]] = None
+ """The bytes that were used to generate the log probability."""
+
+ logprob: Optional[float] = None
+ """The log probability of the token."""
+
+
+class TranscriptionTextDeltaEvent(BaseModel):
+ delta: str
+ """The text delta that was additionally transcribed."""
+
+ type: Literal["transcript.text.delta"]
+ """The type of the event. Always `transcript.text.delta`."""
+
+ logprobs: Optional[List[Logprob]] = None
+ """The log probabilities of the delta.
+
+ Only included if you
+ [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
+ with the `include[]` parameter set to `logprobs`.
+ """
diff --git a/src/openai/types/audio/transcription_text_done_event.py b/src/openai/types/audio/transcription_text_done_event.py
new file mode 100644
index 0000000000..3f1a713a52
--- /dev/null
+++ b/src/openai/types/audio/transcription_text_done_event.py
@@ -0,0 +1,35 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionTextDoneEvent", "Logprob"]
+
+
+class Logprob(BaseModel):
+ token: Optional[str] = None
+ """The token that was used to generate the log probability."""
+
+ bytes: Optional[List[object]] = None
+ """The bytes that were used to generate the log probability."""
+
+ logprob: Optional[float] = None
+ """The log probability of the token."""
+
+
+class TranscriptionTextDoneEvent(BaseModel):
+ text: str
+ """The text that was transcribed."""
+
+ type: Literal["transcript.text.done"]
+ """The type of the event. Always `transcript.text.done`."""
+
+ logprobs: Optional[List[Logprob]] = None
+ """The log probabilities of the individual tokens in the transcription.
+
+ Only included if you
+ [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
+ with the `include[]` parameter set to `logprobs`.
+ """
diff --git a/src/openai/types/audio/translation_create_params.py b/src/openai/types/audio/translation_create_params.py
index 62f85b8757..b23a185375 100644
--- a/src/openai/types/audio/translation_create_params.py
+++ b/src/openai/types/audio/translation_create_params.py
@@ -3,11 +3,10 @@
from __future__ import annotations
from typing import Union
-from typing_extensions import Required, TypedDict
+from typing_extensions import Literal, Required, TypedDict
from ..._types import FileTypes
from ..audio_model import AudioModel
-from ..audio_response_format import AudioResponseFormat
__all__ = ["TranslationCreateParams"]
@@ -34,7 +33,7 @@ class TranslationCreateParams(TypedDict, total=False):
should be in English.
"""
- response_format: AudioResponseFormat
+ response_format: Literal["json", "text", "srt", "verbose_json", "vtt"]
"""
The format of the output, in one of these options: `json`, `text`, `srt`,
`verbose_json`, or `vtt`.
diff --git a/src/openai/types/audio_model.py b/src/openai/types/audio_model.py
index 94ae84c015..4d14d60181 100644
--- a/src/openai/types/audio_model.py
+++ b/src/openai/types/audio_model.py
@@ -4,4 +4,4 @@
__all__ = ["AudioModel"]
-AudioModel: TypeAlias = Literal["whisper-1"]
+AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]
diff --git a/src/openai/types/beta/realtime/__init__.py b/src/openai/types/beta/realtime/__init__.py
index cd0616dcfa..0374b9b457 100644
--- a/src/openai/types/beta/realtime/__init__.py
+++ b/src/openai/types/beta/realtime/__init__.py
@@ -15,6 +15,7 @@
from .session_create_params import SessionCreateParams as SessionCreateParams
from .session_created_event import SessionCreatedEvent as SessionCreatedEvent
from .session_updated_event import SessionUpdatedEvent as SessionUpdatedEvent
+from .transcription_session import TranscriptionSession as TranscriptionSession
from .response_created_event import ResponseCreatedEvent as ResponseCreatedEvent
from .conversation_item_param import ConversationItemParam as ConversationItemParam
from .realtime_connect_params import RealtimeConnectParams as RealtimeConnectParams
@@ -32,6 +33,7 @@
from .realtime_client_event_param import RealtimeClientEventParam as RealtimeClientEventParam
from .response_cancel_event_param import ResponseCancelEventParam as ResponseCancelEventParam
from .response_create_event_param import ResponseCreateEventParam as ResponseCreateEventParam
+from .transcription_session_update import TranscriptionSessionUpdate as TranscriptionSessionUpdate
from .conversation_item_create_event import ConversationItemCreateEvent as ConversationItemCreateEvent
from .conversation_item_delete_event import ConversationItemDeleteEvent as ConversationItemDeleteEvent
from .input_audio_buffer_clear_event import InputAudioBufferClearEvent as InputAudioBufferClearEvent
@@ -41,6 +43,7 @@
from .input_audio_buffer_append_event import InputAudioBufferAppendEvent as InputAudioBufferAppendEvent
from .input_audio_buffer_commit_event import InputAudioBufferCommitEvent as InputAudioBufferCommitEvent
from .response_output_item_done_event import ResponseOutputItemDoneEvent as ResponseOutputItemDoneEvent
+from .conversation_item_retrieve_event import ConversationItemRetrieveEvent as ConversationItemRetrieveEvent
from .conversation_item_truncate_event import ConversationItemTruncateEvent as ConversationItemTruncateEvent
from .conversation_item_with_reference import ConversationItemWithReference as ConversationItemWithReference
from .input_audio_buffer_cleared_event import InputAudioBufferClearedEvent as InputAudioBufferClearedEvent
@@ -49,6 +52,9 @@
from .conversation_item_truncated_event import ConversationItemTruncatedEvent as ConversationItemTruncatedEvent
from .response_content_part_added_event import ResponseContentPartAddedEvent as ResponseContentPartAddedEvent
from .input_audio_buffer_committed_event import InputAudioBufferCommittedEvent as InputAudioBufferCommittedEvent
+from .transcription_session_update_param import TranscriptionSessionUpdateParam as TranscriptionSessionUpdateParam
+from .transcription_session_create_params import TranscriptionSessionCreateParams as TranscriptionSessionCreateParams
+from .transcription_session_updated_event import TranscriptionSessionUpdatedEvent as TranscriptionSessionUpdatedEvent
from .conversation_item_create_event_param import ConversationItemCreateEventParam as ConversationItemCreateEventParam
from .conversation_item_delete_event_param import ConversationItemDeleteEventParam as ConversationItemDeleteEventParam
from .input_audio_buffer_clear_event_param import InputAudioBufferClearEventParam as InputAudioBufferClearEventParam
@@ -58,6 +64,9 @@
from .response_audio_transcript_delta_event import (
ResponseAudioTranscriptDeltaEvent as ResponseAudioTranscriptDeltaEvent,
)
+from .conversation_item_retrieve_event_param import (
+ ConversationItemRetrieveEventParam as ConversationItemRetrieveEventParam,
+)
from .conversation_item_truncate_event_param import (
ConversationItemTruncateEventParam as ConversationItemTruncateEventParam,
)
@@ -76,6 +85,9 @@
from .response_function_call_arguments_delta_event import (
ResponseFunctionCallArgumentsDeltaEvent as ResponseFunctionCallArgumentsDeltaEvent,
)
+from .conversation_item_input_audio_transcription_delta_event import (
+ ConversationItemInputAudioTranscriptionDeltaEvent as ConversationItemInputAudioTranscriptionDeltaEvent,
+)
from .conversation_item_input_audio_transcription_failed_event import (
ConversationItemInputAudioTranscriptionFailedEvent as ConversationItemInputAudioTranscriptionFailedEvent,
)
diff --git a/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_completed_event.py b/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_completed_event.py
index ded79cc0f7..469811693c 100644
--- a/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_completed_event.py
+++ b/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_completed_event.py
@@ -1,10 +1,22 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+from typing import List, Optional
from typing_extensions import Literal
from ...._models import BaseModel
-__all__ = ["ConversationItemInputAudioTranscriptionCompletedEvent"]
+__all__ = ["ConversationItemInputAudioTranscriptionCompletedEvent", "Logprob"]
+
+
+class Logprob(BaseModel):
+ token: str
+ """The token that was used to generate the log probability."""
+
+ bytes: List[int]
+ """The bytes that were used to generate the log probability."""
+
+ logprob: float
+ """The log probability of the token."""
class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel):
@@ -24,3 +36,6 @@ class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel):
"""
The event type, must be `conversation.item.input_audio_transcription.completed`.
"""
+
+ logprobs: Optional[List[Logprob]] = None
+ """The log probabilities of the transcription."""
diff --git a/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_delta_event.py b/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_delta_event.py
new file mode 100644
index 0000000000..924d06d98a
--- /dev/null
+++ b/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_delta_event.py
@@ -0,0 +1,39 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+
+__all__ = ["ConversationItemInputAudioTranscriptionDeltaEvent", "Logprob"]
+
+
+class Logprob(BaseModel):
+ token: str
+ """The token that was used to generate the log probability."""
+
+ bytes: List[int]
+ """The bytes that were used to generate the log probability."""
+
+ logprob: float
+ """The log probability of the token."""
+
+
+class ConversationItemInputAudioTranscriptionDeltaEvent(BaseModel):
+ event_id: str
+ """The unique ID of the server event."""
+
+ item_id: str
+ """The ID of the item."""
+
+ type: Literal["conversation.item.input_audio_transcription.delta"]
+ """The event type, must be `conversation.item.input_audio_transcription.delta`."""
+
+ content_index: Optional[int] = None
+ """The index of the content part in the item's content array."""
+
+ delta: Optional[str] = None
+ """The text delta."""
+
+ logprobs: Optional[List[Logprob]] = None
+ """The log probabilities of the transcription."""
diff --git a/src/openai/types/beta/realtime/conversation_item_retrieve_event.py b/src/openai/types/beta/realtime/conversation_item_retrieve_event.py
new file mode 100644
index 0000000000..822386055c
--- /dev/null
+++ b/src/openai/types/beta/realtime/conversation_item_retrieve_event.py
@@ -0,0 +1,19 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+
+__all__ = ["ConversationItemRetrieveEvent"]
+
+
+class ConversationItemRetrieveEvent(BaseModel):
+ item_id: str
+ """The ID of the item to retrieve."""
+
+ type: Literal["conversation.item.retrieve"]
+ """The event type, must be `conversation.item.retrieve`."""
+
+ event_id: Optional[str] = None
+ """Optional client-generated ID used to identify this event."""
diff --git a/src/openai/types/beta/realtime/conversation_item_retrieve_event_param.py b/src/openai/types/beta/realtime/conversation_item_retrieve_event_param.py
new file mode 100644
index 0000000000..71b3ffa499
--- /dev/null
+++ b/src/openai/types/beta/realtime/conversation_item_retrieve_event_param.py
@@ -0,0 +1,18 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import Literal, Required, TypedDict
+
+__all__ = ["ConversationItemRetrieveEventParam"]
+
+
+class ConversationItemRetrieveEventParam(TypedDict, total=False):
+ item_id: Required[str]
+ """The ID of the item to retrieve."""
+
+ type: Required[Literal["conversation.item.retrieve"]]
+ """The event type, must be `conversation.item.retrieve`."""
+
+ event_id: str
+ """Optional client-generated ID used to identify this event."""
diff --git a/src/openai/types/beta/realtime/realtime_client_event.py b/src/openai/types/beta/realtime/realtime_client_event.py
index 0769184cd0..f962a505cd 100644
--- a/src/openai/types/beta/realtime/realtime_client_event.py
+++ b/src/openai/types/beta/realtime/realtime_client_event.py
@@ -7,26 +7,30 @@
from .session_update_event import SessionUpdateEvent
from .response_cancel_event import ResponseCancelEvent
from .response_create_event import ResponseCreateEvent
+from .transcription_session_update import TranscriptionSessionUpdate
from .conversation_item_create_event import ConversationItemCreateEvent
from .conversation_item_delete_event import ConversationItemDeleteEvent
from .input_audio_buffer_clear_event import InputAudioBufferClearEvent
from .input_audio_buffer_append_event import InputAudioBufferAppendEvent
from .input_audio_buffer_commit_event import InputAudioBufferCommitEvent
+from .conversation_item_retrieve_event import ConversationItemRetrieveEvent
from .conversation_item_truncate_event import ConversationItemTruncateEvent
__all__ = ["RealtimeClientEvent"]
RealtimeClientEvent: TypeAlias = Annotated[
Union[
- SessionUpdateEvent,
- InputAudioBufferAppendEvent,
- InputAudioBufferCommitEvent,
- InputAudioBufferClearEvent,
ConversationItemCreateEvent,
- ConversationItemTruncateEvent,
ConversationItemDeleteEvent,
- ResponseCreateEvent,
+ ConversationItemRetrieveEvent,
+ ConversationItemTruncateEvent,
+ InputAudioBufferAppendEvent,
+ InputAudioBufferClearEvent,
+ InputAudioBufferCommitEvent,
ResponseCancelEvent,
+ ResponseCreateEvent,
+ SessionUpdateEvent,
+ TranscriptionSessionUpdate,
],
PropertyInfo(discriminator="type"),
]
diff --git a/src/openai/types/beta/realtime/realtime_client_event_param.py b/src/openai/types/beta/realtime/realtime_client_event_param.py
index 4020892c33..6fdba4b87c 100644
--- a/src/openai/types/beta/realtime/realtime_client_event_param.py
+++ b/src/openai/types/beta/realtime/realtime_client_event_param.py
@@ -8,23 +8,27 @@
from .session_update_event_param import SessionUpdateEventParam
from .response_cancel_event_param import ResponseCancelEventParam
from .response_create_event_param import ResponseCreateEventParam
+from .transcription_session_update_param import TranscriptionSessionUpdateParam
from .conversation_item_create_event_param import ConversationItemCreateEventParam
from .conversation_item_delete_event_param import ConversationItemDeleteEventParam
from .input_audio_buffer_clear_event_param import InputAudioBufferClearEventParam
from .input_audio_buffer_append_event_param import InputAudioBufferAppendEventParam
from .input_audio_buffer_commit_event_param import InputAudioBufferCommitEventParam
+from .conversation_item_retrieve_event_param import ConversationItemRetrieveEventParam
from .conversation_item_truncate_event_param import ConversationItemTruncateEventParam
__all__ = ["RealtimeClientEventParam"]
RealtimeClientEventParam: TypeAlias = Union[
- SessionUpdateEventParam,
- InputAudioBufferAppendEventParam,
- InputAudioBufferCommitEventParam,
- InputAudioBufferClearEventParam,
ConversationItemCreateEventParam,
- ConversationItemTruncateEventParam,
ConversationItemDeleteEventParam,
- ResponseCreateEventParam,
+ ConversationItemRetrieveEventParam,
+ ConversationItemTruncateEventParam,
+ InputAudioBufferAppendEventParam,
+ InputAudioBufferClearEventParam,
+ InputAudioBufferCommitEventParam,
ResponseCancelEventParam,
+ ResponseCreateEventParam,
+ SessionUpdateEventParam,
+ TranscriptionSessionUpdateParam,
]
diff --git a/src/openai/types/beta/realtime/realtime_server_event.py b/src/openai/types/beta/realtime/realtime_server_event.py
index 5f8ed55b13..ba1d324445 100644
--- a/src/openai/types/beta/realtime/realtime_server_event.py
+++ b/src/openai/types/beta/realtime/realtime_server_event.py
@@ -1,10 +1,12 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
from typing import Union
-from typing_extensions import Annotated, TypeAlias
+from typing_extensions import Literal, Annotated, TypeAlias
from ...._utils import PropertyInfo
+from ...._models import BaseModel
from .error_event import ErrorEvent
+from .conversation_item import ConversationItem
from .response_done_event import ResponseDoneEvent
from .session_created_event import SessionCreatedEvent
from .session_updated_event import SessionUpdatedEvent
@@ -24,49 +26,66 @@
from .conversation_item_truncated_event import ConversationItemTruncatedEvent
from .response_content_part_added_event import ResponseContentPartAddedEvent
from .input_audio_buffer_committed_event import InputAudioBufferCommittedEvent
+from .transcription_session_updated_event import TranscriptionSessionUpdatedEvent
from .response_audio_transcript_done_event import ResponseAudioTranscriptDoneEvent
from .response_audio_transcript_delta_event import ResponseAudioTranscriptDeltaEvent
from .input_audio_buffer_speech_started_event import InputAudioBufferSpeechStartedEvent
from .input_audio_buffer_speech_stopped_event import InputAudioBufferSpeechStoppedEvent
from .response_function_call_arguments_done_event import ResponseFunctionCallArgumentsDoneEvent
from .response_function_call_arguments_delta_event import ResponseFunctionCallArgumentsDeltaEvent
+from .conversation_item_input_audio_transcription_delta_event import ConversationItemInputAudioTranscriptionDeltaEvent
from .conversation_item_input_audio_transcription_failed_event import ConversationItemInputAudioTranscriptionFailedEvent
from .conversation_item_input_audio_transcription_completed_event import (
ConversationItemInputAudioTranscriptionCompletedEvent,
)
-__all__ = ["RealtimeServerEvent"]
+__all__ = ["RealtimeServerEvent", "ConversationItemRetrieved"]
+
+
+class ConversationItemRetrieved(BaseModel):
+ event_id: str
+ """The unique ID of the server event."""
+
+ item: ConversationItem
+ """The item to add to the conversation."""
+
+ type: Literal["conversation.item.retrieved"]
+ """The event type, must be `conversation.item.retrieved`."""
+
RealtimeServerEvent: TypeAlias = Annotated[
Union[
- ErrorEvent,
- SessionCreatedEvent,
- SessionUpdatedEvent,
ConversationCreatedEvent,
- InputAudioBufferCommittedEvent,
- InputAudioBufferClearedEvent,
- InputAudioBufferSpeechStartedEvent,
- InputAudioBufferSpeechStoppedEvent,
ConversationItemCreatedEvent,
+ ConversationItemDeletedEvent,
ConversationItemInputAudioTranscriptionCompletedEvent,
+ ConversationItemInputAudioTranscriptionDeltaEvent,
ConversationItemInputAudioTranscriptionFailedEvent,
+ ConversationItemRetrieved,
ConversationItemTruncatedEvent,
- ConversationItemDeletedEvent,
+ ErrorEvent,
+ InputAudioBufferClearedEvent,
+ InputAudioBufferCommittedEvent,
+ InputAudioBufferSpeechStartedEvent,
+ InputAudioBufferSpeechStoppedEvent,
+ RateLimitsUpdatedEvent,
+ ResponseAudioDeltaEvent,
+ ResponseAudioDoneEvent,
+ ResponseAudioTranscriptDeltaEvent,
+ ResponseAudioTranscriptDoneEvent,
+ ResponseContentPartAddedEvent,
+ ResponseContentPartDoneEvent,
ResponseCreatedEvent,
ResponseDoneEvent,
+ ResponseFunctionCallArgumentsDeltaEvent,
+ ResponseFunctionCallArgumentsDoneEvent,
ResponseOutputItemAddedEvent,
ResponseOutputItemDoneEvent,
- ResponseContentPartAddedEvent,
- ResponseContentPartDoneEvent,
ResponseTextDeltaEvent,
ResponseTextDoneEvent,
- ResponseAudioTranscriptDeltaEvent,
- ResponseAudioTranscriptDoneEvent,
- ResponseAudioDeltaEvent,
- ResponseAudioDoneEvent,
- ResponseFunctionCallArgumentsDeltaEvent,
- ResponseFunctionCallArgumentsDoneEvent,
- RateLimitsUpdatedEvent,
+ SessionCreatedEvent,
+ SessionUpdatedEvent,
+ TranscriptionSessionUpdatedEvent,
],
PropertyInfo(discriminator="type"),
]
diff --git a/src/openai/types/beta/realtime/session.py b/src/openai/types/beta/realtime/session.py
index aee20fa906..3ed53ff5f8 100644
--- a/src/openai/types/beta/realtime/session.py
+++ b/src/openai/types/beta/realtime/session.py
@@ -5,14 +5,40 @@
from ...._models import BaseModel
-__all__ = ["Session", "InputAudioTranscription", "Tool", "TurnDetection"]
+__all__ = ["Session", "InputAudioNoiseReduction", "InputAudioTranscription", "Tool", "TurnDetection"]
+
+
+class InputAudioNoiseReduction(BaseModel):
+ type: Optional[Literal["near_field", "far_field"]] = None
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
class InputAudioTranscription(BaseModel):
+ language: Optional[str] = None
+ """The language of the input audio.
+
+ Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+ """
+
model: Optional[str] = None
"""
- The model to use for transcription, `whisper-1` is the only currently supported
- model.
+ The model to use for transcription, current options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1`.
+ """
+
+ prompt: Optional[str] = None
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
"""
@@ -35,46 +61,56 @@ class Tool(BaseModel):
class TurnDetection(BaseModel):
create_response: Optional[bool] = None
- """Whether or not to automatically generate a response when a VAD stop event
+ """
+ Whether or not to automatically generate a response when a VAD stop event
occurs.
+ """
+
+ eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+ """Used only for `semantic_vad` mode.
- `true` by default.
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`.
"""
interrupt_response: Optional[bool] = None
"""
Whether or not to automatically interrupt any ongoing response with output to
the default conversation (i.e. `conversation` of `auto`) when a VAD start event
- occurs. `true` by default.
+ occurs.
"""
prefix_padding_ms: Optional[int] = None
- """Amount of audio to include before the VAD detected speech (in milliseconds).
+ """Used only for `server_vad` mode.
+ Amount of audio to include before the VAD detected speech (in milliseconds).
Defaults to 300ms.
"""
silence_duration_ms: Optional[int] = None
- """Duration of silence to detect speech stop (in milliseconds).
+ """Used only for `server_vad` mode.
- Defaults to 500ms. With shorter values the model will respond more quickly, but
- may jump in on short pauses from the user.
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
"""
threshold: Optional[float] = None
- """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+ """Used only for `server_vad` mode.
- A higher threshold will require louder audio to activate the model, and thus
- might perform better in noisy environments.
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
"""
- type: Optional[Literal["server_vad"]] = None
- """Type of turn detection, only `server_vad` is currently supported."""
+ type: Optional[Literal["server_vad", "semantic_vad"]] = None
+ """Type of turn detection."""
class Session(BaseModel):
id: Optional[str] = None
- """Unique identifier for the session object."""
+ """Unique identifier for the session that looks like `sess_1234567890abcdef`."""
input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
"""The format of input audio.
@@ -84,13 +120,25 @@ class Session(BaseModel):
byte order.
"""
+ input_audio_noise_reduction: Optional[InputAudioNoiseReduction] = None
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
input_audio_transcription: Optional[InputAudioTranscription] = None
"""
Configuration for input audio transcription, defaults to off and can be set to
`null` to turn off once on. Input audio transcription is not native to the
model, since the model consumes audio directly. Transcription runs
- asynchronously through Whisper and should be treated as rough guidance rather
- than the representation understood by the model.
+ asynchronously through
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
"""
instructions: Optional[str] = None
@@ -122,16 +170,14 @@ class Session(BaseModel):
To disable audio, set this to ["text"].
"""
- model: Union[
- str,
+ model: Optional[
Literal[
"gpt-4o-realtime-preview",
"gpt-4o-realtime-preview-2024-10-01",
"gpt-4o-realtime-preview-2024-12-17",
"gpt-4o-mini-realtime-preview",
"gpt-4o-mini-realtime-preview-2024-12-17",
- ],
- None,
+ ]
] = None
"""The Realtime model used for this session."""
@@ -143,7 +189,11 @@ class Session(BaseModel):
"""
temperature: Optional[float] = None
- """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
+ """Sampling temperature for the model, limited to [0.6, 1.2].
+
+ For audio models a temperature of 0.8 is highly recommended for best
+ performance.
+ """
tool_choice: Optional[str] = None
"""How the model chooses tools.
@@ -155,11 +205,17 @@ class Session(BaseModel):
"""Tools (functions) available to the model."""
turn_detection: Optional[TurnDetection] = None
- """Configuration for turn detection.
-
- Can be set to `null` to turn off. Server VAD means that the model will detect
- the start and end of speech based on audio volume and respond at the end of user
- speech.
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjuction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
"""
voice: Optional[Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]] = None
diff --git a/src/openai/types/beta/realtime/session_create_params.py b/src/openai/types/beta/realtime/session_create_params.py
index bbc86d7c7d..fe4a1c8636 100644
--- a/src/openai/types/beta/realtime/session_create_params.py
+++ b/src/openai/types/beta/realtime/session_create_params.py
@@ -5,7 +5,7 @@
from typing import List, Union, Iterable
from typing_extensions import Literal, TypedDict
-__all__ = ["SessionCreateParams", "InputAudioTranscription", "Tool", "TurnDetection"]
+__all__ = ["SessionCreateParams", "InputAudioNoiseReduction", "InputAudioTranscription", "Tool", "TurnDetection"]
class SessionCreateParams(TypedDict, total=False):
@@ -17,16 +17,25 @@ class SessionCreateParams(TypedDict, total=False):
byte order.
"""
+ input_audio_noise_reduction: InputAudioNoiseReduction
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
input_audio_transcription: InputAudioTranscription
"""
Configuration for input audio transcription, defaults to off and can be set to
`null` to turn off once on. Input audio transcription is not native to the
model, since the model consumes audio directly. Transcription runs
asynchronously through
- [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as rough guidance rather than the representation
- understood by the model. The client can optionally set the language and prompt
- for transcription, these fields will be passed to the Whisper API.
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
"""
instructions: str
@@ -75,7 +84,11 @@ class SessionCreateParams(TypedDict, total=False):
"""
temperature: float
- """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
+ """Sampling temperature for the model, limited to [0.6, 1.2].
+
+ For audio models a temperature of 0.8 is highly recommended for best
+ performance.
+ """
tool_choice: str
"""How the model chooses tools.
@@ -87,11 +100,17 @@ class SessionCreateParams(TypedDict, total=False):
"""Tools (functions) available to the model."""
turn_detection: TurnDetection
- """Configuration for turn detection.
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
- Can be set to `null` to turn off. Server VAD means that the model will detect
- the start and end of speech based on audio volume and respond at the end of user
- speech.
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjuction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
"""
voice: Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]
@@ -103,6 +122,15 @@ class SessionCreateParams(TypedDict, total=False):
"""
+class InputAudioNoiseReduction(TypedDict, total=False):
+ type: Literal["near_field", "far_field"]
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
class InputAudioTranscription(TypedDict, total=False):
language: str
"""The language of the input audio.
@@ -114,16 +142,17 @@ class InputAudioTranscription(TypedDict, total=False):
model: str
"""
- The model to use for transcription, `whisper-1` is the only currently supported
- model.
+ The model to use for transcription, current options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1`.
"""
prompt: str
- """An optional text to guide the model's style or continue a previous audio
- segment.
-
- The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
"""
@@ -146,38 +175,48 @@ class Tool(TypedDict, total=False):
class TurnDetection(TypedDict, total=False):
create_response: bool
- """Whether or not to automatically generate a response when a VAD stop event
+ """
+ Whether or not to automatically generate a response when a VAD stop event
occurs.
+ """
+
+ eagerness: Literal["low", "medium", "high", "auto"]
+ """Used only for `semantic_vad` mode.
- `true` by default.
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`.
"""
interrupt_response: bool
"""
Whether or not to automatically interrupt any ongoing response with output to
the default conversation (i.e. `conversation` of `auto`) when a VAD start event
- occurs. `true` by default.
+ occurs.
"""
prefix_padding_ms: int
- """Amount of audio to include before the VAD detected speech (in milliseconds).
+ """Used only for `server_vad` mode.
+ Amount of audio to include before the VAD detected speech (in milliseconds).
Defaults to 300ms.
"""
silence_duration_ms: int
- """Duration of silence to detect speech stop (in milliseconds).
+ """Used only for `server_vad` mode.
- Defaults to 500ms. With shorter values the model will respond more quickly, but
- may jump in on short pauses from the user.
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
"""
threshold: float
- """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+ """Used only for `server_vad` mode.
- A higher threshold will require louder audio to activate the model, and thus
- might perform better in noisy environments.
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
"""
- type: str
- """Type of turn detection, only `server_vad` is currently supported."""
+ type: Literal["server_vad", "semantic_vad"]
+ """Type of turn detection."""
diff --git a/src/openai/types/beta/realtime/session_update_event.py b/src/openai/types/beta/realtime/session_update_event.py
index 999cd8d660..00180f593d 100644
--- a/src/openai/types/beta/realtime/session_update_event.py
+++ b/src/openai/types/beta/realtime/session_update_event.py
@@ -5,7 +5,23 @@
from ...._models import BaseModel
-__all__ = ["SessionUpdateEvent", "Session", "SessionInputAudioTranscription", "SessionTool", "SessionTurnDetection"]
+__all__ = [
+ "SessionUpdateEvent",
+ "Session",
+ "SessionInputAudioNoiseReduction",
+ "SessionInputAudioTranscription",
+ "SessionTool",
+ "SessionTurnDetection",
+]
+
+
+class SessionInputAudioNoiseReduction(BaseModel):
+ type: Optional[Literal["near_field", "far_field"]] = None
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
class SessionInputAudioTranscription(BaseModel):
@@ -19,16 +35,17 @@ class SessionInputAudioTranscription(BaseModel):
model: Optional[str] = None
"""
- The model to use for transcription, `whisper-1` is the only currently supported
- model.
+ The model to use for transcription, current options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1`.
"""
prompt: Optional[str] = None
- """An optional text to guide the model's style or continue a previous audio
- segment.
-
- The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
"""
@@ -51,41 +68,51 @@ class SessionTool(BaseModel):
class SessionTurnDetection(BaseModel):
create_response: Optional[bool] = None
- """Whether or not to automatically generate a response when a VAD stop event
+ """
+ Whether or not to automatically generate a response when a VAD stop event
occurs.
+ """
- `true` by default.
+ eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+ """Used only for `semantic_vad` mode.
+
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`.
"""
interrupt_response: Optional[bool] = None
"""
Whether or not to automatically interrupt any ongoing response with output to
the default conversation (i.e. `conversation` of `auto`) when a VAD start event
- occurs. `true` by default.
+ occurs.
"""
prefix_padding_ms: Optional[int] = None
- """Amount of audio to include before the VAD detected speech (in milliseconds).
+ """Used only for `server_vad` mode.
+ Amount of audio to include before the VAD detected speech (in milliseconds).
Defaults to 300ms.
"""
silence_duration_ms: Optional[int] = None
- """Duration of silence to detect speech stop (in milliseconds).
+ """Used only for `server_vad` mode.
- Defaults to 500ms. With shorter values the model will respond more quickly, but
- may jump in on short pauses from the user.
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
"""
threshold: Optional[float] = None
- """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+ """Used only for `server_vad` mode.
- A higher threshold will require louder audio to activate the model, and thus
- might perform better in noisy environments.
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
"""
- type: Optional[str] = None
- """Type of turn detection, only `server_vad` is currently supported."""
+ type: Optional[Literal["server_vad", "semantic_vad"]] = None
+ """Type of turn detection."""
class Session(BaseModel):
@@ -97,16 +124,25 @@ class Session(BaseModel):
byte order.
"""
+ input_audio_noise_reduction: Optional[SessionInputAudioNoiseReduction] = None
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
input_audio_transcription: Optional[SessionInputAudioTranscription] = None
"""
Configuration for input audio transcription, defaults to off and can be set to
`null` to turn off once on. Input audio transcription is not native to the
model, since the model consumes audio directly. Transcription runs
asynchronously through
- [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as rough guidance rather than the representation
- understood by the model. The client can optionally set the language and prompt
- for transcription, these fields will be passed to the Whisper API.
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
"""
instructions: Optional[str] = None
@@ -157,7 +193,11 @@ class Session(BaseModel):
"""
temperature: Optional[float] = None
- """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
+ """Sampling temperature for the model, limited to [0.6, 1.2].
+
+ For audio models a temperature of 0.8 is highly recommended for best
+ performance.
+ """
tool_choice: Optional[str] = None
"""How the model chooses tools.
@@ -169,11 +209,17 @@ class Session(BaseModel):
"""Tools (functions) available to the model."""
turn_detection: Optional[SessionTurnDetection] = None
- """Configuration for turn detection.
-
- Can be set to `null` to turn off. Server VAD means that the model will detect
- the start and end of speech based on audio volume and respond at the end of user
- speech.
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjuction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
"""
voice: Optional[Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]] = None
diff --git a/src/openai/types/beta/realtime/session_update_event_param.py b/src/openai/types/beta/realtime/session_update_event_param.py
index 07fdba9d85..b8bce8fbd0 100644
--- a/src/openai/types/beta/realtime/session_update_event_param.py
+++ b/src/openai/types/beta/realtime/session_update_event_param.py
@@ -8,12 +8,22 @@
__all__ = [
"SessionUpdateEventParam",
"Session",
+ "SessionInputAudioNoiseReduction",
"SessionInputAudioTranscription",
"SessionTool",
"SessionTurnDetection",
]
+class SessionInputAudioNoiseReduction(TypedDict, total=False):
+ type: Literal["near_field", "far_field"]
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
class SessionInputAudioTranscription(TypedDict, total=False):
language: str
"""The language of the input audio.
@@ -25,16 +35,17 @@ class SessionInputAudioTranscription(TypedDict, total=False):
model: str
"""
- The model to use for transcription, `whisper-1` is the only currently supported
- model.
+ The model to use for transcription, current options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1`.
"""
prompt: str
- """An optional text to guide the model's style or continue a previous audio
- segment.
-
- The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
"""
@@ -57,41 +68,51 @@ class SessionTool(TypedDict, total=False):
class SessionTurnDetection(TypedDict, total=False):
create_response: bool
- """Whether or not to automatically generate a response when a VAD stop event
+ """
+ Whether or not to automatically generate a response when a VAD stop event
occurs.
+ """
- `true` by default.
+ eagerness: Literal["low", "medium", "high", "auto"]
+ """Used only for `semantic_vad` mode.
+
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`.
"""
interrupt_response: bool
"""
Whether or not to automatically interrupt any ongoing response with output to
the default conversation (i.e. `conversation` of `auto`) when a VAD start event
- occurs. `true` by default.
+ occurs.
"""
prefix_padding_ms: int
- """Amount of audio to include before the VAD detected speech (in milliseconds).
+ """Used only for `server_vad` mode.
+ Amount of audio to include before the VAD detected speech (in milliseconds).
Defaults to 300ms.
"""
silence_duration_ms: int
- """Duration of silence to detect speech stop (in milliseconds).
+ """Used only for `server_vad` mode.
- Defaults to 500ms. With shorter values the model will respond more quickly, but
- may jump in on short pauses from the user.
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
"""
threshold: float
- """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+ """Used only for `server_vad` mode.
- A higher threshold will require louder audio to activate the model, and thus
- might perform better in noisy environments.
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
"""
- type: str
- """Type of turn detection, only `server_vad` is currently supported."""
+ type: Literal["server_vad", "semantic_vad"]
+ """Type of turn detection."""
class Session(TypedDict, total=False):
@@ -103,16 +124,25 @@ class Session(TypedDict, total=False):
byte order.
"""
+ input_audio_noise_reduction: SessionInputAudioNoiseReduction
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
input_audio_transcription: SessionInputAudioTranscription
"""
Configuration for input audio transcription, defaults to off and can be set to
`null` to turn off once on. Input audio transcription is not native to the
model, since the model consumes audio directly. Transcription runs
asynchronously through
- [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as rough guidance rather than the representation
- understood by the model. The client can optionally set the language and prompt
- for transcription, these fields will be passed to the Whisper API.
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
"""
instructions: str
@@ -161,7 +191,11 @@ class Session(TypedDict, total=False):
"""
temperature: float
- """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
+ """Sampling temperature for the model, limited to [0.6, 1.2].
+
+ For audio models a temperature of 0.8 is highly recommended for best
+ performance.
+ """
tool_choice: str
"""How the model chooses tools.
@@ -173,11 +207,17 @@ class Session(TypedDict, total=False):
"""Tools (functions) available to the model."""
turn_detection: SessionTurnDetection
- """Configuration for turn detection.
-
- Can be set to `null` to turn off. Server VAD means that the model will detect
- the start and end of speech based on audio volume and respond at the end of user
- speech.
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjuction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
"""
voice: Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]
diff --git a/src/openai/types/beta/realtime/transcription_session.py b/src/openai/types/beta/realtime/transcription_session.py
new file mode 100644
index 0000000000..7c7abf37b6
--- /dev/null
+++ b/src/openai/types/beta/realtime/transcription_session.py
@@ -0,0 +1,100 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+
+__all__ = ["TranscriptionSession", "ClientSecret", "InputAudioTranscription", "TurnDetection"]
+
+
+class ClientSecret(BaseModel):
+ expires_at: int
+ """Timestamp for when the token expires.
+
+ Currently, all tokens expire after one minute.
+ """
+
+ value: str
+ """
+ Ephemeral key usable in client environments to authenticate connections to the
+ Realtime API. Use this in client-side environments rather than a standard API
+ token, which should only be used server-side.
+ """
+
+
+class InputAudioTranscription(BaseModel):
+ language: Optional[str] = None
+ """The language of the input audio.
+
+ Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+ """
+
+ model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None
+ """The model to use for transcription.
+
+ Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`.
+ """
+
+ prompt: Optional[str] = None
+ """An optional text to guide the model's style or continue a previous audio
+ segment.
+
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+ should match the audio language.
+ """
+
+
+class TurnDetection(BaseModel):
+ prefix_padding_ms: Optional[int] = None
+ """Amount of audio to include before the VAD detected speech (in milliseconds).
+
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: Optional[int] = None
+ """Duration of silence to detect speech stop (in milliseconds).
+
+ Defaults to 500ms. With shorter values the model will respond more quickly, but
+ may jump in on short pauses from the user.
+ """
+
+ threshold: Optional[float] = None
+ """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+
+ A higher threshold will require louder audio to activate the model, and thus
+ might perform better in noisy environments.
+ """
+
+ type: Optional[str] = None
+ """Type of turn detection, only `server_vad` is currently supported."""
+
+
+class TranscriptionSession(BaseModel):
+ client_secret: ClientSecret
+ """Ephemeral key returned by the API.
+
+ Only present when the session is created on the server via REST API.
+ """
+
+ input_audio_format: Optional[str] = None
+ """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
+
+ input_audio_transcription: Optional[InputAudioTranscription] = None
+ """Configuration of the transcription model."""
+
+ modalities: Optional[List[Literal["text", "audio"]]] = None
+ """The set of modalities the model can respond with.
+
+ To disable audio, set this to ["text"].
+ """
+
+ turn_detection: Optional[TurnDetection] = None
+ """Configuration for turn detection.
+
+ Can be set to `null` to turn off. Server VAD means that the model will detect
+ the start and end of speech based on audio volume and respond at the end of user
+ speech.
+ """
diff --git a/src/openai/types/beta/realtime/transcription_session_create_params.py b/src/openai/types/beta/realtime/transcription_session_create_params.py
new file mode 100644
index 0000000000..4066dc4c5d
--- /dev/null
+++ b/src/openai/types/beta/realtime/transcription_session_create_params.py
@@ -0,0 +1,143 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import List
+from typing_extensions import Literal, TypedDict
+
+__all__ = ["TranscriptionSessionCreateParams", "InputAudioNoiseReduction", "InputAudioTranscription", "TurnDetection"]
+
+
+class TranscriptionSessionCreateParams(TypedDict, total=False):
+ include: List[str]
+ """The set of items to include in the transcription. Current available items are:
+
+ - `item.input_audio_transcription.logprobs`
+ """
+
+ input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"]
+ """The format of input audio.
+
+ Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
+ be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
+ byte order.
+ """
+
+ input_audio_noise_reduction: InputAudioNoiseReduction
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ input_audio_transcription: InputAudioTranscription
+ """Configuration for input audio transcription.
+
+ The client can optionally set the language and prompt for transcription, these
+ offer additional guidance to the transcription service.
+ """
+
+ modalities: List[Literal["text", "audio"]]
+ """The set of modalities the model can respond with.
+
+ To disable audio, set this to ["text"].
+ """
+
+ turn_detection: TurnDetection
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjuction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
+ """
+
+
+class InputAudioNoiseReduction(TypedDict, total=False):
+ type: Literal["near_field", "far_field"]
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
+class InputAudioTranscription(TypedDict, total=False):
+ language: str
+ """The language of the input audio.
+
+ Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+ """
+
+ model: Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]
+ """
+ The model to use for transcription, current options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1`.
+ """
+
+ prompt: str
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
+ """
+
+
+class TurnDetection(TypedDict, total=False):
+ create_response: bool
+ """
+ Whether or not to automatically generate a response when a VAD stop event
+ occurs.
+ """
+
+ eagerness: Literal["low", "medium", "high", "auto"]
+ """Used only for `semantic_vad` mode.
+
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`.
+ """
+
+ interrupt_response: bool
+ """
+ Whether or not to automatically interrupt any ongoing response with output to
+ the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ occurs.
+ """
+
+ prefix_padding_ms: int
+ """Used only for `server_vad` mode.
+
+ Amount of audio to include before the VAD detected speech (in milliseconds).
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: int
+ """Used only for `server_vad` mode.
+
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
+ """
+
+ threshold: float
+ """Used only for `server_vad` mode.
+
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
+ """
+
+ type: Literal["server_vad", "semantic_vad"]
+ """Type of turn detection."""
diff --git a/src/openai/types/beta/realtime/transcription_session_update.py b/src/openai/types/beta/realtime/transcription_session_update.py
new file mode 100644
index 0000000000..043ac02e07
--- /dev/null
+++ b/src/openai/types/beta/realtime/transcription_session_update.py
@@ -0,0 +1,160 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+
+__all__ = [
+ "TranscriptionSessionUpdate",
+ "Session",
+ "SessionInputAudioNoiseReduction",
+ "SessionInputAudioTranscription",
+ "SessionTurnDetection",
+]
+
+
+class SessionInputAudioNoiseReduction(BaseModel):
+ type: Optional[Literal["near_field", "far_field"]] = None
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
+class SessionInputAudioTranscription(BaseModel):
+ language: Optional[str] = None
+ """The language of the input audio.
+
+ Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+ """
+
+ model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None
+ """
+ The model to use for transcription, current options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1`.
+ """
+
+ prompt: Optional[str] = None
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
+ """
+
+
+class SessionTurnDetection(BaseModel):
+ create_response: Optional[bool] = None
+ """
+ Whether or not to automatically generate a response when a VAD stop event
+ occurs.
+ """
+
+ eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+ """Used only for `semantic_vad` mode.
+
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`.
+ """
+
+ interrupt_response: Optional[bool] = None
+ """
+ Whether or not to automatically interrupt any ongoing response with output to
+ the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ occurs.
+ """
+
+ prefix_padding_ms: Optional[int] = None
+ """Used only for `server_vad` mode.
+
+ Amount of audio to include before the VAD detected speech (in milliseconds).
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: Optional[int] = None
+ """Used only for `server_vad` mode.
+
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
+ """
+
+ threshold: Optional[float] = None
+ """Used only for `server_vad` mode.
+
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
+ """
+
+ type: Optional[Literal["server_vad", "semantic_vad"]] = None
+ """Type of turn detection."""
+
+
+class Session(BaseModel):
+ include: Optional[List[str]] = None
+ """The set of items to include in the transcription. Current available items are:
+
+ - `item.input_audio_transcription.logprobs`
+ """
+
+ input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
+ """The format of input audio.
+
+ Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
+ be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
+ byte order.
+ """
+
+ input_audio_noise_reduction: Optional[SessionInputAudioNoiseReduction] = None
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ input_audio_transcription: Optional[SessionInputAudioTranscription] = None
+ """Configuration for input audio transcription.
+
+ The client can optionally set the language and prompt for transcription, these
+ offer additional guidance to the transcription service.
+ """
+
+ modalities: Optional[List[Literal["text", "audio"]]] = None
+ """The set of modalities the model can respond with.
+
+ To disable audio, set this to ["text"].
+ """
+
+ turn_detection: Optional[SessionTurnDetection] = None
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjuction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
+ """
+
+
+class TranscriptionSessionUpdate(BaseModel):
+ session: Session
+ """Realtime transcription session object configuration."""
+
+ type: Literal["transcription_session.update"]
+ """The event type, must be `transcription_session.update`."""
+
+ event_id: Optional[str] = None
+ """Optional client-generated ID used to identify this event."""
diff --git a/src/openai/types/beta/realtime/transcription_session_update_param.py b/src/openai/types/beta/realtime/transcription_session_update_param.py
new file mode 100644
index 0000000000..997a36d77b
--- /dev/null
+++ b/src/openai/types/beta/realtime/transcription_session_update_param.py
@@ -0,0 +1,160 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import List
+from typing_extensions import Literal, Required, TypedDict
+
+__all__ = [
+ "TranscriptionSessionUpdateParam",
+ "Session",
+ "SessionInputAudioNoiseReduction",
+ "SessionInputAudioTranscription",
+ "SessionTurnDetection",
+]
+
+
+class SessionInputAudioNoiseReduction(TypedDict, total=False):
+ type: Literal["near_field", "far_field"]
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
+class SessionInputAudioTranscription(TypedDict, total=False):
+ language: str
+ """The language of the input audio.
+
+ Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+ """
+
+ model: Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]
+ """
+ The model to use for transcription, current options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1`.
+ """
+
+ prompt: str
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
+ """
+
+
+class SessionTurnDetection(TypedDict, total=False):
+ create_response: bool
+ """
+ Whether or not to automatically generate a response when a VAD stop event
+ occurs.
+ """
+
+ eagerness: Literal["low", "medium", "high", "auto"]
+ """Used only for `semantic_vad` mode.
+
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`.
+ """
+
+ interrupt_response: bool
+ """
+ Whether or not to automatically interrupt any ongoing response with output to
+ the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ occurs.
+ """
+
+ prefix_padding_ms: int
+ """Used only for `server_vad` mode.
+
+ Amount of audio to include before the VAD detected speech (in milliseconds).
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: int
+ """Used only for `server_vad` mode.
+
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
+ """
+
+ threshold: float
+ """Used only for `server_vad` mode.
+
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
+ """
+
+ type: Literal["server_vad", "semantic_vad"]
+ """Type of turn detection."""
+
+
+class Session(TypedDict, total=False):
+ include: List[str]
+ """The set of items to include in the transcription. Current available items are:
+
+ - `item.input_audio_transcription.logprobs`
+ """
+
+ input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"]
+ """The format of input audio.
+
+ Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
+ be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
+ byte order.
+ """
+
+ input_audio_noise_reduction: SessionInputAudioNoiseReduction
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ input_audio_transcription: SessionInputAudioTranscription
+ """Configuration for input audio transcription.
+
+ The client can optionally set the language and prompt for transcription, these
+ offer additional guidance to the transcription service.
+ """
+
+ modalities: List[Literal["text", "audio"]]
+ """The set of modalities the model can respond with.
+
+ To disable audio, set this to ["text"].
+ """
+
+ turn_detection: SessionTurnDetection
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjuction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
+ """
+
+
+class TranscriptionSessionUpdateParam(TypedDict, total=False):
+ session: Required[Session]
+ """Realtime transcription session object configuration."""
+
+ type: Required[Literal["transcription_session.update"]]
+ """The event type, must be `transcription_session.update`."""
+
+ event_id: str
+ """Optional client-generated ID used to identify this event."""
diff --git a/src/openai/types/beta/realtime/transcription_session_updated_event.py b/src/openai/types/beta/realtime/transcription_session_updated_event.py
new file mode 100644
index 0000000000..ffc100bcc2
--- /dev/null
+++ b/src/openai/types/beta/realtime/transcription_session_updated_event.py
@@ -0,0 +1,24 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+from .transcription_session import TranscriptionSession
+
+__all__ = ["TranscriptionSessionUpdatedEvent"]
+
+
+class TranscriptionSessionUpdatedEvent(BaseModel):
+ event_id: str
+ """The unique ID of the server event."""
+
+ session: TranscriptionSession
+ """A new Realtime transcription session configuration.
+
+ When a session is created on the server via REST API, the session object also
+ contains an ephemeral key. Default TTL for keys is one minute. This property is
+ not present when a session is updated via the WebSocket API.
+ """
+
+ type: Literal["transcription_session.updated"]
+ """The event type, must be `transcription_session.updated`."""
diff --git a/tests/api_resources/audio/test_speech.py b/tests/api_resources/audio/test_speech.py
index 781ebeceb9..808f6ef66c 100644
--- a/tests/api_resources/audio/test_speech.py
+++ b/tests/api_resources/audio/test_speech.py
@@ -41,6 +41,7 @@ def test_method_create_with_all_params(self, client: OpenAI, respx_mock: MockRou
input="string",
model="string",
voice="alloy",
+ instructions="instructions",
response_format="mp3",
speed=0.25,
)
@@ -104,6 +105,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI, re
input="string",
model="string",
voice="alloy",
+ instructions="instructions",
response_format="mp3",
speed=0.25,
)
diff --git a/tests/api_resources/audio/test_transcriptions.py b/tests/api_resources/audio/test_transcriptions.py
index bdb7e0dfb6..19215e11df 100644
--- a/tests/api_resources/audio/test_transcriptions.py
+++ b/tests/api_resources/audio/test_transcriptions.py
@@ -18,31 +18,33 @@ class TestTranscriptions:
parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
@parametrize
- def test_method_create(self, client: OpenAI) -> None:
+ def test_method_create_overload_1(self, client: OpenAI) -> None:
transcription = client.audio.transcriptions.create(
file=b"raw file contents",
- model="whisper-1",
+ model="gpt-4o-transcribe",
)
assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
@parametrize
- def test_method_create_with_all_params(self, client: OpenAI) -> None:
+ def test_method_create_with_all_params_overload_1(self, client: OpenAI) -> None:
transcription = client.audio.transcriptions.create(
file=b"raw file contents",
- model="whisper-1",
- language="string",
- prompt="string",
+ model="gpt-4o-transcribe",
+ include=["logprobs"],
+ language="language",
+ prompt="prompt",
response_format="json",
+ stream=False,
temperature=0,
timestamp_granularities=["word"],
)
assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
@parametrize
- def test_raw_response_create(self, client: OpenAI) -> None:
+ def test_raw_response_create_overload_1(self, client: OpenAI) -> None:
response = client.audio.transcriptions.with_raw_response.create(
file=b"raw file contents",
- model="whisper-1",
+ model="gpt-4o-transcribe",
)
assert response.is_closed is True
@@ -51,10 +53,10 @@ def test_raw_response_create(self, client: OpenAI) -> None:
assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
@parametrize
- def test_streaming_response_create(self, client: OpenAI) -> None:
+ def test_streaming_response_create_overload_1(self, client: OpenAI) -> None:
with client.audio.transcriptions.with_streaming_response.create(
file=b"raw file contents",
- model="whisper-1",
+ model="gpt-4o-transcribe",
) as response:
assert not response.is_closed
assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -64,36 +66,89 @@ def test_streaming_response_create(self, client: OpenAI) -> None:
assert cast(Any, response.is_closed) is True
+ @parametrize
+ def test_method_create_overload_2(self, client: OpenAI) -> None:
+ transcription_stream = client.audio.transcriptions.create(
+ file=b"raw file contents",
+ model="gpt-4o-transcribe",
+ stream=True,
+ )
+ transcription_stream.response.close()
+
+ @parametrize
+ def test_method_create_with_all_params_overload_2(self, client: OpenAI) -> None:
+ transcription_stream = client.audio.transcriptions.create(
+ file=b"raw file contents",
+ model="gpt-4o-transcribe",
+ stream=True,
+ include=["logprobs"],
+ language="language",
+ prompt="prompt",
+ response_format="json",
+ temperature=0,
+ timestamp_granularities=["word"],
+ )
+ transcription_stream.response.close()
+
+ @parametrize
+ def test_raw_response_create_overload_2(self, client: OpenAI) -> None:
+ response = client.audio.transcriptions.with_raw_response.create(
+ file=b"raw file contents",
+ model="gpt-4o-transcribe",
+ stream=True,
+ )
+
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+ stream = response.parse()
+ stream.close()
+
+ @parametrize
+ def test_streaming_response_create_overload_2(self, client: OpenAI) -> None:
+ with client.audio.transcriptions.with_streaming_response.create(
+ file=b"raw file contents",
+ model="gpt-4o-transcribe",
+ stream=True,
+ ) as response:
+ assert not response.is_closed
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+ stream = response.parse()
+ stream.close()
+
+ assert cast(Any, response.is_closed) is True
+
class TestAsyncTranscriptions:
parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"])
@parametrize
- async def test_method_create(self, async_client: AsyncOpenAI) -> None:
+ async def test_method_create_overload_1(self, async_client: AsyncOpenAI) -> None:
transcription = await async_client.audio.transcriptions.create(
file=b"raw file contents",
- model="whisper-1",
+ model="gpt-4o-transcribe",
)
assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
@parametrize
- async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None:
+ async def test_method_create_with_all_params_overload_1(self, async_client: AsyncOpenAI) -> None:
transcription = await async_client.audio.transcriptions.create(
file=b"raw file contents",
- model="whisper-1",
- language="string",
- prompt="string",
+ model="gpt-4o-transcribe",
+ include=["logprobs"],
+ language="language",
+ prompt="prompt",
response_format="json",
+ stream=False,
temperature=0,
timestamp_granularities=["word"],
)
assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
@parametrize
- async def test_raw_response_create(self, async_client: AsyncOpenAI) -> None:
+ async def test_raw_response_create_overload_1(self, async_client: AsyncOpenAI) -> None:
response = await async_client.audio.transcriptions.with_raw_response.create(
file=b"raw file contents",
- model="whisper-1",
+ model="gpt-4o-transcribe",
)
assert response.is_closed is True
@@ -102,10 +157,10 @@ async def test_raw_response_create(self, async_client: AsyncOpenAI) -> None:
assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
@parametrize
- async def test_streaming_response_create(self, async_client: AsyncOpenAI) -> None:
+ async def test_streaming_response_create_overload_1(self, async_client: AsyncOpenAI) -> None:
async with async_client.audio.transcriptions.with_streaming_response.create(
file=b"raw file contents",
- model="whisper-1",
+ model="gpt-4o-transcribe",
) as response:
assert not response.is_closed
assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -114,3 +169,54 @@ async def test_streaming_response_create(self, async_client: AsyncOpenAI) -> Non
assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
assert cast(Any, response.is_closed) is True
+
+ @parametrize
+ async def test_method_create_overload_2(self, async_client: AsyncOpenAI) -> None:
+ transcription_stream = await async_client.audio.transcriptions.create(
+ file=b"raw file contents",
+ model="gpt-4o-transcribe",
+ stream=True,
+ )
+ await transcription_stream.response.aclose()
+
+ @parametrize
+ async def test_method_create_with_all_params_overload_2(self, async_client: AsyncOpenAI) -> None:
+ transcription_stream = await async_client.audio.transcriptions.create(
+ file=b"raw file contents",
+ model="gpt-4o-transcribe",
+ stream=True,
+ include=["logprobs"],
+ language="language",
+ prompt="prompt",
+ response_format="json",
+ temperature=0,
+ timestamp_granularities=["word"],
+ )
+ await transcription_stream.response.aclose()
+
+ @parametrize
+ async def test_raw_response_create_overload_2(self, async_client: AsyncOpenAI) -> None:
+ response = await async_client.audio.transcriptions.with_raw_response.create(
+ file=b"raw file contents",
+ model="gpt-4o-transcribe",
+ stream=True,
+ )
+
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+ stream = response.parse()
+ await stream.close()
+
+ @parametrize
+ async def test_streaming_response_create_overload_2(self, async_client: AsyncOpenAI) -> None:
+ async with async_client.audio.transcriptions.with_streaming_response.create(
+ file=b"raw file contents",
+ model="gpt-4o-transcribe",
+ stream=True,
+ ) as response:
+ assert not response.is_closed
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+ stream = await response.parse()
+ await stream.close()
+
+ assert cast(Any, response.is_closed) is True
diff --git a/tests/api_resources/beta/realtime/test_sessions.py b/tests/api_resources/beta/realtime/test_sessions.py
index 5ea308ca0d..c0a426a417 100644
--- a/tests/api_resources/beta/realtime/test_sessions.py
+++ b/tests/api_resources/beta/realtime/test_sessions.py
@@ -26,6 +26,7 @@ def test_method_create(self, client: OpenAI) -> None:
def test_method_create_with_all_params(self, client: OpenAI) -> None:
session = client.beta.realtime.sessions.create(
input_audio_format="pcm16",
+ input_audio_noise_reduction={"type": "near_field"},
input_audio_transcription={
"language": "language",
"model": "model",
@@ -48,11 +49,12 @@ def test_method_create_with_all_params(self, client: OpenAI) -> None:
],
turn_detection={
"create_response": True,
+ "eagerness": "low",
"interrupt_response": True,
"prefix_padding_ms": 0,
"silence_duration_ms": 0,
"threshold": 0,
- "type": "type",
+ "type": "server_vad",
},
voice="alloy",
)
@@ -91,6 +93,7 @@ async def test_method_create(self, async_client: AsyncOpenAI) -> None:
async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None:
session = await async_client.beta.realtime.sessions.create(
input_audio_format="pcm16",
+ input_audio_noise_reduction={"type": "near_field"},
input_audio_transcription={
"language": "language",
"model": "model",
@@ -113,11 +116,12 @@ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) ->
],
turn_detection={
"create_response": True,
+ "eagerness": "low",
"interrupt_response": True,
"prefix_padding_ms": 0,
"silence_duration_ms": 0,
"threshold": 0,
- "type": "type",
+ "type": "server_vad",
},
voice="alloy",
)
diff --git a/tests/api_resources/beta/realtime/test_transcription_sessions.py b/tests/api_resources/beta/realtime/test_transcription_sessions.py
new file mode 100644
index 0000000000..4826185bea
--- /dev/null
+++ b/tests/api_resources/beta/realtime/test_transcription_sessions.py
@@ -0,0 +1,120 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+import os
+from typing import Any, cast
+
+import pytest
+
+from openai import OpenAI, AsyncOpenAI
+from tests.utils import assert_matches_type
+from openai.types.beta.realtime import TranscriptionSession
+
+base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
+
+
+class TestTranscriptionSessions:
+ parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
+
+ @parametrize
+ def test_method_create(self, client: OpenAI) -> None:
+ transcription_session = client.beta.realtime.transcription_sessions.create()
+ assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+ @parametrize
+ def test_method_create_with_all_params(self, client: OpenAI) -> None:
+ transcription_session = client.beta.realtime.transcription_sessions.create(
+ include=["string"],
+ input_audio_format="pcm16",
+ input_audio_noise_reduction={"type": "near_field"},
+ input_audio_transcription={
+ "language": "language",
+ "model": "gpt-4o-transcribe",
+ "prompt": "prompt",
+ },
+ modalities=["text"],
+ turn_detection={
+ "create_response": True,
+ "eagerness": "low",
+ "interrupt_response": True,
+ "prefix_padding_ms": 0,
+ "silence_duration_ms": 0,
+ "threshold": 0,
+ "type": "server_vad",
+ },
+ )
+ assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+ @parametrize
+ def test_raw_response_create(self, client: OpenAI) -> None:
+ response = client.beta.realtime.transcription_sessions.with_raw_response.create()
+
+ assert response.is_closed is True
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+ transcription_session = response.parse()
+ assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+ @parametrize
+ def test_streaming_response_create(self, client: OpenAI) -> None:
+ with client.beta.realtime.transcription_sessions.with_streaming_response.create() as response:
+ assert not response.is_closed
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+ transcription_session = response.parse()
+ assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+ assert cast(Any, response.is_closed) is True
+
+
+class TestAsyncTranscriptionSessions:
+ parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"])
+
+ @parametrize
+ async def test_method_create(self, async_client: AsyncOpenAI) -> None:
+ transcription_session = await async_client.beta.realtime.transcription_sessions.create()
+ assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+ @parametrize
+ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None:
+ transcription_session = await async_client.beta.realtime.transcription_sessions.create(
+ include=["string"],
+ input_audio_format="pcm16",
+ input_audio_noise_reduction={"type": "near_field"},
+ input_audio_transcription={
+ "language": "language",
+ "model": "gpt-4o-transcribe",
+ "prompt": "prompt",
+ },
+ modalities=["text"],
+ turn_detection={
+ "create_response": True,
+ "eagerness": "low",
+ "interrupt_response": True,
+ "prefix_padding_ms": 0,
+ "silence_duration_ms": 0,
+ "threshold": 0,
+ "type": "server_vad",
+ },
+ )
+ assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+ @parametrize
+ async def test_raw_response_create(self, async_client: AsyncOpenAI) -> None:
+ response = await async_client.beta.realtime.transcription_sessions.with_raw_response.create()
+
+ assert response.is_closed is True
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+ transcription_session = response.parse()
+ assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+ @parametrize
+ async def test_streaming_response_create(self, async_client: AsyncOpenAI) -> None:
+ async with async_client.beta.realtime.transcription_sessions.with_streaming_response.create() as response:
+ assert not response.is_closed
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+ transcription_session = await response.parse()
+ assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+ assert cast(Any, response.is_closed) is True
diff --git a/tests/lib/test_audio.py b/tests/lib/test_audio.py
index 0f53b316ba..ff8dba4714 100644
--- a/tests/lib/test_audio.py
+++ b/tests/lib/test_audio.py
@@ -26,7 +26,7 @@ def test_translation_create_overloads_in_sync(sync: bool, client: OpenAI, async_
assert_signatures_in_sync(
fn,
overload,
- exclude_params={"response_format"},
+ exclude_params={"response_format", "stream"},
description=f" for overload {i}",
)
@@ -60,7 +60,7 @@ def test_transcription_create_overloads_in_sync(sync: bool, client: OpenAI, asyn
assert_signatures_in_sync(
fn,
overload,
- exclude_params={"response_format"},
+ exclude_params={"response_format", "stream"},
description=f" for overload {i}",
)