diff --git a/.gitignore b/.gitignore index 8779740800..70815df7f6 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,7 @@ dist .envrc codegen.log Brewfile.lock.json + +.DS_Store + +examples/*.mp3 diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 4556676715..42bc7e250e 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "1.67.0" + ".": "1.68.0" } \ No newline at end of file diff --git a/.stats.yml b/.stats.yml index e0b06dc22a..abb9371314 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,2 +1,2 @@ -configured_endpoints: 81 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-b26121d5df6eb5d3032a45a267473798b15fcfec76dd44a3256cf1238be05fa4.yml +configured_endpoints: 82 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c22f59c66aec7914b6ee653d3098d1c1c8c16c180d2a158e819c8ddbf476f74b.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index ddd8b945c6..78ae21f27f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## 1.68.0 (2025-03-20) + +Full Changelog: [v1.67.0...v1.68.0](https://github.com/openai/openai-python/compare/v1.67.0...v1.68.0) + +### Features + +* add audio helpers ([423655c](https://github.com/openai/openai-python/commit/423655ca9077cfd258f1e52f6eb386fc8307fa5f)) +* **api:** new models for TTS, STT, + new audio features for Realtime ([#2232](https://github.com/openai/openai-python/issues/2232)) ([ab5192d](https://github.com/openai/openai-python/commit/ab5192d0a7b417ade622ec94dd48f86beb90692c)) + ## 1.67.0 (2025-03-19) Full Changelog: [v1.66.5...v1.67.0](https://github.com/openai/openai-python/compare/v1.66.5...v1.67.0) diff --git a/api.md b/api.md index 7f3a9392a2..a5f81c624c 100644 --- a/api.md +++ b/api.md @@ -151,7 +151,11 @@ Types: ```python from openai.types.audio import ( Transcription, + TranscriptionInclude, TranscriptionSegment, + TranscriptionStreamEvent, + TranscriptionTextDeltaEvent, + TranscriptionTextDoneEvent, TranscriptionVerbose, TranscriptionWord, TranscriptionCreateResponse, @@ -338,7 +342,9 @@ from openai.types.beta.realtime import ( ConversationItemDeleteEvent, ConversationItemDeletedEvent, ConversationItemInputAudioTranscriptionCompletedEvent, + ConversationItemInputAudioTranscriptionDeltaEvent, ConversationItemInputAudioTranscriptionFailedEvent, + ConversationItemRetrieveEvent, ConversationItemTruncateEvent, ConversationItemTruncatedEvent, ConversationItemWithReference, @@ -375,6 +381,8 @@ from openai.types.beta.realtime import ( SessionCreatedEvent, SessionUpdateEvent, SessionUpdatedEvent, + TranscriptionSessionUpdate, + TranscriptionSessionUpdatedEvent, ) ``` @@ -390,6 +398,18 @@ Methods: - client.beta.realtime.sessions.create(\*\*params) -> SessionCreateResponse +### TranscriptionSessions + +Types: + +```python +from openai.types.beta.realtime import TranscriptionSession +``` + +Methods: + +- client.beta.realtime.transcription_sessions.create(\*\*params) -> TranscriptionSession + ## Assistants Types: diff --git a/examples/audio.py b/examples/audio.py index 85f47bfb06..af41fe601b 100755 --- a/examples/audio.py +++ b/examples/audio.py @@ -1,6 +1,5 @@ #!/usr/bin/env rye run python -import time from pathlib import Path from openai import OpenAI @@ -12,8 +11,6 @@ def main() -> None: - stream_to_speakers() - # Create text-to-speech audio file with openai.audio.speech.with_streaming_response.create( model="tts-1", @@ -37,28 +34,5 @@ def main() -> None: print(translation.text) -def stream_to_speakers() -> None: - import pyaudio - - player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True) - - start_time = time.time() - - with openai.audio.speech.with_streaming_response.create( - model="tts-1", - voice="alloy", - response_format="pcm", # similar to WAV, but without a header chunk at the start. - input="""I see skies of blue and clouds of white - The bright blessed days, the dark sacred nights - And I think to myself - What a wonderful world""", - ) as response: - print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms") - for chunk in response.iter_bytes(chunk_size=1024): - player_stream.write(chunk) - - print(f"Done in {int((time.time() - start_time) * 1000)}ms.") - - if __name__ == "__main__": main() diff --git a/examples/speech_to_text.py b/examples/speech_to_text.py new file mode 100755 index 0000000000..cc3f56b424 --- /dev/null +++ b/examples/speech_to_text.py @@ -0,0 +1,25 @@ +#!/usr/bin/env rye run python + +import asyncio + +from openai import AsyncOpenAI +from openai.helpers import Microphone + +# gets OPENAI_API_KEY from your environment variables +openai = AsyncOpenAI() + + +async def main() -> None: + print("Recording for the next 10 seconds...") + recording = await Microphone(timeout=10).record() + print("Recording complete") + transcription = await openai.audio.transcriptions.create( + model="whisper-1", + file=recording, + ) + + print(transcription.text) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/text_to_speech.py b/examples/text_to_speech.py new file mode 100755 index 0000000000..ac8b12b0ab --- /dev/null +++ b/examples/text_to_speech.py @@ -0,0 +1,31 @@ +#!/usr/bin/env rye run python + +import time +import asyncio + +from openai import AsyncOpenAI +from openai.helpers import LocalAudioPlayer + +# gets OPENAI_API_KEY from your environment variables +openai = AsyncOpenAI() + + +async def main() -> None: + start_time = time.time() + + async with openai.audio.speech.with_streaming_response.create( + model="tts-1", + voice="alloy", + response_format="pcm", # similar to WAV, but without a header chunk at the start. + input="""I see skies of blue and clouds of white + The bright blessed days, the dark sacred nights + And I think to myself + What a wonderful world""", + ) as response: + print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms") + await LocalAudioPlayer().play(response) + print(f"Time to play: {int((time.time() - start_time) * 1000)}ms") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index a0a7eba2f5..5ee7157038 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "openai" -version = "1.67.0" +version = "1.68.0" description = "The official Python library for the openai API" dynamic = ["readme"] license = "Apache-2.0" @@ -16,6 +16,8 @@ dependencies = [ "sniffio", "tqdm > 4", "jiter>=0.4.0, <1", + "sounddevice>=0.5.1", + "numpy>=2.0.2", ] requires-python = ">= 3.8" classifiers = [ diff --git a/requirements-dev.lock b/requirements-dev.lock index 48e49f926c..0755ddb3c5 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -33,6 +33,7 @@ certifi==2023.7.22 # via requests cffi==1.16.0 # via cryptography + # via sounddevice charset-normalizer==3.3.2 # via requests click==8.1.7 @@ -92,7 +93,7 @@ nest-asyncio==1.6.0 nodeenv==1.8.0 # via pyright nox==2023.4.22 -numpy==1.26.3 +numpy==2.0.2 # via openai # via pandas # via pandas-stubs @@ -102,7 +103,7 @@ packaging==23.2 # via black # via nox # via pytest -pandas==2.1.4 +pandas==2.2.3 # via openai pandas-stubs==2.1.4.231227 # via openai @@ -154,6 +155,8 @@ sniffio==1.3.0 # via trio sortedcontainers==2.4.0 # via trio +sounddevice==0.5.1 + # via openai time-machine==2.9.0 toml==0.10.2 # via inline-snapshot diff --git a/requirements.lock b/requirements.lock index b935c0ee59..fa88e26c0f 100644 --- a/requirements.lock +++ b/requirements.lock @@ -18,6 +18,8 @@ anyio==4.1.0 certifi==2023.7.22 # via httpcore # via httpx +cffi==1.17.1 + # via sounddevice distro==1.8.0 # via openai exceptiongroup==1.2.2 @@ -41,6 +43,8 @@ pandas==2.2.3 # via openai pandas-stubs==2.2.2.240807 # via openai +pycparser==2.22 + # via cffi pydantic==2.10.3 # via openai pydantic-core==2.27.1 @@ -54,6 +58,8 @@ six==1.16.0 sniffio==1.3.0 # via anyio # via openai +sounddevice==0.5.1 + # via openai tqdm==4.66.5 # via openai types-pytz==2024.2.0.20241003 diff --git a/src/openai/_version.py b/src/openai/_version.py index b63e6ad189..23e4e7ffb7 100644 --- a/src/openai/_version.py +++ b/src/openai/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "openai" -__version__ = "1.67.0" # x-release-please-version +__version__ = "1.68.0" # x-release-please-version diff --git a/src/openai/helpers.py b/src/openai/helpers.py new file mode 100644 index 0000000000..1a10168a96 --- /dev/null +++ b/src/openai/helpers.py @@ -0,0 +1,4 @@ +from .helpers.microphone import Microphone +from .helpers.local_audio_player import LocalAudioPlayer + +__all__ = ["LocalAudioPlayer", "Microphone"] diff --git a/src/openai/helpers/__init__.py b/src/openai/helpers/__init__.py new file mode 100644 index 0000000000..ab3044da59 --- /dev/null +++ b/src/openai/helpers/__init__.py @@ -0,0 +1,4 @@ +from .microphone import Microphone +from .local_audio_player import LocalAudioPlayer + +__all__ = ["Microphone", "LocalAudioPlayer"] diff --git a/src/openai/helpers/local_audio_player.py b/src/openai/helpers/local_audio_player.py new file mode 100644 index 0000000000..46a16ce6bb --- /dev/null +++ b/src/openai/helpers/local_audio_player.py @@ -0,0 +1,162 @@ +# mypy: ignore-errors +import queue +import asyncio +from typing import Any, Union, Callable, AsyncGenerator, cast + +import numpy as np +import sounddevice as sd # type: ignore +import numpy.typing as npt + +from .. import _legacy_response +from .._response import StreamedBinaryAPIResponse, AsyncStreamedBinaryAPIResponse + +SAMPLE_RATE = 24000 + + +class LocalAudioPlayer: + def __init__( + self, + should_stop: Union[Callable[[], bool], None] = None, + ): + self.channels = 1 + self.dtype = np.float32 + self.should_stop = should_stop + + async def _tts_response_to_buffer( + self, + response: Union[ + _legacy_response.HttpxBinaryResponseContent, + AsyncStreamedBinaryAPIResponse, + StreamedBinaryAPIResponse, + ], + ) -> npt.NDArray[np.float32]: + chunks: list[bytes] = [] + if isinstance(response, _legacy_response.HttpxBinaryResponseContent) or isinstance( + response, StreamedBinaryAPIResponse + ): + for chunk in response.iter_bytes(chunk_size=1024): + if chunk: + chunks.append(chunk) + else: + async for chunk in response.iter_bytes(chunk_size=1024): + if chunk: + chunks.append(chunk) + + audio_bytes = b"".join(chunks) + audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0 + audio_np = audio_np.reshape(-1, 1) + return audio_np + + async def play( + self, + input: Union[ + npt.NDArray[np.int16], + npt.NDArray[np.float32], + _legacy_response.HttpxBinaryResponseContent, + AsyncStreamedBinaryAPIResponse, + StreamedBinaryAPIResponse, + ], + ) -> None: + audio_content: npt.NDArray[np.float32] + if isinstance(input, np.ndarray): + if input.dtype == np.int16 and self.dtype == np.float32: + audio_content = (input.astype(np.float32) / 32767.0).reshape(-1, self.channels) + elif input.dtype == np.float32: + audio_content = cast(npt.NDArray[np.float32], input) + else: + raise ValueError(f"Unsupported dtype: {input.dtype}") + else: + audio_content = await self._tts_response_to_buffer(input) + + loop = asyncio.get_event_loop() + event = asyncio.Event() + idx = 0 + + def callback( + outdata: npt.NDArray[np.float32], + frame_count: int, + _time_info: Any, + _status: Any, + ): + nonlocal idx + + remainder = len(audio_content) - idx + if remainder == 0 or (callable(self.should_stop) and self.should_stop()): + loop.call_soon_threadsafe(event.set) + raise sd.CallbackStop + valid_frames = frame_count if remainder >= frame_count else remainder + outdata[:valid_frames] = audio_content[idx : idx + valid_frames] + outdata[valid_frames:] = 0 + idx += valid_frames + + stream = sd.OutputStream( + samplerate=SAMPLE_RATE, + callback=callback, + dtype=audio_content.dtype, + channels=audio_content.shape[1], + ) + with stream: + await event.wait() + + async def play_stream( + self, + buffer_stream: AsyncGenerator[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None], None], + ) -> None: + loop = asyncio.get_event_loop() + event = asyncio.Event() + buffer_queue: queue.Queue[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None]] = queue.Queue(maxsize=50) + + async def buffer_producer(): + async for buffer in buffer_stream: + if buffer is None: + break + await loop.run_in_executor(None, buffer_queue.put, buffer) + await loop.run_in_executor(None, buffer_queue.put, None) # Signal completion + + def callback( + outdata: npt.NDArray[np.float32], + frame_count: int, + _time_info: Any, + _status: Any, + ): + nonlocal current_buffer, buffer_pos + + frames_written = 0 + while frames_written < frame_count: + if current_buffer is None or buffer_pos >= len(current_buffer): + try: + current_buffer = buffer_queue.get(timeout=0.1) + if current_buffer is None: + loop.call_soon_threadsafe(event.set) + raise sd.CallbackStop + buffer_pos = 0 + + if current_buffer.dtype == np.int16 and self.dtype == np.float32: + current_buffer = (current_buffer.astype(np.float32) / 32767.0).reshape(-1, self.channels) + + except queue.Empty: + outdata[frames_written:] = 0 + return + + remaining_frames = len(current_buffer) - buffer_pos + frames_to_write = min(frame_count - frames_written, remaining_frames) + outdata[frames_written : frames_written + frames_to_write] = current_buffer[ + buffer_pos : buffer_pos + frames_to_write + ] + buffer_pos += frames_to_write + frames_written += frames_to_write + + current_buffer = None + buffer_pos = 0 + + producer_task = asyncio.create_task(buffer_producer()) + + with sd.OutputStream( + samplerate=SAMPLE_RATE, + channels=self.channels, + dtype=self.dtype, + callback=callback, + ): + await event.wait() + + await producer_task diff --git a/src/openai/helpers/microphone.py b/src/openai/helpers/microphone.py new file mode 100644 index 0000000000..18650909aa --- /dev/null +++ b/src/openai/helpers/microphone.py @@ -0,0 +1,98 @@ +# mypy: ignore-errors +import io +import time +import wave +import asyncio +from typing import Any, Type, Union, Generic, TypeVar, Callable, overload +from typing_extensions import Literal + +import numpy as np +import sounddevice as sd # type: ignore +import numpy.typing as npt + +from openai._types import FileTypes, FileContent + +SAMPLE_RATE = 24000 + +DType = TypeVar("DType", bound=np.generic) + + +class Microphone(Generic[DType]): + def __init__( + self, + channels: int = 1, + dtype: Type[DType] = np.int16, + should_record: Union[Callable[[], bool], None] = None, + timeout: Union[float, None] = None, + ): + self.channels = channels + self.dtype = dtype + self.should_record = should_record + self.buffer_chunks = [] + self.timeout = timeout + self.has_record_function = callable(should_record) + + def _ndarray_to_wav(self, audio_data: npt.NDArray[DType]) -> FileTypes: + buffer: FileContent = io.BytesIO() + with wave.open(buffer, "w") as wav_file: + wav_file.setnchannels(self.channels) + wav_file.setsampwidth(np.dtype(self.dtype).itemsize) + wav_file.setframerate(SAMPLE_RATE) + wav_file.writeframes(audio_data.tobytes()) + buffer.seek(0) + return ("audio.wav", buffer, "audio/wav") + + @overload + async def record(self, return_ndarray: Literal[True]) -> npt.NDArray[DType]: ... + + @overload + async def record(self, return_ndarray: Literal[False]) -> FileTypes: ... + + @overload + async def record(self, return_ndarray: None = ...) -> FileTypes: ... + + async def record(self, return_ndarray: Union[bool, None] = False) -> Union[npt.NDArray[DType], FileTypes]: + loop = asyncio.get_event_loop() + event = asyncio.Event() + self.buffer_chunks: list[npt.NDArray[DType]] = [] + start_time = time.perf_counter() + + def callback( + indata: npt.NDArray[DType], + _frame_count: int, + _time_info: Any, + _status: Any, + ): + execution_time = time.perf_counter() - start_time + reached_recording_timeout = execution_time > self.timeout if self.timeout is not None else False + if reached_recording_timeout: + loop.call_soon_threadsafe(event.set) + raise sd.CallbackStop + + should_be_recording = self.should_record() if callable(self.should_record) else True + if not should_be_recording: + loop.call_soon_threadsafe(event.set) + raise sd.CallbackStop + + self.buffer_chunks.append(indata.copy()) + + stream = sd.InputStream( + callback=callback, + dtype=self.dtype, + samplerate=SAMPLE_RATE, + channels=self.channels, + ) + with stream: + await event.wait() + + # Concatenate all chunks into a single buffer, handle empty case + concatenated_chunks: npt.NDArray[DType] = ( + np.concatenate(self.buffer_chunks, axis=0) + if len(self.buffer_chunks) > 0 + else np.array([], dtype=self.dtype) + ) + + if return_ndarray: + return concatenated_chunks + else: + return self._ndarray_to_wav(concatenated_chunks) diff --git a/src/openai/resources/audio/speech.py b/src/openai/resources/audio/speech.py index ad01118161..529e3a47ea 100644 --- a/src/openai/resources/audio/speech.py +++ b/src/openai/resources/audio/speech.py @@ -54,6 +54,7 @@ def create( input: str, model: Union[str, SpeechModel], voice: Literal["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"], + instructions: str | NotGiven = NOT_GIVEN, response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | NotGiven = NOT_GIVEN, speed: float | NotGiven = NOT_GIVEN, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. @@ -71,13 +72,16 @@ def create( model: One of the available [TTS models](https://platform.openai.com/docs/models#tts): - `tts-1` or `tts-1-hd` + `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`. voice: The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage` and `shimmer`. Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options). + instructions: Control the voice of your generated audio with additional instructions. Does not + work with `tts-1` or `tts-1-hd`. + response_format: The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`. @@ -100,6 +104,7 @@ def create( "input": input, "model": model, "voice": voice, + "instructions": instructions, "response_format": response_format, "speed": speed, }, @@ -138,6 +143,7 @@ async def create( input: str, model: Union[str, SpeechModel], voice: Literal["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"], + instructions: str | NotGiven = NOT_GIVEN, response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | NotGiven = NOT_GIVEN, speed: float | NotGiven = NOT_GIVEN, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. @@ -155,13 +161,16 @@ async def create( model: One of the available [TTS models](https://platform.openai.com/docs/models#tts): - `tts-1` or `tts-1-hd` + `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`. voice: The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage` and `shimmer`. Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options). + instructions: Control the voice of your generated audio with additional instructions. Does not + work with `tts-1` or `tts-1-hd`. + response_format: The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`. @@ -184,6 +193,7 @@ async def create( "input": input, "model": model, "voice": voice, + "instructions": instructions, "response_format": response_format, "speed": speed, }, diff --git a/src/openai/resources/audio/transcriptions.py b/src/openai/resources/audio/transcriptions.py index f338ad067d..2a77f91d69 100644 --- a/src/openai/resources/audio/transcriptions.py +++ b/src/openai/resources/audio/transcriptions.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, List, Union, Mapping, cast +from typing import TYPE_CHECKING, List, Union, Mapping, Optional, cast from typing_extensions import Literal, overload, assert_never import httpx @@ -13,6 +13,7 @@ from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven, FileTypes from ..._utils import ( extract_files, + required_args, maybe_transform, deepcopy_minimal, async_maybe_transform, @@ -20,12 +21,16 @@ from ..._compat import cached_property from ..._resource import SyncAPIResource, AsyncAPIResource from ..._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper +from ..._streaming import Stream, AsyncStream from ...types.audio import transcription_create_params from ..._base_client import make_request_options from ...types.audio_model import AudioModel from ...types.audio.transcription import Transcription from ...types.audio_response_format import AudioResponseFormat +from ...types.audio.transcription_include import TranscriptionInclude from ...types.audio.transcription_verbose import TranscriptionVerbose +from ...types.audio.transcription_stream_event import TranscriptionStreamEvent +from ...types.audio.transcription_create_response import TranscriptionCreateResponse __all__ = ["Transcriptions", "AsyncTranscriptions"] @@ -58,6 +63,7 @@ def create( *, file: FileTypes, model: Union[str, AudioModel], + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, response_format: Union[Literal["json"], NotGiven] = NOT_GIVEN, language: str | NotGiven = NOT_GIVEN, prompt: str | NotGiven = NOT_GIVEN, @@ -77,6 +83,7 @@ def create( *, file: FileTypes, model: Union[str, AudioModel], + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, response_format: Literal["verbose_json"], language: str | NotGiven = NOT_GIVEN, prompt: str | NotGiven = NOT_GIVEN, @@ -97,6 +104,7 @@ def create( file: FileTypes, model: Union[str, AudioModel], response_format: Literal["text", "srt", "vtt"], + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, language: str | NotGiven = NOT_GIVEN, prompt: str | NotGiven = NOT_GIVEN, temperature: float | NotGiven = NOT_GIVEN, @@ -109,11 +117,96 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> str: ... + @overload + def create( + self, + *, + file: FileTypes, + model: Union[str, AudioModel], + stream: Literal[True], + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, + language: str | NotGiven = NOT_GIVEN, + prompt: str | NotGiven = NOT_GIVEN, + response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN, + temperature: float | NotGiven = NOT_GIVEN, + timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> Stream[TranscriptionStreamEvent]: + """ + Transcribes audio into the input language. + + Args: + file: + The audio file object (not file name) to transcribe, in one of these formats: + flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + + model: ID of the model to use. The options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source + Whisper V2 model). + + stream: If set to true, the model response data will be streamed to the client as it is + generated using + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). + See the + [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) + for more information. + + Note: Streaming is not supported for the `whisper-1` model and will be ignored. + + include: Additional information to include in the transcription response. `logprobs` will + return the log probabilities of the tokens in the response to understand the + model's confidence in the transcription. `logprobs` only works with + response_format set to `json` and only with the models `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`. + + language: The language of the input audio. Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + + prompt: An optional text to guide the model's style or continue a previous audio + segment. The + [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + should match the audio language. + + response_format: The format of the output, in one of these options: `json`, `text`, `srt`, + `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, + the only supported format is `json`. + + temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the + output more random, while lower values like 0.2 will make it more focused and + deterministic. If set to 0, the model will use + [log probability](https://en.wikipedia.org/wiki/Log_probability) to + automatically increase the temperature until certain thresholds are hit. + + timestamp_granularities: The timestamp granularities to populate for this transcription. + `response_format` must be set `verbose_json` to use timestamp granularities. + Either or both of these options are supported: `word`, or `segment`. Note: There + is no additional latency for segment timestamps, but generating word timestamps + incurs additional latency. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @overload def create( self, *, file: FileTypes, model: Union[str, AudioModel], + stream: bool, + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, language: str | NotGiven = NOT_GIVEN, prompt: str | NotGiven = NOT_GIVEN, response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN, @@ -125,7 +218,7 @@ def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, - ) -> Transcription | TranscriptionVerbose | str: + ) -> TranscriptionCreateResponse | Stream[TranscriptionStreamEvent]: """ Transcribes audio into the input language. @@ -134,8 +227,24 @@ def create( The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. - model: ID of the model to use. Only `whisper-1` (which is powered by our open source - Whisper V2 model) is currently available. + model: ID of the model to use. The options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source + Whisper V2 model). + + stream: If set to true, the model response data will be streamed to the client as it is + generated using + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). + See the + [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) + for more information. + + Note: Streaming is not supported for the `whisper-1` model and will be ignored. + + include: Additional information to include in the transcription response. `logprobs` will + return the log probabilities of the tokens in the response to understand the + model's confidence in the transcription. `logprobs` only works with + response_format set to `json` and only with the models `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`. language: The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) @@ -147,7 +256,8 @@ def create( should match the audio language. response_format: The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. + `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, + the only supported format is `json`. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and @@ -169,13 +279,37 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ + ... + + @required_args(["file", "model"], ["file", "model", "stream"]) + def create( + self, + *, + file: FileTypes, + model: Union[str, AudioModel], + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, + language: str | NotGiven = NOT_GIVEN, + prompt: str | NotGiven = NOT_GIVEN, + response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN, + stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN, + temperature: float | NotGiven = NOT_GIVEN, + timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> str | Transcription | TranscriptionVerbose | Stream[TranscriptionStreamEvent]: body = deepcopy_minimal( { "file": file, "model": model, + "include": include, "language": language, "prompt": prompt, "response_format": response_format, + "stream": stream, "temperature": temperature, "timestamp_granularities": timestamp_granularities, } @@ -193,6 +327,8 @@ def create( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), cast_to=_get_response_format_type(response_format), + stream=stream or False, + stream_cls=Stream[TranscriptionStreamEvent], ) @@ -226,6 +362,7 @@ async def create( language: str | NotGiven = NOT_GIVEN, prompt: str | NotGiven = NOT_GIVEN, temperature: float | NotGiven = NOT_GIVEN, + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. @@ -241,6 +378,7 @@ async def create( *, file: FileTypes, model: Union[str, AudioModel], + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, response_format: Literal["verbose_json"], language: str | NotGiven = NOT_GIVEN, prompt: str | NotGiven = NOT_GIVEN, @@ -260,6 +398,7 @@ async def create( *, file: FileTypes, model: Union[str, AudioModel], + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, response_format: Literal["text", "srt", "vtt"], language: str | NotGiven = NOT_GIVEN, prompt: str | NotGiven = NOT_GIVEN, @@ -273,11 +412,96 @@ async def create( timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> str: ... + @overload + async def create( + self, + *, + file: FileTypes, + model: Union[str, AudioModel], + stream: Literal[True], + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, + language: str | NotGiven = NOT_GIVEN, + prompt: str | NotGiven = NOT_GIVEN, + response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN, + temperature: float | NotGiven = NOT_GIVEN, + timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> AsyncStream[TranscriptionStreamEvent]: + """ + Transcribes audio into the input language. + + Args: + file: + The audio file object (not file name) to transcribe, in one of these formats: + flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + + model: ID of the model to use. The options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source + Whisper V2 model). + + stream: If set to true, the model response data will be streamed to the client as it is + generated using + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). + See the + [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) + for more information. + + Note: Streaming is not supported for the `whisper-1` model and will be ignored. + + include: Additional information to include in the transcription response. `logprobs` will + return the log probabilities of the tokens in the response to understand the + model's confidence in the transcription. `logprobs` only works with + response_format set to `json` and only with the models `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`. + + language: The language of the input audio. Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + + prompt: An optional text to guide the model's style or continue a previous audio + segment. The + [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + should match the audio language. + + response_format: The format of the output, in one of these options: `json`, `text`, `srt`, + `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, + the only supported format is `json`. + + temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the + output more random, while lower values like 0.2 will make it more focused and + deterministic. If set to 0, the model will use + [log probability](https://en.wikipedia.org/wiki/Log_probability) to + automatically increase the temperature until certain thresholds are hit. + + timestamp_granularities: The timestamp granularities to populate for this transcription. + `response_format` must be set `verbose_json` to use timestamp granularities. + Either or both of these options are supported: `word`, or `segment`. Note: There + is no additional latency for segment timestamps, but generating word timestamps + incurs additional latency. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @overload async def create( self, *, file: FileTypes, model: Union[str, AudioModel], + stream: bool, + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, language: str | NotGiven = NOT_GIVEN, prompt: str | NotGiven = NOT_GIVEN, response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN, @@ -289,7 +513,7 @@ async def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, - ) -> Transcription | TranscriptionVerbose | str: + ) -> TranscriptionCreateResponse | AsyncStream[TranscriptionStreamEvent]: """ Transcribes audio into the input language. @@ -298,8 +522,24 @@ async def create( The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. - model: ID of the model to use. Only `whisper-1` (which is powered by our open source - Whisper V2 model) is currently available. + model: ID of the model to use. The options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source + Whisper V2 model). + + stream: If set to true, the model response data will be streamed to the client as it is + generated using + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). + See the + [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) + for more information. + + Note: Streaming is not supported for the `whisper-1` model and will be ignored. + + include: Additional information to include in the transcription response. `logprobs` will + return the log probabilities of the tokens in the response to understand the + model's confidence in the transcription. `logprobs` only works with + response_format set to `json` and only with the models `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`. language: The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) @@ -311,7 +551,8 @@ async def create( should match the audio language. response_format: The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. + `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, + the only supported format is `json`. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and @@ -333,13 +574,37 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ + ... + + @required_args(["file", "model"], ["file", "model", "stream"]) + async def create( + self, + *, + file: FileTypes, + model: Union[str, AudioModel], + include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN, + language: str | NotGiven = NOT_GIVEN, + prompt: str | NotGiven = NOT_GIVEN, + response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN, + stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN, + temperature: float | NotGiven = NOT_GIVEN, + timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> Transcription | TranscriptionVerbose | str | AsyncStream[TranscriptionStreamEvent]: body = deepcopy_minimal( { "file": file, "model": model, + "include": include, "language": language, "prompt": prompt, "response_format": response_format, + "stream": stream, "temperature": temperature, "timestamp_granularities": timestamp_granularities, } @@ -357,6 +622,8 @@ async def create( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), cast_to=_get_response_format_type(response_format), + stream=stream or False, + stream_cls=AsyncStream[TranscriptionStreamEvent], ) diff --git a/src/openai/resources/audio/translations.py b/src/openai/resources/audio/translations.py index cd3132dc57..f55dbd0ee5 100644 --- a/src/openai/resources/audio/translations.py +++ b/src/openai/resources/audio/translations.py @@ -9,7 +9,6 @@ import httpx from ... import _legacy_response -from ...types import AudioResponseFormat from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven, FileTypes from ..._utils import ( extract_files, @@ -109,7 +108,7 @@ def create( file: FileTypes, model: Union[str, AudioModel], prompt: str | NotGiven = NOT_GIVEN, - response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN, + response_format: Union[Literal["json", "text", "srt", "verbose_json", "vtt"], NotGiven] = NOT_GIVEN, temperature: float | NotGiven = NOT_GIVEN, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. diff --git a/src/openai/resources/beta/realtime/__init__.py b/src/openai/resources/beta/realtime/__init__.py index 474434e6e1..7ab3d9931c 100644 --- a/src/openai/resources/beta/realtime/__init__.py +++ b/src/openai/resources/beta/realtime/__init__.py @@ -16,6 +16,14 @@ SessionsWithStreamingResponse, AsyncSessionsWithStreamingResponse, ) +from .transcription_sessions import ( + TranscriptionSessions, + AsyncTranscriptionSessions, + TranscriptionSessionsWithRawResponse, + AsyncTranscriptionSessionsWithRawResponse, + TranscriptionSessionsWithStreamingResponse, + AsyncTranscriptionSessionsWithStreamingResponse, +) __all__ = [ "Sessions", @@ -24,6 +32,12 @@ "AsyncSessionsWithRawResponse", "SessionsWithStreamingResponse", "AsyncSessionsWithStreamingResponse", + "TranscriptionSessions", + "AsyncTranscriptionSessions", + "TranscriptionSessionsWithRawResponse", + "AsyncTranscriptionSessionsWithRawResponse", + "TranscriptionSessionsWithStreamingResponse", + "AsyncTranscriptionSessionsWithStreamingResponse", "Realtime", "AsyncRealtime", "RealtimeWithRawResponse", diff --git a/src/openai/resources/beta/realtime/realtime.py b/src/openai/resources/beta/realtime/realtime.py index cd610d9089..76e57f8cb7 100644 --- a/src/openai/resources/beta/realtime/realtime.py +++ b/src/openai/resources/beta/realtime/realtime.py @@ -32,7 +32,19 @@ from ...._resource import SyncAPIResource, AsyncAPIResource from ...._exceptions import OpenAIError from ...._base_client import _merge_mappings -from ....types.beta.realtime import session_update_event_param, response_create_event_param +from ....types.beta.realtime import ( + session_update_event_param, + response_create_event_param, + transcription_session_update_param, +) +from .transcription_sessions import ( + TranscriptionSessions, + AsyncTranscriptionSessions, + TranscriptionSessionsWithRawResponse, + AsyncTranscriptionSessionsWithRawResponse, + TranscriptionSessionsWithStreamingResponse, + AsyncTranscriptionSessionsWithStreamingResponse, +) from ....types.websocket_connection_options import WebsocketConnectionOptions from ....types.beta.realtime.realtime_client_event import RealtimeClientEvent from ....types.beta.realtime.realtime_server_event import RealtimeServerEvent @@ -55,6 +67,10 @@ class Realtime(SyncAPIResource): def sessions(self) -> Sessions: return Sessions(self._client) + @cached_property + def transcription_sessions(self) -> TranscriptionSessions: + return TranscriptionSessions(self._client) + @cached_property def with_raw_response(self) -> RealtimeWithRawResponse: """ @@ -107,6 +123,10 @@ class AsyncRealtime(AsyncAPIResource): def sessions(self) -> AsyncSessions: return AsyncSessions(self._client) + @cached_property + def transcription_sessions(self) -> AsyncTranscriptionSessions: + return AsyncTranscriptionSessions(self._client) + @cached_property def with_raw_response(self) -> AsyncRealtimeWithRawResponse: """ @@ -162,6 +182,10 @@ def __init__(self, realtime: Realtime) -> None: def sessions(self) -> SessionsWithRawResponse: return SessionsWithRawResponse(self._realtime.sessions) + @cached_property + def transcription_sessions(self) -> TranscriptionSessionsWithRawResponse: + return TranscriptionSessionsWithRawResponse(self._realtime.transcription_sessions) + class AsyncRealtimeWithRawResponse: def __init__(self, realtime: AsyncRealtime) -> None: @@ -171,6 +195,10 @@ def __init__(self, realtime: AsyncRealtime) -> None: def sessions(self) -> AsyncSessionsWithRawResponse: return AsyncSessionsWithRawResponse(self._realtime.sessions) + @cached_property + def transcription_sessions(self) -> AsyncTranscriptionSessionsWithRawResponse: + return AsyncTranscriptionSessionsWithRawResponse(self._realtime.transcription_sessions) + class RealtimeWithStreamingResponse: def __init__(self, realtime: Realtime) -> None: @@ -180,6 +208,10 @@ def __init__(self, realtime: Realtime) -> None: def sessions(self) -> SessionsWithStreamingResponse: return SessionsWithStreamingResponse(self._realtime.sessions) + @cached_property + def transcription_sessions(self) -> TranscriptionSessionsWithStreamingResponse: + return TranscriptionSessionsWithStreamingResponse(self._realtime.transcription_sessions) + class AsyncRealtimeWithStreamingResponse: def __init__(self, realtime: AsyncRealtime) -> None: @@ -189,14 +221,19 @@ def __init__(self, realtime: AsyncRealtime) -> None: def sessions(self) -> AsyncSessionsWithStreamingResponse: return AsyncSessionsWithStreamingResponse(self._realtime.sessions) + @cached_property + def transcription_sessions(self) -> AsyncTranscriptionSessionsWithStreamingResponse: + return AsyncTranscriptionSessionsWithStreamingResponse(self._realtime.transcription_sessions) + class AsyncRealtimeConnection: """Represents a live websocket connection to the Realtime API""" session: AsyncRealtimeSessionResource response: AsyncRealtimeResponseResource - conversation: AsyncRealtimeConversationResource input_audio_buffer: AsyncRealtimeInputAudioBufferResource + conversation: AsyncRealtimeConversationResource + transcription_session: AsyncRealtimeTranscriptionSessionResource _connection: AsyncWebsocketConnection @@ -205,8 +242,9 @@ def __init__(self, connection: AsyncWebsocketConnection) -> None: self.session = AsyncRealtimeSessionResource(self) self.response = AsyncRealtimeResponseResource(self) - self.conversation = AsyncRealtimeConversationResource(self) self.input_audio_buffer = AsyncRealtimeInputAudioBufferResource(self) + self.conversation = AsyncRealtimeConversationResource(self) + self.transcription_session = AsyncRealtimeTranscriptionSessionResource(self) async def __aiter__(self) -> AsyncIterator[RealtimeServerEvent]: """ @@ -377,8 +415,9 @@ class RealtimeConnection: session: RealtimeSessionResource response: RealtimeResponseResource - conversation: RealtimeConversationResource input_audio_buffer: RealtimeInputAudioBufferResource + conversation: RealtimeConversationResource + transcription_session: RealtimeTranscriptionSessionResource _connection: WebsocketConnection @@ -387,8 +426,9 @@ def __init__(self, connection: WebsocketConnection) -> None: self.session = RealtimeSessionResource(self) self.response = RealtimeResponseResource(self) - self.conversation = RealtimeConversationResource(self) self.input_audio_buffer = RealtimeInputAudioBufferResource(self) + self.conversation = RealtimeConversationResource(self) + self.transcription_session = RealtimeTranscriptionSessionResource(self) def __iter__(self) -> Iterator[RealtimeServerEvent]: """ @@ -582,20 +622,6 @@ def update(self, *, session: session_update_event_param.Session, event_id: str | class RealtimeResponseResource(BaseRealtimeConnectionResource): - def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None: - """Send this event to cancel an in-progress response. - - The server will respond - with a `response.cancelled` event or an error if there is no response to - cancel. - """ - self._connection.send( - cast( - RealtimeClientEventParam, - strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}), - ) - ) - def create( self, *, @@ -626,6 +652,70 @@ def create( ) ) + def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None: + """Send this event to cancel an in-progress response. + + The server will respond + with a `response.cancelled` event or an error if there is no response to + cancel. + """ + self._connection.send( + cast( + RealtimeClientEventParam, + strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}), + ) + ) + + +class RealtimeInputAudioBufferResource(BaseRealtimeConnectionResource): + def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: + """Send this event to clear the audio bytes in the buffer. + + The server will + respond with an `input_audio_buffer.cleared` event. + """ + self._connection.send( + cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id})) + ) + + def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: + """ + Send this event to commit the user input audio buffer, which will create a + new user message item in the conversation. This event will produce an error + if the input audio buffer is empty. When in Server VAD mode, the client does + not need to send this event, the server will commit the audio buffer + automatically. + + Committing the input audio buffer will trigger input audio transcription + (if enabled in session configuration), but it will not create a response + from the model. The server will respond with an `input_audio_buffer.committed` + event. + """ + self._connection.send( + cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id})) + ) + + def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None: + """Send this event to append audio bytes to the input audio buffer. + + The audio + buffer is temporary storage you can write to and later commit. In Server VAD + mode, the audio buffer is used to detect speech and the server will decide + when to commit. When Server VAD is disabled, you must commit the audio buffer + manually. + + The client may choose how much audio to place in each event up to a maximum + of 15 MiB, for example streaming smaller chunks from the client may allow the + VAD to be more responsive. Unlike made other client events, the server will + not send a confirmation response to this event. + """ + self._connection.send( + cast( + RealtimeClientEventParam, + strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}), + ) + ) + class RealtimeConversationResource(BaseRealtimeConnectionResource): @cached_property @@ -711,53 +801,30 @@ def truncate( ) ) - -class RealtimeInputAudioBufferResource(BaseRealtimeConnectionResource): - def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: - """Send this event to clear the audio bytes in the buffer. - - The server will - respond with an `input_audio_buffer.cleared` event. + def retrieve(self, *, item_id: str, event_id: str | NotGiven = NOT_GIVEN) -> None: """ - self._connection.send( - cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id})) - ) - - def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: - """ - Send this event to commit the user input audio buffer, which will create a - new user message item in the conversation. This event will produce an error - if the input audio buffer is empty. When in Server VAD mode, the client does - not need to send this event, the server will commit the audio buffer - automatically. - - Committing the input audio buffer will trigger input audio transcription - (if enabled in session configuration), but it will not create a response - from the model. The server will respond with an `input_audio_buffer.committed` - event. + Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD. + The server will respond with a `conversation.item.retrieved` event, + unless the item does not exist in the conversation history, in which case the + server will respond with an error. """ self._connection.send( - cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id})) + cast( + RealtimeClientEventParam, + strip_not_given({"type": "conversation.item.retrieve", "item_id": item_id, "event_id": event_id}), + ) ) - def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None: - """Send this event to append audio bytes to the input audio buffer. - The audio - buffer is temporary storage you can write to and later commit. In Server VAD - mode, the audio buffer is used to detect speech and the server will decide - when to commit. When Server VAD is disabled, you must commit the audio buffer - manually. - - The client may choose how much audio to place in each event up to a maximum - of 15 MiB, for example streaming smaller chunks from the client may allow the - VAD to be more responsive. Unlike made other client events, the server will - not send a confirmation response to this event. - """ +class RealtimeTranscriptionSessionResource(BaseRealtimeConnectionResource): + def update( + self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN + ) -> None: + """Send this event to update a transcription session.""" self._connection.send( cast( RealtimeClientEventParam, - strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}), + strip_not_given({"type": "transcription_session.update", "session": session, "event_id": event_id}), ) ) @@ -792,20 +859,6 @@ async def update( class AsyncRealtimeResponseResource(BaseAsyncRealtimeConnectionResource): - async def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None: - """Send this event to cancel an in-progress response. - - The server will respond - with a `response.cancelled` event or an error if there is no response to - cancel. - """ - await self._connection.send( - cast( - RealtimeClientEventParam, - strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}), - ) - ) - async def create( self, *, @@ -836,6 +889,70 @@ async def create( ) ) + async def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None: + """Send this event to cancel an in-progress response. + + The server will respond + with a `response.cancelled` event or an error if there is no response to + cancel. + """ + await self._connection.send( + cast( + RealtimeClientEventParam, + strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}), + ) + ) + + +class AsyncRealtimeInputAudioBufferResource(BaseAsyncRealtimeConnectionResource): + async def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: + """Send this event to clear the audio bytes in the buffer. + + The server will + respond with an `input_audio_buffer.cleared` event. + """ + await self._connection.send( + cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id})) + ) + + async def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: + """ + Send this event to commit the user input audio buffer, which will create a + new user message item in the conversation. This event will produce an error + if the input audio buffer is empty. When in Server VAD mode, the client does + not need to send this event, the server will commit the audio buffer + automatically. + + Committing the input audio buffer will trigger input audio transcription + (if enabled in session configuration), but it will not create a response + from the model. The server will respond with an `input_audio_buffer.committed` + event. + """ + await self._connection.send( + cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id})) + ) + + async def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None: + """Send this event to append audio bytes to the input audio buffer. + + The audio + buffer is temporary storage you can write to and later commit. In Server VAD + mode, the audio buffer is used to detect speech and the server will decide + when to commit. When Server VAD is disabled, you must commit the audio buffer + manually. + + The client may choose how much audio to place in each event up to a maximum + of 15 MiB, for example streaming smaller chunks from the client may allow the + VAD to be more responsive. Unlike made other client events, the server will + not send a confirmation response to this event. + """ + await self._connection.send( + cast( + RealtimeClientEventParam, + strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}), + ) + ) + class AsyncRealtimeConversationResource(BaseAsyncRealtimeConnectionResource): @cached_property @@ -921,52 +1038,29 @@ async def truncate( ) ) - -class AsyncRealtimeInputAudioBufferResource(BaseAsyncRealtimeConnectionResource): - async def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: - """Send this event to clear the audio bytes in the buffer. - - The server will - respond with an `input_audio_buffer.cleared` event. - """ - await self._connection.send( - cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id})) - ) - - async def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: + async def retrieve(self, *, item_id: str, event_id: str | NotGiven = NOT_GIVEN) -> None: """ - Send this event to commit the user input audio buffer, which will create a - new user message item in the conversation. This event will produce an error - if the input audio buffer is empty. When in Server VAD mode, the client does - not need to send this event, the server will commit the audio buffer - automatically. - - Committing the input audio buffer will trigger input audio transcription - (if enabled in session configuration), but it will not create a response - from the model. The server will respond with an `input_audio_buffer.committed` - event. + Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD. + The server will respond with a `conversation.item.retrieved` event, + unless the item does not exist in the conversation history, in which case the + server will respond with an error. """ await self._connection.send( - cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id})) + cast( + RealtimeClientEventParam, + strip_not_given({"type": "conversation.item.retrieve", "item_id": item_id, "event_id": event_id}), + ) ) - async def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None: - """Send this event to append audio bytes to the input audio buffer. - - The audio - buffer is temporary storage you can write to and later commit. In Server VAD - mode, the audio buffer is used to detect speech and the server will decide - when to commit. When Server VAD is disabled, you must commit the audio buffer - manually. - The client may choose how much audio to place in each event up to a maximum - of 15 MiB, for example streaming smaller chunks from the client may allow the - VAD to be more responsive. Unlike made other client events, the server will - not send a confirmation response to this event. - """ +class AsyncRealtimeTranscriptionSessionResource(BaseAsyncRealtimeConnectionResource): + async def update( + self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN + ) -> None: + """Send this event to update a transcription session.""" await self._connection.send( cast( RealtimeClientEventParam, - strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}), + strip_not_given({"type": "transcription_session.update", "session": session, "event_id": event_id}), ) ) diff --git a/src/openai/resources/beta/realtime/sessions.py b/src/openai/resources/beta/realtime/sessions.py index 4b337b7c19..5884e54de2 100644 --- a/src/openai/resources/beta/realtime/sessions.py +++ b/src/openai/resources/beta/realtime/sessions.py @@ -47,6 +47,7 @@ def create( self, *, input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN, + input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN, input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN, instructions: str | NotGiven = NOT_GIVEN, max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN, @@ -86,14 +87,20 @@ def create( `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian byte order. + input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn + off. Noise reduction filters audio added to the input audio buffer before it is + sent to VAD and the model. Filtering the audio can improve VAD and turn + detection accuracy (reducing false positives) and model performance by improving + perception of the input audio. + input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through - [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription) - and should be treated as rough guidance rather than the representation - understood by the model. The client can optionally set the language and prompt - for transcription, these fields will be passed to the Whisper API. + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. instructions: The default system instructions (i.e. system message) prepended to model calls. This field allows the client to guide the model on desired responses. The model @@ -119,16 +126,24 @@ def create( output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is sampled at a rate of 24kHz. - temperature: Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a + temperature of 0.8 is highly recommended for best performance. tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify a function. tools: Tools (functions) available to the model. - turn_detection: Configuration for turn detection. Can be set to `null` to turn off. Server VAD - means that the model will detect the start and end of speech based on audio - volume and respond at the end of user speech. + turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + set to `null` to turn off, in which case the client must manually trigger model + response. Server VAD means that the model will detect the start and end of + speech based on audio volume and respond at the end of user speech. Semantic VAD + is more advanced and uses a turn detection model (in conjuction with VAD) to + semantically estimate whether the user has finished speaking, then dynamically + sets a timeout based on this probability. For example, if user audio trails off + with "uhhm", the model will score a low probability of turn end and wait longer + for the user to continue speaking. This can be useful for more natural + conversations, but may have a higher latency. voice: The voice the model uses to respond. Voice cannot be changed during the session once the model has responded with audio at least once. Current voice options are @@ -148,6 +163,7 @@ def create( body=maybe_transform( { "input_audio_format": input_audio_format, + "input_audio_noise_reduction": input_audio_noise_reduction, "input_audio_transcription": input_audio_transcription, "instructions": instructions, "max_response_output_tokens": max_response_output_tokens, @@ -193,6 +209,7 @@ async def create( self, *, input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN, + input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN, input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN, instructions: str | NotGiven = NOT_GIVEN, max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN, @@ -232,14 +249,20 @@ async def create( `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian byte order. + input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn + off. Noise reduction filters audio added to the input audio buffer before it is + sent to VAD and the model. Filtering the audio can improve VAD and turn + detection accuracy (reducing false positives) and model performance by improving + perception of the input audio. + input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through - [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription) - and should be treated as rough guidance rather than the representation - understood by the model. The client can optionally set the language and prompt - for transcription, these fields will be passed to the Whisper API. + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. instructions: The default system instructions (i.e. system message) prepended to model calls. This field allows the client to guide the model on desired responses. The model @@ -265,16 +288,24 @@ async def create( output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is sampled at a rate of 24kHz. - temperature: Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a + temperature of 0.8 is highly recommended for best performance. tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify a function. tools: Tools (functions) available to the model. - turn_detection: Configuration for turn detection. Can be set to `null` to turn off. Server VAD - means that the model will detect the start and end of speech based on audio - volume and respond at the end of user speech. + turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + set to `null` to turn off, in which case the client must manually trigger model + response. Server VAD means that the model will detect the start and end of + speech based on audio volume and respond at the end of user speech. Semantic VAD + is more advanced and uses a turn detection model (in conjuction with VAD) to + semantically estimate whether the user has finished speaking, then dynamically + sets a timeout based on this probability. For example, if user audio trails off + with "uhhm", the model will score a low probability of turn end and wait longer + for the user to continue speaking. This can be useful for more natural + conversations, but may have a higher latency. voice: The voice the model uses to respond. Voice cannot be changed during the session once the model has responded with audio at least once. Current voice options are @@ -294,6 +325,7 @@ async def create( body=await async_maybe_transform( { "input_audio_format": input_audio_format, + "input_audio_noise_reduction": input_audio_noise_reduction, "input_audio_transcription": input_audio_transcription, "instructions": instructions, "max_response_output_tokens": max_response_output_tokens, diff --git a/src/openai/resources/beta/realtime/transcription_sessions.py b/src/openai/resources/beta/realtime/transcription_sessions.py new file mode 100644 index 0000000000..0917da71fa --- /dev/null +++ b/src/openai/resources/beta/realtime/transcription_sessions.py @@ -0,0 +1,277 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import List +from typing_extensions import Literal + +import httpx + +from .... import _legacy_response +from ...._types import NOT_GIVEN, Body, Query, Headers, NotGiven +from ...._utils import ( + maybe_transform, + async_maybe_transform, +) +from ...._compat import cached_property +from ...._resource import SyncAPIResource, AsyncAPIResource +from ...._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper +from ...._base_client import make_request_options +from ....types.beta.realtime import transcription_session_create_params +from ....types.beta.realtime.transcription_session import TranscriptionSession + +__all__ = ["TranscriptionSessions", "AsyncTranscriptionSessions"] + + +class TranscriptionSessions(SyncAPIResource): + @cached_property + def with_raw_response(self) -> TranscriptionSessionsWithRawResponse: + """ + This property can be used as a prefix for any HTTP method call to return + the raw response object instead of the parsed content. + + For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers + """ + return TranscriptionSessionsWithRawResponse(self) + + @cached_property + def with_streaming_response(self) -> TranscriptionSessionsWithStreamingResponse: + """ + An alternative to `.with_raw_response` that doesn't eagerly read the response body. + + For more information, see https://www.github.com/openai/openai-python#with_streaming_response + """ + return TranscriptionSessionsWithStreamingResponse(self) + + def create( + self, + *, + include: List[str] | NotGiven = NOT_GIVEN, + input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN, + input_audio_noise_reduction: transcription_session_create_params.InputAudioNoiseReduction + | NotGiven = NOT_GIVEN, + input_audio_transcription: transcription_session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN, + modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN, + turn_detection: transcription_session_create_params.TurnDetection | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> TranscriptionSession: + """ + Create an ephemeral API token for use in client-side applications with the + Realtime API specifically for realtime transcriptions. Can be configured with + the same session parameters as the `transcription_session.update` client event. + + It responds with a session object, plus a `client_secret` key which contains a + usable ephemeral API token that can be used to authenticate browser clients for + the Realtime API. + + Args: + include: + The set of items to include in the transcription. Current available items are: + + - `item.input_audio_transcription.logprobs` + + input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For + `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel + (mono), and little-endian byte order. + + input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn + off. Noise reduction filters audio added to the input audio buffer before it is + sent to VAD and the model. Filtering the audio can improve VAD and turn + detection accuracy (reducing false positives) and model performance by improving + perception of the input audio. + + input_audio_transcription: Configuration for input audio transcription. The client can optionally set the + language and prompt for transcription, these offer additional guidance to the + transcription service. + + modalities: The set of modalities the model can respond with. To disable audio, set this to + ["text"]. + + turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + set to `null` to turn off, in which case the client must manually trigger model + response. Server VAD means that the model will detect the start and end of + speech based on audio volume and respond at the end of user speech. Semantic VAD + is more advanced and uses a turn detection model (in conjuction with VAD) to + semantically estimate whether the user has finished speaking, then dynamically + sets a timeout based on this probability. For example, if user audio trails off + with "uhhm", the model will score a low probability of turn end and wait longer + for the user to continue speaking. This can be useful for more natural + conversations, but may have a higher latency. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})} + return self._post( + "/realtime/transcription_sessions", + body=maybe_transform( + { + "include": include, + "input_audio_format": input_audio_format, + "input_audio_noise_reduction": input_audio_noise_reduction, + "input_audio_transcription": input_audio_transcription, + "modalities": modalities, + "turn_detection": turn_detection, + }, + transcription_session_create_params.TranscriptionSessionCreateParams, + ), + options=make_request_options( + extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + ), + cast_to=TranscriptionSession, + ) + + +class AsyncTranscriptionSessions(AsyncAPIResource): + @cached_property + def with_raw_response(self) -> AsyncTranscriptionSessionsWithRawResponse: + """ + This property can be used as a prefix for any HTTP method call to return + the raw response object instead of the parsed content. + + For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers + """ + return AsyncTranscriptionSessionsWithRawResponse(self) + + @cached_property + def with_streaming_response(self) -> AsyncTranscriptionSessionsWithStreamingResponse: + """ + An alternative to `.with_raw_response` that doesn't eagerly read the response body. + + For more information, see https://www.github.com/openai/openai-python#with_streaming_response + """ + return AsyncTranscriptionSessionsWithStreamingResponse(self) + + async def create( + self, + *, + include: List[str] | NotGiven = NOT_GIVEN, + input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN, + input_audio_noise_reduction: transcription_session_create_params.InputAudioNoiseReduction + | NotGiven = NOT_GIVEN, + input_audio_transcription: transcription_session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN, + modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN, + turn_detection: transcription_session_create_params.TurnDetection | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> TranscriptionSession: + """ + Create an ephemeral API token for use in client-side applications with the + Realtime API specifically for realtime transcriptions. Can be configured with + the same session parameters as the `transcription_session.update` client event. + + It responds with a session object, plus a `client_secret` key which contains a + usable ephemeral API token that can be used to authenticate browser clients for + the Realtime API. + + Args: + include: + The set of items to include in the transcription. Current available items are: + + - `item.input_audio_transcription.logprobs` + + input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For + `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel + (mono), and little-endian byte order. + + input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn + off. Noise reduction filters audio added to the input audio buffer before it is + sent to VAD and the model. Filtering the audio can improve VAD and turn + detection accuracy (reducing false positives) and model performance by improving + perception of the input audio. + + input_audio_transcription: Configuration for input audio transcription. The client can optionally set the + language and prompt for transcription, these offer additional guidance to the + transcription service. + + modalities: The set of modalities the model can respond with. To disable audio, set this to + ["text"]. + + turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + set to `null` to turn off, in which case the client must manually trigger model + response. Server VAD means that the model will detect the start and end of + speech based on audio volume and respond at the end of user speech. Semantic VAD + is more advanced and uses a turn detection model (in conjuction with VAD) to + semantically estimate whether the user has finished speaking, then dynamically + sets a timeout based on this probability. For example, if user audio trails off + with "uhhm", the model will score a low probability of turn end and wait longer + for the user to continue speaking. This can be useful for more natural + conversations, but may have a higher latency. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})} + return await self._post( + "/realtime/transcription_sessions", + body=await async_maybe_transform( + { + "include": include, + "input_audio_format": input_audio_format, + "input_audio_noise_reduction": input_audio_noise_reduction, + "input_audio_transcription": input_audio_transcription, + "modalities": modalities, + "turn_detection": turn_detection, + }, + transcription_session_create_params.TranscriptionSessionCreateParams, + ), + options=make_request_options( + extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + ), + cast_to=TranscriptionSession, + ) + + +class TranscriptionSessionsWithRawResponse: + def __init__(self, transcription_sessions: TranscriptionSessions) -> None: + self._transcription_sessions = transcription_sessions + + self.create = _legacy_response.to_raw_response_wrapper( + transcription_sessions.create, + ) + + +class AsyncTranscriptionSessionsWithRawResponse: + def __init__(self, transcription_sessions: AsyncTranscriptionSessions) -> None: + self._transcription_sessions = transcription_sessions + + self.create = _legacy_response.async_to_raw_response_wrapper( + transcription_sessions.create, + ) + + +class TranscriptionSessionsWithStreamingResponse: + def __init__(self, transcription_sessions: TranscriptionSessions) -> None: + self._transcription_sessions = transcription_sessions + + self.create = to_streamed_response_wrapper( + transcription_sessions.create, + ) + + +class AsyncTranscriptionSessionsWithStreamingResponse: + def __init__(self, transcription_sessions: AsyncTranscriptionSessions) -> None: + self._transcription_sessions = transcription_sessions + + self.create = async_to_streamed_response_wrapper( + transcription_sessions.create, + ) diff --git a/src/openai/types/audio/__init__.py b/src/openai/types/audio/__init__.py index 822e0f3a8d..396944ee47 100644 --- a/src/openai/types/audio/__init__.py +++ b/src/openai/types/audio/__init__.py @@ -8,9 +8,13 @@ from .transcription_word import TranscriptionWord as TranscriptionWord from .translation_verbose import TranslationVerbose as TranslationVerbose from .speech_create_params import SpeechCreateParams as SpeechCreateParams +from .transcription_include import TranscriptionInclude as TranscriptionInclude from .transcription_segment import TranscriptionSegment as TranscriptionSegment from .transcription_verbose import TranscriptionVerbose as TranscriptionVerbose from .translation_create_params import TranslationCreateParams as TranslationCreateParams +from .transcription_stream_event import TranscriptionStreamEvent as TranscriptionStreamEvent from .transcription_create_params import TranscriptionCreateParams as TranscriptionCreateParams from .translation_create_response import TranslationCreateResponse as TranslationCreateResponse from .transcription_create_response import TranscriptionCreateResponse as TranscriptionCreateResponse +from .transcription_text_done_event import TranscriptionTextDoneEvent as TranscriptionTextDoneEvent +from .transcription_text_delta_event import TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent diff --git a/src/openai/types/audio/speech_create_params.py b/src/openai/types/audio/speech_create_params.py index ed1a1ce748..958680710b 100644 --- a/src/openai/types/audio/speech_create_params.py +++ b/src/openai/types/audio/speech_create_params.py @@ -17,7 +17,7 @@ class SpeechCreateParams(TypedDict, total=False): model: Required[Union[str, SpeechModel]] """ One of the available [TTS models](https://platform.openai.com/docs/models#tts): - `tts-1` or `tts-1-hd` + `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`. """ voice: Required[Literal["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]] @@ -28,6 +28,12 @@ class SpeechCreateParams(TypedDict, total=False): [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options). """ + instructions: str + """Control the voice of your generated audio with additional instructions. + + Does not work with `tts-1` or `tts-1-hd`. + """ + response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] """The format to audio in. diff --git a/src/openai/types/audio/speech_model.py b/src/openai/types/audio/speech_model.py index bd685ab34d..f004f805da 100644 --- a/src/openai/types/audio/speech_model.py +++ b/src/openai/types/audio/speech_model.py @@ -4,4 +4,4 @@ __all__ = ["SpeechModel"] -SpeechModel: TypeAlias = Literal["tts-1", "tts-1-hd"] +SpeechModel: TypeAlias = Literal["tts-1", "tts-1-hd", "gpt-4o-mini-tts"] diff --git a/src/openai/types/audio/transcription.py b/src/openai/types/audio/transcription.py index edb5f227fc..1576385404 100644 --- a/src/openai/types/audio/transcription.py +++ b/src/openai/types/audio/transcription.py @@ -1,11 +1,30 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. +from typing import List, Optional from ..._models import BaseModel -__all__ = ["Transcription"] +__all__ = ["Transcription", "Logprob"] + + +class Logprob(BaseModel): + token: Optional[str] = None + """The token in the transcription.""" + + bytes: Optional[List[float]] = None + """The bytes of the token.""" + + logprob: Optional[float] = None + """The log probability of the token.""" class Transcription(BaseModel): text: str """The transcribed text.""" + + logprobs: Optional[List[Logprob]] = None + """The log probabilities of the tokens in the transcription. + + Only returned with the models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe` + if `logprobs` is added to the `include` array. + """ diff --git a/src/openai/types/audio/transcription_create_params.py b/src/openai/types/audio/transcription_create_params.py index f1779c35e6..0cda4c7907 100644 --- a/src/openai/types/audio/transcription_create_params.py +++ b/src/openai/types/audio/transcription_create_params.py @@ -2,17 +2,22 @@ from __future__ import annotations -from typing import List, Union +from typing import List, Union, Optional from typing_extensions import Literal, Required, TypedDict from ..._types import FileTypes from ..audio_model import AudioModel +from .transcription_include import TranscriptionInclude from ..audio_response_format import AudioResponseFormat -__all__ = ["TranscriptionCreateParams"] +__all__ = [ + "TranscriptionCreateParamsBase", + "TranscriptionCreateParamsNonStreaming", + "TranscriptionCreateParamsStreaming", +] -class TranscriptionCreateParams(TypedDict, total=False): +class TranscriptionCreateParamsBase(TypedDict, total=False): file: Required[FileTypes] """ The audio file object (not file name) to transcribe, in one of these formats: @@ -22,8 +27,17 @@ class TranscriptionCreateParams(TypedDict, total=False): model: Required[Union[str, AudioModel]] """ID of the model to use. - Only `whisper-1` (which is powered by our open source Whisper V2 model) is - currently available. + The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` + (which is powered by our open source Whisper V2 model). + """ + + include: List[TranscriptionInclude] + """Additional information to include in the transcription response. + + `logprobs` will return the log probabilities of the tokens in the response to + understand the model's confidence in the transcription. `logprobs` only works + with response_format set to `json` and only with the models `gpt-4o-transcribe` + and `gpt-4o-mini-transcribe`. """ language: str @@ -45,7 +59,8 @@ class TranscriptionCreateParams(TypedDict, total=False): response_format: AudioResponseFormat """ The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. + `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, + the only supported format is `json`. """ temperature: float @@ -65,3 +80,34 @@ class TranscriptionCreateParams(TypedDict, total=False): is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. """ + + +class TranscriptionCreateParamsNonStreaming(TranscriptionCreateParamsBase, total=False): + stream: Optional[Literal[False]] + """ + If set to true, the model response data will be streamed to the client as it is + generated using + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). + See the + [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) + for more information. + + Note: Streaming is not supported for the `whisper-1` model and will be ignored. + """ + + +class TranscriptionCreateParamsStreaming(TranscriptionCreateParamsBase): + stream: Required[Literal[True]] + """ + If set to true, the model response data will be streamed to the client as it is + generated using + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). + See the + [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) + for more information. + + Note: Streaming is not supported for the `whisper-1` model and will be ignored. + """ + + +TranscriptionCreateParams = Union[TranscriptionCreateParamsNonStreaming, TranscriptionCreateParamsStreaming] diff --git a/src/openai/types/audio/transcription_include.py b/src/openai/types/audio/transcription_include.py new file mode 100644 index 0000000000..0e464ac934 --- /dev/null +++ b/src/openai/types/audio/transcription_include.py @@ -0,0 +1,7 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing_extensions import Literal, TypeAlias + +__all__ = ["TranscriptionInclude"] + +TranscriptionInclude: TypeAlias = Literal["logprobs"] diff --git a/src/openai/types/audio/transcription_stream_event.py b/src/openai/types/audio/transcription_stream_event.py new file mode 100644 index 0000000000..757077a280 --- /dev/null +++ b/src/openai/types/audio/transcription_stream_event.py @@ -0,0 +1,14 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Union +from typing_extensions import Annotated, TypeAlias + +from ..._utils import PropertyInfo +from .transcription_text_done_event import TranscriptionTextDoneEvent +from .transcription_text_delta_event import TranscriptionTextDeltaEvent + +__all__ = ["TranscriptionStreamEvent"] + +TranscriptionStreamEvent: TypeAlias = Annotated[ + Union[TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent], PropertyInfo(discriminator="type") +] diff --git a/src/openai/types/audio/transcription_text_delta_event.py b/src/openai/types/audio/transcription_text_delta_event.py new file mode 100644 index 0000000000..f8d5355491 --- /dev/null +++ b/src/openai/types/audio/transcription_text_delta_event.py @@ -0,0 +1,35 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["TranscriptionTextDeltaEvent", "Logprob"] + + +class Logprob(BaseModel): + token: Optional[str] = None + """The token that was used to generate the log probability.""" + + bytes: Optional[List[object]] = None + """The bytes that were used to generate the log probability.""" + + logprob: Optional[float] = None + """The log probability of the token.""" + + +class TranscriptionTextDeltaEvent(BaseModel): + delta: str + """The text delta that was additionally transcribed.""" + + type: Literal["transcript.text.delta"] + """The type of the event. Always `transcript.text.delta`.""" + + logprobs: Optional[List[Logprob]] = None + """The log probabilities of the delta. + + Only included if you + [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) + with the `include[]` parameter set to `logprobs`. + """ diff --git a/src/openai/types/audio/transcription_text_done_event.py b/src/openai/types/audio/transcription_text_done_event.py new file mode 100644 index 0000000000..3f1a713a52 --- /dev/null +++ b/src/openai/types/audio/transcription_text_done_event.py @@ -0,0 +1,35 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["TranscriptionTextDoneEvent", "Logprob"] + + +class Logprob(BaseModel): + token: Optional[str] = None + """The token that was used to generate the log probability.""" + + bytes: Optional[List[object]] = None + """The bytes that were used to generate the log probability.""" + + logprob: Optional[float] = None + """The log probability of the token.""" + + +class TranscriptionTextDoneEvent(BaseModel): + text: str + """The text that was transcribed.""" + + type: Literal["transcript.text.done"] + """The type of the event. Always `transcript.text.done`.""" + + logprobs: Optional[List[Logprob]] = None + """The log probabilities of the individual tokens in the transcription. + + Only included if you + [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) + with the `include[]` parameter set to `logprobs`. + """ diff --git a/src/openai/types/audio/translation_create_params.py b/src/openai/types/audio/translation_create_params.py index 62f85b8757..b23a185375 100644 --- a/src/openai/types/audio/translation_create_params.py +++ b/src/openai/types/audio/translation_create_params.py @@ -3,11 +3,10 @@ from __future__ import annotations from typing import Union -from typing_extensions import Required, TypedDict +from typing_extensions import Literal, Required, TypedDict from ..._types import FileTypes from ..audio_model import AudioModel -from ..audio_response_format import AudioResponseFormat __all__ = ["TranslationCreateParams"] @@ -34,7 +33,7 @@ class TranslationCreateParams(TypedDict, total=False): should be in English. """ - response_format: AudioResponseFormat + response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] """ The format of the output, in one of these options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. diff --git a/src/openai/types/audio_model.py b/src/openai/types/audio_model.py index 94ae84c015..4d14d60181 100644 --- a/src/openai/types/audio_model.py +++ b/src/openai/types/audio_model.py @@ -4,4 +4,4 @@ __all__ = ["AudioModel"] -AudioModel: TypeAlias = Literal["whisper-1"] +AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"] diff --git a/src/openai/types/beta/realtime/__init__.py b/src/openai/types/beta/realtime/__init__.py index cd0616dcfa..0374b9b457 100644 --- a/src/openai/types/beta/realtime/__init__.py +++ b/src/openai/types/beta/realtime/__init__.py @@ -15,6 +15,7 @@ from .session_create_params import SessionCreateParams as SessionCreateParams from .session_created_event import SessionCreatedEvent as SessionCreatedEvent from .session_updated_event import SessionUpdatedEvent as SessionUpdatedEvent +from .transcription_session import TranscriptionSession as TranscriptionSession from .response_created_event import ResponseCreatedEvent as ResponseCreatedEvent from .conversation_item_param import ConversationItemParam as ConversationItemParam from .realtime_connect_params import RealtimeConnectParams as RealtimeConnectParams @@ -32,6 +33,7 @@ from .realtime_client_event_param import RealtimeClientEventParam as RealtimeClientEventParam from .response_cancel_event_param import ResponseCancelEventParam as ResponseCancelEventParam from .response_create_event_param import ResponseCreateEventParam as ResponseCreateEventParam +from .transcription_session_update import TranscriptionSessionUpdate as TranscriptionSessionUpdate from .conversation_item_create_event import ConversationItemCreateEvent as ConversationItemCreateEvent from .conversation_item_delete_event import ConversationItemDeleteEvent as ConversationItemDeleteEvent from .input_audio_buffer_clear_event import InputAudioBufferClearEvent as InputAudioBufferClearEvent @@ -41,6 +43,7 @@ from .input_audio_buffer_append_event import InputAudioBufferAppendEvent as InputAudioBufferAppendEvent from .input_audio_buffer_commit_event import InputAudioBufferCommitEvent as InputAudioBufferCommitEvent from .response_output_item_done_event import ResponseOutputItemDoneEvent as ResponseOutputItemDoneEvent +from .conversation_item_retrieve_event import ConversationItemRetrieveEvent as ConversationItemRetrieveEvent from .conversation_item_truncate_event import ConversationItemTruncateEvent as ConversationItemTruncateEvent from .conversation_item_with_reference import ConversationItemWithReference as ConversationItemWithReference from .input_audio_buffer_cleared_event import InputAudioBufferClearedEvent as InputAudioBufferClearedEvent @@ -49,6 +52,9 @@ from .conversation_item_truncated_event import ConversationItemTruncatedEvent as ConversationItemTruncatedEvent from .response_content_part_added_event import ResponseContentPartAddedEvent as ResponseContentPartAddedEvent from .input_audio_buffer_committed_event import InputAudioBufferCommittedEvent as InputAudioBufferCommittedEvent +from .transcription_session_update_param import TranscriptionSessionUpdateParam as TranscriptionSessionUpdateParam +from .transcription_session_create_params import TranscriptionSessionCreateParams as TranscriptionSessionCreateParams +from .transcription_session_updated_event import TranscriptionSessionUpdatedEvent as TranscriptionSessionUpdatedEvent from .conversation_item_create_event_param import ConversationItemCreateEventParam as ConversationItemCreateEventParam from .conversation_item_delete_event_param import ConversationItemDeleteEventParam as ConversationItemDeleteEventParam from .input_audio_buffer_clear_event_param import InputAudioBufferClearEventParam as InputAudioBufferClearEventParam @@ -58,6 +64,9 @@ from .response_audio_transcript_delta_event import ( ResponseAudioTranscriptDeltaEvent as ResponseAudioTranscriptDeltaEvent, ) +from .conversation_item_retrieve_event_param import ( + ConversationItemRetrieveEventParam as ConversationItemRetrieveEventParam, +) from .conversation_item_truncate_event_param import ( ConversationItemTruncateEventParam as ConversationItemTruncateEventParam, ) @@ -76,6 +85,9 @@ from .response_function_call_arguments_delta_event import ( ResponseFunctionCallArgumentsDeltaEvent as ResponseFunctionCallArgumentsDeltaEvent, ) +from .conversation_item_input_audio_transcription_delta_event import ( + ConversationItemInputAudioTranscriptionDeltaEvent as ConversationItemInputAudioTranscriptionDeltaEvent, +) from .conversation_item_input_audio_transcription_failed_event import ( ConversationItemInputAudioTranscriptionFailedEvent as ConversationItemInputAudioTranscriptionFailedEvent, ) diff --git a/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_completed_event.py b/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_completed_event.py index ded79cc0f7..469811693c 100644 --- a/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_completed_event.py +++ b/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_completed_event.py @@ -1,10 +1,22 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. +from typing import List, Optional from typing_extensions import Literal from ...._models import BaseModel -__all__ = ["ConversationItemInputAudioTranscriptionCompletedEvent"] +__all__ = ["ConversationItemInputAudioTranscriptionCompletedEvent", "Logprob"] + + +class Logprob(BaseModel): + token: str + """The token that was used to generate the log probability.""" + + bytes: List[int] + """The bytes that were used to generate the log probability.""" + + logprob: float + """The log probability of the token.""" class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel): @@ -24,3 +36,6 @@ class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel): """ The event type, must be `conversation.item.input_audio_transcription.completed`. """ + + logprobs: Optional[List[Logprob]] = None + """The log probabilities of the transcription.""" diff --git a/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_delta_event.py b/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_delta_event.py new file mode 100644 index 0000000000..924d06d98a --- /dev/null +++ b/src/openai/types/beta/realtime/conversation_item_input_audio_transcription_delta_event.py @@ -0,0 +1,39 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ...._models import BaseModel + +__all__ = ["ConversationItemInputAudioTranscriptionDeltaEvent", "Logprob"] + + +class Logprob(BaseModel): + token: str + """The token that was used to generate the log probability.""" + + bytes: List[int] + """The bytes that were used to generate the log probability.""" + + logprob: float + """The log probability of the token.""" + + +class ConversationItemInputAudioTranscriptionDeltaEvent(BaseModel): + event_id: str + """The unique ID of the server event.""" + + item_id: str + """The ID of the item.""" + + type: Literal["conversation.item.input_audio_transcription.delta"] + """The event type, must be `conversation.item.input_audio_transcription.delta`.""" + + content_index: Optional[int] = None + """The index of the content part in the item's content array.""" + + delta: Optional[str] = None + """The text delta.""" + + logprobs: Optional[List[Logprob]] = None + """The log probabilities of the transcription.""" diff --git a/src/openai/types/beta/realtime/conversation_item_retrieve_event.py b/src/openai/types/beta/realtime/conversation_item_retrieve_event.py new file mode 100644 index 0000000000..822386055c --- /dev/null +++ b/src/openai/types/beta/realtime/conversation_item_retrieve_event.py @@ -0,0 +1,19 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ...._models import BaseModel + +__all__ = ["ConversationItemRetrieveEvent"] + + +class ConversationItemRetrieveEvent(BaseModel): + item_id: str + """The ID of the item to retrieve.""" + + type: Literal["conversation.item.retrieve"] + """The event type, must be `conversation.item.retrieve`.""" + + event_id: Optional[str] = None + """Optional client-generated ID used to identify this event.""" diff --git a/src/openai/types/beta/realtime/conversation_item_retrieve_event_param.py b/src/openai/types/beta/realtime/conversation_item_retrieve_event_param.py new file mode 100644 index 0000000000..71b3ffa499 --- /dev/null +++ b/src/openai/types/beta/realtime/conversation_item_retrieve_event_param.py @@ -0,0 +1,18 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import Literal, Required, TypedDict + +__all__ = ["ConversationItemRetrieveEventParam"] + + +class ConversationItemRetrieveEventParam(TypedDict, total=False): + item_id: Required[str] + """The ID of the item to retrieve.""" + + type: Required[Literal["conversation.item.retrieve"]] + """The event type, must be `conversation.item.retrieve`.""" + + event_id: str + """Optional client-generated ID used to identify this event.""" diff --git a/src/openai/types/beta/realtime/realtime_client_event.py b/src/openai/types/beta/realtime/realtime_client_event.py index 0769184cd0..f962a505cd 100644 --- a/src/openai/types/beta/realtime/realtime_client_event.py +++ b/src/openai/types/beta/realtime/realtime_client_event.py @@ -7,26 +7,30 @@ from .session_update_event import SessionUpdateEvent from .response_cancel_event import ResponseCancelEvent from .response_create_event import ResponseCreateEvent +from .transcription_session_update import TranscriptionSessionUpdate from .conversation_item_create_event import ConversationItemCreateEvent from .conversation_item_delete_event import ConversationItemDeleteEvent from .input_audio_buffer_clear_event import InputAudioBufferClearEvent from .input_audio_buffer_append_event import InputAudioBufferAppendEvent from .input_audio_buffer_commit_event import InputAudioBufferCommitEvent +from .conversation_item_retrieve_event import ConversationItemRetrieveEvent from .conversation_item_truncate_event import ConversationItemTruncateEvent __all__ = ["RealtimeClientEvent"] RealtimeClientEvent: TypeAlias = Annotated[ Union[ - SessionUpdateEvent, - InputAudioBufferAppendEvent, - InputAudioBufferCommitEvent, - InputAudioBufferClearEvent, ConversationItemCreateEvent, - ConversationItemTruncateEvent, ConversationItemDeleteEvent, - ResponseCreateEvent, + ConversationItemRetrieveEvent, + ConversationItemTruncateEvent, + InputAudioBufferAppendEvent, + InputAudioBufferClearEvent, + InputAudioBufferCommitEvent, ResponseCancelEvent, + ResponseCreateEvent, + SessionUpdateEvent, + TranscriptionSessionUpdate, ], PropertyInfo(discriminator="type"), ] diff --git a/src/openai/types/beta/realtime/realtime_client_event_param.py b/src/openai/types/beta/realtime/realtime_client_event_param.py index 4020892c33..6fdba4b87c 100644 --- a/src/openai/types/beta/realtime/realtime_client_event_param.py +++ b/src/openai/types/beta/realtime/realtime_client_event_param.py @@ -8,23 +8,27 @@ from .session_update_event_param import SessionUpdateEventParam from .response_cancel_event_param import ResponseCancelEventParam from .response_create_event_param import ResponseCreateEventParam +from .transcription_session_update_param import TranscriptionSessionUpdateParam from .conversation_item_create_event_param import ConversationItemCreateEventParam from .conversation_item_delete_event_param import ConversationItemDeleteEventParam from .input_audio_buffer_clear_event_param import InputAudioBufferClearEventParam from .input_audio_buffer_append_event_param import InputAudioBufferAppendEventParam from .input_audio_buffer_commit_event_param import InputAudioBufferCommitEventParam +from .conversation_item_retrieve_event_param import ConversationItemRetrieveEventParam from .conversation_item_truncate_event_param import ConversationItemTruncateEventParam __all__ = ["RealtimeClientEventParam"] RealtimeClientEventParam: TypeAlias = Union[ - SessionUpdateEventParam, - InputAudioBufferAppendEventParam, - InputAudioBufferCommitEventParam, - InputAudioBufferClearEventParam, ConversationItemCreateEventParam, - ConversationItemTruncateEventParam, ConversationItemDeleteEventParam, - ResponseCreateEventParam, + ConversationItemRetrieveEventParam, + ConversationItemTruncateEventParam, + InputAudioBufferAppendEventParam, + InputAudioBufferClearEventParam, + InputAudioBufferCommitEventParam, ResponseCancelEventParam, + ResponseCreateEventParam, + SessionUpdateEventParam, + TranscriptionSessionUpdateParam, ] diff --git a/src/openai/types/beta/realtime/realtime_server_event.py b/src/openai/types/beta/realtime/realtime_server_event.py index 5f8ed55b13..ba1d324445 100644 --- a/src/openai/types/beta/realtime/realtime_server_event.py +++ b/src/openai/types/beta/realtime/realtime_server_event.py @@ -1,10 +1,12 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from typing import Union -from typing_extensions import Annotated, TypeAlias +from typing_extensions import Literal, Annotated, TypeAlias from ...._utils import PropertyInfo +from ...._models import BaseModel from .error_event import ErrorEvent +from .conversation_item import ConversationItem from .response_done_event import ResponseDoneEvent from .session_created_event import SessionCreatedEvent from .session_updated_event import SessionUpdatedEvent @@ -24,49 +26,66 @@ from .conversation_item_truncated_event import ConversationItemTruncatedEvent from .response_content_part_added_event import ResponseContentPartAddedEvent from .input_audio_buffer_committed_event import InputAudioBufferCommittedEvent +from .transcription_session_updated_event import TranscriptionSessionUpdatedEvent from .response_audio_transcript_done_event import ResponseAudioTranscriptDoneEvent from .response_audio_transcript_delta_event import ResponseAudioTranscriptDeltaEvent from .input_audio_buffer_speech_started_event import InputAudioBufferSpeechStartedEvent from .input_audio_buffer_speech_stopped_event import InputAudioBufferSpeechStoppedEvent from .response_function_call_arguments_done_event import ResponseFunctionCallArgumentsDoneEvent from .response_function_call_arguments_delta_event import ResponseFunctionCallArgumentsDeltaEvent +from .conversation_item_input_audio_transcription_delta_event import ConversationItemInputAudioTranscriptionDeltaEvent from .conversation_item_input_audio_transcription_failed_event import ConversationItemInputAudioTranscriptionFailedEvent from .conversation_item_input_audio_transcription_completed_event import ( ConversationItemInputAudioTranscriptionCompletedEvent, ) -__all__ = ["RealtimeServerEvent"] +__all__ = ["RealtimeServerEvent", "ConversationItemRetrieved"] + + +class ConversationItemRetrieved(BaseModel): + event_id: str + """The unique ID of the server event.""" + + item: ConversationItem + """The item to add to the conversation.""" + + type: Literal["conversation.item.retrieved"] + """The event type, must be `conversation.item.retrieved`.""" + RealtimeServerEvent: TypeAlias = Annotated[ Union[ - ErrorEvent, - SessionCreatedEvent, - SessionUpdatedEvent, ConversationCreatedEvent, - InputAudioBufferCommittedEvent, - InputAudioBufferClearedEvent, - InputAudioBufferSpeechStartedEvent, - InputAudioBufferSpeechStoppedEvent, ConversationItemCreatedEvent, + ConversationItemDeletedEvent, ConversationItemInputAudioTranscriptionCompletedEvent, + ConversationItemInputAudioTranscriptionDeltaEvent, ConversationItemInputAudioTranscriptionFailedEvent, + ConversationItemRetrieved, ConversationItemTruncatedEvent, - ConversationItemDeletedEvent, + ErrorEvent, + InputAudioBufferClearedEvent, + InputAudioBufferCommittedEvent, + InputAudioBufferSpeechStartedEvent, + InputAudioBufferSpeechStoppedEvent, + RateLimitsUpdatedEvent, + ResponseAudioDeltaEvent, + ResponseAudioDoneEvent, + ResponseAudioTranscriptDeltaEvent, + ResponseAudioTranscriptDoneEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, ResponseCreatedEvent, ResponseDoneEvent, + ResponseFunctionCallArgumentsDeltaEvent, + ResponseFunctionCallArgumentsDoneEvent, ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent, - ResponseContentPartAddedEvent, - ResponseContentPartDoneEvent, ResponseTextDeltaEvent, ResponseTextDoneEvent, - ResponseAudioTranscriptDeltaEvent, - ResponseAudioTranscriptDoneEvent, - ResponseAudioDeltaEvent, - ResponseAudioDoneEvent, - ResponseFunctionCallArgumentsDeltaEvent, - ResponseFunctionCallArgumentsDoneEvent, - RateLimitsUpdatedEvent, + SessionCreatedEvent, + SessionUpdatedEvent, + TranscriptionSessionUpdatedEvent, ], PropertyInfo(discriminator="type"), ] diff --git a/src/openai/types/beta/realtime/session.py b/src/openai/types/beta/realtime/session.py index aee20fa906..3ed53ff5f8 100644 --- a/src/openai/types/beta/realtime/session.py +++ b/src/openai/types/beta/realtime/session.py @@ -5,14 +5,40 @@ from ...._models import BaseModel -__all__ = ["Session", "InputAudioTranscription", "Tool", "TurnDetection"] +__all__ = ["Session", "InputAudioNoiseReduction", "InputAudioTranscription", "Tool", "TurnDetection"] + + +class InputAudioNoiseReduction(BaseModel): + type: Optional[Literal["near_field", "far_field"]] = None + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ class InputAudioTranscription(BaseModel): + language: Optional[str] = None + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + """ + model: Optional[str] = None """ - The model to use for transcription, `whisper-1` is the only currently supported - model. + The model to use for transcription, current options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1`. + """ + + prompt: Optional[str] = None + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". """ @@ -35,46 +61,56 @@ class Tool(BaseModel): class TurnDetection(BaseModel): create_response: Optional[bool] = None - """Whether or not to automatically generate a response when a VAD stop event + """ + Whether or not to automatically generate a response when a VAD stop event occurs. + """ + + eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None + """Used only for `semantic_vad` mode. - `true` by default. + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. """ interrupt_response: Optional[bool] = None """ Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event - occurs. `true` by default. + occurs. """ prefix_padding_ms: Optional[int] = None - """Amount of audio to include before the VAD detected speech (in milliseconds). + """Used only for `server_vad` mode. + Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to 300ms. """ silence_duration_ms: Optional[int] = None - """Duration of silence to detect speech stop (in milliseconds). + """Used only for `server_vad` mode. - Defaults to 500ms. With shorter values the model will respond more quickly, but - may jump in on short pauses from the user. + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. """ threshold: Optional[float] = None - """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. + """Used only for `server_vad` mode. - A higher threshold will require louder audio to activate the model, and thus - might perform better in noisy environments. + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. """ - type: Optional[Literal["server_vad"]] = None - """Type of turn detection, only `server_vad` is currently supported.""" + type: Optional[Literal["server_vad", "semantic_vad"]] = None + """Type of turn detection.""" class Session(BaseModel): id: Optional[str] = None - """Unique identifier for the session object.""" + """Unique identifier for the session that looks like `sess_1234567890abcdef`.""" input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None """The format of input audio. @@ -84,13 +120,25 @@ class Session(BaseModel): byte order. """ + input_audio_noise_reduction: Optional[InputAudioNoiseReduction] = None + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + input_audio_transcription: Optional[InputAudioTranscription] = None """ Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs - asynchronously through Whisper and should be treated as rough guidance rather - than the representation understood by the model. + asynchronously through + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. """ instructions: Optional[str] = None @@ -122,16 +170,14 @@ class Session(BaseModel): To disable audio, set this to ["text"]. """ - model: Union[ - str, + model: Optional[ Literal[ "gpt-4o-realtime-preview", "gpt-4o-realtime-preview-2024-10-01", "gpt-4o-realtime-preview-2024-12-17", "gpt-4o-mini-realtime-preview", "gpt-4o-mini-realtime-preview-2024-12-17", - ], - None, + ] ] = None """The Realtime model used for this session.""" @@ -143,7 +189,11 @@ class Session(BaseModel): """ temperature: Optional[float] = None - """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.""" + """Sampling temperature for the model, limited to [0.6, 1.2]. + + For audio models a temperature of 0.8 is highly recommended for best + performance. + """ tool_choice: Optional[str] = None """How the model chooses tools. @@ -155,11 +205,17 @@ class Session(BaseModel): """Tools (functions) available to the model.""" turn_detection: Optional[TurnDetection] = None - """Configuration for turn detection. - - Can be set to `null` to turn off. Server VAD means that the model will detect - the start and end of speech based on audio volume and respond at the end of user - speech. + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjuction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. """ voice: Optional[Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]] = None diff --git a/src/openai/types/beta/realtime/session_create_params.py b/src/openai/types/beta/realtime/session_create_params.py index bbc86d7c7d..fe4a1c8636 100644 --- a/src/openai/types/beta/realtime/session_create_params.py +++ b/src/openai/types/beta/realtime/session_create_params.py @@ -5,7 +5,7 @@ from typing import List, Union, Iterable from typing_extensions import Literal, TypedDict -__all__ = ["SessionCreateParams", "InputAudioTranscription", "Tool", "TurnDetection"] +__all__ = ["SessionCreateParams", "InputAudioNoiseReduction", "InputAudioTranscription", "Tool", "TurnDetection"] class SessionCreateParams(TypedDict, total=False): @@ -17,16 +17,25 @@ class SessionCreateParams(TypedDict, total=False): byte order. """ + input_audio_noise_reduction: InputAudioNoiseReduction + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + input_audio_transcription: InputAudioTranscription """ Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through - [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription) - and should be treated as rough guidance rather than the representation - understood by the model. The client can optionally set the language and prompt - for transcription, these fields will be passed to the Whisper API. + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. """ instructions: str @@ -75,7 +84,11 @@ class SessionCreateParams(TypedDict, total=False): """ temperature: float - """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.""" + """Sampling temperature for the model, limited to [0.6, 1.2]. + + For audio models a temperature of 0.8 is highly recommended for best + performance. + """ tool_choice: str """How the model chooses tools. @@ -87,11 +100,17 @@ class SessionCreateParams(TypedDict, total=False): """Tools (functions) available to the model.""" turn_detection: TurnDetection - """Configuration for turn detection. + """Configuration for turn detection, ether Server VAD or Semantic VAD. - Can be set to `null` to turn off. Server VAD means that the model will detect - the start and end of speech based on audio volume and respond at the end of user - speech. + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjuction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. """ voice: Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"] @@ -103,6 +122,15 @@ class SessionCreateParams(TypedDict, total=False): """ +class InputAudioNoiseReduction(TypedDict, total=False): + type: Literal["near_field", "far_field"] + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + class InputAudioTranscription(TypedDict, total=False): language: str """The language of the input audio. @@ -114,16 +142,17 @@ class InputAudioTranscription(TypedDict, total=False): model: str """ - The model to use for transcription, `whisper-1` is the only currently supported - model. + The model to use for transcription, current options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1`. """ prompt: str - """An optional text to guide the model's style or continue a previous audio - segment. - - The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". """ @@ -146,38 +175,48 @@ class Tool(TypedDict, total=False): class TurnDetection(TypedDict, total=False): create_response: bool - """Whether or not to automatically generate a response when a VAD stop event + """ + Whether or not to automatically generate a response when a VAD stop event occurs. + """ + + eagerness: Literal["low", "medium", "high", "auto"] + """Used only for `semantic_vad` mode. - `true` by default. + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. """ interrupt_response: bool """ Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event - occurs. `true` by default. + occurs. """ prefix_padding_ms: int - """Amount of audio to include before the VAD detected speech (in milliseconds). + """Used only for `server_vad` mode. + Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to 300ms. """ silence_duration_ms: int - """Duration of silence to detect speech stop (in milliseconds). + """Used only for `server_vad` mode. - Defaults to 500ms. With shorter values the model will respond more quickly, but - may jump in on short pauses from the user. + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. """ threshold: float - """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. + """Used only for `server_vad` mode. - A higher threshold will require louder audio to activate the model, and thus - might perform better in noisy environments. + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. """ - type: str - """Type of turn detection, only `server_vad` is currently supported.""" + type: Literal["server_vad", "semantic_vad"] + """Type of turn detection.""" diff --git a/src/openai/types/beta/realtime/session_update_event.py b/src/openai/types/beta/realtime/session_update_event.py index 999cd8d660..00180f593d 100644 --- a/src/openai/types/beta/realtime/session_update_event.py +++ b/src/openai/types/beta/realtime/session_update_event.py @@ -5,7 +5,23 @@ from ...._models import BaseModel -__all__ = ["SessionUpdateEvent", "Session", "SessionInputAudioTranscription", "SessionTool", "SessionTurnDetection"] +__all__ = [ + "SessionUpdateEvent", + "Session", + "SessionInputAudioNoiseReduction", + "SessionInputAudioTranscription", + "SessionTool", + "SessionTurnDetection", +] + + +class SessionInputAudioNoiseReduction(BaseModel): + type: Optional[Literal["near_field", "far_field"]] = None + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ class SessionInputAudioTranscription(BaseModel): @@ -19,16 +35,17 @@ class SessionInputAudioTranscription(BaseModel): model: Optional[str] = None """ - The model to use for transcription, `whisper-1` is the only currently supported - model. + The model to use for transcription, current options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1`. """ prompt: Optional[str] = None - """An optional text to guide the model's style or continue a previous audio - segment. - - The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". """ @@ -51,41 +68,51 @@ class SessionTool(BaseModel): class SessionTurnDetection(BaseModel): create_response: Optional[bool] = None - """Whether or not to automatically generate a response when a VAD stop event + """ + Whether or not to automatically generate a response when a VAD stop event occurs. + """ - `true` by default. + eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. """ interrupt_response: Optional[bool] = None """ Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event - occurs. `true` by default. + occurs. """ prefix_padding_ms: Optional[int] = None - """Amount of audio to include before the VAD detected speech (in milliseconds). + """Used only for `server_vad` mode. + Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to 300ms. """ silence_duration_ms: Optional[int] = None - """Duration of silence to detect speech stop (in milliseconds). + """Used only for `server_vad` mode. - Defaults to 500ms. With shorter values the model will respond more quickly, but - may jump in on short pauses from the user. + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. """ threshold: Optional[float] = None - """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. + """Used only for `server_vad` mode. - A higher threshold will require louder audio to activate the model, and thus - might perform better in noisy environments. + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. """ - type: Optional[str] = None - """Type of turn detection, only `server_vad` is currently supported.""" + type: Optional[Literal["server_vad", "semantic_vad"]] = None + """Type of turn detection.""" class Session(BaseModel): @@ -97,16 +124,25 @@ class Session(BaseModel): byte order. """ + input_audio_noise_reduction: Optional[SessionInputAudioNoiseReduction] = None + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + input_audio_transcription: Optional[SessionInputAudioTranscription] = None """ Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through - [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription) - and should be treated as rough guidance rather than the representation - understood by the model. The client can optionally set the language and prompt - for transcription, these fields will be passed to the Whisper API. + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. """ instructions: Optional[str] = None @@ -157,7 +193,11 @@ class Session(BaseModel): """ temperature: Optional[float] = None - """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.""" + """Sampling temperature for the model, limited to [0.6, 1.2]. + + For audio models a temperature of 0.8 is highly recommended for best + performance. + """ tool_choice: Optional[str] = None """How the model chooses tools. @@ -169,11 +209,17 @@ class Session(BaseModel): """Tools (functions) available to the model.""" turn_detection: Optional[SessionTurnDetection] = None - """Configuration for turn detection. - - Can be set to `null` to turn off. Server VAD means that the model will detect - the start and end of speech based on audio volume and respond at the end of user - speech. + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjuction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. """ voice: Optional[Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]] = None diff --git a/src/openai/types/beta/realtime/session_update_event_param.py b/src/openai/types/beta/realtime/session_update_event_param.py index 07fdba9d85..b8bce8fbd0 100644 --- a/src/openai/types/beta/realtime/session_update_event_param.py +++ b/src/openai/types/beta/realtime/session_update_event_param.py @@ -8,12 +8,22 @@ __all__ = [ "SessionUpdateEventParam", "Session", + "SessionInputAudioNoiseReduction", "SessionInputAudioTranscription", "SessionTool", "SessionTurnDetection", ] +class SessionInputAudioNoiseReduction(TypedDict, total=False): + type: Literal["near_field", "far_field"] + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + class SessionInputAudioTranscription(TypedDict, total=False): language: str """The language of the input audio. @@ -25,16 +35,17 @@ class SessionInputAudioTranscription(TypedDict, total=False): model: str """ - The model to use for transcription, `whisper-1` is the only currently supported - model. + The model to use for transcription, current options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1`. """ prompt: str - """An optional text to guide the model's style or continue a previous audio - segment. - - The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". """ @@ -57,41 +68,51 @@ class SessionTool(TypedDict, total=False): class SessionTurnDetection(TypedDict, total=False): create_response: bool - """Whether or not to automatically generate a response when a VAD stop event + """ + Whether or not to automatically generate a response when a VAD stop event occurs. + """ - `true` by default. + eagerness: Literal["low", "medium", "high", "auto"] + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. """ interrupt_response: bool """ Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event - occurs. `true` by default. + occurs. """ prefix_padding_ms: int - """Amount of audio to include before the VAD detected speech (in milliseconds). + """Used only for `server_vad` mode. + Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to 300ms. """ silence_duration_ms: int - """Duration of silence to detect speech stop (in milliseconds). + """Used only for `server_vad` mode. - Defaults to 500ms. With shorter values the model will respond more quickly, but - may jump in on short pauses from the user. + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. """ threshold: float - """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. + """Used only for `server_vad` mode. - A higher threshold will require louder audio to activate the model, and thus - might perform better in noisy environments. + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. """ - type: str - """Type of turn detection, only `server_vad` is currently supported.""" + type: Literal["server_vad", "semantic_vad"] + """Type of turn detection.""" class Session(TypedDict, total=False): @@ -103,16 +124,25 @@ class Session(TypedDict, total=False): byte order. """ + input_audio_noise_reduction: SessionInputAudioNoiseReduction + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + input_audio_transcription: SessionInputAudioTranscription """ Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through - [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription) - and should be treated as rough guidance rather than the representation - understood by the model. The client can optionally set the language and prompt - for transcription, these fields will be passed to the Whisper API. + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. """ instructions: str @@ -161,7 +191,11 @@ class Session(TypedDict, total=False): """ temperature: float - """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.""" + """Sampling temperature for the model, limited to [0.6, 1.2]. + + For audio models a temperature of 0.8 is highly recommended for best + performance. + """ tool_choice: str """How the model chooses tools. @@ -173,11 +207,17 @@ class Session(TypedDict, total=False): """Tools (functions) available to the model.""" turn_detection: SessionTurnDetection - """Configuration for turn detection. - - Can be set to `null` to turn off. Server VAD means that the model will detect - the start and end of speech based on audio volume and respond at the end of user - speech. + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjuction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. """ voice: Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"] diff --git a/src/openai/types/beta/realtime/transcription_session.py b/src/openai/types/beta/realtime/transcription_session.py new file mode 100644 index 0000000000..7c7abf37b6 --- /dev/null +++ b/src/openai/types/beta/realtime/transcription_session.py @@ -0,0 +1,100 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ...._models import BaseModel + +__all__ = ["TranscriptionSession", "ClientSecret", "InputAudioTranscription", "TurnDetection"] + + +class ClientSecret(BaseModel): + expires_at: int + """Timestamp for when the token expires. + + Currently, all tokens expire after one minute. + """ + + value: str + """ + Ephemeral key usable in client environments to authenticate connections to the + Realtime API. Use this in client-side environments rather than a standard API + token, which should only be used server-side. + """ + + +class InputAudioTranscription(BaseModel): + language: Optional[str] = None + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + """ + + model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None + """The model to use for transcription. + + Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`. + """ + + prompt: Optional[str] = None + """An optional text to guide the model's style or continue a previous audio + segment. + + The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + should match the audio language. + """ + + +class TurnDetection(BaseModel): + prefix_padding_ms: Optional[int] = None + """Amount of audio to include before the VAD detected speech (in milliseconds). + + Defaults to 300ms. + """ + + silence_duration_ms: Optional[int] = None + """Duration of silence to detect speech stop (in milliseconds). + + Defaults to 500ms. With shorter values the model will respond more quickly, but + may jump in on short pauses from the user. + """ + + threshold: Optional[float] = None + """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. + + A higher threshold will require louder audio to activate the model, and thus + might perform better in noisy environments. + """ + + type: Optional[str] = None + """Type of turn detection, only `server_vad` is currently supported.""" + + +class TranscriptionSession(BaseModel): + client_secret: ClientSecret + """Ephemeral key returned by the API. + + Only present when the session is created on the server via REST API. + """ + + input_audio_format: Optional[str] = None + """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" + + input_audio_transcription: Optional[InputAudioTranscription] = None + """Configuration of the transcription model.""" + + modalities: Optional[List[Literal["text", "audio"]]] = None + """The set of modalities the model can respond with. + + To disable audio, set this to ["text"]. + """ + + turn_detection: Optional[TurnDetection] = None + """Configuration for turn detection. + + Can be set to `null` to turn off. Server VAD means that the model will detect + the start and end of speech based on audio volume and respond at the end of user + speech. + """ diff --git a/src/openai/types/beta/realtime/transcription_session_create_params.py b/src/openai/types/beta/realtime/transcription_session_create_params.py new file mode 100644 index 0000000000..4066dc4c5d --- /dev/null +++ b/src/openai/types/beta/realtime/transcription_session_create_params.py @@ -0,0 +1,143 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import List +from typing_extensions import Literal, TypedDict + +__all__ = ["TranscriptionSessionCreateParams", "InputAudioNoiseReduction", "InputAudioTranscription", "TurnDetection"] + + +class TranscriptionSessionCreateParams(TypedDict, total=False): + include: List[str] + """The set of items to include in the transcription. Current available items are: + + - `item.input_audio_transcription.logprobs` + """ + + input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] + """The format of input audio. + + Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must + be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian + byte order. + """ + + input_audio_noise_reduction: InputAudioNoiseReduction + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + input_audio_transcription: InputAudioTranscription + """Configuration for input audio transcription. + + The client can optionally set the language and prompt for transcription, these + offer additional guidance to the transcription service. + """ + + modalities: List[Literal["text", "audio"]] + """The set of modalities the model can respond with. + + To disable audio, set this to ["text"]. + """ + + turn_detection: TurnDetection + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjuction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. + """ + + +class InputAudioNoiseReduction(TypedDict, total=False): + type: Literal["near_field", "far_field"] + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + +class InputAudioTranscription(TypedDict, total=False): + language: str + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + """ + + model: Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"] + """ + The model to use for transcription, current options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1`. + """ + + prompt: str + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". + """ + + +class TurnDetection(TypedDict, total=False): + create_response: bool + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Literal["low", "medium", "high", "auto"] + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. + """ + + interrupt_response: bool + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + prefix_padding_ms: int + """Used only for `server_vad` mode. + + Amount of audio to include before the VAD detected speech (in milliseconds). + Defaults to 300ms. + """ + + silence_duration_ms: int + """Used only for `server_vad` mode. + + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. + """ + + threshold: float + """Used only for `server_vad` mode. + + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. + """ + + type: Literal["server_vad", "semantic_vad"] + """Type of turn detection.""" diff --git a/src/openai/types/beta/realtime/transcription_session_update.py b/src/openai/types/beta/realtime/transcription_session_update.py new file mode 100644 index 0000000000..043ac02e07 --- /dev/null +++ b/src/openai/types/beta/realtime/transcription_session_update.py @@ -0,0 +1,160 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ...._models import BaseModel + +__all__ = [ + "TranscriptionSessionUpdate", + "Session", + "SessionInputAudioNoiseReduction", + "SessionInputAudioTranscription", + "SessionTurnDetection", +] + + +class SessionInputAudioNoiseReduction(BaseModel): + type: Optional[Literal["near_field", "far_field"]] = None + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + +class SessionInputAudioTranscription(BaseModel): + language: Optional[str] = None + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + """ + + model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None + """ + The model to use for transcription, current options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1`. + """ + + prompt: Optional[str] = None + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". + """ + + +class SessionTurnDetection(BaseModel): + create_response: Optional[bool] = None + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. + """ + + interrupt_response: Optional[bool] = None + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + prefix_padding_ms: Optional[int] = None + """Used only for `server_vad` mode. + + Amount of audio to include before the VAD detected speech (in milliseconds). + Defaults to 300ms. + """ + + silence_duration_ms: Optional[int] = None + """Used only for `server_vad` mode. + + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. + """ + + threshold: Optional[float] = None + """Used only for `server_vad` mode. + + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. + """ + + type: Optional[Literal["server_vad", "semantic_vad"]] = None + """Type of turn detection.""" + + +class Session(BaseModel): + include: Optional[List[str]] = None + """The set of items to include in the transcription. Current available items are: + + - `item.input_audio_transcription.logprobs` + """ + + input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None + """The format of input audio. + + Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must + be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian + byte order. + """ + + input_audio_noise_reduction: Optional[SessionInputAudioNoiseReduction] = None + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + input_audio_transcription: Optional[SessionInputAudioTranscription] = None + """Configuration for input audio transcription. + + The client can optionally set the language and prompt for transcription, these + offer additional guidance to the transcription service. + """ + + modalities: Optional[List[Literal["text", "audio"]]] = None + """The set of modalities the model can respond with. + + To disable audio, set this to ["text"]. + """ + + turn_detection: Optional[SessionTurnDetection] = None + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjuction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. + """ + + +class TranscriptionSessionUpdate(BaseModel): + session: Session + """Realtime transcription session object configuration.""" + + type: Literal["transcription_session.update"] + """The event type, must be `transcription_session.update`.""" + + event_id: Optional[str] = None + """Optional client-generated ID used to identify this event.""" diff --git a/src/openai/types/beta/realtime/transcription_session_update_param.py b/src/openai/types/beta/realtime/transcription_session_update_param.py new file mode 100644 index 0000000000..997a36d77b --- /dev/null +++ b/src/openai/types/beta/realtime/transcription_session_update_param.py @@ -0,0 +1,160 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import List +from typing_extensions import Literal, Required, TypedDict + +__all__ = [ + "TranscriptionSessionUpdateParam", + "Session", + "SessionInputAudioNoiseReduction", + "SessionInputAudioTranscription", + "SessionTurnDetection", +] + + +class SessionInputAudioNoiseReduction(TypedDict, total=False): + type: Literal["near_field", "far_field"] + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + +class SessionInputAudioTranscription(TypedDict, total=False): + language: str + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + """ + + model: Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"] + """ + The model to use for transcription, current options are `gpt-4o-transcribe`, + `gpt-4o-mini-transcribe`, and `whisper-1`. + """ + + prompt: str + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". + """ + + +class SessionTurnDetection(TypedDict, total=False): + create_response: bool + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Literal["low", "medium", "high", "auto"] + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. + """ + + interrupt_response: bool + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + prefix_padding_ms: int + """Used only for `server_vad` mode. + + Amount of audio to include before the VAD detected speech (in milliseconds). + Defaults to 300ms. + """ + + silence_duration_ms: int + """Used only for `server_vad` mode. + + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. + """ + + threshold: float + """Used only for `server_vad` mode. + + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. + """ + + type: Literal["server_vad", "semantic_vad"] + """Type of turn detection.""" + + +class Session(TypedDict, total=False): + include: List[str] + """The set of items to include in the transcription. Current available items are: + + - `item.input_audio_transcription.logprobs` + """ + + input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] + """The format of input audio. + + Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must + be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian + byte order. + """ + + input_audio_noise_reduction: SessionInputAudioNoiseReduction + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + input_audio_transcription: SessionInputAudioTranscription + """Configuration for input audio transcription. + + The client can optionally set the language and prompt for transcription, these + offer additional guidance to the transcription service. + """ + + modalities: List[Literal["text", "audio"]] + """The set of modalities the model can respond with. + + To disable audio, set this to ["text"]. + """ + + turn_detection: SessionTurnDetection + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjuction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. + """ + + +class TranscriptionSessionUpdateParam(TypedDict, total=False): + session: Required[Session] + """Realtime transcription session object configuration.""" + + type: Required[Literal["transcription_session.update"]] + """The event type, must be `transcription_session.update`.""" + + event_id: str + """Optional client-generated ID used to identify this event.""" diff --git a/src/openai/types/beta/realtime/transcription_session_updated_event.py b/src/openai/types/beta/realtime/transcription_session_updated_event.py new file mode 100644 index 0000000000..ffc100bcc2 --- /dev/null +++ b/src/openai/types/beta/realtime/transcription_session_updated_event.py @@ -0,0 +1,24 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing_extensions import Literal + +from ...._models import BaseModel +from .transcription_session import TranscriptionSession + +__all__ = ["TranscriptionSessionUpdatedEvent"] + + +class TranscriptionSessionUpdatedEvent(BaseModel): + event_id: str + """The unique ID of the server event.""" + + session: TranscriptionSession + """A new Realtime transcription session configuration. + + When a session is created on the server via REST API, the session object also + contains an ephemeral key. Default TTL for keys is one minute. This property is + not present when a session is updated via the WebSocket API. + """ + + type: Literal["transcription_session.updated"] + """The event type, must be `transcription_session.updated`.""" diff --git a/tests/api_resources/audio/test_speech.py b/tests/api_resources/audio/test_speech.py index 781ebeceb9..808f6ef66c 100644 --- a/tests/api_resources/audio/test_speech.py +++ b/tests/api_resources/audio/test_speech.py @@ -41,6 +41,7 @@ def test_method_create_with_all_params(self, client: OpenAI, respx_mock: MockRou input="string", model="string", voice="alloy", + instructions="instructions", response_format="mp3", speed=0.25, ) @@ -104,6 +105,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI, re input="string", model="string", voice="alloy", + instructions="instructions", response_format="mp3", speed=0.25, ) diff --git a/tests/api_resources/audio/test_transcriptions.py b/tests/api_resources/audio/test_transcriptions.py index bdb7e0dfb6..19215e11df 100644 --- a/tests/api_resources/audio/test_transcriptions.py +++ b/tests/api_resources/audio/test_transcriptions.py @@ -18,31 +18,33 @@ class TestTranscriptions: parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"]) @parametrize - def test_method_create(self, client: OpenAI) -> None: + def test_method_create_overload_1(self, client: OpenAI) -> None: transcription = client.audio.transcriptions.create( file=b"raw file contents", - model="whisper-1", + model="gpt-4o-transcribe", ) assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"]) @parametrize - def test_method_create_with_all_params(self, client: OpenAI) -> None: + def test_method_create_with_all_params_overload_1(self, client: OpenAI) -> None: transcription = client.audio.transcriptions.create( file=b"raw file contents", - model="whisper-1", - language="string", - prompt="string", + model="gpt-4o-transcribe", + include=["logprobs"], + language="language", + prompt="prompt", response_format="json", + stream=False, temperature=0, timestamp_granularities=["word"], ) assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"]) @parametrize - def test_raw_response_create(self, client: OpenAI) -> None: + def test_raw_response_create_overload_1(self, client: OpenAI) -> None: response = client.audio.transcriptions.with_raw_response.create( file=b"raw file contents", - model="whisper-1", + model="gpt-4o-transcribe", ) assert response.is_closed is True @@ -51,10 +53,10 @@ def test_raw_response_create(self, client: OpenAI) -> None: assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"]) @parametrize - def test_streaming_response_create(self, client: OpenAI) -> None: + def test_streaming_response_create_overload_1(self, client: OpenAI) -> None: with client.audio.transcriptions.with_streaming_response.create( file=b"raw file contents", - model="whisper-1", + model="gpt-4o-transcribe", ) as response: assert not response.is_closed assert response.http_request.headers.get("X-Stainless-Lang") == "python" @@ -64,36 +66,89 @@ def test_streaming_response_create(self, client: OpenAI) -> None: assert cast(Any, response.is_closed) is True + @parametrize + def test_method_create_overload_2(self, client: OpenAI) -> None: + transcription_stream = client.audio.transcriptions.create( + file=b"raw file contents", + model="gpt-4o-transcribe", + stream=True, + ) + transcription_stream.response.close() + + @parametrize + def test_method_create_with_all_params_overload_2(self, client: OpenAI) -> None: + transcription_stream = client.audio.transcriptions.create( + file=b"raw file contents", + model="gpt-4o-transcribe", + stream=True, + include=["logprobs"], + language="language", + prompt="prompt", + response_format="json", + temperature=0, + timestamp_granularities=["word"], + ) + transcription_stream.response.close() + + @parametrize + def test_raw_response_create_overload_2(self, client: OpenAI) -> None: + response = client.audio.transcriptions.with_raw_response.create( + file=b"raw file contents", + model="gpt-4o-transcribe", + stream=True, + ) + + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + stream = response.parse() + stream.close() + + @parametrize + def test_streaming_response_create_overload_2(self, client: OpenAI) -> None: + with client.audio.transcriptions.with_streaming_response.create( + file=b"raw file contents", + model="gpt-4o-transcribe", + stream=True, + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + stream = response.parse() + stream.close() + + assert cast(Any, response.is_closed) is True + class TestAsyncTranscriptions: parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"]) @parametrize - async def test_method_create(self, async_client: AsyncOpenAI) -> None: + async def test_method_create_overload_1(self, async_client: AsyncOpenAI) -> None: transcription = await async_client.audio.transcriptions.create( file=b"raw file contents", - model="whisper-1", + model="gpt-4o-transcribe", ) assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"]) @parametrize - async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None: + async def test_method_create_with_all_params_overload_1(self, async_client: AsyncOpenAI) -> None: transcription = await async_client.audio.transcriptions.create( file=b"raw file contents", - model="whisper-1", - language="string", - prompt="string", + model="gpt-4o-transcribe", + include=["logprobs"], + language="language", + prompt="prompt", response_format="json", + stream=False, temperature=0, timestamp_granularities=["word"], ) assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"]) @parametrize - async def test_raw_response_create(self, async_client: AsyncOpenAI) -> None: + async def test_raw_response_create_overload_1(self, async_client: AsyncOpenAI) -> None: response = await async_client.audio.transcriptions.with_raw_response.create( file=b"raw file contents", - model="whisper-1", + model="gpt-4o-transcribe", ) assert response.is_closed is True @@ -102,10 +157,10 @@ async def test_raw_response_create(self, async_client: AsyncOpenAI) -> None: assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"]) @parametrize - async def test_streaming_response_create(self, async_client: AsyncOpenAI) -> None: + async def test_streaming_response_create_overload_1(self, async_client: AsyncOpenAI) -> None: async with async_client.audio.transcriptions.with_streaming_response.create( file=b"raw file contents", - model="whisper-1", + model="gpt-4o-transcribe", ) as response: assert not response.is_closed assert response.http_request.headers.get("X-Stainless-Lang") == "python" @@ -114,3 +169,54 @@ async def test_streaming_response_create(self, async_client: AsyncOpenAI) -> Non assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"]) assert cast(Any, response.is_closed) is True + + @parametrize + async def test_method_create_overload_2(self, async_client: AsyncOpenAI) -> None: + transcription_stream = await async_client.audio.transcriptions.create( + file=b"raw file contents", + model="gpt-4o-transcribe", + stream=True, + ) + await transcription_stream.response.aclose() + + @parametrize + async def test_method_create_with_all_params_overload_2(self, async_client: AsyncOpenAI) -> None: + transcription_stream = await async_client.audio.transcriptions.create( + file=b"raw file contents", + model="gpt-4o-transcribe", + stream=True, + include=["logprobs"], + language="language", + prompt="prompt", + response_format="json", + temperature=0, + timestamp_granularities=["word"], + ) + await transcription_stream.response.aclose() + + @parametrize + async def test_raw_response_create_overload_2(self, async_client: AsyncOpenAI) -> None: + response = await async_client.audio.transcriptions.with_raw_response.create( + file=b"raw file contents", + model="gpt-4o-transcribe", + stream=True, + ) + + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + stream = response.parse() + await stream.close() + + @parametrize + async def test_streaming_response_create_overload_2(self, async_client: AsyncOpenAI) -> None: + async with async_client.audio.transcriptions.with_streaming_response.create( + file=b"raw file contents", + model="gpt-4o-transcribe", + stream=True, + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + stream = await response.parse() + await stream.close() + + assert cast(Any, response.is_closed) is True diff --git a/tests/api_resources/beta/realtime/test_sessions.py b/tests/api_resources/beta/realtime/test_sessions.py index 5ea308ca0d..c0a426a417 100644 --- a/tests/api_resources/beta/realtime/test_sessions.py +++ b/tests/api_resources/beta/realtime/test_sessions.py @@ -26,6 +26,7 @@ def test_method_create(self, client: OpenAI) -> None: def test_method_create_with_all_params(self, client: OpenAI) -> None: session = client.beta.realtime.sessions.create( input_audio_format="pcm16", + input_audio_noise_reduction={"type": "near_field"}, input_audio_transcription={ "language": "language", "model": "model", @@ -48,11 +49,12 @@ def test_method_create_with_all_params(self, client: OpenAI) -> None: ], turn_detection={ "create_response": True, + "eagerness": "low", "interrupt_response": True, "prefix_padding_ms": 0, "silence_duration_ms": 0, "threshold": 0, - "type": "type", + "type": "server_vad", }, voice="alloy", ) @@ -91,6 +93,7 @@ async def test_method_create(self, async_client: AsyncOpenAI) -> None: async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None: session = await async_client.beta.realtime.sessions.create( input_audio_format="pcm16", + input_audio_noise_reduction={"type": "near_field"}, input_audio_transcription={ "language": "language", "model": "model", @@ -113,11 +116,12 @@ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> ], turn_detection={ "create_response": True, + "eagerness": "low", "interrupt_response": True, "prefix_padding_ms": 0, "silence_duration_ms": 0, "threshold": 0, - "type": "type", + "type": "server_vad", }, voice="alloy", ) diff --git a/tests/api_resources/beta/realtime/test_transcription_sessions.py b/tests/api_resources/beta/realtime/test_transcription_sessions.py new file mode 100644 index 0000000000..4826185bea --- /dev/null +++ b/tests/api_resources/beta/realtime/test_transcription_sessions.py @@ -0,0 +1,120 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +import os +from typing import Any, cast + +import pytest + +from openai import OpenAI, AsyncOpenAI +from tests.utils import assert_matches_type +from openai.types.beta.realtime import TranscriptionSession + +base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") + + +class TestTranscriptionSessions: + parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"]) + + @parametrize + def test_method_create(self, client: OpenAI) -> None: + transcription_session = client.beta.realtime.transcription_sessions.create() + assert_matches_type(TranscriptionSession, transcription_session, path=["response"]) + + @parametrize + def test_method_create_with_all_params(self, client: OpenAI) -> None: + transcription_session = client.beta.realtime.transcription_sessions.create( + include=["string"], + input_audio_format="pcm16", + input_audio_noise_reduction={"type": "near_field"}, + input_audio_transcription={ + "language": "language", + "model": "gpt-4o-transcribe", + "prompt": "prompt", + }, + modalities=["text"], + turn_detection={ + "create_response": True, + "eagerness": "low", + "interrupt_response": True, + "prefix_padding_ms": 0, + "silence_duration_ms": 0, + "threshold": 0, + "type": "server_vad", + }, + ) + assert_matches_type(TranscriptionSession, transcription_session, path=["response"]) + + @parametrize + def test_raw_response_create(self, client: OpenAI) -> None: + response = client.beta.realtime.transcription_sessions.with_raw_response.create() + + assert response.is_closed is True + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + transcription_session = response.parse() + assert_matches_type(TranscriptionSession, transcription_session, path=["response"]) + + @parametrize + def test_streaming_response_create(self, client: OpenAI) -> None: + with client.beta.realtime.transcription_sessions.with_streaming_response.create() as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + transcription_session = response.parse() + assert_matches_type(TranscriptionSession, transcription_session, path=["response"]) + + assert cast(Any, response.is_closed) is True + + +class TestAsyncTranscriptionSessions: + parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"]) + + @parametrize + async def test_method_create(self, async_client: AsyncOpenAI) -> None: + transcription_session = await async_client.beta.realtime.transcription_sessions.create() + assert_matches_type(TranscriptionSession, transcription_session, path=["response"]) + + @parametrize + async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None: + transcription_session = await async_client.beta.realtime.transcription_sessions.create( + include=["string"], + input_audio_format="pcm16", + input_audio_noise_reduction={"type": "near_field"}, + input_audio_transcription={ + "language": "language", + "model": "gpt-4o-transcribe", + "prompt": "prompt", + }, + modalities=["text"], + turn_detection={ + "create_response": True, + "eagerness": "low", + "interrupt_response": True, + "prefix_padding_ms": 0, + "silence_duration_ms": 0, + "threshold": 0, + "type": "server_vad", + }, + ) + assert_matches_type(TranscriptionSession, transcription_session, path=["response"]) + + @parametrize + async def test_raw_response_create(self, async_client: AsyncOpenAI) -> None: + response = await async_client.beta.realtime.transcription_sessions.with_raw_response.create() + + assert response.is_closed is True + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + transcription_session = response.parse() + assert_matches_type(TranscriptionSession, transcription_session, path=["response"]) + + @parametrize + async def test_streaming_response_create(self, async_client: AsyncOpenAI) -> None: + async with async_client.beta.realtime.transcription_sessions.with_streaming_response.create() as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + transcription_session = await response.parse() + assert_matches_type(TranscriptionSession, transcription_session, path=["response"]) + + assert cast(Any, response.is_closed) is True diff --git a/tests/lib/test_audio.py b/tests/lib/test_audio.py index 0f53b316ba..ff8dba4714 100644 --- a/tests/lib/test_audio.py +++ b/tests/lib/test_audio.py @@ -26,7 +26,7 @@ def test_translation_create_overloads_in_sync(sync: bool, client: OpenAI, async_ assert_signatures_in_sync( fn, overload, - exclude_params={"response_format"}, + exclude_params={"response_format", "stream"}, description=f" for overload {i}", ) @@ -60,7 +60,7 @@ def test_transcription_create_overloads_in_sync(sync: bool, client: OpenAI, asyn assert_signatures_in_sync( fn, overload, - exclude_params={"response_format"}, + exclude_params={"response_format", "stream"}, description=f" for overload {i}", )