Skip to content

Commit

Permalink
feat: Add audio toolkit from owl project (#1744)
Browse files Browse the repository at this point in the history
Co-authored-by: Wendong <[email protected]>
Co-authored-by: Wendong-Fan <[email protected]>
  • Loading branch information
3 people authored Mar 9, 2025
1 parent fee5905 commit 67a0608
Show file tree
Hide file tree
Showing 11 changed files with 717 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ repos:
- id: mypy
name: Check mypy
entry: mypy --namespace-packages -p camel -p test -p apps
language: system
language: python
types: [python]
pass_filenames: false
require_serial: true
Expand Down
2 changes: 2 additions & 0 deletions camel/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .aiml_model import AIMLModel
from .anthropic_model import AnthropicModel
from .azure_openai_model import AzureOpenAIModel
from .base_audio_model import BaseAudioModel
from .base_model import BaseModelBackend
from .cohere_model import CohereModel
from .deepseek_model import DeepSeekModel
Expand Down Expand Up @@ -74,4 +75,5 @@
'InternLMModel',
'MoonshotModel',
'AIMLModel',
'BaseAudioModel',
]
92 changes: 92 additions & 0 deletions camel/models/base_audio_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import os
from abc import ABC, abstractmethod
from typing import Any, Optional


class BaseAudioModel(ABC):
r"""Base class for audio models providing Text-to-Speech (TTS) and
Speech-to-Text (STT) functionality.
"""

def __init__(
self,
api_key: Optional[str] = None,
url: Optional[str] = None,
) -> None:
r"""Initialize an instance of BaseAudioModel.
Args:
api_key (Optional[str]): API key for the audio service. If not
provided, will look for an environment variable specific to the
implementation.
url (Optional[str]): Base URL for the audio API. If not provided,
will use a default URL or look for an environment variable
specific to the implementation.
"""
self._api_key = api_key
self._url = url

@abstractmethod
def text_to_speech(
self,
input: str,
*,
storage_path: str,
**kwargs: Any,
) -> Any:
r"""Convert text to speech.
Args:
input (str): The text to be converted to speech.
storage_path (str): The local path to store the
generated speech file.
**kwargs (Any): Extra kwargs passed to the TTS API.
Returns:
Any: The response from the TTS API, which may vary by
implementation.
"""
pass

@abstractmethod
def speech_to_text(
self,
audio_file_path: str,
**kwargs: Any,
) -> str:
r"""Convert speech audio to text.
Args:
audio_file_path (str): The audio file path to transcribe.
**kwargs (Any): Extra keyword arguments passed to the
Speech-to-Text (STT) API.
Returns:
str: The transcribed text.
"""
pass

def _ensure_directory_exists(self, file_path: str) -> None:
r"""Ensure the directory for the given file path exists.
Args:
file_path (str): The file path for which to ensure the directory
exists.
"""
directory = os.path.dirname(file_path)
if directory and not os.path.exists(directory):
os.makedirs(directory)
26 changes: 18 additions & 8 deletions camel/models/fish_audio_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
import os
from typing import Any, Optional

from camel.models.base_audio_model import BaseAudioModel

class FishAudioModel:

class FishAudioModel(BaseAudioModel):
r"""Provides access to FishAudio's Text-to-Speech (TTS) and Speech_to_Text
(STT) models.
"""
Expand All @@ -37,6 +39,7 @@ def __init__(
"""
from fish_audio_sdk import Session

super().__init__(api_key, url)
self._api_key = api_key or os.environ.get("FISHAUDIO_API_KEY")
self._url = url or os.environ.get(
"FISHAUDIO_API_BASE_URL", "https://api.fish.audio"
Expand All @@ -46,7 +49,8 @@ def __init__(
def text_to_speech(
self,
input: str,
storage_path: str,
*,
storage_path: Optional[str] = None,
reference_id: Optional[str] = None,
reference_audio: Optional[str] = None,
reference_audio_text: Optional[str] = None,
Expand All @@ -55,9 +59,9 @@ def text_to_speech(
r"""Convert text to speech and save the output to a file.
Args:
input_text (str): The text to convert to speech.
storage_path (str): The file path where the resulting speech will
be saved.
input (str): The text to convert to speech.
storage_path (Optional[str]): The file path where the resulting
speech will be saved. (default: :obj:`None`)
reference_id (Optional[str]): An optional reference ID to
associate with the request. (default: :obj:`None`)
reference_audio (Optional[str]): Path to an audio file for
Expand All @@ -68,12 +72,18 @@ def text_to_speech(
Raises:
FileNotFoundError: If the reference audio file cannot be found.
ValueError: If storage_path is not provided or if reference_audio
is provided without reference_audio_text.
"""
from fish_audio_sdk import ReferenceAudio, TTSRequest

directory = os.path.dirname(storage_path)
if directory and not os.path.exists(directory):
os.makedirs(directory)
if storage_path is None:
raise ValueError(
"storage_path must be provided for "
"FishAudioModel.text_to_speech"
)

self._ensure_directory_exists(storage_path)

if not reference_audio:
with open(f"{storage_path}", "wb") as f:
Expand Down
81 changes: 80 additions & 1 deletion camel/models/openai_audio_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import base64
import os
from typing import Any, List, Optional, Union

from openai import AsyncOpenAI, OpenAI, _legacy_response

from camel.models.base_audio_model import BaseAudioModel
from camel.types import AudioModelType, VoiceType


class OpenAIAudioModels:
class OpenAIAudioModels(BaseAudioModel):
r"""Provides access to OpenAI's Text-to-Speech (TTS) and Speech_to_Text
(STT) models."""

Expand All @@ -29,6 +31,7 @@ def __init__(
url: Optional[str] = None,
) -> None:
r"""Initialize an instance of OpenAI."""
super().__init__(api_key, url)
self._url = url or os.environ.get("OPENAI_API_BASE_URL")
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
self._client = OpenAI(
Expand All @@ -47,6 +50,7 @@ def __init__(
def text_to_speech(
self,
input: str,
*,
model_type: AudioModelType = AudioModelType.TTS_1,
voice: VoiceType = VoiceType.ALLOY,
storage_path: Optional[str] = None,
Expand Down Expand Up @@ -111,6 +115,8 @@ def text_to_speech(
new_storage_path = (
f"{file_name}_{chunk_index}{file_extension}"
)
# Ensure directory exists
self._ensure_directory_exists(new_storage_path)
response.write_to_file(new_storage_path)
chunk_index += 1
except Exception as e:
Expand All @@ -131,6 +137,8 @@ def text_to_speech(

if storage_path:
try:
# Ensure directory exists
self._ensure_directory_exists(storage_path)
response.write_to_file(storage_path)
except Exception as e:
raise Exception("Error during write the file") from e
Expand Down Expand Up @@ -263,3 +271,74 @@ def speech_to_text(
return transcription.text
except Exception as e:
raise Exception("Error during STT API call") from e

def audio_question_answering(
self,
audio_file_path: str,
question: str,
model: str = "gpt-4o-mini-audio-preview",
**kwargs: Any,
) -> str:
r"""Answer a question directly using the audio content.
Args:
audio_file_path (str): The path to the audio file.
question (str): The question to ask about the audio content.
model (str, optional): The model to use for audio question
answering. (default: :obj:`"gpt-4o-mini-audio-preview"`)
**kwargs (Any): Extra keyword arguments passed to the chat
completions API.
Returns:
str: The model's response to the question.
Raises:
Exception: If there's an error during the API call.
"""
try:
# Read and encode the audio file
with open(audio_file_path, "rb") as audio_file:
audio_data = audio_file.read()

encoded_string = base64.b64encode(audio_data).decode('utf-8')

# Get file format
file_suffix = os.path.splitext(audio_file_path)[1]
file_format = file_suffix[1:].lower()

# Prepare the prompt
text_prompt = "Answer the following question based on the "
f"given audio information:\n\n{question}"

# Call the OpenAI API
completion = self._client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": "You are a helpful assistant "
"specializing in audio analysis.",
},
{ # type: ignore[misc, list-item]
"role": "user",
"content": [
{"type": "text", "text": text_prompt},
{
"type": "input_audio",
"input_audio": {
"data": encoded_string,
"format": file_format,
},
},
],
},
],
**kwargs,
)

response = str(completion.choices[0].message.content)
return response
except Exception as e:
raise Exception(
"Error during audio question answering API call"
) from e
2 changes: 2 additions & 0 deletions camel/toolkits/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from .zapier_toolkit import ZapierToolkit
from .sympy_toolkit import SymPyToolkit
from .mineru_toolkit import MinerUToolkit
from .audio_analysis_toolkit import AudioAnalysisToolkit
from .excel_toolkit import ExcelToolkit
from .video_analysis_toolkit import VideoAnalysisToolkit
from .image_analysis_toolkit import ImageAnalysisToolkit
Expand Down Expand Up @@ -91,6 +92,7 @@
'ZapierToolkit',
'SymPyToolkit',
'MinerUToolkit',
'AudioAnalysisToolkit',
'ExcelToolkit',
'VideoAnalysisToolkit',
'ImageAnalysisToolkit',
Expand Down
Loading

0 comments on commit 67a0608

Please sign in to comment.