feat: Add audio toolkit from owl project (#1744)

Co-authored-by: Wendong <[email protected]> Co-authored-by: Wendong-Fan <[email protected]>
camel-ai · Mar 9, 2025 · 67a0608 · 67a0608
1 parent fee5905
commit 67a0608
Show file tree

Hide file tree

Showing 11 changed files with 717 additions and 15 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
       - id: mypy
         name: Check mypy
         entry: mypy --namespace-packages -p camel -p test -p apps
-        language: system
+        language: python
         types: [python]
         pass_filenames: false
         require_serial: true

diff --git a/camel/models/__init__.py b/camel/models/__init__.py
@@ -14,6 +14,7 @@
 from .aiml_model import AIMLModel
 from .anthropic_model import AnthropicModel
 from .azure_openai_model import AzureOpenAIModel
+from .base_audio_model import BaseAudioModel
 from .base_model import BaseModelBackend
 from .cohere_model import CohereModel
 from .deepseek_model import DeepSeekModel
@@ -74,4 +75,5 @@
     'InternLMModel',
     'MoonshotModel',
     'AIMLModel',
+    'BaseAudioModel',
 ]
diff --git a/camel/models/base_audio_model.py b/camel/models/base_audio_model.py
@@ -0,0 +1,92 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+
+class BaseAudioModel(ABC):
+    r"""Base class for audio models providing Text-to-Speech (TTS) and
+    Speech-to-Text (STT) functionality.
+    """
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        url: Optional[str] = None,
+    ) -> None:
+        r"""Initialize an instance of BaseAudioModel.
+
+        Args:
+            api_key (Optional[str]): API key for the audio service. If not
+                provided, will look for an environment variable specific to the
+                implementation.
+            url (Optional[str]): Base URL for the audio API. If not provided,
+                will use a default URL or look for an environment variable
+                specific to the implementation.
+        """
+        self._api_key = api_key
+        self._url = url
+
+    @abstractmethod
+    def text_to_speech(
+        self,
+        input: str,
+        *,
+        storage_path: str,
+        **kwargs: Any,
+    ) -> Any:
+        r"""Convert text to speech.
+
+        Args:
+            input (str): The text to be converted to speech.
+            storage_path (str): The local path to store the
+                generated speech file.
+            **kwargs (Any): Extra kwargs passed to the TTS API.
+
+        Returns:
+            Any: The response from the TTS API, which may vary by
+                implementation.
+        """
+        pass
+
+    @abstractmethod
+    def speech_to_text(
+        self,
+        audio_file_path: str,
+        **kwargs: Any,
+    ) -> str:
+        r"""Convert speech audio to text.
+
+        Args:
+            audio_file_path (str): The audio file path to transcribe.
+            **kwargs (Any): Extra keyword arguments passed to the
+                Speech-to-Text (STT) API.
+
+        Returns:
+            str: The transcribed text.
+        """
+        pass
+
+    def _ensure_directory_exists(self, file_path: str) -> None:
+        r"""Ensure the directory for the given file path exists.
+
+        Args:
+            file_path (str): The file path for which to ensure the directory
+                exists.
+        """
+        directory = os.path.dirname(file_path)
+        if directory and not os.path.exists(directory):
+            os.makedirs(directory)
diff --git a/camel/models/fish_audio_model.py b/camel/models/fish_audio_model.py
@@ -15,8 +15,10 @@
 import os
 from typing import Any, Optional
 
+from camel.models.base_audio_model import BaseAudioModel
 
-class FishAudioModel:
+
+class FishAudioModel(BaseAudioModel):
     r"""Provides access to FishAudio's Text-to-Speech (TTS) and Speech_to_Text
     (STT) models.
     """
@@ -37,6 +39,7 @@ def __init__(
         """
         from fish_audio_sdk import Session
 
+        super().__init__(api_key, url)
         self._api_key = api_key or os.environ.get("FISHAUDIO_API_KEY")
         self._url = url or os.environ.get(
             "FISHAUDIO_API_BASE_URL", "https://api.fish.audio"
@@ -46,7 +49,8 @@ def __init__(
     def text_to_speech(
         self,
         input: str,
-        storage_path: str,
+        *,
+        storage_path: Optional[str] = None,
         reference_id: Optional[str] = None,
         reference_audio: Optional[str] = None,
         reference_audio_text: Optional[str] = None,
@@ -55,9 +59,9 @@ def text_to_speech(
         r"""Convert text to speech and save the output to a file.
 
         Args:
-            input_text (str): The text to convert to speech.
-            storage_path (str): The file path where the resulting speech will
-                be saved.
+            input (str): The text to convert to speech.
+            storage_path (Optional[str]): The file path where the resulting
+                speech will be saved. (default: :obj:`None`)
             reference_id (Optional[str]): An optional reference ID to
                 associate with the request. (default: :obj:`None`)
             reference_audio (Optional[str]): Path to an audio file for
@@ -68,12 +72,18 @@ def text_to_speech(
 
         Raises:
             FileNotFoundError: If the reference audio file cannot be found.
+            ValueError: If storage_path is not provided or if reference_audio
+                is provided without reference_audio_text.
         """
         from fish_audio_sdk import ReferenceAudio, TTSRequest
 
-        directory = os.path.dirname(storage_path)
-        if directory and not os.path.exists(directory):
-            os.makedirs(directory)
+        if storage_path is None:
+            raise ValueError(
+                "storage_path must be provided for "
+                "FishAudioModel.text_to_speech"
+            )
+
+        self._ensure_directory_exists(storage_path)
 
         if not reference_audio:
             with open(f"{storage_path}", "wb") as f:

diff --git a/camel/models/openai_audio_models.py b/camel/models/openai_audio_models.py
@@ -11,15 +11,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+import base64
 import os
 from typing import Any, List, Optional, Union
 
 from openai import AsyncOpenAI, OpenAI, _legacy_response
 
+from camel.models.base_audio_model import BaseAudioModel
 from camel.types import AudioModelType, VoiceType
 
 
-class OpenAIAudioModels:
+class OpenAIAudioModels(BaseAudioModel):
     r"""Provides access to OpenAI's Text-to-Speech (TTS) and Speech_to_Text
     (STT) models."""
 
@@ -29,6 +31,7 @@ def __init__(
         url: Optional[str] = None,
     ) -> None:
         r"""Initialize an instance of OpenAI."""
+        super().__init__(api_key, url)
         self._url = url or os.environ.get("OPENAI_API_BASE_URL")
         self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
         self._client = OpenAI(
@@ -47,6 +50,7 @@ def __init__(
     def text_to_speech(
         self,
         input: str,
+        *,
         model_type: AudioModelType = AudioModelType.TTS_1,
         voice: VoiceType = VoiceType.ALLOY,
         storage_path: Optional[str] = None,
@@ -111,6 +115,8 @@ def text_to_speech(
                             new_storage_path = (
                                 f"{file_name}_{chunk_index}{file_extension}"
                             )
+                            # Ensure directory exists
+                            self._ensure_directory_exists(new_storage_path)
                             response.write_to_file(new_storage_path)
                             chunk_index += 1
                         except Exception as e:
@@ -131,6 +137,8 @@ def text_to_speech(
 
             if storage_path:
                 try:
+                    # Ensure directory exists
+                    self._ensure_directory_exists(storage_path)
                     response.write_to_file(storage_path)
                 except Exception as e:
                     raise Exception("Error during write the file") from e
@@ -263,3 +271,74 @@ def speech_to_text(
                     return transcription.text
         except Exception as e:
             raise Exception("Error during STT API call") from e
+
+    def audio_question_answering(
+        self,
+        audio_file_path: str,
+        question: str,
+        model: str = "gpt-4o-mini-audio-preview",
+        **kwargs: Any,
+    ) -> str:
+        r"""Answer a question directly using the audio content.
+
+        Args:
+            audio_file_path (str): The path to the audio file.
+            question (str): The question to ask about the audio content.
+            model (str, optional): The model to use for audio question
+                answering. (default: :obj:`"gpt-4o-mini-audio-preview"`)
+            **kwargs (Any): Extra keyword arguments passed to the chat
+                completions API.
+
+        Returns:
+            str: The model's response to the question.
+
+        Raises:
+            Exception: If there's an error during the API call.
+        """
+        try:
+            # Read and encode the audio file
+            with open(audio_file_path, "rb") as audio_file:
+                audio_data = audio_file.read()
+
+            encoded_string = base64.b64encode(audio_data).decode('utf-8')
+
+            # Get file format
+            file_suffix = os.path.splitext(audio_file_path)[1]
+            file_format = file_suffix[1:].lower()
+
+            # Prepare the prompt
+            text_prompt = "Answer the following question based on the "
+            f"given audio information:\n\n{question}"
+
+            # Call the OpenAI API
+            completion = self._client.chat.completions.create(
+                model=model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant "
+                        "specializing in audio analysis.",
+                    },
+                    {  # type: ignore[misc, list-item]
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": text_prompt},
+                            {
+                                "type": "input_audio",
+                                "input_audio": {
+                                    "data": encoded_string,
+                                    "format": file_format,
+                                },
+                            },
+                        ],
+                    },
+                ],
+                **kwargs,
+            )
+
+            response = str(completion.choices[0].message.content)
+            return response
+        except Exception as e:
+            raise Exception(
+                "Error during audio question answering API call"
+            ) from e
diff --git a/camel/toolkits/__init__.py b/camel/toolkits/__init__.py
@@ -50,6 +50,7 @@
 from .zapier_toolkit import ZapierToolkit
 from .sympy_toolkit import SymPyToolkit
 from .mineru_toolkit import MinerUToolkit
+from .audio_analysis_toolkit import AudioAnalysisToolkit
 from .excel_toolkit import ExcelToolkit
 from .video_analysis_toolkit import VideoAnalysisToolkit
 from .image_analysis_toolkit import ImageAnalysisToolkit
@@ -91,6 +92,7 @@
     'ZapierToolkit',
     'SymPyToolkit',
     'MinerUToolkit',
+    'AudioAnalysisToolkit',
     'ExcelToolkit',
     'VideoAnalysisToolkit',
     'ImageAnalysisToolkit',