Enhance model documentation across audio and text tasks to clarify de…

…fault model usage
sensein · Nov 24, 2024 · b378be5 · b378be5
1 parent 7604dba
commit b378be5
Show file tree

Hide file tree

Showing 18 changed files with 43 additions and 27 deletions.
diff --git a/src/senselab/audio/tasks/speaker_diarization/api.py b/src/senselab/audio/tasks/speaker_diarization/api.py
@@ -19,8 +19,8 @@ def diarize_audios(
 
     Args:
         audios (List[Audio]): The list of audio objects to be diarized.
-        model (SenselabModel): The model used for diarization
-            (default is "pyannote/speaker-diarization-3.1").
+        model (SenselabModel): The model used for diarization.
+            If None, the default model "pyannote/speaker-diarization-3.1" is used.
         device (Optional[DeviceType]): The device to run the model on (default is None).
         num_speakers (Optional[int]): The number of speakers (default is None).
         min_speakers (Optional[int]): The minimum number of speakers (default is None).
@@ -42,4 +42,6 @@ def diarize_audios(
             max_speakers=max_speakers,
         )
     else:
-        raise NotImplementedError("Only Pyannote models are supported for now.")
+        raise NotImplementedError(
+            "Only Pyannote models are supported for now. We aim to support more models in the future."
+        )
diff --git a/src/senselab/audio/tasks/speaker_diarization/pyannote.py b/src/senselab/audio/tasks/speaker_diarization/pyannote.py
@@ -57,6 +57,7 @@ def diarize_audios_with_pyannote(
     Args:
         audios (List[Audio]): A list of audio files.
         model (PyannoteAudioModel): The model to use for diarization.
+            If None, the default model "pyannote/speaker-diarization-3.1" is used.
         device (Optional[DeviceType]): The device to use for diarization.
         num_speakers (Optional[int]): Number of speakers, when known.
         min_speakers (Optional[int]): Minimum number of speakers. Has no effect when `num_speakers` is provided.

diff --git a/src/senselab/audio/tasks/speaker_embeddings/api.py b/src/senselab/audio/tasks/speaker_embeddings/api.py
@@ -18,8 +18,8 @@ def extract_speaker_embeddings_from_audios(
 
     Args:
         audios (List[Audio]): A list of Audio objects containing the audio signals and their properties.
-        model (SpeechBrainModel): The model used to compute the embeddings
-            (default is "speechbrain/spkrec-ecapa-voxceleb").
+        model (SpeechBrainModel): The model used to compute the embeddings.
+            If None, the default model "speechbrain/spkrec-ecapa-voxceleb" is used.
         device (Optional[DeviceType]): The device to run the model on (default is None).
 
     Returns:
@@ -43,4 +43,6 @@ def extract_speaker_embeddings_from_audios(
             audios=audios, model=model, device=device
         )
     else:
-        raise NotImplementedError("The specified model is not supported for now.")
+        raise NotImplementedError(
+            "Only SpeechBrain models are supported for now. We aim to support more models in the future."
+        )
diff --git a/src/senselab/audio/tasks/speaker_embeddings/speechbrain.py b/src/senselab/audio/tasks/speaker_embeddings/speechbrain.py
@@ -55,8 +55,8 @@ def extract_speechbrain_speaker_embeddings_from_audios(
 
         Args:
             audios (List[Audio]): A list of Audio objects containing the audio signals and their properties.
-            model (SpeechBrainModel): The model used to compute the embeddings
-                (default is "speechbrain/spkrec-ecapa-voxceleb").
+            model (SpeechBrainModel): The model used to compute the embeddings.
+                If None, the default model "speechbrain/spkrec-ecapa-voxceleb" is used.
             device (Optional[DeviceType]): The device to run the model on (default is None).
                 Only CPU and CUDA are supported.
 

diff --git a/src/senselab/audio/tasks/speaker_verification/speaker_verification.py b/src/senselab/audio/tasks/speaker_verification/speaker_verification.py
@@ -27,7 +27,8 @@ def verify_speaker(
     Args:
         audios (List[Tuple[Audio, Audio]]): A list of tuples, where each tuple contains
                                             two audio samples to be compared.
-        model (SpeechBrainModel, optional): The model for speaker verification.
+        model (SpeechBrainModel, optional): The model for speaker verification..
+            If None, the default model "speechbrain/spkrec-ecapa-voxceleb" is used.
         device (DeviceType, optional): The device to run the model on. Defaults to CPU.
         threshold (float, optional): The threshold to determine same speaker.
 

diff --git a/src/senselab/audio/tasks/speech_enhancement/api.py b/src/senselab/audio/tasks/speech_enhancement/api.py
@@ -16,8 +16,8 @@ def enhance_audios(
 
     Args:
         audios (List[Audio]): The list of audio objects to be enhanced.
-        model (SenselabModel): The model used for enhancement
-            (default is "speechbrain/sepformer-wham16k-enhancement").
+        model (SenselabModel): The model used for enhancement.
+            If None, the default model "speechbrain/sepformer-wham16k-enhancement" is used.
         device (Optional[DeviceType]): The device to run the model on (default is None).
 
     Returns:
@@ -29,4 +29,6 @@ def enhance_audios(
     if isinstance(model, SpeechBrainModel):
         return SpeechBrainEnhancer.enhance_audios_with_speechbrain(audios=audios, model=model, device=device)
     else:
-        raise NotImplementedError("Only SpeechBrain models are supported for now.")
+        raise NotImplementedError(
+            "Only SpeechBrain models are supported for now. We aim to support more models in the future."
+        )
diff --git a/src/senselab/audio/tasks/speech_enhancement/speechbrain.py b/src/senselab/audio/tasks/speech_enhancement/speechbrain.py
@@ -55,6 +55,7 @@ def enhance_audios_with_speechbrain(
         Args:
             audios (List[Audio]): The list of audio objects to be enhanced.
             model (SpeechBrainModel): The SpeechBrain model used for enhancement.
+                If None, the default model "speechbrain/sepformer-wham16k-enhancement" is used.
             device (Optional[DeviceType]): The device to run the model on (default is None).
             batch_size (int): The size of batches to use when processing on a GPU.
 

diff --git a/src/senselab/audio/tasks/speech_to_text/api.py b/src/senselab/audio/tasks/speech_to_text/api.py
@@ -40,6 +40,8 @@ def transcribe_audios(
                 audios=audios, model=model, language=language, device=device, **kwargs
             )
         else:
-            raise NotImplementedError("Only Hugging Face models are supported for now.")
+            raise NotImplementedError(
+                "Only Hugging Face models are supported for now. We aim to support more models in the future."
+            )
     except TypeError as e:
         raise TypeError(e)  # noqa: W0707
diff --git a/src/senselab/audio/tasks/speech_to_text/huggingface.py b/src/senselab/audio/tasks/speech_to_text/huggingface.py
@@ -80,7 +80,8 @@ def transcribe_audios_with_transformers(
 
         Args:
             audios (List[Audio]): The list of audio objects to be transcribed.
-            model (HFModel): The Hugging Face model used for transcription. (default is `openai/whisper-tiny`).
+            model (HFModel): The Hugging Face model used for transcription.
+                If None, the default model "openai/whisper-tiny" is used.
             language (Optional[Language]): The language of the audio (default is None).
             return_timestamps (Optional[str]): The level of timestamp details (default is "word").
             max_new_tokens (int): The maximum number of new tokens (default is 128).

diff --git a/src/senselab/audio/tasks/text_to_speech/api.py b/src/senselab/audio/tasks/text_to_speech/api.py
@@ -24,7 +24,7 @@ def synthesize_texts(
     Args:
         texts (List[str]): The list of text strings to be synthesized.
         model (SenselabModel): The model used for synthesis.
-            Defaults to HFModel(path_or_uri="suno/bark", revision="main").
+                If None, the default model "suno/bark" is used.
         language (Optional[Language]): The language of the text
             (default is None).
         device (Optional[DeviceType]): The device to run the model on

diff --git a/src/senselab/audio/tasks/text_to_speech/huggingface.py b/src/senselab/audio/tasks/text_to_speech/huggingface.py
@@ -57,7 +57,8 @@ def synthesize_texts_with_transformers(
 
         Args:
             texts (List[str]): The list of text strings to be synthesized.
-            model (HFModel): The Hugging Face model used for synthesis (default is `suno/bark`).
+            model (HFModel): The Hugging Face model used for synthesis.
+                If None, the default model "suno/bark" is used.
             device (Optional[DeviceType]): The device to run the model on (default is None).
             forward_params (Optional[Dict[str, Any]]): Additional parameters to pass to the forward function.
 

diff --git a/src/senselab/audio/tasks/text_to_speech/marstts.py b/src/senselab/audio/tasks/text_to_speech/marstts.py
@@ -75,7 +75,8 @@ def synthesize_texts_with_mars5tts(
             texts (List[str]): The list of text strings to be synthesized.
             targets (List[Tuple[Audio, str]]):
                 The list of tuples containing audio objects and transcripts.
-            model (TorchModel): The Torch model (default is "Camb-ai/mars5-tts").
+            model (TorchModel): The Torch model.
+                If None, the default model "Camb-ai/mars5-tts" is used.
             language (Optional[Language]): The language of the text (default is None).
                 The only supported language is "en" for now.
             device (DeviceType): The device to run the model on (default is None). Supported devices are CPU and CUDA.

diff --git a/src/senselab/audio/tasks/text_to_speech/styletts2.py b/src/senselab/audio/tasks/text_to_speech/styletts2.py
@@ -41,7 +41,8 @@ def _get_style_tts_2_model(
         """Get or create a StyleTTS2 model.
 
         Args:
-            model (TorchModel): The Torch model (default is "wilke0818/StyleTTS2-TorchHub:main").
+            model (TorchModel): The Torch model.
+                If None, the default model "wilke0818/StyleTTS2-TorchHub:main" is used.
             language (Optional[Language]): The language of the text (default is None).
                 The only supported language is "en" for now.
             device (DeviceType): The device to run the model on (default is None). Supported devices are CPU and CUDA.
@@ -101,7 +102,8 @@ def synthesize_texts_with_style_tts_2(
                 The list of audio objects to reference.
             target_transcripts (List[Optional[str]]):
                 Transcript for each target audio
-            model (TorchModel): The Torch model (default is "wilke0818/StyleTTS2-TorchHub").
+            model (TorchModel): The Torch model.
+                If None, the default model "wilke0818/StyleTTS2-TorchHub" is used.
             language (Optional[Language]): The language of the text (default is None).
                 The only supported language is "en" for now.
             device (Optional[DeviceType]): device to run model on

diff --git a/src/senselab/audio/tasks/voice_activity_detection/api.py b/src/senselab/audio/tasks/voice_activity_detection/api.py
@@ -16,8 +16,8 @@ def detect_human_voice_activity_in_audios(
 
     Args:
         audios (List[Audio]): The list of audio objects to be processed.
-        model (Optional[PyannoteAudioModel]): The model used for voice activity detection
-            (default is `pyannote/speaker-diarization-3.1`).
+        model (Optional[PyannoteAudioModel]): The model used for voice activity detection.
+                If None, the default model "pyannote/speaker-diarization-3.1" is used.
         device (Optional[DeviceType]): The device to run the model on (default is None).
 
     Returns:

diff --git a/src/senselab/audio/tasks/voice_cloning/api.py b/src/senselab/audio/tasks/voice_cloning/api.py
@@ -31,8 +31,8 @@ def clone_voices(
         model (TorchModel, optional): The model to use for voice cloning. Currently,
             only KNNVC (K-Nearest Neighbors Voice Conversion) is supported, encapsulated
             by the `TorchModel` class. `TorchModel` is a child class of `SenselabModel`
-            and specifies the model and revision for cloning. Defaults to
-            `TorchModel(path_or_uri="bshall/knn-vc", revision="master")`.
+            and specifies the model and revision for cloning.
+            If None, the default model "bshall/knn-vc" is used.
         device (Optional[DeviceType], optional): The device to run the model on (e.g., CPU or GPU).
             Defaults to None.
         **kwargs: Additional keyword arguments for model-specific parameters that will
@@ -63,4 +63,4 @@ def clone_voices(
             source_audios=source_audios, target_audios=target_audios, model=model, device=device, **kwargs
         )
     else:
-        raise NotImplementedError("Only KNNVC is supported for now.")
+        raise NotImplementedError("Only KNNVC is supported for now. We aim to support more models in the future.")
diff --git a/src/senselab/audio/tasks/voice_cloning/knnvc.py b/src/senselab/audio/tasks/voice_cloning/knnvc.py
@@ -64,7 +64,7 @@ def clone_voices_with_knn_vc(
             source_audios (List[Audio]): List of source audio objects.
             target_audios (List[Audio]): List of target audio objects.
             model (TorchModel, optional): The Torch model to use for the KNNVC pipeline.
-                Defaults to TorchModel(path_or_uri="bshall/knn-vc", revision="master").
+                If None, the default model "bshall/knn-vc" is used.
             prematched_vocoder (bool, optional): Flag indicating whether to use a pre-matched vocoder. Defaults to True.
             topk (int, optional): The number of top matches to consider. Defaults to 4.
             device (Optional[DeviceType], optional): The device to run the pipeline on. Defaults to None.

diff --git a/src/senselab/text/tasks/embeddings_extraction/huggingface.py b/src/senselab/text/tasks/embeddings_extraction/huggingface.py
@@ -67,7 +67,7 @@ def extract_text_embeddings(
         Args:
             pieces_of_text (List[str]): A list of strings to extract embeddings from.
             model (HFModel, optional): A Hugging Face model configuration.
-                Defaults to HFModel(path_or_uri="sentence-transformers/all-MiniLM-L6-v2").
+                If None, the default model "sentence-transformers/all-MiniLM-L6-v2" is used.
             device (Optional[DeviceType], optional): The device to run the model on.
                 Defaults to None.
 

diff --git a/src/senselab/text/tasks/embeddings_extraction/sentence_transformers.py b/src/senselab/text/tasks/embeddings_extraction/sentence_transformers.py
@@ -52,7 +52,7 @@ def extract_text_embeddings(
         Args:
             pieces_of_text (List[str]): A list of strings to extract embeddings from.
             model (SentenceTransformersModel, optional): A Hugging Face model configuration.
-                Defaults to SentenceTransformersModel(path_or_uri="sentence-transformers/all-MiniLM-L6-v2").
+                If None, the default model "sentence-transformers/all-MiniLM-L6-v2" is used.
             device (Optional[DeviceType], optional): The device to run the model on.
                 Defaults to None.