From b378be5378bf59308dbf59bbdb5600b020583bc8 Mon Sep 17 00:00:00 2001 From: 900miles Date: Sun, 24 Nov 2024 17:13:48 -0500 Subject: [PATCH] Enhance model documentation across audio and text tasks to clarify default model usage --- src/senselab/audio/tasks/speaker_diarization/api.py | 8 +++++--- src/senselab/audio/tasks/speaker_diarization/pyannote.py | 1 + src/senselab/audio/tasks/speaker_embeddings/api.py | 8 +++++--- .../audio/tasks/speaker_embeddings/speechbrain.py | 4 ++-- .../tasks/speaker_verification/speaker_verification.py | 3 ++- src/senselab/audio/tasks/speech_enhancement/api.py | 8 +++++--- .../audio/tasks/speech_enhancement/speechbrain.py | 1 + src/senselab/audio/tasks/speech_to_text/api.py | 4 +++- src/senselab/audio/tasks/speech_to_text/huggingface.py | 3 ++- src/senselab/audio/tasks/text_to_speech/api.py | 2 +- src/senselab/audio/tasks/text_to_speech/huggingface.py | 3 ++- src/senselab/audio/tasks/text_to_speech/marstts.py | 3 ++- src/senselab/audio/tasks/text_to_speech/styletts2.py | 6 ++++-- src/senselab/audio/tasks/voice_activity_detection/api.py | 4 ++-- src/senselab/audio/tasks/voice_cloning/api.py | 6 +++--- src/senselab/audio/tasks/voice_cloning/knnvc.py | 2 +- .../text/tasks/embeddings_extraction/huggingface.py | 2 +- .../tasks/embeddings_extraction/sentence_transformers.py | 2 +- 18 files changed, 43 insertions(+), 27 deletions(-) diff --git a/src/senselab/audio/tasks/speaker_diarization/api.py b/src/senselab/audio/tasks/speaker_diarization/api.py index 45e62395..bab7456b 100644 --- a/src/senselab/audio/tasks/speaker_diarization/api.py +++ b/src/senselab/audio/tasks/speaker_diarization/api.py @@ -19,8 +19,8 @@ def diarize_audios( Args: audios (List[Audio]): The list of audio objects to be diarized. - model (SenselabModel): The model used for diarization - (default is "pyannote/speaker-diarization-3.1"). + model (SenselabModel): The model used for diarization. + If None, the default model "pyannote/speaker-diarization-3.1" is used. device (Optional[DeviceType]): The device to run the model on (default is None). num_speakers (Optional[int]): The number of speakers (default is None). min_speakers (Optional[int]): The minimum number of speakers (default is None). @@ -42,4 +42,6 @@ def diarize_audios( max_speakers=max_speakers, ) else: - raise NotImplementedError("Only Pyannote models are supported for now.") + raise NotImplementedError( + "Only Pyannote models are supported for now. We aim to support more models in the future." + ) diff --git a/src/senselab/audio/tasks/speaker_diarization/pyannote.py b/src/senselab/audio/tasks/speaker_diarization/pyannote.py index 7422623d..5ce6d0a7 100644 --- a/src/senselab/audio/tasks/speaker_diarization/pyannote.py +++ b/src/senselab/audio/tasks/speaker_diarization/pyannote.py @@ -57,6 +57,7 @@ def diarize_audios_with_pyannote( Args: audios (List[Audio]): A list of audio files. model (PyannoteAudioModel): The model to use for diarization. + If None, the default model "pyannote/speaker-diarization-3.1" is used. device (Optional[DeviceType]): The device to use for diarization. num_speakers (Optional[int]): Number of speakers, when known. min_speakers (Optional[int]): Minimum number of speakers. Has no effect when `num_speakers` is provided. diff --git a/src/senselab/audio/tasks/speaker_embeddings/api.py b/src/senselab/audio/tasks/speaker_embeddings/api.py index 040b9496..66cd8056 100644 --- a/src/senselab/audio/tasks/speaker_embeddings/api.py +++ b/src/senselab/audio/tasks/speaker_embeddings/api.py @@ -18,8 +18,8 @@ def extract_speaker_embeddings_from_audios( Args: audios (List[Audio]): A list of Audio objects containing the audio signals and their properties. - model (SpeechBrainModel): The model used to compute the embeddings - (default is "speechbrain/spkrec-ecapa-voxceleb"). + model (SpeechBrainModel): The model used to compute the embeddings. + If None, the default model "speechbrain/spkrec-ecapa-voxceleb" is used. device (Optional[DeviceType]): The device to run the model on (default is None). Returns: @@ -43,4 +43,6 @@ def extract_speaker_embeddings_from_audios( audios=audios, model=model, device=device ) else: - raise NotImplementedError("The specified model is not supported for now.") + raise NotImplementedError( + "Only SpeechBrain models are supported for now. We aim to support more models in the future." + ) diff --git a/src/senselab/audio/tasks/speaker_embeddings/speechbrain.py b/src/senselab/audio/tasks/speaker_embeddings/speechbrain.py index f5b215c3..ba53dda8 100644 --- a/src/senselab/audio/tasks/speaker_embeddings/speechbrain.py +++ b/src/senselab/audio/tasks/speaker_embeddings/speechbrain.py @@ -55,8 +55,8 @@ def extract_speechbrain_speaker_embeddings_from_audios( Args: audios (List[Audio]): A list of Audio objects containing the audio signals and their properties. - model (SpeechBrainModel): The model used to compute the embeddings - (default is "speechbrain/spkrec-ecapa-voxceleb"). + model (SpeechBrainModel): The model used to compute the embeddings. + If None, the default model "speechbrain/spkrec-ecapa-voxceleb" is used. device (Optional[DeviceType]): The device to run the model on (default is None). Only CPU and CUDA are supported. diff --git a/src/senselab/audio/tasks/speaker_verification/speaker_verification.py b/src/senselab/audio/tasks/speaker_verification/speaker_verification.py index 01ced32c..e39aa606 100644 --- a/src/senselab/audio/tasks/speaker_verification/speaker_verification.py +++ b/src/senselab/audio/tasks/speaker_verification/speaker_verification.py @@ -27,7 +27,8 @@ def verify_speaker( Args: audios (List[Tuple[Audio, Audio]]): A list of tuples, where each tuple contains two audio samples to be compared. - model (SpeechBrainModel, optional): The model for speaker verification. + model (SpeechBrainModel, optional): The model for speaker verification.. + If None, the default model "speechbrain/spkrec-ecapa-voxceleb" is used. device (DeviceType, optional): The device to run the model on. Defaults to CPU. threshold (float, optional): The threshold to determine same speaker. diff --git a/src/senselab/audio/tasks/speech_enhancement/api.py b/src/senselab/audio/tasks/speech_enhancement/api.py index 5125dc2f..cd5597ea 100644 --- a/src/senselab/audio/tasks/speech_enhancement/api.py +++ b/src/senselab/audio/tasks/speech_enhancement/api.py @@ -16,8 +16,8 @@ def enhance_audios( Args: audios (List[Audio]): The list of audio objects to be enhanced. - model (SenselabModel): The model used for enhancement - (default is "speechbrain/sepformer-wham16k-enhancement"). + model (SenselabModel): The model used for enhancement. + If None, the default model "speechbrain/sepformer-wham16k-enhancement" is used. device (Optional[DeviceType]): The device to run the model on (default is None). Returns: @@ -29,4 +29,6 @@ def enhance_audios( if isinstance(model, SpeechBrainModel): return SpeechBrainEnhancer.enhance_audios_with_speechbrain(audios=audios, model=model, device=device) else: - raise NotImplementedError("Only SpeechBrain models are supported for now.") + raise NotImplementedError( + "Only SpeechBrain models are supported for now. We aim to support more models in the future." + ) diff --git a/src/senselab/audio/tasks/speech_enhancement/speechbrain.py b/src/senselab/audio/tasks/speech_enhancement/speechbrain.py index 2624b6b3..f8d55718 100644 --- a/src/senselab/audio/tasks/speech_enhancement/speechbrain.py +++ b/src/senselab/audio/tasks/speech_enhancement/speechbrain.py @@ -55,6 +55,7 @@ def enhance_audios_with_speechbrain( Args: audios (List[Audio]): The list of audio objects to be enhanced. model (SpeechBrainModel): The SpeechBrain model used for enhancement. + If None, the default model "speechbrain/sepformer-wham16k-enhancement" is used. device (Optional[DeviceType]): The device to run the model on (default is None). batch_size (int): The size of batches to use when processing on a GPU. diff --git a/src/senselab/audio/tasks/speech_to_text/api.py b/src/senselab/audio/tasks/speech_to_text/api.py index 276a5794..a7f0eecd 100644 --- a/src/senselab/audio/tasks/speech_to_text/api.py +++ b/src/senselab/audio/tasks/speech_to_text/api.py @@ -40,6 +40,8 @@ def transcribe_audios( audios=audios, model=model, language=language, device=device, **kwargs ) else: - raise NotImplementedError("Only Hugging Face models are supported for now.") + raise NotImplementedError( + "Only Hugging Face models are supported for now. We aim to support more models in the future." + ) except TypeError as e: raise TypeError(e) # noqa: W0707 diff --git a/src/senselab/audio/tasks/speech_to_text/huggingface.py b/src/senselab/audio/tasks/speech_to_text/huggingface.py index ca8c89ad..484897d7 100644 --- a/src/senselab/audio/tasks/speech_to_text/huggingface.py +++ b/src/senselab/audio/tasks/speech_to_text/huggingface.py @@ -80,7 +80,8 @@ def transcribe_audios_with_transformers( Args: audios (List[Audio]): The list of audio objects to be transcribed. - model (HFModel): The Hugging Face model used for transcription. (default is `openai/whisper-tiny`). + model (HFModel): The Hugging Face model used for transcription. + If None, the default model "openai/whisper-tiny" is used. language (Optional[Language]): The language of the audio (default is None). return_timestamps (Optional[str]): The level of timestamp details (default is "word"). max_new_tokens (int): The maximum number of new tokens (default is 128). diff --git a/src/senselab/audio/tasks/text_to_speech/api.py b/src/senselab/audio/tasks/text_to_speech/api.py index 708ea39d..376552b3 100644 --- a/src/senselab/audio/tasks/text_to_speech/api.py +++ b/src/senselab/audio/tasks/text_to_speech/api.py @@ -24,7 +24,7 @@ def synthesize_texts( Args: texts (List[str]): The list of text strings to be synthesized. model (SenselabModel): The model used for synthesis. - Defaults to HFModel(path_or_uri="suno/bark", revision="main"). + If None, the default model "suno/bark" is used. language (Optional[Language]): The language of the text (default is None). device (Optional[DeviceType]): The device to run the model on diff --git a/src/senselab/audio/tasks/text_to_speech/huggingface.py b/src/senselab/audio/tasks/text_to_speech/huggingface.py index 84e2aff7..268e3c9d 100644 --- a/src/senselab/audio/tasks/text_to_speech/huggingface.py +++ b/src/senselab/audio/tasks/text_to_speech/huggingface.py @@ -57,7 +57,8 @@ def synthesize_texts_with_transformers( Args: texts (List[str]): The list of text strings to be synthesized. - model (HFModel): The Hugging Face model used for synthesis (default is `suno/bark`). + model (HFModel): The Hugging Face model used for synthesis. + If None, the default model "suno/bark" is used. device (Optional[DeviceType]): The device to run the model on (default is None). forward_params (Optional[Dict[str, Any]]): Additional parameters to pass to the forward function. diff --git a/src/senselab/audio/tasks/text_to_speech/marstts.py b/src/senselab/audio/tasks/text_to_speech/marstts.py index e7945d63..7d99c980 100644 --- a/src/senselab/audio/tasks/text_to_speech/marstts.py +++ b/src/senselab/audio/tasks/text_to_speech/marstts.py @@ -75,7 +75,8 @@ def synthesize_texts_with_mars5tts( texts (List[str]): The list of text strings to be synthesized. targets (List[Tuple[Audio, str]]): The list of tuples containing audio objects and transcripts. - model (TorchModel): The Torch model (default is "Camb-ai/mars5-tts"). + model (TorchModel): The Torch model. + If None, the default model "Camb-ai/mars5-tts" is used. language (Optional[Language]): The language of the text (default is None). The only supported language is "en" for now. device (DeviceType): The device to run the model on (default is None). Supported devices are CPU and CUDA. diff --git a/src/senselab/audio/tasks/text_to_speech/styletts2.py b/src/senselab/audio/tasks/text_to_speech/styletts2.py index 32e83a58..6e6ce7b9 100644 --- a/src/senselab/audio/tasks/text_to_speech/styletts2.py +++ b/src/senselab/audio/tasks/text_to_speech/styletts2.py @@ -41,7 +41,8 @@ def _get_style_tts_2_model( """Get or create a StyleTTS2 model. Args: - model (TorchModel): The Torch model (default is "wilke0818/StyleTTS2-TorchHub:main"). + model (TorchModel): The Torch model. + If None, the default model "wilke0818/StyleTTS2-TorchHub:main" is used. language (Optional[Language]): The language of the text (default is None). The only supported language is "en" for now. device (DeviceType): The device to run the model on (default is None). Supported devices are CPU and CUDA. @@ -101,7 +102,8 @@ def synthesize_texts_with_style_tts_2( The list of audio objects to reference. target_transcripts (List[Optional[str]]): Transcript for each target audio - model (TorchModel): The Torch model (default is "wilke0818/StyleTTS2-TorchHub"). + model (TorchModel): The Torch model. + If None, the default model "wilke0818/StyleTTS2-TorchHub" is used. language (Optional[Language]): The language of the text (default is None). The only supported language is "en" for now. device (Optional[DeviceType]): device to run model on diff --git a/src/senselab/audio/tasks/voice_activity_detection/api.py b/src/senselab/audio/tasks/voice_activity_detection/api.py index 587806dd..ecd45c89 100644 --- a/src/senselab/audio/tasks/voice_activity_detection/api.py +++ b/src/senselab/audio/tasks/voice_activity_detection/api.py @@ -16,8 +16,8 @@ def detect_human_voice_activity_in_audios( Args: audios (List[Audio]): The list of audio objects to be processed. - model (Optional[PyannoteAudioModel]): The model used for voice activity detection - (default is `pyannote/speaker-diarization-3.1`). + model (Optional[PyannoteAudioModel]): The model used for voice activity detection. + If None, the default model "pyannote/speaker-diarization-3.1" is used. device (Optional[DeviceType]): The device to run the model on (default is None). Returns: diff --git a/src/senselab/audio/tasks/voice_cloning/api.py b/src/senselab/audio/tasks/voice_cloning/api.py index d251f470..8a73c982 100644 --- a/src/senselab/audio/tasks/voice_cloning/api.py +++ b/src/senselab/audio/tasks/voice_cloning/api.py @@ -31,8 +31,8 @@ def clone_voices( model (TorchModel, optional): The model to use for voice cloning. Currently, only KNNVC (K-Nearest Neighbors Voice Conversion) is supported, encapsulated by the `TorchModel` class. `TorchModel` is a child class of `SenselabModel` - and specifies the model and revision for cloning. Defaults to - `TorchModel(path_or_uri="bshall/knn-vc", revision="master")`. + and specifies the model and revision for cloning. + If None, the default model "bshall/knn-vc" is used. device (Optional[DeviceType], optional): The device to run the model on (e.g., CPU or GPU). Defaults to None. **kwargs: Additional keyword arguments for model-specific parameters that will @@ -63,4 +63,4 @@ def clone_voices( source_audios=source_audios, target_audios=target_audios, model=model, device=device, **kwargs ) else: - raise NotImplementedError("Only KNNVC is supported for now.") + raise NotImplementedError("Only KNNVC is supported for now. We aim to support more models in the future.") diff --git a/src/senselab/audio/tasks/voice_cloning/knnvc.py b/src/senselab/audio/tasks/voice_cloning/knnvc.py index ee3e3d15..45181bba 100644 --- a/src/senselab/audio/tasks/voice_cloning/knnvc.py +++ b/src/senselab/audio/tasks/voice_cloning/knnvc.py @@ -64,7 +64,7 @@ def clone_voices_with_knn_vc( source_audios (List[Audio]): List of source audio objects. target_audios (List[Audio]): List of target audio objects. model (TorchModel, optional): The Torch model to use for the KNNVC pipeline. - Defaults to TorchModel(path_or_uri="bshall/knn-vc", revision="master"). + If None, the default model "bshall/knn-vc" is used. prematched_vocoder (bool, optional): Flag indicating whether to use a pre-matched vocoder. Defaults to True. topk (int, optional): The number of top matches to consider. Defaults to 4. device (Optional[DeviceType], optional): The device to run the pipeline on. Defaults to None. diff --git a/src/senselab/text/tasks/embeddings_extraction/huggingface.py b/src/senselab/text/tasks/embeddings_extraction/huggingface.py index 6f308de8..f3ad4f85 100644 --- a/src/senselab/text/tasks/embeddings_extraction/huggingface.py +++ b/src/senselab/text/tasks/embeddings_extraction/huggingface.py @@ -67,7 +67,7 @@ def extract_text_embeddings( Args: pieces_of_text (List[str]): A list of strings to extract embeddings from. model (HFModel, optional): A Hugging Face model configuration. - Defaults to HFModel(path_or_uri="sentence-transformers/all-MiniLM-L6-v2"). + If None, the default model "sentence-transformers/all-MiniLM-L6-v2" is used. device (Optional[DeviceType], optional): The device to run the model on. Defaults to None. diff --git a/src/senselab/text/tasks/embeddings_extraction/sentence_transformers.py b/src/senselab/text/tasks/embeddings_extraction/sentence_transformers.py index 587ae3c2..6201640d 100644 --- a/src/senselab/text/tasks/embeddings_extraction/sentence_transformers.py +++ b/src/senselab/text/tasks/embeddings_extraction/sentence_transformers.py @@ -52,7 +52,7 @@ def extract_text_embeddings( Args: pieces_of_text (List[str]): A list of strings to extract embeddings from. model (SentenceTransformersModel, optional): A Hugging Face model configuration. - Defaults to SentenceTransformersModel(path_or_uri="sentence-transformers/all-MiniLM-L6-v2"). + If None, the default model "sentence-transformers/all-MiniLM-L6-v2" is used. device (Optional[DeviceType], optional): The device to run the model on. Defaults to None.