Skip to content

Commit

Permalink
Enhance model documentation across audio and text tasks to clarify de…
Browse files Browse the repository at this point in the history
…fault model usage
  • Loading branch information
900miles committed Nov 24, 2024
1 parent 7604dba commit b378be5
Show file tree
Hide file tree
Showing 18 changed files with 43 additions and 27 deletions.
8 changes: 5 additions & 3 deletions src/senselab/audio/tasks/speaker_diarization/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def diarize_audios(
Args:
audios (List[Audio]): The list of audio objects to be diarized.
model (SenselabModel): The model used for diarization
(default is "pyannote/speaker-diarization-3.1").
model (SenselabModel): The model used for diarization.
If None, the default model "pyannote/speaker-diarization-3.1" is used.
device (Optional[DeviceType]): The device to run the model on (default is None).
num_speakers (Optional[int]): The number of speakers (default is None).
min_speakers (Optional[int]): The minimum number of speakers (default is None).
Expand All @@ -42,4 +42,6 @@ def diarize_audios(
max_speakers=max_speakers,
)
else:
raise NotImplementedError("Only Pyannote models are supported for now.")
raise NotImplementedError(
"Only Pyannote models are supported for now. We aim to support more models in the future."
)
1 change: 1 addition & 0 deletions src/senselab/audio/tasks/speaker_diarization/pyannote.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def diarize_audios_with_pyannote(
Args:
audios (List[Audio]): A list of audio files.
model (PyannoteAudioModel): The model to use for diarization.
If None, the default model "pyannote/speaker-diarization-3.1" is used.
device (Optional[DeviceType]): The device to use for diarization.
num_speakers (Optional[int]): Number of speakers, when known.
min_speakers (Optional[int]): Minimum number of speakers. Has no effect when `num_speakers` is provided.
Expand Down
8 changes: 5 additions & 3 deletions src/senselab/audio/tasks/speaker_embeddings/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ def extract_speaker_embeddings_from_audios(
Args:
audios (List[Audio]): A list of Audio objects containing the audio signals and their properties.
model (SpeechBrainModel): The model used to compute the embeddings
(default is "speechbrain/spkrec-ecapa-voxceleb").
model (SpeechBrainModel): The model used to compute the embeddings.
If None, the default model "speechbrain/spkrec-ecapa-voxceleb" is used.
device (Optional[DeviceType]): The device to run the model on (default is None).
Returns:
Expand All @@ -43,4 +43,6 @@ def extract_speaker_embeddings_from_audios(
audios=audios, model=model, device=device
)
else:
raise NotImplementedError("The specified model is not supported for now.")
raise NotImplementedError(
"Only SpeechBrain models are supported for now. We aim to support more models in the future."
)
4 changes: 2 additions & 2 deletions src/senselab/audio/tasks/speaker_embeddings/speechbrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def extract_speechbrain_speaker_embeddings_from_audios(
Args:
audios (List[Audio]): A list of Audio objects containing the audio signals and their properties.
model (SpeechBrainModel): The model used to compute the embeddings
(default is "speechbrain/spkrec-ecapa-voxceleb").
model (SpeechBrainModel): The model used to compute the embeddings.
If None, the default model "speechbrain/spkrec-ecapa-voxceleb" is used.
device (Optional[DeviceType]): The device to run the model on (default is None).
Only CPU and CUDA are supported.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def verify_speaker(
Args:
audios (List[Tuple[Audio, Audio]]): A list of tuples, where each tuple contains
two audio samples to be compared.
model (SpeechBrainModel, optional): The model for speaker verification.
model (SpeechBrainModel, optional): The model for speaker verification..
If None, the default model "speechbrain/spkrec-ecapa-voxceleb" is used.
device (DeviceType, optional): The device to run the model on. Defaults to CPU.
threshold (float, optional): The threshold to determine same speaker.
Expand Down
8 changes: 5 additions & 3 deletions src/senselab/audio/tasks/speech_enhancement/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def enhance_audios(
Args:
audios (List[Audio]): The list of audio objects to be enhanced.
model (SenselabModel): The model used for enhancement
(default is "speechbrain/sepformer-wham16k-enhancement").
model (SenselabModel): The model used for enhancement.
If None, the default model "speechbrain/sepformer-wham16k-enhancement" is used.
device (Optional[DeviceType]): The device to run the model on (default is None).
Returns:
Expand All @@ -29,4 +29,6 @@ def enhance_audios(
if isinstance(model, SpeechBrainModel):
return SpeechBrainEnhancer.enhance_audios_with_speechbrain(audios=audios, model=model, device=device)
else:
raise NotImplementedError("Only SpeechBrain models are supported for now.")
raise NotImplementedError(
"Only SpeechBrain models are supported for now. We aim to support more models in the future."
)
1 change: 1 addition & 0 deletions src/senselab/audio/tasks/speech_enhancement/speechbrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def enhance_audios_with_speechbrain(
Args:
audios (List[Audio]): The list of audio objects to be enhanced.
model (SpeechBrainModel): The SpeechBrain model used for enhancement.
If None, the default model "speechbrain/sepformer-wham16k-enhancement" is used.
device (Optional[DeviceType]): The device to run the model on (default is None).
batch_size (int): The size of batches to use when processing on a GPU.
Expand Down
4 changes: 3 additions & 1 deletion src/senselab/audio/tasks/speech_to_text/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def transcribe_audios(
audios=audios, model=model, language=language, device=device, **kwargs
)
else:
raise NotImplementedError("Only Hugging Face models are supported for now.")
raise NotImplementedError(
"Only Hugging Face models are supported for now. We aim to support more models in the future."
)
except TypeError as e:
raise TypeError(e) # noqa: W0707
3 changes: 2 additions & 1 deletion src/senselab/audio/tasks/speech_to_text/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def transcribe_audios_with_transformers(
Args:
audios (List[Audio]): The list of audio objects to be transcribed.
model (HFModel): The Hugging Face model used for transcription. (default is `openai/whisper-tiny`).
model (HFModel): The Hugging Face model used for transcription.
If None, the default model "openai/whisper-tiny" is used.
language (Optional[Language]): The language of the audio (default is None).
return_timestamps (Optional[str]): The level of timestamp details (default is "word").
max_new_tokens (int): The maximum number of new tokens (default is 128).
Expand Down
2 changes: 1 addition & 1 deletion src/senselab/audio/tasks/text_to_speech/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def synthesize_texts(
Args:
texts (List[str]): The list of text strings to be synthesized.
model (SenselabModel): The model used for synthesis.
Defaults to HFModel(path_or_uri="suno/bark", revision="main").
If None, the default model "suno/bark" is used.
language (Optional[Language]): The language of the text
(default is None).
device (Optional[DeviceType]): The device to run the model on
Expand Down
3 changes: 2 additions & 1 deletion src/senselab/audio/tasks/text_to_speech/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def synthesize_texts_with_transformers(
Args:
texts (List[str]): The list of text strings to be synthesized.
model (HFModel): The Hugging Face model used for synthesis (default is `suno/bark`).
model (HFModel): The Hugging Face model used for synthesis.
If None, the default model "suno/bark" is used.
device (Optional[DeviceType]): The device to run the model on (default is None).
forward_params (Optional[Dict[str, Any]]): Additional parameters to pass to the forward function.
Expand Down
3 changes: 2 additions & 1 deletion src/senselab/audio/tasks/text_to_speech/marstts.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def synthesize_texts_with_mars5tts(
texts (List[str]): The list of text strings to be synthesized.
targets (List[Tuple[Audio, str]]):
The list of tuples containing audio objects and transcripts.
model (TorchModel): The Torch model (default is "Camb-ai/mars5-tts").
model (TorchModel): The Torch model.
If None, the default model "Camb-ai/mars5-tts" is used.
language (Optional[Language]): The language of the text (default is None).
The only supported language is "en" for now.
device (DeviceType): The device to run the model on (default is None). Supported devices are CPU and CUDA.
Expand Down
6 changes: 4 additions & 2 deletions src/senselab/audio/tasks/text_to_speech/styletts2.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def _get_style_tts_2_model(
"""Get or create a StyleTTS2 model.
Args:
model (TorchModel): The Torch model (default is "wilke0818/StyleTTS2-TorchHub:main").
model (TorchModel): The Torch model.
If None, the default model "wilke0818/StyleTTS2-TorchHub:main" is used.
language (Optional[Language]): The language of the text (default is None).
The only supported language is "en" for now.
device (DeviceType): The device to run the model on (default is None). Supported devices are CPU and CUDA.
Expand Down Expand Up @@ -101,7 +102,8 @@ def synthesize_texts_with_style_tts_2(
The list of audio objects to reference.
target_transcripts (List[Optional[str]]):
Transcript for each target audio
model (TorchModel): The Torch model (default is "wilke0818/StyleTTS2-TorchHub").
model (TorchModel): The Torch model.
If None, the default model "wilke0818/StyleTTS2-TorchHub" is used.
language (Optional[Language]): The language of the text (default is None).
The only supported language is "en" for now.
device (Optional[DeviceType]): device to run model on
Expand Down
4 changes: 2 additions & 2 deletions src/senselab/audio/tasks/voice_activity_detection/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def detect_human_voice_activity_in_audios(
Args:
audios (List[Audio]): The list of audio objects to be processed.
model (Optional[PyannoteAudioModel]): The model used for voice activity detection
(default is `pyannote/speaker-diarization-3.1`).
model (Optional[PyannoteAudioModel]): The model used for voice activity detection.
If None, the default model "pyannote/speaker-diarization-3.1" is used.
device (Optional[DeviceType]): The device to run the model on (default is None).
Returns:
Expand Down
6 changes: 3 additions & 3 deletions src/senselab/audio/tasks/voice_cloning/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ def clone_voices(
model (TorchModel, optional): The model to use for voice cloning. Currently,
only KNNVC (K-Nearest Neighbors Voice Conversion) is supported, encapsulated
by the `TorchModel` class. `TorchModel` is a child class of `SenselabModel`
and specifies the model and revision for cloning. Defaults to
`TorchModel(path_or_uri="bshall/knn-vc", revision="master")`.
and specifies the model and revision for cloning.
If None, the default model "bshall/knn-vc" is used.
device (Optional[DeviceType], optional): The device to run the model on (e.g., CPU or GPU).
Defaults to None.
**kwargs: Additional keyword arguments for model-specific parameters that will
Expand Down Expand Up @@ -63,4 +63,4 @@ def clone_voices(
source_audios=source_audios, target_audios=target_audios, model=model, device=device, **kwargs
)
else:
raise NotImplementedError("Only KNNVC is supported for now.")
raise NotImplementedError("Only KNNVC is supported for now. We aim to support more models in the future.")
2 changes: 1 addition & 1 deletion src/senselab/audio/tasks/voice_cloning/knnvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def clone_voices_with_knn_vc(
source_audios (List[Audio]): List of source audio objects.
target_audios (List[Audio]): List of target audio objects.
model (TorchModel, optional): The Torch model to use for the KNNVC pipeline.
Defaults to TorchModel(path_or_uri="bshall/knn-vc", revision="master").
If None, the default model "bshall/knn-vc" is used.
prematched_vocoder (bool, optional): Flag indicating whether to use a pre-matched vocoder. Defaults to True.
topk (int, optional): The number of top matches to consider. Defaults to 4.
device (Optional[DeviceType], optional): The device to run the pipeline on. Defaults to None.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def extract_text_embeddings(
Args:
pieces_of_text (List[str]): A list of strings to extract embeddings from.
model (HFModel, optional): A Hugging Face model configuration.
Defaults to HFModel(path_or_uri="sentence-transformers/all-MiniLM-L6-v2").
If None, the default model "sentence-transformers/all-MiniLM-L6-v2" is used.
device (Optional[DeviceType], optional): The device to run the model on.
Defaults to None.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def extract_text_embeddings(
Args:
pieces_of_text (List[str]): A list of strings to extract embeddings from.
model (SentenceTransformersModel, optional): A Hugging Face model configuration.
Defaults to SentenceTransformersModel(path_or_uri="sentence-transformers/all-MiniLM-L6-v2").
If None, the default model "sentence-transformers/all-MiniLM-L6-v2" is used.
device (Optional[DeviceType], optional): The device to run the model on.
Defaults to None.
Expand Down

0 comments on commit b378be5

Please sign in to comment.