diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 0c068eb6..38c86c19 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -64,7 +64,7 @@ jobs: --verbose shell: bash - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 848910cb..4105ceba 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,6 +37,8 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 hooks: + - id: check-added-large-files + args: [--maxkb=15000] - id: check-case-conflict - id: end-of-file-fixer - id: trailing-whitespace diff --git a/CHANGELOG.md b/CHANGELOG.md index 188db5f5..a89d6b11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,38 @@ +# 0.26.1 (Mon Dec 30 2024) + +#### 🐛 Bug Fix + +- Update model.py [#228](https://github.com/sensein/senselab/pull/228) ([@fabiocat93](https://github.com/fabiocat93)) + +#### 📝 Documentation + +- Updating tutorial files [#210](https://github.com/sensein/senselab/pull/210) ([@900miles](https://github.com/900miles) [@fabiocat93](https://github.com/fabiocat93)) + +#### 🔩 Dependency Updates + +- Bump codecov/codecov-action from 4 to 5 [#211](https://github.com/sensein/senselab/pull/211) ([@dependabot[bot]](https://github.com/dependabot[bot]) [@fabiocat93](https://github.com/fabiocat93)) + +#### Authors: 3 + +- [@900miles](https://github.com/900miles) +- [@dependabot[bot]](https://github.com/dependabot[bot]) +- Fabio Catania ([@fabiocat93](https://github.com/fabiocat93)) + +--- + +# 0.26.0 (Mon Dec 23 2024) + +#### 🚀 Enhancement + +- Bump the production-dependencies group across 1 directory with 3 updates [#218](https://github.com/sensein/senselab/pull/218) ([@dependabot[bot]](https://github.com/dependabot[bot]) [@fabiocat93](https://github.com/fabiocat93)) + +#### Authors: 2 + +- [@dependabot[bot]](https://github.com/dependabot[bot]) +- Fabio Catania ([@fabiocat93](https://github.com/fabiocat93)) + +--- + # 0.25.0 (Tue Dec 10 2024) #### 🚀 Enhancement diff --git a/pyproject.toml b/pyproject.toml index 35ffc155..f8edda4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ datasets = "~=3" torch = "~=2.5" torchvision = "~=0.20" torchaudio = "~=2.5" -transformers = "~=4.46.2" +transformers = "~=4.47" pydra = "~=0.25" pydantic = "~=2.7" accelerate = "*" @@ -43,7 +43,7 @@ torch-audiomentations = "~=0.11" sentence-transformers = "~=3.1" jiwer = "~=3.0" speechbrain = "~=1" -pyav = "~=13" +pyav = "~=14.0" pyannote-audio = "~=3.3" pycountry = "~=24.6" types-requests = "~=2.32" @@ -130,10 +130,7 @@ target-version = "py310" [tool.ruff.lint] select = ["ANN", "D", "E", "F", "I"] -ignore = [ - "ANN101", # self should not be annotated. - "ANN102" # cls should not be annotated. -] +ignore = [] fixable = ["ALL"] unfixable = [] dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" diff --git a/src/senselab/text/tasks/evaluate_conversation/metrics.py b/src/senselab/text/tasks/evaluate_conversation/metrics.py index 774d2559..2a949c0f 100644 --- a/src/senselab/text/tasks/evaluate_conversation/metrics.py +++ b/src/senselab/text/tasks/evaluate_conversation/metrics.py @@ -27,9 +27,9 @@ def Rouge(*args: List, **kwargs: Dict) -> rouge_scorer.RougeScorer: Rouge.__doc__ = rouge_scorer.RougeScorer.__doc__ -def sentence_bleu_sacre(*args: List, **kwargs: Dict) -> BLEUScore: +def sentence_bleu_sacre(*args: List) -> List[BLEUScore]: """Wrapper for sacrebleu's sentence_bleu function.""" - return sb.sentence_bleu(*args, **kwargs) + return [sb.sentence_bleu(str(item), str(ref)) for item, ref in args] sentence_bleu_sacre.__doc__ = sb.sentence_bleu.__doc__ diff --git a/src/senselab/text/tasks/llms/transcript_manager.py b/src/senselab/text/tasks/llms/transcript_manager.py index 729c7840..ef807a1d 100644 --- a/src/senselab/text/tasks/llms/transcript_manager.py +++ b/src/senselab/text/tasks/llms/transcript_manager.py @@ -27,7 +27,7 @@ class Transcript: convert_json_to_scriptlines(json_obj: Dict) -> List[ScriptLine]: Converts transcript format to LLM format. """ - def __init__(self, transcript_path: Path) -> None: + def __init__(self: "Transcript", transcript_path: Path) -> None: """Initializes the manager with a transcript file path. Args: @@ -38,12 +38,12 @@ def __init__(self, transcript_path: Path) -> None: json_obj = self._load_transcript(transcript_path) self.scriptlines = self.convert_json_to_scriptlines(json_obj) - def print_human_readable(self) -> None: + def print_human_readable(self: "Transcript") -> None: """Prints the stored scriptlines in a human-readable format.""" for message in self.scriptlines: print(f"{message.speaker}:\t\t{message.text}\n") - def get_num_tokens(self) -> int: + def get_num_tokens(self: "Transcript") -> int: """Returns the total number of OpenAI tokens in the conversation. Returns: @@ -56,7 +56,7 @@ def get_num_tokens(self) -> int: c += len(encoding.encode(message.text)) return c - def extract_response_opportunities(self) -> List[List[ScriptLine]]: + def extract_response_opportunities(self: "Transcript") -> List[List[ScriptLine]]: """Extract consecutive sublists from the messages list, ending after every 'user' response. This is used to compare AI responses to a human's response @@ -71,7 +71,7 @@ def extract_response_opportunities(self) -> List[List[ScriptLine]]: sublists = [] for i, message in enumerate(self.scriptlines): - if message.speaker == "user" and i > 0: + if message.speaker == "user" and i >= 0: sublist = self.scriptlines[0 : i + 1] sublists.append(sublist) diff --git a/src/senselab/utils/data_structures/device.py b/src/senselab/utils/data_structures/device.py index 71fc2619..e917b606 100644 --- a/src/senselab/utils/data_structures/device.py +++ b/src/senselab/utils/data_structures/device.py @@ -9,9 +9,9 @@ class DeviceType(Enum): """Device types for PyTorch operations.""" - CPU: str = "cpu" - CUDA: str = "cuda" - MPS: str = "mps" + CPU = "cpu" + CUDA = "cuda" + MPS = "mps" DTYPE_MAP = {DeviceType.CPU: torch.float32, DeviceType.CUDA: torch.float16, DeviceType.MPS: torch.float32} diff --git a/src/senselab/utils/data_structures/model.py b/src/senselab/utils/data_structures/model.py index 2111b03f..000af2a2 100644 --- a/src/senselab/utils/data_structures/model.py +++ b/src/senselab/utils/data_structures/model.py @@ -153,7 +153,10 @@ def check_hf_repo_exists(repo_id: str, revision: str = "main", repo_type: str = """Private function to check if a Hugging Face repository exists.""" api = HfApi() try: - api.list_repo_commits(repo_id=repo_id, revision=revision, repo_type=repo_type) + if repo_type == "model": + api.model_info(repo_id=repo_id, revision=revision) + else: + api.list_repo_commits(repo_id=repo_id, revision=revision, repo_type=repo_type) return True except Exception: # raise RuntimeError(f"An error occurred: {e}") diff --git a/src/tests/text/tasks/transcript_manager_test.py b/src/tests/text/tasks/transcript_manager_test.py index f49b2407..55ffe47a 100644 --- a/src/tests/text/tasks/transcript_manager_test.py +++ b/src/tests/text/tasks/transcript_manager_test.py @@ -94,7 +94,9 @@ def test_get_num_tokens(sample_transcript: Path) -> None: def test_response_opportunities_extraction(sample_transcript: Path) -> None: """Test the extraction of response opportunities.""" transcript = Transcript(sample_transcript) + print(transcript) opportunities = transcript.extract_response_opportunities() + print(opportunities) assert len(opportunities) == 2, "Expected two response opportunities" assert opportunities[0][-1].speaker == "user", "Expected last message to be first message from user" diff --git a/src/tests/utils/data_structures/model_test.py b/src/tests/utils/data_structures/model_test.py index 5ee14b6e..d240cc2c 100644 --- a/src/tests/utils/data_structures/model_test.py +++ b/src/tests/utils/data_structures/model_test.py @@ -9,7 +9,7 @@ def test_check_hf_repo_exists_true() -> None: """Test HF repo exists.""" - with patch("huggingface_hub.HfApi.list_repo_commits") as mock_list_repo_commits: + with patch("huggingface_hub.HfApi.model_info") as mock_list_repo_commits: mock_list_repo_commits.return_value = True assert check_hf_repo_exists("valid_repo") is True diff --git a/tutorials/audio/00_getting_started.ipynb b/tutorials/audio/00_getting_started.ipynb index b3d431e2..12e57f5f 100644 --- a/tutorials/audio/00_getting_started.ipynb +++ b/tutorials/audio/00_getting_started.ipynb @@ -6,7 +6,7 @@ "source": [ "# Getting Started with ```senselab```\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensein/senselab/blob/main/tutorials/audiogetting_started.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensein/senselab/blob/main/tutorials/audio/00_getting_started.ipynb)\n", "\n", "\n", "Welcome to the `senselab` quick start tutorial! \n", @@ -35,7 +35,7 @@ }, "outputs": [], "source": [ - "pip install senselab" + "%pip install senselab" ] }, { @@ -48,14 +48,17 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from senselab.audio.data_structures import Audio\n", + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "!wget -O tutorial_audio_files/audio_48khz_stereo_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_stereo_16bits.wav\n", "\n", - "MONO_AUDIO_PATH = \"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\"\n", - "STEREO_AUDIO_PATH = \"../../src/tests/data_for_testing/audio_48khz_stereo_16bits.wav\"\n", + "MONO_AUDIO_PATH = \"tutorial_audio_files/audio_48khz_mono_16bits.wav\"\n", + "STEREO_AUDIO_PATH = \"tutorial_audio_files/audio_48khz_stereo_16bits.wav\"\n", "\n", "audio1 = Audio.from_filepath(MONO_AUDIO_PATH)\n", "audio2 = Audio.from_filepath(STEREO_AUDIO_PATH)" @@ -71,9 +74,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The original audio has 2 channels.\n", + "The downmixed audio has 1 channels.\n" + ] + } + ], "source": [ "from senselab.audio.tasks.preprocessing import downmix_audios_to_mono\n", "\n", @@ -331,7 +343,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/tutorials/audio/audio_data_augmentation.ipynb b/tutorials/audio/audio_data_augmentation.ipynb index 589ba656..3da79bb5 100644 --- a/tutorials/audio/audio_data_augmentation.ipynb +++ b/tutorials/audio/audio_data_augmentation.ipynb @@ -16,7 +16,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We start by importing the modules required for the augmentation, plotting, and audio processing tasks." + "First, we should install senselab if it has not already been installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we start by importing the modules required for the augmentation, plotting, and audio processing tasks." ] }, { @@ -78,7 +94,10 @@ "outputs": [], "source": [ "# Load an audio file\n", - "audio = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n", + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "\n", + "audio = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", "\n", "# Play the audio\n", "play_audio(audio)\n", @@ -145,7 +164,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/tutorials/audio/extract_speaker_embeddings.ipynb b/tutorials/audio/extract_speaker_embeddings.ipynb index 2ea3c0aa..5b7b4e92 100644 --- a/tutorials/audio/extract_speaker_embeddings.ipynb +++ b/tutorials/audio/extract_speaker_embeddings.ipynb @@ -17,6 +17,15 @@ "First, let's import the necessary libraries and the function we'll be using." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, { "cell_type": "code", "execution_count": null, @@ -45,12 +54,16 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "audio1 = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n", - "audio2 = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_stereo_16bits.wav\")\n", + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "!wget -O tutorial_audio_files/audio_48khz_stereo_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_stereo_16bits.wav\n", + "\n", + "audio1 = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", + "audio2 = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_stereo_16bits.wav\")\n", "\n", "# Downmix to mono\n", "audio2 = downmix_audios_to_mono([audio2])[0]\n", @@ -99,7 +112,7 @@ "metadata": {}, "outputs": [], "source": [ - "from senselab.utils.tasks.cosine_similarity import cosine_similarity\n", + "from senselab.utils.tasks.cosine_similarity import compute_cosine_similarity\n", "\n", "\n", "# DIRECTLY PLOT THE EMBEDDINGS FOR THE TWO FILES\n", @@ -129,7 +142,7 @@ " \n", " for i in range(n):\n", " for j in range(n):\n", - " similarity_matrix[i, j] = cosine_similarity(embeddings[i], embeddings[j])\n", + " similarity_matrix[i, j] = compute_cosine_similarity(embeddings[i], embeddings[j])\n", " \n", " fig, ax = plt.subplots(figsize=(8, 6))\n", " im = ax.imshow(similarity_matrix, cmap='coolwarm', vmin=-1, vmax=1)\n", @@ -174,7 +187,7 @@ ], "metadata": { "kernelspec": { - "display_name": "senselab", + "display_name": "senselab-KP8v1V64-py3.10", "language": "python", "name": "python3" }, @@ -188,7 +201,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.0" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/tutorials/audio/features_extraction.ipynb b/tutorials/audio/features_extraction.ipynb index 737dfdaf..147c11d6 100644 --- a/tutorials/audio/features_extraction.ipynb +++ b/tutorials/audio/features_extraction.ipynb @@ -12,6 +12,15 @@ "In this tutorial, we will explore how to extract some audio descriptors with the `senselab` package. Descriptors include acoustic and quality measures and are extracted with different libraries. " ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, { "cell_type": "code", "execution_count": null, @@ -30,8 +39,11 @@ "metadata": {}, "outputs": [], "source": [ + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_stereo_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_stereo_16bits.wav\n", + "\n", "# Load audio\n", - "audio2 = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_stereo_16bits.wav\")\n", + "audio2 = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_stereo_16bits.wav\")\n", "\n", "# Downmix to mono\n", "audio2 = downmix_audios_to_mono([audio2])[0]\n", diff --git a/tutorials/audio/speaker_diarization.ipynb b/tutorials/audio/speaker_diarization.ipynb index c6f2f5a4..1b7a03ff 100644 --- a/tutorials/audio/speaker_diarization.ipynb +++ b/tutorials/audio/speaker_diarization.ipynb @@ -11,6 +11,15 @@ "This tutorial demonstrates how to use the `diarize_audios` function to perform speaker diarization on some audio files, which means to segment the audio into multiple speakers." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, { "cell_type": "code", "execution_count": null, @@ -46,7 +55,10 @@ "outputs": [], "source": [ "# Load an audio file from the specified file path into an Audio object.\n", - "audio = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n", + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "\n", + "audio = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", "\n", "# Resample the audio to 16kHz, as this is the expected input format for the model.\n", "# The resample_audios function returns a list, so we take the first (and only) element.\n", diff --git a/tutorials/audio/speaker_verification.ipynb b/tutorials/audio/speaker_verification.ipynb index e4ec22d6..bb588ee4 100644 --- a/tutorials/audio/speaker_verification.ipynb +++ b/tutorials/audio/speaker_verification.ipynb @@ -4,8 +4,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "# Speaker Verification\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensein/senselab/blob/main/tutorials/audio/speaker_verification.ipynb)\n", "\n", - "#### Speaker Verification\n", "Speaker Verification is a process in which an audio system determines whether a given set of speech samples are from the same speaker. This technology is widely used in various applications such as security systems, authentication processes, and personalized user experiences. The core concept revolves around comparing voice characteristics extracted from speech samples to verify the identity of the speaker.\n", "\n", "Speaker verification can be done in SenseLab as follows:" @@ -15,28 +17,67 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe kernel failed to start as '/Users/isaacbevers/Library/Python/3.10/lib/python/site-packages/psutil/_psutil_osx.abi3.so' could not be imported from '/Users/isaacbevers/Library/Python/3.10/lib/python/site-packages/psutil/_psutil_osx.abi3.so, 0x0002'.\n", - "\u001b[1;31mClick here for more info." - ] - } - ], + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Import necessary libraries\n", "from senselab.audio.data_structures import Audio\n", "from senselab.audio.tasks.speaker_verification.speaker_verification import verify_speaker\n", + "from senselab.audio.tasks.preprocessing import extract_segments, resample_audios\n", + "from senselab.audio.tasks.plotting import play_audio\n", "\n", - "# Create two audio samples (dummy data for illustration purposes)\n", - "audio1 = Audio(signal=[0.1, 0.2, 0.3], sampling_rate=16000)\n", - "audio2 = Audio(signal=[0.1, 0.2, 0.3], sampling_rate=16000)\n", + "# Download an audio file for testing\n", + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", "\n", + "# Load an audio file from the specified file path into an Audio object.\n", + "audio = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", + "\n", + "# Resample the audio to 16 kHz\n", + "audio = resample_audios([audio], 16000)[0]\n", + "\n", + "# Clip the audio into two different speakers\n", + "audio_clips = extract_segments([(audio, [(0, 0.8), (0.9, 1.8)])])\n", + "audio1 = audio_clips[0][0]\n", + "audio2 = audio_clips[0][1]\n", + "\n", + "# Display the audio clips\n", + "play_audio(audio1)\n", + "play_audio(audio2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can ask senselab to verify whether the two clips are of the same speaker or not. If we give it the same audio clip twice, we should expect a high verification score, and with the two different clips, we should expect a low verification score. Let's see what happens." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Verification Score: 0.9999999403953552, Same Speaker: True\n", + "Verification Score: -0.06414839625358582, Same Speaker: False\n" + ] + } + ], + "source": [ "# List of audio pairs to compare\n", - "audio_pairs = [(audio1, audio2)]\n", + "audio_pairs = [(audio1, audio1), (audio1, audio2)]\n", "\n", "# Verify if the audios are from the same speaker\n", "results = verify_speaker(audio_pairs)\n", @@ -75,8 +116,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.10.11" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" } }, "nbformat": 4, diff --git a/tutorials/audio/speech_enhancement.ipynb b/tutorials/audio/speech_enhancement.ipynb index a8dc459a..2673b2aa 100644 --- a/tutorials/audio/speech_enhancement.ipynb +++ b/tutorials/audio/speech_enhancement.ipynb @@ -13,6 +13,15 @@ "We will show you how to use the [Speformer model (speechbrain/sepformer-wham16k-enhancement)](https://huggingface.co/speechbrain/sepformer-wham16k-enhancement)." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install senselab" + ] + }, { "cell_type": "code", "execution_count": null, @@ -33,8 +42,11 @@ "metadata": {}, "outputs": [], "source": [ + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "\n", "# Load an audio file from the specified file path\n", - "audio = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n", + "audio = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", "\n", "# Resample the audio to 16kHz to match the model's expected input format\n", "audio = resample_audios([audio], 16000)[0]\n", @@ -45,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -72,13 +84,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Play the enhanced audio to hear the result after speech enhancement\n", "play_audio(enhanced_audio)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -97,7 +134,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/tutorials/audio/speech_to_text.ipynb b/tutorials/audio/speech_to_text.ipynb index 21034303..b4f8b49b 100644 --- a/tutorials/audio/speech_to_text.ipynb +++ b/tutorials/audio/speech_to_text.ipynb @@ -19,6 +19,15 @@ "First, let's import the necessary libraries and the function we'll be using." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -61,12 +70,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "audio1 = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n", - "audio2 = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_stereo_16bits.wav\")" + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "!wget -O tutorial_audio_files/audio_48khz_stereo_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_stereo_16bits.wav\n", + "\n", + "audio1 = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", + "audio2 = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_stereo_16bits.wav\")" ] }, { diff --git a/tutorials/audio/text_to_speech.ipynb b/tutorials/audio/text_to_speech.ipynb index 3e8dff0d..3e050565 100644 --- a/tutorials/audio/text_to_speech.ipynb +++ b/tutorials/audio/text_to_speech.ipynb @@ -21,6 +21,15 @@ "The very first example shows how to use ```facebook/mms-tts-eng``` which just requires as input the list of pieces of text that you want to synthetize." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, { "cell_type": "code", "execution_count": null, @@ -180,7 +189,10 @@ "metadata": {}, "outputs": [], "source": [ - "audio = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n", + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "\n", + "audio = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", "ground_truth = \"This is Peter.\"\n", "audio = extract_segments([(audio, [(0.0, 1.0)])])[0][0]" ] diff --git a/tutorials/audio/transcribe_timestamped.ipynb b/tutorials/audio/transcribe_timestamped.ipynb index ce9da3b4..daa18363 100644 --- a/tutorials/audio/transcribe_timestamped.ipynb +++ b/tutorials/audio/transcribe_timestamped.ipynb @@ -12,6 +12,15 @@ "The transcribe_timestamped function processes audio files, transcribes the spoken content, and aligns the transcriptions with the audio to provide timestamps for each segment of text. This can be particularly useful for creating subtitles, indexing audio content, or analyzing speech patterns." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, { "cell_type": "code", "execution_count": null, @@ -40,8 +49,11 @@ "metadata": {}, "outputs": [], "source": [ + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "\n", "# Load an audio file from the specified file path\n", - "audio = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n", + "audio = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", "\n", "# Resample the audio to 16kHz to match the model's expected input format\n", "audio = resample_audios([audio], 16000)[0]\n", diff --git a/tutorials/audio/tutorial_audio_files/audio_48khz_mono_16bits.wav b/tutorials/audio/tutorial_audio_files/audio_48khz_mono_16bits.wav new file mode 100644 index 00000000..3d3f3f70 Binary files /dev/null and b/tutorials/audio/tutorial_audio_files/audio_48khz_mono_16bits.wav differ diff --git a/tutorials/audio/tutorial_audio_files/audio_48khz_stereo_16bits.wav b/tutorials/audio/tutorial_audio_files/audio_48khz_stereo_16bits.wav new file mode 100644 index 00000000..9a578f60 Binary files /dev/null and b/tutorials/audio/tutorial_audio_files/audio_48khz_stereo_16bits.wav differ diff --git a/tutorials/audio/voice_activity_detection.ipynb b/tutorials/audio/voice_activity_detection.ipynb index 7e05a534..b07aecfb 100644 --- a/tutorials/audio/voice_activity_detection.ipynb +++ b/tutorials/audio/voice_activity_detection.ipynb @@ -13,6 +13,15 @@ "Specifically, will show you how to use the `pyannote/speaker-diarization-3.1` model (https://huggingface.co/pyannote/speaker-diarization-3.1)." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -47,8 +56,11 @@ "metadata": {}, "outputs": [], "source": [ + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "\n", "# Load an audio file from the specified file path into an Audio object.\n", - "audio = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n", + "audio = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", "\n", "# Resample the audio to 16kHz, as this is the expected input format for the model.\n", "# The resample_audios function returns a list, so we take the first (and only) element.\n", diff --git a/tutorials/audio/voice_cloning.ipynb b/tutorials/audio/voice_cloning.ipynb index b8c3901b..0dcde037 100644 --- a/tutorials/audio/voice_cloning.ipynb +++ b/tutorials/audio/voice_cloning.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "# Voice cloning\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensein/senselab/blob/main/tutorials/audiovoice_cloning.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensein/senselab/blob/main/tutorials/audio/voice_cloning.ipynb)\n", "\n", "This tutorial demonstrates how to use the `clone_voices` function from the `senselab` library to convert someone's speech into another person's voice. Currently, `senselab` includes the `KNNVC` model for voice cloning. In this tutorial, we will see how to use it." ] @@ -18,6 +18,15 @@ "First, we need to import the necessary modules and classes from the `senselab` package." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install senselab" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -62,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -101,7 +110,10 @@ } ], "source": [ - "audio = Audio.from_filepath(\"../../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n", + "!mkdir -p tutorial_audio_files\n", + "!wget -O tutorial_audio_files/audio_48khz_mono_16bits.wav https://github.com/sensein/senselab/raw/main/src/tests/data_for_testing/audio_48khz_mono_16bits.wav\n", + "\n", + "audio = Audio.from_filepath(\"tutorial_audio_files/audio_48khz_mono_16bits.wav\")\n", "\n", "# Resample the audio to 16kHz\n", "audio = resample_audios([audio], 16000)[0]\n",