Skip to content

Commit

Permalink
restructuring audio feats extraction and relative docs #2
Browse files Browse the repository at this point in the history
  • Loading branch information
fabiocat93 authored and brukew committed Nov 19, 2024
1 parent e8b7e11 commit b0e7692
Show file tree
Hide file tree
Showing 11 changed files with 1,221 additions and 2,567 deletions.
487 changes: 449 additions & 38 deletions src/senselab/audio/tasks/features_extraction/api.py

Large diffs are not rendered by default.

8 changes: 2 additions & 6 deletions src/senselab/audio/tasks/features_extraction/doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@

## Task Overview

This module provides the API of the senselab audio features extraction.
This module provides the API of the `senselab` audio features extraction.

Features can be extracted using `opensmile`, `praat-parselmouth`, `torchaudio`, and `torchaudio-squim`.
We are working to facilitate the way to extract features in a meaningful way.
Also, we are working to optimize these utilities.

**STAY TUNED**.
Features can be extracted using `opensmile`, `praat-parselmouth`, `torchaudio`, and `torchaudio-squim`. More details are in the functions and in the tutorial.
22 changes: 15 additions & 7 deletions src/senselab/audio/tasks/features_extraction/opensmile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Dict, List

import opensmile
import torch

from senselab.audio.data_structures import Audio

Expand Down Expand Up @@ -41,8 +42,10 @@ def extract_opensmile_features_from_audios(
Args:
audios (List[Audio]): The list of audio objects to extract features from.
feature_set (str): The openSMILE feature set (default is "eGeMAPSv02").
feature_level (str): The openSMILE feature level (default is "Functionals").
feature_set (str): The openSMILE feature set
(default is "eGeMAPSv02". The alternatives include "ComParE_2016").
feature_level (str): The openSMILE feature level
(default is "Functionals". The alternative is "LowLevelDescriptors").
Returns:
List[Dict[str, Any]]: The list of feature dictionaries for each audio.
Expand All @@ -60,11 +63,16 @@ def _extract_feats_from_audio(sample: Audio, smile: opensmile.Smile) -> Dict[str
"""
audio_array = sample.waveform.squeeze().numpy()
sampling_rate = sample.sampling_rate
sample_features = smile.process_signal(audio_array, sampling_rate)
# Convert to a dictionary with float values and return it
return {
k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in sample_features.to_dict("list").items()
}
try:
sample_features = smile.process_signal(audio_array, sampling_rate)
# Convert to a dictionary with float values and return it
return {
k: v[0] if isinstance(v, list) and len(v) == 1 else v
for k, v in sample_features.to_dict("list").items()
}
except Exception as e:
print(f"Error processing sample {sample.orig_path_or_id}: {e}")
return {feature: torch.nan for feature in smile.feature_names}

smile = OpenSmileFeatureExtractorFactory.get_opensmile_extractor(feature_set, feature_level)
features = [_extract_feats_from_audio(audio, smile) for audio in audios]
Expand Down
510 changes: 313 additions & 197 deletions src/senselab/audio/tasks/features_extraction/praat_parselmouth.py

Large diffs are not rendered by default.

156 changes: 117 additions & 39 deletions src/senselab/audio/tasks/features_extraction/torchaudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ def extract_spectrogram_from_audios(
)
spectrograms = []
for audio in audios:
spectrograms.append({"spectrogram": spectrogram(audio.waveform).squeeze(0)})
try:
spectrograms.append({"spectrogram": spectrogram(audio.waveform).squeeze(0)})
except RuntimeError:
spectrograms.append({"spectrogram": torch.nan})
return spectrograms


Expand Down Expand Up @@ -69,14 +72,17 @@ def extract_mel_spectrogram_from_audios(
raise ValueError("win_length cannot be None")
mel_spectrograms = []
for audio in audios:
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=audio.sampling_rate,
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
n_mels=n_mels,
)(audio.waveform)
mel_spectrograms.append({"mel_spectrogram": mel_spectrogram.squeeze(0)})
try:
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=audio.sampling_rate,
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
n_mels=n_mels,
)(audio.waveform)
mel_spectrograms.append({"mel_spectrogram": mel_spectrogram.squeeze(0)})
except RuntimeError:
mel_spectrograms.append({"mel_spectrogram": torch.nan})
return mel_spectrograms


Expand Down Expand Up @@ -110,20 +116,22 @@ def extract_mfcc_from_audios(
raise ValueError("win_length cannot be None")
mfccs = []
for audio in audios:
mfcc_transform = torchaudio.transforms.MFCC(
sample_rate=audio.sampling_rate,
n_mfcc=n_mfcc,
melkwargs={"n_fft": n_ftt, "win_length": win_length, "hop_length": hop_length, "n_mels": n_mels},
)
mfccs.append({"mfcc": mfcc_transform(audio.waveform).squeeze(0)})
try:
mfcc_transform = torchaudio.transforms.MFCC(
sample_rate=audio.sampling_rate,
n_mfcc=n_mfcc,
melkwargs={"n_fft": n_ftt, "win_length": win_length, "hop_length": hop_length, "n_mels": n_mels},
)
mfccs.append({"mfcc": mfcc_transform(audio.waveform).squeeze(0)})
except RuntimeError:
mfccs.append({"mfcc": torch.nan})
return mfccs


def extract_mel_filter_bank_from_audios(
audios: List[Audio],
n_mels: int = 128,
n_stft: int = 201,
n_fft: int = 400,
n_fft: int = 1024,
win_length: Optional[int] = None,
hop_length: Optional[int] = None,
) -> List[Dict[str, torch.Tensor]]:
Expand All @@ -132,8 +140,7 @@ def extract_mel_filter_bank_from_audios(
Args:
audios (List[Audio]): List of Audio objects.
n_mels (int): Number of mel filter banks. Default is 128.
n_stft (int): Number of bins in STFT. Default is 201.
n_fft (int): Size of FFT, creates n_fft // 2 + 1 bins. Default is 400.
n_fft (int): Size of FFT, creates n_fft // 2 + 1 bins. Default is 1024.
win_length (int): Window size. Default is None, using n_fft.
hop_length (int): Length of hop between STFT windows. Default is None, using win_length // 2.
Expand All @@ -144,20 +151,24 @@ def extract_mel_filter_bank_from_audios(
win_length = n_fft
if hop_length is None:
hop_length = win_length // 2
n_stft = n_fft // 2 + 1

spectrograms = extract_spectrogram_from_audios(audios, n_fft, win_length, hop_length)

mel_filter_banks = []
for i, audio in enumerate(audios):
melscale_transform = torchaudio.transforms.MelScale(
sample_rate=audio.sampling_rate, n_mels=n_mels, n_stft=n_stft
)
mel_filter_banks.append({"mel_filter_bank": melscale_transform(spectrograms[i]["spectrogram"]).squeeze(0)})
try:
melscale_transform = torchaudio.transforms.MelScale(
sample_rate=audio.sampling_rate, n_mels=n_mels, n_stft=n_stft
)
mel_filter_banks.append({"mel_filter_bank": melscale_transform(spectrograms[i]["spectrogram"]).squeeze(0)})
except RuntimeError:
mel_filter_banks.append({"mel_filter_bank": torch.nan})
return mel_filter_banks


def extract_pitch_from_audios(
audios: List[Audio], freq_low: int = 85, freq_high: int = 3400
audios: List[Audio], freq_low: int = 80, freq_high: int = 500
) -> List[Dict[str, torch.Tensor]]:
"""Extract pitch from a list of audio objects.
Expand All @@ -166,29 +177,56 @@ def extract_pitch_from_audios(
Args:
audios (List[Audio]): List of Audio objects.
freq_low (int): Lowest frequency that can be detected (Hz). Default is 85.
freq_high (int): Highest frequency that can be detected (Hz). Default is 3400.
freq_low (int): Lowest frequency that can be detected (Hz). Should be bigger than 0.
(Default is 80).
freq_high (int): Highest frequency that can be detected (Hz).
(Default is 500).
Returns:
List[Dict[str, torch.Tensor]]: List of Dict objects containing pitches.
"""
if freq_low <= 0:
raise ValueError("freq_low should be bigger than 0")

pitches = []
for audio in audios:
pitches.append(
{
"pitch": torchaudio.functional.detect_pitch_frequency(
audio.waveform, sample_rate=audio.sampling_rate, freq_low=freq_low, freq_high=freq_high
).squeeze(0)
}
)
try:
pitches.append(
{
"pitch": torchaudio.functional.detect_pitch_frequency(
audio.waveform, sample_rate=audio.sampling_rate, freq_low=freq_low, freq_high=freq_high
).squeeze(0)
}
)
except RuntimeError:
pitches.append({"pitch": torch.nan})
return pitches


def extract_torchaudio_features_from_audios(audios: List[Audio], plugin: str = "cf") -> List[Dict[str, Any]]:
def extract_torchaudio_features_from_audios(
audios: List[Audio],
freq_low: int = 80,
freq_high: int = 500,
n_fft: int = 1024,
n_mels: int = 128,
n_mfcc: int = 40,
win_length: Optional[int] = None,
hop_length: Optional[int] = None,
plugin: str = "cf",
) -> List[Dict[str, Any]]:
"""Extract torchaudio features from a list of audio objects.
Args:
audios (List[Audio]): The list of audio objects to extract features from.
freq_low (int): Lowest frequency that can be detected (Hz). Should be bigger than 0.
(Default is 80).
freq_high (int): Highest frequency that can be detected (Hz).
(Default is 500).
n_fft (int): Size of FFT, creates n_fft // 2 + 1 bins. Default is 1024.
n_mels (int): Number of mel filter banks. Default is 128.
n_mfcc (int): Number of MFCCs. Default is 40.
win_length (int): Window size. Default is None, using n_fft.
hop_length (int): Length of hop between STFT windows. Default is None, using win_length // 2.
plugin (str): The plugin to use. Default is "cf".
Returns:
Expand All @@ -203,11 +241,51 @@ def extract_torchaudio_features_from_audios(audios: List[Audio], plugin: str = "
formatted_audios = [[audio] for audio in audios]
wf = pydra.Workflow(name="wf", input_spec=["x"])
wf.split("x", x=formatted_audios)
wf.add(extract_pitch_from_audios_pt(name="extract_pitch_from_audios_pt", audios=wf.lzin.x))
wf.add(extract_mel_filter_bank_from_audios_pt(name="extract_mel_filter_bank_from_audios_pt", audios=wf.lzin.x))
wf.add(extract_mfcc_from_audios_pt(name="extract_mfcc_from_audios_pt", audios=wf.lzin.x))
wf.add(extract_mel_spectrogram_from_audios_pt(name="extract_mel_spectrogram_from_audios_pt", audios=wf.lzin.x))
wf.add(extract_spectrogram_from_audios_pt(name="extract_spectrogram_from_audios_pt", audios=wf.lzin.x))
wf.add(
extract_pitch_from_audios_pt(
name="extract_pitch_from_audios_pt", audios=wf.lzin.x, freq_low=freq_low, freq_high=freq_high
)
)
wf.add(
extract_mel_filter_bank_from_audios_pt(
name="extract_mel_filter_bank_from_audios_pt",
audios=wf.lzin.x,
n_mels=n_mels,
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
)
)
wf.add(
extract_mfcc_from_audios_pt(
name="extract_mfcc_from_audios_pt",
audios=wf.lzin.x,
n_mfcc=n_mfcc,
n_fft=n_fft,
n_mels=n_mels,
win_length=win_length,
hop_length=hop_length,
)
)
wf.add(
extract_mel_spectrogram_from_audios_pt(
name="extract_mel_spectrogram_from_audios_pt",
audios=wf.lzin.x,
n_mels=n_mels,
n_nfft=n_fft,
win_length=win_length,
hop_length=hop_length,
)
)
wf.add(
extract_spectrogram_from_audios_pt(
name="extract_spectrogram_from_audios_pt",
audios=wf.lzin.x,
n_nfft=n_fft,
win_length=win_length,
hop_length=hop_length,
)
)

# setting multiple workflow outputs
wf.set_output(
Expand Down
23 changes: 15 additions & 8 deletions src/senselab/audio/tasks/features_extraction/torchaudio_squim.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import Any, Dict, List

import torch
from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE

from senselab.audio.data_structures import Audio
Expand Down Expand Up @@ -30,11 +31,15 @@ def extract_objective_quality_features_from_audios(audio_list: List[Audio]) -> D
features: Dict[str, Any] = {"stoi": [], "pesq": [], "si_sdr": []}

for audio in audio_list:
stoi, pesq, si_sdr = objective_model(audio.waveform)
features["stoi"].append(stoi.item())
features["pesq"].append(pesq.item())
features["si_sdr"].append(si_sdr.item())

try:
stoi, pesq, si_sdr = objective_model(audio.waveform)
features["stoi"].append(stoi.item())
features["pesq"].append(pesq.item())
features["si_sdr"].append(si_sdr.item())
except ValueError:
features["stoi"].append(torch.nan)
features["pesq"].append(torch.nan)
features["si_sdr"].append(torch.nan)
return features


Expand Down Expand Up @@ -67,7 +72,9 @@ def extract_subjective_quality_features_from_audios(
features: Dict[str, Any] = {"mos": []}

for i, audio in enumerate(audio_list):
mos = subjective_model(audio.waveform, non_matching_references[i].waveform)
features["mos"].append(mos.item())

try:
mos = subjective_model(audio.waveform, non_matching_references[i].waveform)
features["mos"].append(mos.item())
except ValueError:
features["mos"].append(torch.nan)
return features
2 changes: 1 addition & 1 deletion src/senselab/audio/workflows/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
"""Workflow for timestamped transcription."""
"""Workflows and pipelines for audio processing and analysis."""
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
""".. include:: ./doc.md""" # noqa: D415
9 changes: 9 additions & 0 deletions src/senselab/audio/workflows/health_measurements/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Measuring Health Using Speech and Voice Metrics

## Overview

Health measurement through speech and voice analysis is an emerging interdisciplinary field with significant potential in clinical and remote health monitoring. By leveraging advances in machine learning, deep learning, and signal processing, speech and voice data can reveal a range of health indicators, particularly useful for assessing mental and neurological health conditions. These metrics capture both vocal and linguistic features that can reflect emotional, cognitive, and physical states, providing insights that are less invasive than traditional methods.

In section, you can find some workflows and pipelines aiming to extract and refine useful measurements and explore their potential for quick, non-intrusive health assessments and monitoring. Integrating these speech-derived measures with broader health monitoring systems could contribute significantly to early diagnosis, personalized treatment, and ongoing health monitoring across various clinical applications.

**This piece of documentation is in progress. STAY TUNED!**
Loading

0 comments on commit b0e7692

Please sign in to comment.