From d278fe4290bcbc66183e1b02765c21423b3388ba Mon Sep 17 00:00:00 2001 From: sammlapp Date: Tue, 23 Jan 2024 22:35:37 -0500 Subject: [PATCH 1/6] resolves #942 helpful message when index not set Also somewhat related to #865 --- opensoundscape/ml/dataloaders.py | 23 +++++++++- tests/test_dataloaders.py | 79 ++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 tests/test_dataloaders.py diff --git a/opensoundscape/ml/dataloaders.py b/opensoundscape/ml/dataloaders.py index 35e57cbe..0828b38d 100644 --- a/opensoundscape/ml/dataloaders.py +++ b/opensoundscape/ml/dataloaders.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import warnings +from pathlib import Path from opensoundscape.utils import identity from opensoundscape.ml.safe_dataset import SafeDataset @@ -70,8 +71,15 @@ def __init__( type(samples) == pd.DataFrame and type(samples.index) == pd.core.indexes.multi.MultiIndex ): # c1 user provided multi-index df with file,start_time,end_time of clips + # raise AssertionError if first item of multi-index is not str or Path + _check_is_path(samples.index.values[0][0]) dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor) - elif split_files_into_clips: # c2 user provided file list; split into + elif ( + split_files_into_clips + ): # c2 user provided file list; split each file into appropriate length clips + # raise AssertionError if first item is not str or Path + _check_is_path(samples.index.values[0]) + dataset = AudioSplittingDataset( samples=samples, preprocessor=preprocessor, @@ -79,7 +87,13 @@ def __init__( final_clip=final_clip, ) else: # c3 split_files_into_clips=False -> one sample & one prediction per file provided + # eg, each file is a 5 second clips and the model expects 5 second clips + + # raise AssertionError if first item is not str or Path + _check_is_path(samples.index.values[0]) + dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor) + dataset.bypass_augmentations = bypass_augmentations if len(dataset) < 1: @@ -110,3 +124,10 @@ def __init__( self.dataset._invalid_samples = self.dataset._invalid_samples.union( dataset.invalid_samples ) + + +def _check_is_path(path): + assert isinstance(path, str) or isinstance(path, Path), ( + f"First item of multi-index must be str or Path, got {type(path)}. " + "Did you set the index correctly?" + ) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py new file mode 100644 index 00000000..62b750d0 --- /dev/null +++ b/tests/test_dataloaders.py @@ -0,0 +1,79 @@ +import pytest +import numpy as np +import pandas as pd +from opensoundscape.preprocess.preprocessors import SpectrogramPreprocessor +from opensoundscape.ml.dataloaders import SafeAudioDataloader + + +@pytest.fixture() +def dataset_df(): + paths = ["tests/audio/silence_10s.mp3", "tests/audio/silence_10s.mp3"] + labels = [[1, 0], [0, 1]] + return pd.DataFrame(index=paths, data=labels, columns=[0, 1]) + + +@pytest.fixture() +def bad_dataset_df(): + labels = [[1, 0], [0, 1]] + return pd.DataFrame(index=range(len(labels)), data=labels, columns=[0, 1]) + + +@pytest.fixture() +def dataset_df_multiindex(): + paths = ["tests/audio/silence_10s.mp3", "tests/audio/silence_10s.mp3"] + start_times = [0, 1] + end_times = [1, 2] + return pd.DataFrame( + { + "file": paths, + "start_time": start_times, + "end_time": end_times, + "A": [0, 1], + "B": [1, 0], + } + ).set_index(["file", "start_time", "end_time"]) + + +@pytest.fixture() +def bad_dataset_df_multiindex(): + paths = ["tests/audio/silence_10s.mp3", "tests/audio/silence_10s.mp3"] + start_times = [0, 1] + end_times = [1, 2] + return pd.DataFrame( + { + "file": paths, + "start_time": start_times, + "end_time": end_times, + "A": [0, 1], + "B": [1, 0], + } + ) # .set_index(["file", "start_time", "end_time"]) + + +@pytest.fixture() +def bad_dataset_df(): + labels = [[1, 0], [0, 1]] + return pd.DataFrame(index=range(len(labels)), data=labels, columns=[0, 1]) + + +@pytest.fixture() +def pre(): + return SpectrogramPreprocessor(sample_duration=1) + + +def test_helpful_error_if_index_is_integer(bad_dataset_df, pre): + with pytest.raises(AssertionError): + SafeAudioDataloader(bad_dataset_df, pre) + + +def test_init(dataset_df, pre): + SafeAudioDataloader(dataset_df, pre) + + +def test_init_multiindex(dataset_df, pre): + SafeAudioDataloader(dataset_df, pre) + + +def test_catch_index_not_set(bad_dataset_df_multiindex, pre): + with pytest.raises(AssertionError): + SafeAudioDataloader(bad_dataset_df_multiindex, pre) From 9d7dc5eabd7b8dd3f90551725f200fad1441beef Mon Sep 17 00:00:00 2001 From: sammlapp Date: Tue, 23 Jan 2024 22:47:56 -0500 Subject: [PATCH 2/6] resolve 803 useful error if preprocessor.forward gets df gives user advice if preprocessor.forward() gets pd.dataframe instead of pd.series --- opensoundscape/preprocess/preprocessors.py | 8 ++++++++ tests/test_preprocessors.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/opensoundscape/preprocess/preprocessors.py b/opensoundscape/preprocess/preprocessors.py index 25fb6f94..c3aa8a70 100644 --- a/opensoundscape/preprocess/preprocessors.py +++ b/opensoundscape/preprocess/preprocessors.py @@ -119,6 +119,7 @@ def forward( sample (instance of AudioSample class) """ + # validate input if break_on_key is not None: assert ( break_on_key in self.pipeline @@ -196,6 +197,13 @@ def _generate_sample(self, sample): path, (str, Path) ), "if passing a series, series.name must contain (path, start_time, end_time)" sample = AudioSample(path, start_time=start, duration=self.sample_duration) + elif isinstance(sample, pd.DataFrame): + raise AssertionError( + "sample must be AudioSample, tuple of (path, start_time), " + "or pd.Series with (path, start_time, end_time) as .name. " + f"was {type(sample)}. " + "Perhaps a dataset was accessed like dataset[[0,1,2]] instead of dataset[0]?" + ) else: assert isinstance(sample, AudioSample), ( "sample must be AudioSample, tuple of (path, start_time), " diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py index 18d195e1..de58f324 100644 --- a/tests/test_preprocessors.py +++ b/tests/test_preprocessors.py @@ -85,4 +85,12 @@ def test_trace_output(preprocessor, sample): assert isinstance(sample.trace["load_audio"], Audio) +def test_catch_input_to_forward_is_dataframe(preprocessor): + # raises AssertionError if samples arg in forward is not pd.Series + # which might occur if user accessed a dataset with a list instead of a single index + # see issue 803: https://github.com/kitzeslab/opensoundscape/issues/803 + with pytest.raises(AssertionError): + preprocessor.forward(pd.DataFrame({0: ["a"]})) + + # several specific scenarios are tested using DataSets in test_datasets.py From 7ddc1baf8aab992cbf469839b4ec99e2864e6236 Mon Sep 17 00:00:00 2001 From: sammlapp Date: Tue, 23 Jan 2024 23:11:32 -0500 Subject: [PATCH 3/6] resolve #865 helpful error for wrong train_df format modifies input validation for SafeAudioDataloader --- opensoundscape/ml/cnn.py | 5 ++++- opensoundscape/ml/dataloaders.py | 9 +-------- opensoundscape/utils.py | 12 +++++++----- tests/test_cnn.py | 21 +++++++++++++++++++++ 4 files changed, 33 insertions(+), 14 deletions(-) diff --git a/opensoundscape/ml/cnn.py b/opensoundscape/ml/cnn.py index 9659e004..afff3270 100644 --- a/opensoundscape/ml/cnn.py +++ b/opensoundscape/ml/cnn.py @@ -30,7 +30,7 @@ from opensoundscape.ml.datasets import AudioFileDataset from opensoundscape.ml.cnn_architectures import inception_v3 from opensoundscape.sample import collate_audio_samples_to_dict -from opensoundscape.utils import identity +from opensoundscape.utils import identity, _check_is_path from opensoundscape.logging import wandb_table from opensoundscape.ml.cam import CAM @@ -786,6 +786,7 @@ def train( """ ### Input Validation ### + # Validation of class list class_err = """ Train and validation datasets must have same classes and class order as model object. Consider using @@ -803,6 +804,8 @@ def train( "evaluated using the performance on the training set." ) + ## Initialization ## + # Initialize attributes self.log_interval = log_interval self.save_interval = save_interval diff --git a/opensoundscape/ml/dataloaders.py b/opensoundscape/ml/dataloaders.py index 0828b38d..adf894ad 100644 --- a/opensoundscape/ml/dataloaders.py +++ b/opensoundscape/ml/dataloaders.py @@ -4,7 +4,7 @@ import warnings from pathlib import Path -from opensoundscape.utils import identity +from opensoundscape.utils import identity, _check_is_path from opensoundscape.ml.safe_dataset import SafeDataset from opensoundscape.ml.datasets import AudioFileDataset, AudioSplittingDataset @@ -124,10 +124,3 @@ def __init__( self.dataset._invalid_samples = self.dataset._invalid_samples.union( dataset.invalid_samples ) - - -def _check_is_path(path): - assert isinstance(path, str) or isinstance(path, Path), ( - f"First item of multi-index must be str or Path, got {type(path)}. " - "Did you set the index correctly?" - ) diff --git a/opensoundscape/utils.py b/opensoundscape/utils.py index a28b9ae1..3a75a257 100644 --- a/opensoundscape/utils.py +++ b/opensoundscape/utils.py @@ -1,14 +1,10 @@ """Utilities for opensoundscape""" -import datetime -import warnings - import numpy as np import pandas as pd -import pytz -import soundfile import librosa from matplotlib.colors import LinearSegmentedColormap +from pathlib import Path class GetDurationError(ValueError): @@ -329,3 +325,9 @@ def generate_opacity_colormaps( colormaps.append(cmap) return colormaps + + +def _check_is_path(path): + assert isinstance(path, str) or isinstance( + path, Path + ), f"Expected str or Path, got {type(path)}. Did you set the index correctly?" diff --git a/tests/test_cnn.py b/tests/test_cnn.py index 8305ddc8..ed09c1a7 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -305,6 +305,27 @@ def test_train_on_clip_df(train_df): ) +def test_train_bad_index(train_df): + """ + AssertionError catches case where index is not one of the allowed formats + """ + model = cnn.CNN("resnet18", [0, 1], sample_duration=2) + # reset the index so that train_df index is integers (not an allowed format) + train_df = make_clip_df(train_df.index.values, clip_duration=2).reset_index() + train_df[0] = np.random.choice([0, 1], size=10) + train_df[1] = np.random.choice([0, 1], size=10) + with pytest.raises(AssertionError): + model.train( + train_df, + train_df, + save_path="tests/models/", + epochs=1, + batch_size=2, + save_interval=10, + num_workers=0, + ) + + def test_predict_without_splitting(test_df): model = cnn.CNN("resnet18", classes=[0, 1], sample_duration=5.0) scores = model.predict(test_df, split_files_into_clips=False) From 212e7d4b0d9de00be02928062e65207c4a6c39d2 Mon Sep 17 00:00:00 2001 From: sammlapp Date: Tue, 23 Jan 2024 23:11:59 -0500 Subject: [PATCH 4/6] handle 0-length samples scenario --- opensoundscape/ml/dataloaders.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/opensoundscape/ml/dataloaders.py b/opensoundscape/ml/dataloaders.py index adf894ad..32420000 100644 --- a/opensoundscape/ml/dataloaders.py +++ b/opensoundscape/ml/dataloaders.py @@ -72,26 +72,28 @@ def __init__( and type(samples.index) == pd.core.indexes.multi.MultiIndex ): # c1 user provided multi-index df with file,start_time,end_time of clips # raise AssertionError if first item of multi-index is not str or Path - _check_is_path(samples.index.values[0][0]) + if len(samples) > 0: + _check_is_path(samples.index.values[0][0]) dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor) elif ( split_files_into_clips ): # c2 user provided file list; split each file into appropriate length clips # raise AssertionError if first item is not str or Path - _check_is_path(samples.index.values[0]) - + if len(samples) > 0: + _check_is_path(samples[0]) dataset = AudioSplittingDataset( samples=samples, preprocessor=preprocessor, overlap_fraction=overlap_fraction, final_clip=final_clip, ) - else: # c3 split_files_into_clips=False -> one sample & one prediction per file provided + else: # c3 samples is list of files and + # split_files_into_clips=False -> one sample & one prediction per file provided # eg, each file is a 5 second clips and the model expects 5 second clips # raise AssertionError if first item is not str or Path - _check_is_path(samples.index.values[0]) - + if len(samples) > 0: + _check_is_path(samples[0]) dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor) dataset.bypass_augmentations = bypass_augmentations From b89d4aabdfe812178d1ad53a48390c981988b02b Mon Sep 17 00:00:00 2001 From: sammlapp Date: Tue, 23 Jan 2024 23:20:24 -0500 Subject: [PATCH 5/6] fix input validation for SafeAudioDataloader --- opensoundscape/ml/dataloaders.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/opensoundscape/ml/dataloaders.py b/opensoundscape/ml/dataloaders.py index 32420000..6ae41402 100644 --- a/opensoundscape/ml/dataloaders.py +++ b/opensoundscape/ml/dataloaders.py @@ -63,6 +63,18 @@ def __init__( "(c) (file,start_time,end_time) as MultiIndex" ) + # validate that file paths are correctly placed in the input index or list + if len(samples) > 0: + if isinstance(samples, pd.DataFrame): # samples is a pd.DataFrame + if isinstance(samples.index, pd.core.indexes.multi.MultiIndex): + # index is (file, start_time, end_time) + first_path = samples.index.values[0][0] + else: # index of df is just file path + first_path = samples.index.values[0] + else: # samples is a list of file path + first_path = samples[0] + _check_is_path(first_path) + # set up prediction Dataset, considering three possible cases: # (c1) user provided multi-index df with file,start_time,end_time of clips # (c2) user provided file list and wants clips to be split out automatically @@ -72,15 +84,11 @@ def __init__( and type(samples.index) == pd.core.indexes.multi.MultiIndex ): # c1 user provided multi-index df with file,start_time,end_time of clips # raise AssertionError if first item of multi-index is not str or Path - if len(samples) > 0: - _check_is_path(samples.index.values[0][0]) dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor) elif ( split_files_into_clips ): # c2 user provided file list; split each file into appropriate length clips # raise AssertionError if first item is not str or Path - if len(samples) > 0: - _check_is_path(samples[0]) dataset = AudioSplittingDataset( samples=samples, preprocessor=preprocessor, @@ -90,10 +98,6 @@ def __init__( else: # c3 samples is list of files and # split_files_into_clips=False -> one sample & one prediction per file provided # eg, each file is a 5 second clips and the model expects 5 second clips - - # raise AssertionError if first item is not str or Path - if len(samples) > 0: - _check_is_path(samples[0]) dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor) dataset.bypass_augmentations = bypass_augmentations From 52f7d7677dfa510a143c7738e000d3391106b31c Mon Sep 17 00:00:00 2001 From: sammlapp Date: Tue, 10 Sep 2024 15:45:23 -0400 Subject: [PATCH 6/6] clean up imports --- opensoundscape/ml/cnn.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/opensoundscape/ml/cnn.py b/opensoundscape/ml/cnn.py index e6a6d006..fdcd42b2 100644 --- a/opensoundscape/ml/cnn.py +++ b/opensoundscape/ml/cnn.py @@ -27,15 +27,13 @@ from opensoundscape.preprocess import io from opensoundscape.ml.datasets import AudioFileDataset from opensoundscape.ml.cnn_architectures import inception_v3 -from opensoundscape.ml.loss import ResampleLoss -from opensoundscape.sample import collate_audio_samples, collate_audio_samples_to_dict -from opensoundscape.utils import identity, _check_is_path +from opensoundscape.sample import collate_audio_samples +from opensoundscape.utils import identity from opensoundscape.logging import wandb_table from opensoundscape.ml.cam import CAM import pytorch_grad_cam from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget -from lightning.pytorch.callbacks import ModelCheckpoint from torchmetrics.classification import ( @@ -2189,9 +2187,7 @@ class BaseClassifier(SpectrogramClassifier): """ -def use_resample_loss( - model, train_df -): # TODO revisit how this work. Should be able to set loss_cls=ResampleLoss() +def use_resample_loss(model, train_df): """Modify a model to use ResampleLoss for multi-target training ResampleLoss may perform better than BCE Loss for multitarget problems