From d278fe4290bcbc66183e1b02765c21423b3388ba Mon Sep 17 00:00:00 2001
From: sammlapp <sammlapp@gmail.com>
Date: Tue, 23 Jan 2024 22:35:37 -0500
Subject: [PATCH 1/6] resolves #942 helpful message when index not set

Also somewhat related to #865
---
 opensoundscape/ml/dataloaders.py | 23 +++++++++-
 tests/test_dataloaders.py        | 79 ++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_dataloaders.py

diff --git a/opensoundscape/ml/dataloaders.py b/opensoundscape/ml/dataloaders.py
index 35e57cbe..0828b38d 100644
--- a/opensoundscape/ml/dataloaders.py
+++ b/opensoundscape/ml/dataloaders.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 import warnings
+from pathlib import Path
 
 from opensoundscape.utils import identity
 from opensoundscape.ml.safe_dataset import SafeDataset
@@ -70,8 +71,15 @@ def __init__(
             type(samples) == pd.DataFrame
             and type(samples.index) == pd.core.indexes.multi.MultiIndex
         ):  # c1 user provided multi-index df with file,start_time,end_time of clips
+            # raise AssertionError if first item of multi-index is not str or Path
+            _check_is_path(samples.index.values[0][0])
             dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor)
-        elif split_files_into_clips:  # c2 user provided file list; split into
+        elif (
+            split_files_into_clips
+        ):  # c2 user provided file list; split each file into appropriate length clips
+            # raise AssertionError if first item is not str or Path
+            _check_is_path(samples.index.values[0])
+
             dataset = AudioSplittingDataset(
                 samples=samples,
                 preprocessor=preprocessor,
@@ -79,7 +87,13 @@ def __init__(
                 final_clip=final_clip,
             )
         else:  # c3 split_files_into_clips=False -> one sample & one prediction per file provided
+            # eg, each file is a 5 second clips and the model expects 5 second clips
+
+            # raise AssertionError if first item is not str or Path
+            _check_is_path(samples.index.values[0])
+
             dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor)
+
         dataset.bypass_augmentations = bypass_augmentations
 
         if len(dataset) < 1:
@@ -110,3 +124,10 @@ def __init__(
         self.dataset._invalid_samples = self.dataset._invalid_samples.union(
             dataset.invalid_samples
         )
+
+
+def _check_is_path(path):
+    assert isinstance(path, str) or isinstance(path, Path), (
+        f"First item of multi-index must be str or Path, got {type(path)}. "
+        "Did you set the index correctly?"
+    )
diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py
new file mode 100644
index 00000000..62b750d0
--- /dev/null
+++ b/tests/test_dataloaders.py
@@ -0,0 +1,79 @@
+import pytest
+import numpy as np
+import pandas as pd
+from opensoundscape.preprocess.preprocessors import SpectrogramPreprocessor
+from opensoundscape.ml.dataloaders import SafeAudioDataloader
+
+
+@pytest.fixture()
+def dataset_df():
+    paths = ["tests/audio/silence_10s.mp3", "tests/audio/silence_10s.mp3"]
+    labels = [[1, 0], [0, 1]]
+    return pd.DataFrame(index=paths, data=labels, columns=[0, 1])
+
+
+@pytest.fixture()
+def bad_dataset_df():
+    labels = [[1, 0], [0, 1]]
+    return pd.DataFrame(index=range(len(labels)), data=labels, columns=[0, 1])
+
+
+@pytest.fixture()
+def dataset_df_multiindex():
+    paths = ["tests/audio/silence_10s.mp3", "tests/audio/silence_10s.mp3"]
+    start_times = [0, 1]
+    end_times = [1, 2]
+    return pd.DataFrame(
+        {
+            "file": paths,
+            "start_time": start_times,
+            "end_time": end_times,
+            "A": [0, 1],
+            "B": [1, 0],
+        }
+    ).set_index(["file", "start_time", "end_time"])
+
+
+@pytest.fixture()
+def bad_dataset_df_multiindex():
+    paths = ["tests/audio/silence_10s.mp3", "tests/audio/silence_10s.mp3"]
+    start_times = [0, 1]
+    end_times = [1, 2]
+    return pd.DataFrame(
+        {
+            "file": paths,
+            "start_time": start_times,
+            "end_time": end_times,
+            "A": [0, 1],
+            "B": [1, 0],
+        }
+    )  # .set_index(["file", "start_time", "end_time"])
+
+
+@pytest.fixture()
+def bad_dataset_df():
+    labels = [[1, 0], [0, 1]]
+    return pd.DataFrame(index=range(len(labels)), data=labels, columns=[0, 1])
+
+
+@pytest.fixture()
+def pre():
+    return SpectrogramPreprocessor(sample_duration=1)
+
+
+def test_helpful_error_if_index_is_integer(bad_dataset_df, pre):
+    with pytest.raises(AssertionError):
+        SafeAudioDataloader(bad_dataset_df, pre)
+
+
+def test_init(dataset_df, pre):
+    SafeAudioDataloader(dataset_df, pre)
+
+
+def test_init_multiindex(dataset_df, pre):
+    SafeAudioDataloader(dataset_df, pre)
+
+
+def test_catch_index_not_set(bad_dataset_df_multiindex, pre):
+    with pytest.raises(AssertionError):
+        SafeAudioDataloader(bad_dataset_df_multiindex, pre)

From 9d7dc5eabd7b8dd3f90551725f200fad1441beef Mon Sep 17 00:00:00 2001
From: sammlapp <sammlapp@gmail.com>
Date: Tue, 23 Jan 2024 22:47:56 -0500
Subject: [PATCH 2/6] resolve 803 useful error if preprocessor.forward gets df

gives user advice if preprocessor.forward() gets pd.dataframe instead of pd.series
---
 opensoundscape/preprocess/preprocessors.py | 8 ++++++++
 tests/test_preprocessors.py                | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/opensoundscape/preprocess/preprocessors.py b/opensoundscape/preprocess/preprocessors.py
index 25fb6f94..c3aa8a70 100644
--- a/opensoundscape/preprocess/preprocessors.py
+++ b/opensoundscape/preprocess/preprocessors.py
@@ -119,6 +119,7 @@ def forward(
             sample (instance of AudioSample class)
 
         """
+        # validate input
         if break_on_key is not None:
             assert (
                 break_on_key in self.pipeline
@@ -196,6 +197,13 @@ def _generate_sample(self, sample):
                 path, (str, Path)
             ), "if passing a series, series.name must contain (path, start_time, end_time)"
             sample = AudioSample(path, start_time=start, duration=self.sample_duration)
+        elif isinstance(sample, pd.DataFrame):
+            raise AssertionError(
+                "sample must be AudioSample, tuple of (path, start_time), "
+                "or pd.Series with (path, start_time, end_time) as .name. "
+                f"was {type(sample)}. "
+                "Perhaps a dataset was accessed like dataset[[0,1,2]] instead of dataset[0]?"
+            )
         else:
             assert isinstance(sample, AudioSample), (
                 "sample must be AudioSample, tuple of (path, start_time), "
diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py
index 18d195e1..de58f324 100644
--- a/tests/test_preprocessors.py
+++ b/tests/test_preprocessors.py
@@ -85,4 +85,12 @@ def test_trace_output(preprocessor, sample):
     assert isinstance(sample.trace["load_audio"], Audio)
 
 
+def test_catch_input_to_forward_is_dataframe(preprocessor):
+    # raises AssertionError if samples arg in forward is not pd.Series
+    # which might occur if user accessed a dataset with a list instead of a single index
+    # see issue 803: https://github.com/kitzeslab/opensoundscape/issues/803
+    with pytest.raises(AssertionError):
+        preprocessor.forward(pd.DataFrame({0: ["a"]}))
+
+
 # several specific scenarios are tested using DataSets in test_datasets.py

From 7ddc1baf8aab992cbf469839b4ec99e2864e6236 Mon Sep 17 00:00:00 2001
From: sammlapp <sammlapp@gmail.com>
Date: Tue, 23 Jan 2024 23:11:32 -0500
Subject: [PATCH 3/6] resolve #865 helpful error for wrong train_df format

modifies input validation for SafeAudioDataloader
---
 opensoundscape/ml/cnn.py         |  5 ++++-
 opensoundscape/ml/dataloaders.py |  9 +--------
 opensoundscape/utils.py          | 12 +++++++-----
 tests/test_cnn.py                | 21 +++++++++++++++++++++
 4 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/opensoundscape/ml/cnn.py b/opensoundscape/ml/cnn.py
index 9659e004..afff3270 100644
--- a/opensoundscape/ml/cnn.py
+++ b/opensoundscape/ml/cnn.py
@@ -30,7 +30,7 @@
 from opensoundscape.ml.datasets import AudioFileDataset
 from opensoundscape.ml.cnn_architectures import inception_v3
 from opensoundscape.sample import collate_audio_samples_to_dict
-from opensoundscape.utils import identity
+from opensoundscape.utils import identity, _check_is_path
 from opensoundscape.logging import wandb_table
 
 from opensoundscape.ml.cam import CAM
@@ -786,6 +786,7 @@ def train(
         """
 
         ### Input Validation ###
+        # Validation of class list
         class_err = """
             Train and validation datasets must have same classes
             and class order as model object. Consider using
@@ -803,6 +804,8 @@ def train(
                 "evaluated using the performance on the training set."
             )
 
+        ## Initialization ##
+
         # Initialize attributes
         self.log_interval = log_interval
         self.save_interval = save_interval
diff --git a/opensoundscape/ml/dataloaders.py b/opensoundscape/ml/dataloaders.py
index 0828b38d..adf894ad 100644
--- a/opensoundscape/ml/dataloaders.py
+++ b/opensoundscape/ml/dataloaders.py
@@ -4,7 +4,7 @@
 import warnings
 from pathlib import Path
 
-from opensoundscape.utils import identity
+from opensoundscape.utils import identity, _check_is_path
 from opensoundscape.ml.safe_dataset import SafeDataset
 from opensoundscape.ml.datasets import AudioFileDataset, AudioSplittingDataset
 
@@ -124,10 +124,3 @@ def __init__(
         self.dataset._invalid_samples = self.dataset._invalid_samples.union(
             dataset.invalid_samples
         )
-
-
-def _check_is_path(path):
-    assert isinstance(path, str) or isinstance(path, Path), (
-        f"First item of multi-index must be str or Path, got {type(path)}. "
-        "Did you set the index correctly?"
-    )
diff --git a/opensoundscape/utils.py b/opensoundscape/utils.py
index a28b9ae1..3a75a257 100644
--- a/opensoundscape/utils.py
+++ b/opensoundscape/utils.py
@@ -1,14 +1,10 @@
 """Utilities for opensoundscape"""
 
-import datetime
-import warnings
-
 import numpy as np
 import pandas as pd
-import pytz
-import soundfile
 import librosa
 from matplotlib.colors import LinearSegmentedColormap
+from pathlib import Path
 
 
 class GetDurationError(ValueError):
@@ -329,3 +325,9 @@ def generate_opacity_colormaps(
         colormaps.append(cmap)
 
     return colormaps
+
+
+def _check_is_path(path):
+    assert isinstance(path, str) or isinstance(
+        path, Path
+    ), f"Expected str or Path, got {type(path)}. Did you set the index correctly?"
diff --git a/tests/test_cnn.py b/tests/test_cnn.py
index 8305ddc8..ed09c1a7 100644
--- a/tests/test_cnn.py
+++ b/tests/test_cnn.py
@@ -305,6 +305,27 @@ def test_train_on_clip_df(train_df):
     )
 
 
+def test_train_bad_index(train_df):
+    """
+    AssertionError catches case where index is not one of the allowed formats
+    """
+    model = cnn.CNN("resnet18", [0, 1], sample_duration=2)
+    # reset the index so that train_df index is integers (not an allowed format)
+    train_df = make_clip_df(train_df.index.values, clip_duration=2).reset_index()
+    train_df[0] = np.random.choice([0, 1], size=10)
+    train_df[1] = np.random.choice([0, 1], size=10)
+    with pytest.raises(AssertionError):
+        model.train(
+            train_df,
+            train_df,
+            save_path="tests/models/",
+            epochs=1,
+            batch_size=2,
+            save_interval=10,
+            num_workers=0,
+        )
+
+
 def test_predict_without_splitting(test_df):
     model = cnn.CNN("resnet18", classes=[0, 1], sample_duration=5.0)
     scores = model.predict(test_df, split_files_into_clips=False)

From 212e7d4b0d9de00be02928062e65207c4a6c39d2 Mon Sep 17 00:00:00 2001
From: sammlapp <sammlapp@gmail.com>
Date: Tue, 23 Jan 2024 23:11:59 -0500
Subject: [PATCH 4/6] handle 0-length samples scenario

---
 opensoundscape/ml/dataloaders.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/opensoundscape/ml/dataloaders.py b/opensoundscape/ml/dataloaders.py
index adf894ad..32420000 100644
--- a/opensoundscape/ml/dataloaders.py
+++ b/opensoundscape/ml/dataloaders.py
@@ -72,26 +72,28 @@ def __init__(
             and type(samples.index) == pd.core.indexes.multi.MultiIndex
         ):  # c1 user provided multi-index df with file,start_time,end_time of clips
             # raise AssertionError if first item of multi-index is not str or Path
-            _check_is_path(samples.index.values[0][0])
+            if len(samples) > 0:
+                _check_is_path(samples.index.values[0][0])
             dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor)
         elif (
             split_files_into_clips
         ):  # c2 user provided file list; split each file into appropriate length clips
             # raise AssertionError if first item is not str or Path
-            _check_is_path(samples.index.values[0])
-
+            if len(samples) > 0:
+                _check_is_path(samples[0])
             dataset = AudioSplittingDataset(
                 samples=samples,
                 preprocessor=preprocessor,
                 overlap_fraction=overlap_fraction,
                 final_clip=final_clip,
             )
-        else:  # c3 split_files_into_clips=False -> one sample & one prediction per file provided
+        else:  # c3 samples is list of files and
+            # split_files_into_clips=False -> one sample & one prediction per file provided
             # eg, each file is a 5 second clips and the model expects 5 second clips
 
             # raise AssertionError if first item is not str or Path
-            _check_is_path(samples.index.values[0])
-
+            if len(samples) > 0:
+                _check_is_path(samples[0])
             dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor)
 
         dataset.bypass_augmentations = bypass_augmentations

From b89d4aabdfe812178d1ad53a48390c981988b02b Mon Sep 17 00:00:00 2001
From: sammlapp <sammlapp@gmail.com>
Date: Tue, 23 Jan 2024 23:20:24 -0500
Subject: [PATCH 5/6] fix input validation for SafeAudioDataloader

---
 opensoundscape/ml/dataloaders.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/opensoundscape/ml/dataloaders.py b/opensoundscape/ml/dataloaders.py
index 32420000..6ae41402 100644
--- a/opensoundscape/ml/dataloaders.py
+++ b/opensoundscape/ml/dataloaders.py
@@ -63,6 +63,18 @@ def __init__(
             "(c) (file,start_time,end_time) as MultiIndex"
         )
 
+        # validate that file paths are correctly placed in the input index or list
+        if len(samples) > 0:
+            if isinstance(samples, pd.DataFrame):  # samples is a pd.DataFrame
+                if isinstance(samples.index, pd.core.indexes.multi.MultiIndex):
+                    # index is (file, start_time, end_time)
+                    first_path = samples.index.values[0][0]
+                else:  # index of df is just file path
+                    first_path = samples.index.values[0]
+            else:  # samples is a list of file path
+                first_path = samples[0]
+            _check_is_path(first_path)
+
         # set up prediction Dataset, considering three possible cases:
         # (c1) user provided multi-index df with file,start_time,end_time of clips
         # (c2) user provided file list and wants clips to be split out automatically
@@ -72,15 +84,11 @@ def __init__(
             and type(samples.index) == pd.core.indexes.multi.MultiIndex
         ):  # c1 user provided multi-index df with file,start_time,end_time of clips
             # raise AssertionError if first item of multi-index is not str or Path
-            if len(samples) > 0:
-                _check_is_path(samples.index.values[0][0])
             dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor)
         elif (
             split_files_into_clips
         ):  # c2 user provided file list; split each file into appropriate length clips
             # raise AssertionError if first item is not str or Path
-            if len(samples) > 0:
-                _check_is_path(samples[0])
             dataset = AudioSplittingDataset(
                 samples=samples,
                 preprocessor=preprocessor,
@@ -90,10 +98,6 @@ def __init__(
         else:  # c3 samples is list of files and
             # split_files_into_clips=False -> one sample & one prediction per file provided
             # eg, each file is a 5 second clips and the model expects 5 second clips
-
-            # raise AssertionError if first item is not str or Path
-            if len(samples) > 0:
-                _check_is_path(samples[0])
             dataset = AudioFileDataset(samples=samples, preprocessor=preprocessor)
 
         dataset.bypass_augmentations = bypass_augmentations

From 52f7d7677dfa510a143c7738e000d3391106b31c Mon Sep 17 00:00:00 2001
From: sammlapp <sammlapp@gmail.com>
Date: Tue, 10 Sep 2024 15:45:23 -0400
Subject: [PATCH 6/6] clean up imports

---
 opensoundscape/ml/cnn.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/opensoundscape/ml/cnn.py b/opensoundscape/ml/cnn.py
index e6a6d006..fdcd42b2 100644
--- a/opensoundscape/ml/cnn.py
+++ b/opensoundscape/ml/cnn.py
@@ -27,15 +27,13 @@
 from opensoundscape.preprocess import io
 from opensoundscape.ml.datasets import AudioFileDataset
 from opensoundscape.ml.cnn_architectures import inception_v3
-from opensoundscape.ml.loss import ResampleLoss
-from opensoundscape.sample import collate_audio_samples, collate_audio_samples_to_dict
-from opensoundscape.utils import identity, _check_is_path
+from opensoundscape.sample import collate_audio_samples
+from opensoundscape.utils import identity
 from opensoundscape.logging import wandb_table
 
 from opensoundscape.ml.cam import CAM
 import pytorch_grad_cam
 from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
-from lightning.pytorch.callbacks import ModelCheckpoint
 
 
 from torchmetrics.classification import (
@@ -2189,9 +2187,7 @@ class BaseClassifier(SpectrogramClassifier):
     """
 
 
-def use_resample_loss(
-    model, train_df
-):  # TODO revisit how this work. Should be able to set loss_cls=ResampleLoss()
+def use_resample_loss(model, train_df):
     """Modify a model to use ResampleLoss for multi-target training
 
     ResampleLoss may perform better than BCE Loss for multitarget problems