Merge pull request #1048 from kitzeslab/feat_350_pcen

Feat 350 pcen
kitzeslab · Sep 9, 2024 · a3d4001 · a3d4001
2 parents d39833e + b89be05
commit a3d4001
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 20 deletions.
diff --git a/opensoundscape/preprocess/action_functions.py b/opensoundscape/preprocess/action_functions.py
@@ -5,6 +5,7 @@
 
 import random
 
+import librosa
 import torch
 import torchvision
 
@@ -262,3 +263,8 @@ def tensor_add_noise(tensor, std=1):
     """
     noise = torch.empty_like(tensor).normal_(mean=0, std=std)
     return tensor + noise
+
+
+@register_action_fn
+def pcen(s, **kwargs):
+    return s._spawn(spectrogram=librosa.pcen(S=s.spectrogram, **kwargs))
diff --git a/opensoundscape/preprocess/preprocessors.py b/opensoundscape/preprocess/preprocessors.py
@@ -23,6 +23,7 @@
 from opensoundscape.sample import AudioSample
 from opensoundscape.preprocess.actions import ACTION_CLS_DICT
 from opensoundscape.preprocess import io
+from opensoundscape.preprocess import preprocessors, actions
 
 
 PREPROCESSOR_CLS_DICT = {}
@@ -579,6 +580,47 @@ def _generate_sample(self, sample):
         return sample
 
 
+class PCENPreprocessor(preprocessors.SpectrogramPreprocessor):
+    def __init__(self, *args, **kwargs):
+        """same arguments as SpectrogramPreprocessor
+
+        adds an action that performs PCEN after making spectrogram
+        PCEN is Per Channel Energy Normalization, see https://arxiv.org/abs/1607.05666
+
+        The only other difference from SpectrogramPreprocessor is that we set the dB_scale to False
+        when generating the Spectrogram, because PCEN expects a linear-scale spectrogram; and that
+        the we normalize the output of PCEN to [0,1], then use range=[0,1] for spec.to_tensor()
+
+        note: user should set self.pipeline['pcen'].params['sr'] and 'hop_length' to match the audio/spectrogram settings
+        after instantiating this class
+
+        User can modify parameters, in particular setting PCEN parameters via self.pipeline['pcen'].params
+        """
+        super().__init__(*args, **kwargs)
+
+        # need to pass linear-value spectrogram to pcen
+        self.pipeline["to_spec"].set(dB_scale=False)
+
+        # use Librosa implementation of PCEN (could use a pytorch implementation in the future, and make it trainable)
+        pcen_action = actions.Action(fn=action_functions.pcen, is_augmentation=False)
+        self.insert_action(action_index="pcen", action=pcen_action, after_key="to_spec")
+
+        # normalize PCEN output to [0,1]
+        def normalize_to_01(s):
+            new_s = (s.spectrogram - s.spectrogram.min()) / (
+                s.spectrogram.max() - s.spectrogram.min()
+            )
+            return s._spawn(spectrogram=new_s)
+
+        self.insert_action(
+            action_index="normalize",
+            action=actions.Action(fn=normalize_to_01, is_augmentation=False),
+            after_key="pcen",
+        )
+
+        self.pipeline.to_tensor.set(range=[0, 1])
+
+
 @register_preprocessor_cls
 class AudioPreprocessor(BasePreprocessor):
     """Child of BasePreprocessor that only loads audio and resamples

diff --git a/tests/test_actions.py b/tests/test_actions.py
@@ -312,3 +312,11 @@ def test_overlay_to_from_dict(sample_df):
     # new action will have empty overlay_df and will be bypassed
     assert action2.bypass == True
     assert action2.overlay_df.empty
+
+
+def test_pcen(sample_audio):
+    sample_audio.data = Spectrogram.from_audio(sample_audio.data, dB_scale=False)
+    action = actions.Action(action_functions.pcen)
+    original_spec = copy.copy(sample_audio.data.spectrogram)
+    action(sample_audio)
+    assert not np.array_equal(sample_audio.data.spectrogram, original_spec)
diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py
@@ -1,9 +1,11 @@
+from copy import copy
+from pathlib import Path
+
 import pytest
 import math
 import numpy as np
 import pandas as pd
-from copy import copy
-from pathlib import Path
+import torch
 
 from opensoundscape.preprocess import preprocessors, actions, action_functions
 from opensoundscape.preprocess.preprocessors import (
@@ -230,26 +232,15 @@ def test_noisereducespectrogrampreprocessor(short_sample):
     assert s1.mean() < s2.mean()
 
 
-def test_audiopreprocessor(audiopreprocessor, sample):
-    """should retain original sample rate"""
-    s = audiopreprocessor.forward(sample).data
-    assert type(s) == Audio
-    assert math.isclose(s.duration, 2.0, abs_tol=1e-9)
-    assert s.sample_rate == 22050
-
+def test_pcenpreprocessor(sample):
 
-def test_audiopreprocessor_extend(audiopreprocessor, short_sample):
-    """should retain original sample rate"""
-    s = audiopreprocessor.forward(short_sample).data
-    assert type(s) == Audio
-    assert math.isclose(s.duration, 2.0, abs_tol=1e-9)
-    assert s.sample_rate == 22050
+    p1 = preprocessors.PCENPreprocessor(sample_duration=1)
+    s1 = p1.forward(sample, bypass_augmentations=True).data
+    assert isinstance(s1, torch.Tensor)
 
-    # when trim_audio.extend is False, should raise an error
-    # if the input is too short
-    audiopreprocessor.pipeline.trim_audio.set(extend=False)
-    with pytest.raises(PreprocessingError):
-        s = audiopreprocessor.forward(short_sample).data
+    # try using some different settings
+    p1.pipeline.pcen.set(gain=0.5)
+    s2 = p1.forward(sample, bypass_augmentations=True).data
 
 
 def test_preprocessor_to_from_dict(preprocessor, sample):