embed(): return dataframe rather than matrix

the dataframe index is needed to associate embeddings with specific audio files and time ranges
kitzeslab · Sep 6, 2024 · 51263bc · 51263bc
1 parent a10df73
commit 51263bc
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 2 deletions.
diff --git a/opensoundscape/ml/cnn.py b/opensoundscape/ml/cnn.py
@@ -2125,9 +2125,18 @@ def embed(
             avgpool_intermediates=avgpool,
         )
 
+        # put embeddings
+        embeddings = pd.DataFrame(
+            data=embeddings[0], index=dataloader.dataset.dataset.label_df.index
+        )
+
         if return_preds:
-            return embeddings[0], preds
-        return embeddings[0]
+            # put predictions in a DataFrame with same index as embeddings
+            preds = pd.DataFrame(
+                data=preds, index=dataloader.dataset.dataset.label_df.index
+            )
+            return embeddings, preds
+        return embeddings
 
     @property
     def device(self):

diff --git a/tests/test_cnn.py b/tests/test_cnn.py
@@ -831,6 +831,7 @@ def test_embed(test_df):
             embeddings = m.embed(samples=test_df, avgpool=True, progress_bar=False)
             assert embeddings.shape[0] == 2
             assert len(embeddings.shape) == 2
+            assert isinstance(embeddings, pd.DataFrame)
         except Exception as e:
             raise Exception(f"{arch} failed") from e