kitzeslab · sammlapp · May 24, 2024 · May 21, 2024 · May 23, 2024 · May 23, 2024
diff --git a/opensoundscape/ml/cnn.py b/opensoundscape/ml/cnn.py
@@ -481,9 +481,9 @@ def __init__(
                 len(classes), num_channels=num_channels
             )
         else:
-            assert issubclass(
-                type(architecture), torch.nn.Module
-            ), "architecture must be a string or an instance of a subclass of torch.nn.Module"
+            assert isinstance(
+                architecture, torch.nn.Module
+            ), "architecture must be a string or an instance of (a subclass of) torch.nn.Module"
             if num_channels != 3:
                 warnings.warn(
                     f"Make sure your architecture expects the number of channels in "
@@ -1203,7 +1203,7 @@ def generate_cams(
                 str can be any of the following:
                     "gradcam": pytorch_grad_cam.GradCAM,
                     "hirescam": pytorch_grad_cam.HiResCAM,
-                    "scorecam": opensoundscape.ml.utils.ScoreCAM, #pytorch_grad_cam.ScoreCAM,
+                    "scorecam": pytorch_grad_cam.ScoreCAM,
                     "gradcam++": pytorch_grad_cam.GradCAMPlusPlus,
                     "ablationcam": pytorch_grad_cam.AblationCAM,
                     "xgradcam": pytorch_grad_cam.XGradCAM,
@@ -1264,7 +1264,7 @@ def generate_cams(
         methods_dict = {
             "gradcam": pytorch_grad_cam.GradCAM,
             "hirescam": pytorch_grad_cam.HiResCAM,
-            "scorecam": opensoundscape.ml.utils.ScoreCAM,  # pytorch_grad_cam.ScoreCAM,
+            "scorecam": pytorch_grad_cam.ScoreCAM,
             "gradcam++": pytorch_grad_cam.GradCAMPlusPlus,
             "ablationcam": pytorch_grad_cam.AblationCAM,
             "xgradcam": pytorch_grad_cam.XGradCAM,
@@ -1277,11 +1277,13 @@ def generate_cams(
         if isinstance(method, str) and method in methods_dict:
             # get cam clsas based on string name and create instance
             cam = methods_dict[method](model=self.network, target_layers=target_layers)
+            cam.device = self.device
         elif method is None:
             cam = None
         elif issubclass(method, pytorch_grad_cam.base_cam.BaseCAM):
             # generate instance of cam from class
             cam = method(model=self.network, target_layers=target_layers)
+            cam.device = self.device
         else:
             raise ValueError(
                 f"`method` {method} not supported. "
@@ -1292,8 +1294,8 @@ def generate_cams(
         # initialize guided back propagation object
         if guided_backprop:
             gb_model = pytorch_grad_cam.GuidedBackpropReLUModel(
-                model=self.network, use_cuda=False
-            )  # TODO cuda usage - expose? use model setting?
+                model=self.network, device=self.device
+            )
 
         # create dataloader to generate batches of AudioSamples
         dataloader = self.inference_dataloader_cls(

diff --git a/opensoundscape/ml/cnn_architectures.py b/opensoundscape/ml/cnn_architectures.py
@@ -29,6 +29,7 @@
 architectures - the easiest way is to simply use the InceptionV3 class in
 opensoundscape.ml.cnn.
 """
+
 import warnings
 
 import torch
@@ -490,30 +491,28 @@ def efficientnet_b0(
         num_channels:
             specify channels in input sample, eg [channels h,w] sample shape
 
+    Note: in v0.10.2, changed from using NVIDIA/DeepLearningExamples:torchhub repo
+        implementatiuon to native pytorch implementation
+
     """
-    torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
-    architecture_ft = torch.hub.load(
-        "NVIDIA/DeepLearningExamples:torchhub",
-        "nvidia_efficientnet_b0",
-        pretrained=weights,
-    )
+    architecture_ft = torchvision.models.efficientnet_b0(weights=weights)
 
     # prevent weights of feature extractor from being trained, if desired
     if freeze_feature_extractor:
         freeze_params(architecture_ft)
 
     # change number of output nodes
-    architecture_ft.classifier.fc = change_fc_output_size(
-        architecture_ft.classifier.fc, num_classes
+    architecture_ft.classifier[1] = change_fc_output_size(
+        architecture_ft.classifier[1], num_classes=num_classes
     )
 
     # change input shape num_channels
-    architecture_ft.stem.conv = change_conv2d_channels(
-        architecture_ft.stem.conv, num_channels
+    architecture_ft.features[0][0] = change_conv2d_channels(
+        architecture_ft.features[0][0], num_channels
     )
 
     # default target layers for activation maps like GradCAM and guided backpropagation
-    architecture_ft.cam_target_layers = [architecture_ft.layers[-1][-1]]
+    architecture_ft.cam_target_layers = [architecture_ft.features[-1]]
 
     return architecture_ft
 
@@ -537,124 +536,28 @@ def efficientnet_b4(
         num_channels:
             specify channels in input sample, eg [channels h,w] sample shape
 
-    """
-    torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
-    architecture_ft = torch.hub.load(
-        "NVIDIA/DeepLearningExamples:torchhub",
-        "nvidia_efficientnet_b4",
-        pretrained=weights,
-    )
-
-    # prevent weights of feature extractor from being trained, if desired
-    if freeze_feature_extractor:
-        freeze_params(architecture_ft)
-
-    # change number of output nodes
-    architecture_ft.classifier.fc = change_fc_output_size(
-        architecture_ft.classifier.fc, num_classes
-    )
-
-    # change input shape num_channels
-    architecture_ft.stem.conv = change_conv2d_channels(
-        architecture_ft.stem.conv, num_channels
-    )
-
-    # default target layers for activation maps like GradCAM and guided backpropagation
-    architecture_ft.cam_target_layers = [architecture_ft.layers[-1][-1]]
-
-    return architecture_ft
-
-
-@register_arch
-def efficientnet_widese_b0(
-    num_classes, freeze_feature_extractor=False, weights="DEFAULT", num_channels=3
-):
-    """Wrapper for efficientnet_widese_b0 architecture
-
-    Args:
-        num_classes:
-            number of output nodes for the final layer
-        freeze_feature_extractor:
-            if False (default), entire network will have gradients and can train
-            if True, feature block is frozen and only final layer is trained
-        weights:
-            string containing version name of the pre-trained classification weights to use for this architecture.
-            if 'DEFAULT', model is loaded with best available weights (note that these may change across versions).
-            Pre-trained weights available for each architecture are listed at https://pytorch.org/vision/stable/models.html
-        num_channels:
-            specify channels in input sample, eg [channels h,w] sample shape
+    Note: in v0.10.2, changed from using NVIDIA/DeepLearningExamples:torchhub repo
+        implementatiuon to native pytorch implementation
 
     """
-    torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
-    architecture_ft = torch.hub.load(
-        "NVIDIA/DeepLearningExamples:torchhub",
-        "nvidia_efficientnet_widese_b0",
-        pretrained=weights,
-    )
+    architecture_ft = torchvision.models.efficientnet_b4(weights=weights)
 
     # prevent weights of feature extractor from being trained, if desired
     if freeze_feature_extractor:
         freeze_params(architecture_ft)
 
     # change number of output nodes
-    architecture_ft.classifier.fc = change_fc_output_size(
-        architecture_ft.classifier.fc, num_classes
+    architecture_ft.classifier[1] = change_fc_output_size(
+        architecture_ft.classifier[1], num_classes=num_classes
     )
 
     # change input shape num_channels
-    architecture_ft.stem.conv = change_conv2d_channels(
-        architecture_ft.stem.conv, num_channels
+    architecture_ft.features[0][0] = change_conv2d_channels(
+        architecture_ft.features[0][0], num_channels
     )
 
     # default target layers for activation maps like GradCAM and guided backpropagation
-    architecture_ft.cam_target_layers = [architecture_ft.layers[-1][-1]]
-
-    return architecture_ft
-
-
-@register_arch
-def efficientnet_widese_b4(
-    num_classes, freeze_feature_extractor=False, weights="DEFAULT", num_channels=3
-):
-    """Wrapper for efficientnet_widese_b4 architecture
-
-    Args:
-        num_classes:
-            number of output nodes for the final layer
-        freeze_feature_extractor:
-            if False (default), entire network will have gradients and can train
-            if True, feature block is frozen and only final layer is trained
-        weights:
-            string containing version name of the pre-trained classification weights to use for this architecture.
-            if 'DEFAULT', model is loaded with best available weights (note that these may change across versions).
-            Pre-trained weights available for each architecture are listed at https://pytorch.org/vision/stable/models.html
-        num_channels:
-            specify channels in input sample, eg [channels h,w] sample shape
-
-    """
-    torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
-    architecture_ft = torch.hub.load(
-        "NVIDIA/DeepLearningExamples:torchhub",
-        "nvidia_efficientnet_widese_b4",
-        pretrained=weights,
-    )
-
-    # prevent weights of feature extractor from being trained, if desired
-    if freeze_feature_extractor:
-        freeze_params(architecture_ft)
-
-    # change number of output nodes
-    architecture_ft.classifier.fc = change_fc_output_size(
-        architecture_ft.classifier.fc, num_classes
-    )
-
-    # change input shape num_channels
-    architecture_ft.stem.conv = change_conv2d_channels(
-        architecture_ft.stem.conv, num_channels
-    )
-
-    # default target layers for activation maps like GradCAM and guided backpropagation
-    architecture_ft.cam_target_layers = [architecture_ft.layers[-1][-1]]
+    architecture_ft.cam_target_layers = [architecture_ft.features[-1]]
 
     return architecture_ft
 

diff --git a/opensoundscape/ml/utils.py b/opensoundscape/ml/utils.py
@@ -1,4 +1,5 @@
 """Utilties for .ml"""
+
 import warnings
 import pandas as pd
 import numpy as np
@@ -131,57 +132,6 @@ def apply_activation_layer(x, activation_layer=None):
     return x
 
 
-# override pytorch_grad_cam's score cam class because it has a bug
-# with device mismatch of upsampled (cpu) vs input_tensor (may be mps, cuda, etc)
-class ScoreCAM(pytorch_grad_cam.base_cam.BaseCAM):
-    def __init__(self, model, target_layers, use_cuda=False, reshape_transform=None):
-        super(ScoreCAM, self).__init__(
-            model,
-            target_layers,
-            use_cuda,
-            reshape_transform=reshape_transform,
-            uses_gradients=False,
-        )
-
-    def get_cam_weights(self, input_tensor, target_layer, targets, activations, grads):
-        with torch.no_grad():
-            upsample = torch.nn.UpsamplingBilinear2d(size=input_tensor.shape[-2:])
-            activation_tensor = torch.from_numpy(activations)
-            if self.cuda:
-                activation_tensor = activation_tensor.cuda()
-
-            upsampled = upsample(activation_tensor)
-
-            maxs = upsampled.view(upsampled.size(0), upsampled.size(1), -1).max(dim=-1)[
-                0
-            ]
-            mins = upsampled.view(upsampled.size(0), upsampled.size(1), -1).min(dim=-1)[
-                0
-            ]
-
-            maxs, mins = maxs[:, :, None, None], mins[:, :, None, None]
-            upsampled = (upsampled - mins) / (maxs - mins)
-
-            upsampled = upsampled.to(input_tensor.device)
-            input_tensors = input_tensor[:, None, :, :] * upsampled[:, :, None, :, :]
-
-            if hasattr(self, "batch_size"):
-                BATCH_SIZE = self.batch_size
-            else:
-                BATCH_SIZE = 16
-
-            scores = []
-            for target, tensor in zip(targets, input_tensors):
-                for i in tqdm.tqdm(range(0, tensor.size(0), BATCH_SIZE)):
-                    batch = tensor[i : i + BATCH_SIZE, :]
-                    outputs = [target(o).cpu().item() for o in self.model(batch)]
-                    scores.extend(outputs)
-            scores = torch.Tensor(scores)
-            scores = scores.view(activations.shape[0], activations.shape[1])
-            weights = torch.nn.Softmax(dim=-1)(scores).numpy()
-            return weights
-
-
 def collate_audio_samples_to_tensors(batch):
     """
     takes a list of AudioSample objects, returns batched tensors