Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

only use internal pytorch architectures #1004

Merged
merged 4 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions opensoundscape/ml/cnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,9 +481,9 @@ def __init__(
len(classes), num_channels=num_channels
)
else:
assert issubclass(
type(architecture), torch.nn.Module
), "architecture must be a string or an instance of a subclass of torch.nn.Module"
assert isinstance(
architecture, torch.nn.Module
), "architecture must be a string or an instance of (a subclass of) torch.nn.Module"
if num_channels != 3:
warnings.warn(
f"Make sure your architecture expects the number of channels in "
Expand Down Expand Up @@ -1203,7 +1203,7 @@ def generate_cams(
str can be any of the following:
"gradcam": pytorch_grad_cam.GradCAM,
"hirescam": pytorch_grad_cam.HiResCAM,
"scorecam": opensoundscape.ml.utils.ScoreCAM, #pytorch_grad_cam.ScoreCAM,
"scorecam": pytorch_grad_cam.ScoreCAM,
"gradcam++": pytorch_grad_cam.GradCAMPlusPlus,
"ablationcam": pytorch_grad_cam.AblationCAM,
"xgradcam": pytorch_grad_cam.XGradCAM,
Expand Down Expand Up @@ -1264,7 +1264,7 @@ def generate_cams(
methods_dict = {
"gradcam": pytorch_grad_cam.GradCAM,
"hirescam": pytorch_grad_cam.HiResCAM,
"scorecam": opensoundscape.ml.utils.ScoreCAM, # pytorch_grad_cam.ScoreCAM,
"scorecam": pytorch_grad_cam.ScoreCAM,
"gradcam++": pytorch_grad_cam.GradCAMPlusPlus,
"ablationcam": pytorch_grad_cam.AblationCAM,
"xgradcam": pytorch_grad_cam.XGradCAM,
Expand All @@ -1277,11 +1277,13 @@ def generate_cams(
if isinstance(method, str) and method in methods_dict:
# get cam clsas based on string name and create instance
cam = methods_dict[method](model=self.network, target_layers=target_layers)
cam.device = self.device
elif method is None:
cam = None
elif issubclass(method, pytorch_grad_cam.base_cam.BaseCAM):
# generate instance of cam from class
cam = method(model=self.network, target_layers=target_layers)
cam.device = self.device
else:
raise ValueError(
f"`method` {method} not supported. "
Expand All @@ -1292,8 +1294,8 @@ def generate_cams(
# initialize guided back propagation object
if guided_backprop:
gb_model = pytorch_grad_cam.GuidedBackpropReLUModel(
model=self.network, use_cuda=False
) # TODO cuda usage - expose? use model setting?
model=self.network, device=self.device
)

# create dataloader to generate batches of AudioSamples
dataloader = self.inference_dataloader_cls(
Expand Down
133 changes: 18 additions & 115 deletions opensoundscape/ml/cnn_architectures.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
architectures - the easiest way is to simply use the InceptionV3 class in
opensoundscape.ml.cnn.
"""

import warnings

import torch
Expand Down Expand Up @@ -490,30 +491,28 @@ def efficientnet_b0(
num_channels:
specify channels in input sample, eg [channels h,w] sample shape

Note: in v0.10.2, changed from using NVIDIA/DeepLearningExamples:torchhub repo
implementatiuon to native pytorch implementation

"""
torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
architecture_ft = torch.hub.load(
"NVIDIA/DeepLearningExamples:torchhub",
"nvidia_efficientnet_b0",
pretrained=weights,
)
architecture_ft = torchvision.models.efficientnet_b0(weights=weights)

# prevent weights of feature extractor from being trained, if desired
if freeze_feature_extractor:
freeze_params(architecture_ft)

# change number of output nodes
architecture_ft.classifier.fc = change_fc_output_size(
architecture_ft.classifier.fc, num_classes
architecture_ft.classifier[1] = change_fc_output_size(
architecture_ft.classifier[1], num_classes=num_classes
)

# change input shape num_channels
architecture_ft.stem.conv = change_conv2d_channels(
architecture_ft.stem.conv, num_channels
architecture_ft.features[0][0] = change_conv2d_channels(
architecture_ft.features[0][0], num_channels
)

# default target layers for activation maps like GradCAM and guided backpropagation
architecture_ft.cam_target_layers = [architecture_ft.layers[-1][-1]]
architecture_ft.cam_target_layers = [architecture_ft.features[-1]]

return architecture_ft

Expand All @@ -537,124 +536,28 @@ def efficientnet_b4(
num_channels:
specify channels in input sample, eg [channels h,w] sample shape

"""
torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
architecture_ft = torch.hub.load(
"NVIDIA/DeepLearningExamples:torchhub",
"nvidia_efficientnet_b4",
pretrained=weights,
)

# prevent weights of feature extractor from being trained, if desired
if freeze_feature_extractor:
freeze_params(architecture_ft)

# change number of output nodes
architecture_ft.classifier.fc = change_fc_output_size(
architecture_ft.classifier.fc, num_classes
)

# change input shape num_channels
architecture_ft.stem.conv = change_conv2d_channels(
architecture_ft.stem.conv, num_channels
)

# default target layers for activation maps like GradCAM and guided backpropagation
architecture_ft.cam_target_layers = [architecture_ft.layers[-1][-1]]

return architecture_ft


@register_arch
def efficientnet_widese_b0(
num_classes, freeze_feature_extractor=False, weights="DEFAULT", num_channels=3
):
"""Wrapper for efficientnet_widese_b0 architecture

Args:
num_classes:
number of output nodes for the final layer
freeze_feature_extractor:
if False (default), entire network will have gradients and can train
if True, feature block is frozen and only final layer is trained
weights:
string containing version name of the pre-trained classification weights to use for this architecture.
if 'DEFAULT', model is loaded with best available weights (note that these may change across versions).
Pre-trained weights available for each architecture are listed at https://pytorch.org/vision/stable/models.html
num_channels:
specify channels in input sample, eg [channels h,w] sample shape
Note: in v0.10.2, changed from using NVIDIA/DeepLearningExamples:torchhub repo
implementatiuon to native pytorch implementation

"""
torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
architecture_ft = torch.hub.load(
"NVIDIA/DeepLearningExamples:torchhub",
"nvidia_efficientnet_widese_b0",
pretrained=weights,
)
architecture_ft = torchvision.models.efficientnet_b4(weights=weights)

# prevent weights of feature extractor from being trained, if desired
if freeze_feature_extractor:
freeze_params(architecture_ft)

# change number of output nodes
architecture_ft.classifier.fc = change_fc_output_size(
architecture_ft.classifier.fc, num_classes
architecture_ft.classifier[1] = change_fc_output_size(
architecture_ft.classifier[1], num_classes=num_classes
)

# change input shape num_channels
architecture_ft.stem.conv = change_conv2d_channels(
architecture_ft.stem.conv, num_channels
architecture_ft.features[0][0] = change_conv2d_channels(
architecture_ft.features[0][0], num_channels
)

# default target layers for activation maps like GradCAM and guided backpropagation
architecture_ft.cam_target_layers = [architecture_ft.layers[-1][-1]]

return architecture_ft


@register_arch
def efficientnet_widese_b4(
num_classes, freeze_feature_extractor=False, weights="DEFAULT", num_channels=3
):
"""Wrapper for efficientnet_widese_b4 architecture

Args:
num_classes:
number of output nodes for the final layer
freeze_feature_extractor:
if False (default), entire network will have gradients and can train
if True, feature block is frozen and only final layer is trained
weights:
string containing version name of the pre-trained classification weights to use for this architecture.
if 'DEFAULT', model is loaded with best available weights (note that these may change across versions).
Pre-trained weights available for each architecture are listed at https://pytorch.org/vision/stable/models.html
num_channels:
specify channels in input sample, eg [channels h,w] sample shape

"""
torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
architecture_ft = torch.hub.load(
"NVIDIA/DeepLearningExamples:torchhub",
"nvidia_efficientnet_widese_b4",
pretrained=weights,
)

# prevent weights of feature extractor from being trained, if desired
if freeze_feature_extractor:
freeze_params(architecture_ft)

# change number of output nodes
architecture_ft.classifier.fc = change_fc_output_size(
architecture_ft.classifier.fc, num_classes
)

# change input shape num_channels
architecture_ft.stem.conv = change_conv2d_channels(
architecture_ft.stem.conv, num_channels
)

# default target layers for activation maps like GradCAM and guided backpropagation
architecture_ft.cam_target_layers = [architecture_ft.layers[-1][-1]]
architecture_ft.cam_target_layers = [architecture_ft.features[-1]]

return architecture_ft

Expand Down
52 changes: 1 addition & 51 deletions opensoundscape/ml/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Utilties for .ml"""

import warnings
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -131,57 +132,6 @@ def apply_activation_layer(x, activation_layer=None):
return x


# override pytorch_grad_cam's score cam class because it has a bug
# with device mismatch of upsampled (cpu) vs input_tensor (may be mps, cuda, etc)
class ScoreCAM(pytorch_grad_cam.base_cam.BaseCAM):
def __init__(self, model, target_layers, use_cuda=False, reshape_transform=None):
super(ScoreCAM, self).__init__(
model,
target_layers,
use_cuda,
reshape_transform=reshape_transform,
uses_gradients=False,
)

def get_cam_weights(self, input_tensor, target_layer, targets, activations, grads):
with torch.no_grad():
upsample = torch.nn.UpsamplingBilinear2d(size=input_tensor.shape[-2:])
activation_tensor = torch.from_numpy(activations)
if self.cuda:
activation_tensor = activation_tensor.cuda()

upsampled = upsample(activation_tensor)

maxs = upsampled.view(upsampled.size(0), upsampled.size(1), -1).max(dim=-1)[
0
]
mins = upsampled.view(upsampled.size(0), upsampled.size(1), -1).min(dim=-1)[
0
]

maxs, mins = maxs[:, :, None, None], mins[:, :, None, None]
upsampled = (upsampled - mins) / (maxs - mins)

upsampled = upsampled.to(input_tensor.device)
input_tensors = input_tensor[:, None, :, :] * upsampled[:, :, None, :, :]

if hasattr(self, "batch_size"):
BATCH_SIZE = self.batch_size
else:
BATCH_SIZE = 16

scores = []
for target, tensor in zip(targets, input_tensors):
for i in tqdm.tqdm(range(0, tensor.size(0), BATCH_SIZE)):
batch = tensor[i : i + BATCH_SIZE, :]
outputs = [target(o).cpu().item() for o in self.model(batch)]
scores.extend(outputs)
scores = torch.Tensor(scores)
scores = scores.view(activations.shape[0], activations.shape[1])
weights = torch.nn.Softmax(dim=-1)(scores).numpy()
return weights


def collate_audio_samples_to_tensors(batch):
"""
takes a list of AudioSample objects, returns batched tensors
Expand Down
Loading
Loading