switch to swish activation

peterdsharpe · Mar 30, 2024 · 63783eb · 63783eb
1 parent c5dd618
commit 63783eb
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 31 deletions.
diff --git a/neuralfoil/gen2_5_architecture/main.py b/neuralfoil/gen2_5_architecture/main.py
@@ -109,7 +109,7 @@ def net(x: np.ndarray):
             x = w @ x + np.reshape(b, (-1, 1))
 
             if len(layer_indices_to_iterate) != 0:  # Don't apply the activation function on the last layer
-                x = np.softplus(x)
+                x = np.swish(x)
         x = np.transpose(x)
         return x
 
@@ -124,8 +124,8 @@ def net(x: np.ndarray):
     x_flipped = x + 0.  # This is a array-api-agnostic way to force a memory copy of the array to be made.
     x_flipped[:, :8] = x[:, 8:16] * -1  # switch kulfan_lower with a flipped kulfan_upper
     x_flipped[:, 8:16] = x[:, :8] * -1  # switch kulfan_upper with a flipped kulfan_lower
-    x_flipped[:, 16] *= -1  # flip kulfan_LE_weight
-    x_flipped[:, 18] *= -1  # flip sin(2a)
+    x_flipped[:, 16] = -1 * x[:, 16]  # flip kulfan_LE_weight
+    x_flipped[:, 18] = -1 * x[:, 18]  # flip sin(2a)
     x_flipped[:, 23] = x[:, 24]  # flip xtr_upper with xtr_lower
     x_flipped[:, 24] = x[:, 23]  # flip xtr_lower with xtr_upper
 
@@ -135,8 +135,8 @@ def net(x: np.ndarray):
 
     ### The resulting outputs will also be flipped, so we need to flip them back to their normal orientation
     y_unflipped = y_flipped + 0.  # This is a array-api-agnostic way to force a memory copy of the array to be made.
-    y_unflipped[:, 1] *= -1  # CL
-    y_unflipped[:, 3] *= -1  # CM
+    y_unflipped[:, 1] = y_flipped[:, 1] * -1  # CL
+    y_unflipped[:, 3] = y_flipped[:, 3] * -1  # CM
     y_unflipped[:, 4] = y_flipped[:, 5]  # switch Top_Xtr with Bot_Xtr
     y_unflipped[:, 5] = y_flipped[:, 4]  # switch Bot_Xtr with Top_Xtr
 

diff --git a/training/gen2_architecture/train_blind_neural_network.py b/training/gen2_architecture/train_blind_neural_network.py
@@ -21,6 +21,8 @@
 N_outputs = len(df_train_outputs_scaled.columns)
 
 cache_file = Path(__file__).parent / "nn-xxxlarge.pth"
+n_hidden_layers = 5
+width = 512
 print("Cache file: ", cache_file)
 
 
@@ -29,30 +31,26 @@ class Net(torch.nn.Module):
     def __init__(self, mean_inputs_scaled, cov_inputs_scaled):
         super().__init__()
 
-        width = 512
-
         self.mean_inputs_scaled = mean_inputs_scaled
         self.cov_inputs_scaled = cov_inputs_scaled
         self.inv_cov_inputs_scaled = torch.inverse(cov_inputs_scaled)
         self.N_inputs = len(mean_inputs_scaled)
 
-        self.net = torch.nn.Sequential(
+        layers = [
             torch.nn.Linear(N_inputs, width),
-            torch.nn.Tanh(),
-
-            torch.nn.Linear(width, width),
-            torch.nn.Tanh(),
-            torch.nn.Linear(width, width),
-            torch.nn.Tanh(),
-            torch.nn.Linear(width, width),
-            torch.nn.Tanh(),
-            torch.nn.Linear(width, width),
-            torch.nn.Tanh(),
-            torch.nn.Linear(width, width),
-            torch.nn.Tanh(),
-
+            torch.nn.SiLU(),
+        ]
+        for i in range(n_hidden_layers):
+            layers += [
+                torch.nn.Linear(width, width),
+                torch.nn.SiLU(),
+            ]
+
+        layers += [
             torch.nn.Linear(width, N_outputs),
-        )
+        ]
+
+        self.net = torch.nn.Sequential(*layers)
 
     def squared_mahalanobis_distance(self, x: torch.Tensor):
         return torch.sum(
@@ -67,15 +65,14 @@ def forward(self, x: torch.Tensor):
         ### Add in the squared Mahalanobis distance to the analysis_confidence logit, to ensure it
         # asymptotes to untrustworthy as the inputs get further from the training data
 
-
         ### Then, flip the inputs and evaluate the network again.
         # The goal here is to embed the invariant of "symmetry across alpha" into the network evaluation.
 
         x_flipped = x.clone()
         x_flipped[:, :8] = -1 * x[:, 8:16]  # switch kulfan_lower with a flipped kulfan_upper
         x_flipped[:, 8:16] = -1 * x[:, :8]  # switch kulfan_upper with a flipped kulfan_lower
         x_flipped[:, 16] = -1 * x[:, 16]  # flip kulfan_LE_weight
-        x_flipped[:, 18] = -1 * x[:, 18] # flip sin(2a)
+        x_flipped[:, 18] = -1 * x[:, 18]  # flip sin(2a)
         x_flipped[:, 23] = x[:, 24]  # flip xtr_upper with xtr_lower
         x_flipped[:, 24] = x[:, 23]  # flip xtr_lower with xtr_upper
 
@@ -93,9 +90,9 @@ def forward(self, x: torch.Tensor):
 
         # switch upper and lower Ret, H
         y_unflipped[:, 6 + 32 * 0: 6 + 32 * 2] = y_flipped[:, 6 + 32 * 3: 6 + 32 * 5]
-        y_unflipped[:, 6 + 32 * 2: 6 + 32 * 3] = y_flipped[:, 6 + 32 * 5: 6 + 32 * 6] * -1 # ue/vinf
+        y_unflipped[:, 6 + 32 * 2: 6 + 32 * 3] = y_flipped[:, 6 + 32 * 5: 6 + 32 * 6] * -1  # ue/vinf
         y_unflipped[:, 6 + 32 * 3: 6 + 32 * 5] = y_flipped[:, 6 + 32 * 0: 6 + 32 * 2]
-        y_unflipped[:, 6 + 32 * 5: 6 + 32 * 6] = y_flipped[:, 6 + 32 * 2: 6 + 32 * 3] * -1 # ue/vinf
+        y_unflipped[:, 6 + 32 * 5: 6 + 32 * 6] = y_flipped[:, 6 + 32 * 2: 6 + 32 * 3] * -1  # ue/vinf
 
         # switch upper_bl_ue/vinf with lower_bl_ue/vinf
 
@@ -117,15 +114,14 @@ def forward(self, x: torch.Tensor):
         cov_inputs_scaled=torch.tensor(cov_inputs_scaled, dtype=torch.float32).to(device),
     ).to(device)
 
-
     # Define the optimizer
     learning_rate = 1e-4
     optimizer = torch.optim.RAdam(net.parameters(), lr=learning_rate, weight_decay=3e-5)
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         optimizer,
-        patience=10,
+        factor=0.5,
+        patience=50,
         verbose=True,
-        min_lr=0,
     )
 
     try:
@@ -182,6 +178,7 @@ def forward(self, x: torch.Tensor):
 
     loss_weights = loss_weights / torch.sum(loss_weights) * 1000
 
+
     def loss_function(y_pred, y_data, return_individual_loss_components=False):
         # For data with NaN, overwrite the data with the prediction. This essentially makes the model ignore NaN data,
         # since the gradient of the loss with respect to parameters is zero when the data is NaN.
@@ -286,7 +283,7 @@ def loss_function(y_pred, y_data, return_individual_loss_components=False):
 
                 loss_components_from_each_test_batch.append(loss_components)
 
-                y_pred[:, 0] = torch.sigmoid(y_pred[:, 0]) # Analysis confidence, a binary variable
+                y_pred[:, 0] = torch.sigmoid(y_pred[:, 0])  # Analysis confidence, a binary variable
 
                 mae_from_each_test_batch.append(
                     torch.nanmean(torch.abs(y_pred - y_data), dim=0)
@@ -321,6 +318,6 @@ def loss_function(y_pred, y_data, return_individual_loss_components=False):
         scheduler.step(test_loss)
 
         torch.save({
-            'model_state_dict': net.state_dict(),
+            'model_state_dict'    : net.state_dict(),
             'optimizer_state_dict': optimizer.state_dict(),
         }, cache_file)