From cbd9b95b8063c919cf78ba9290bd663b41fe5f3e Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Wed, 17 Jan 2024 21:10:53 -0800
Subject: [PATCH 1/2] build(setup): use OpenMP, work around compiler bug

- Edit setup.sh script to compile with OpenMP support
- Work around a compiler bug triggered by enabling OpenMP

In a future commit, we will research the utility of OpenMP for
GPU offloading.
---
 setup.sh                         | 21 ++++++++++++--------
 test/trainable_engine_test_m.f90 | 34 +++++++++++++++++++++++---------
 2 files changed, 38 insertions(+), 17 deletions(-)
diff --git a/setup.sh b/setup.sh
index 65de6f1ef..f270d4396 100755
--- a/setup.sh
+++ b/setup.sh
@@ -59,7 +59,7 @@ install_fpm_from_source()
   fi
 }
 
-# if no fpm, install either through homebrew, or gfortran compiling fpm.F90
+# if no fpm, install either through homebrew or by compiling fpm.F90 with gfortran
 if ! command -v fpm > /dev/null ; then
   if ! command -v brew > /dev/null ; then
     if ! command -v gfortran > /dev/null ; then
@@ -77,15 +77,20 @@ fi
 FPM_FC=${FC:-"gfortran-13"}
 FPM_CC=${CC:-"gcc-13"}
 
-mkdir -p build
-
-fpm test
+fpm test --profile release --flag "-fopenmp"
 
 echo ""
 echo "____________________ Inference-Engine has been set up! _______________________"
 echo ""
-echo "To run one of the programs in the example subdirectory, enter a command of the"
-echo "following form at a shell command prompt after replacing <example-base-name>"
-echo "with the base name of a file in the example/ subdirectory:"
+echo "Enter the command below to the see names of example use cases that you can run:"
+echo ""
+echo "fpm run --example"
+echo ""
+echo "To run an example, execute the following command after replacing <name> with"
+echo "one of the names listed by the above command:"
+echo ""
+echo "fpm run --profile release --flag \"-fopenmp\" --example <name>"
+echo ""
+echo "where the '--profile release' and '--flag \"-fopenmp\"' might reduce run times."
+echo "Example programs print usage information if additional arguments are required."
 echo ""
-echo "fpm run --example <example-base-name> --profile release"
diff --git a/test/trainable_engine_test_m.f90 b/test/trainable_engine_test_m.f90
index bc1409033..ad45735e1 100644
--- a/test/trainable_engine_test_m.f90
+++ b/test/trainable_engine_test_m.f90
@@ -281,32 +281,47 @@ function xor_gate_with_random_weights() result(test_passes)
     integer, parameter :: num_inputs=2, mini_batch_size = 1, num_iterations=500000
       !! Depending on where in the random-number sequence the weights start, this test can pass for lower
       !! numbers of iterations, e.g., 400000. Using more iterations gives more robust convergence.
-    integer batch, iter, i
+    integer batch, iter
 
     allocate(harvest(num_inputs, mini_batch_size, num_iterations))
     call random_number(harvest)
 
-    ! The following temporary copies are required by gfortran bug 100650 and possibly 49324
+    ! The following temporary copies, tmp and tmp2, are required by gfortran bug 100650 and possibly 49324
     ! See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100650 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49324
-    tmp = [([(tensor_t(merge(true, false, harvest(:,batch,iter) < 0.5E0)), batch=1, mini_batch_size)], iter=1, num_iterations)]
+
+    allocate(tmp(mini_batch_size*num_iterations))
+    do concurrent(batch = 1: mini_batch_size, iter = 1:num_iterations)
+      tmp((iter-1)*mini_batch_size + 1) = tensor_t(merge(true, false, harvest(:,batch,iter) < 0.5E0))
+    end do
     training_inputs = reshape(tmp, [mini_batch_size, num_iterations])
 
-    tmp2 = [([(xor(training_inputs(batch, iter)), batch = 1, mini_batch_size)], iter = 1, num_iterations )]
+    allocate(tmp2(size(tmp)))
+    do concurrent(batch = 1: mini_batch_size, iter = 1:num_iterations)
+      tmp2((iter-1)*mini_batch_size + 1) = xor(training_inputs(batch, iter))
+    end do
     training_outputs = reshape(tmp2, [mini_batch_size, num_iterations])
 
-    mini_batches = [(mini_batch_t(input_output_pair_t(training_inputs(:,iter), training_outputs(:,iter))), iter=1, num_iterations)]        
+    allocate(mini_batches(size(training_inputs,1)*num_iterations))
+    do concurrent(iter=1:num_iterations)
+      mini_batches(iter) = mini_batch_t(input_output_pair_t(training_inputs(:,iter), training_outputs(:,iter)))
+    end do
+
     trainable_engine = two_random_hidden_layers()
 
     call trainable_engine%train(mini_batches, adam=.false., learning_rate=1.5)
 
     test_inputs = [tensor_t([true,true]), tensor_t([false,true]), tensor_t([true,false]), tensor_t([false,false])]
-    expected_test_outputs = [(xor(test_inputs(i)), i=1, size(test_inputs))]
-    actual_outputs = trainable_engine%infer(test_inputs)
-    test_passes = [(abs(actual_outputs(i)%values() - expected_test_outputs(i)%values()) < tolerance, i=1, size(actual_outputs))]
+    block
+      integer i
+
+      expected_test_outputs = [(xor(test_inputs(i)), i=1, size(test_inputs))]
+      actual_outputs = trainable_engine%infer(test_inputs)
+      test_passes = [(abs(actual_outputs(i)%values() - expected_test_outputs(i)%values()) < tolerance, i=1, size(actual_outputs))]
+    end block
 
   contains
     
-    function xor(inputs) result(expected_outputs)
+    pure function xor(inputs) result(expected_outputs)
       type(tensor_t), intent(in) :: inputs
       type(tensor_t) expected_outputs
       associate(sum_inputs => sum(inputs%values()))
@@ -396,6 +411,7 @@ function perturbed_identity_converges() result(test_passes)
     integer, parameter :: num_epochs = 148
     integer, parameter :: num_bins = 5 
     integer i, bin, epoch
+
     trainable_engine = perturbed_identity_network(perturbation_magnitude=0.1)
 
     associate(num_inputs => trainable_engine%num_inputs(), num_outputs => trainable_engine%num_outputs())

From 0d0311d837038642ebb13b8d41838a3d13ea437c Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Wed, 17 Jan 2024 21:11:24 -0800
Subject: [PATCH 2/2] doc(example): adjust usage info

Add flags to the usage output to match the arguments of the
recommended commands to the arguments employed in setup.sh.
This prevents unnecessary rebuilding of the softwre stack.
---
 example/concurrent-inferences.f90         | 2 +-
 example/learn-addition.f90                | 2 +-
 example/learn-exponentiation.f90          | 2 +-
 example/learn-microphysics-procedures.f90 | 2 +-
 example/learn-multiplication.f90          | 2 +-
 example/learn-power-series.f90            | 2 +-
 example/learn-saturated-mixing-ratio.f90  | 2 +-
 example/train-and-write.f90               | 2 +-
 example/write-read-infer.f90              | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/example/concurrent-inferences.f90 b/example/concurrent-inferences.f90
index b2620ac24..65e35d0f8 100644
--- a/example/concurrent-inferences.f90
+++ b/example/concurrent-inferences.f90
@@ -16,7 +16,7 @@ program concurrent_inferences
 
   if (len(network_file_name%string())==0) then
     error stop new_line('a') // new_line('a') // &
-      'Usage: ./build/run-fpm.sh run --example concurrent-inferences -- --network "<file-name>"' 
+      'Usage: fpm run --example concurrent-inferences --profile release --flag "-fopenmp" -- --network "<file-name>"'
   end if
 
   block 
diff --git a/example/learn-addition.f90 b/example/learn-addition.f90
index 3358a800f..f94e089e5 100644
--- a/example/learn-addition.f90
+++ b/example/learn-addition.f90
@@ -34,7 +34,7 @@ program learn_addition
 
   if (len(final_network_file%string())==0) then
     error stop new_line('a') // new_line('a') // &
-      'Usage: ./build/run-fpm.sh run --example learn-addition -- --output-file "<file-name>"' 
+      'Usage: fpm run --example learn-addition --profile release --flag "-fopenmp" -- --output-file "<file-name>"'
   end if
 
   block
diff --git a/example/learn-exponentiation.f90 b/example/learn-exponentiation.f90
index 404095038..411a42ea5 100644
--- a/example/learn-exponentiation.f90
+++ b/example/learn-exponentiation.f90
@@ -34,7 +34,7 @@ program learn_exponentiation
 
   if (len(final_network_file%string())==0) then
     error stop new_line('a') // new_line('a') // &
-      'Usage: ./build/run-fpm.sh run --example train-polynomials -- --output-file "<file-name>"' 
+      'Usage: fpm run --example train-polynomials --profile release --flag "-fopenmp" -- --output-file "<file-name>"'
   end if
 
   block
diff --git a/example/learn-microphysics-procedures.f90 b/example/learn-microphysics-procedures.f90
index eec4cef8f..5a115c690 100644
--- a/example/learn-microphysics-procedures.f90
+++ b/example/learn-microphysics-procedures.f90
@@ -19,7 +19,7 @@ program learn_microphysics_procedures
 
   if (len(network_file%string())==0) then
     error stop new_line('a') // new_line('a') // &
-      'Usage: ./build/run-fpm.sh run learn-microphysics-procedures -- --output-file "<file-name>"' 
+      'Usage: fpm run learn-microphysics-procedures --profile release --flag "-fopenmp" -- --output-file "<file-name>"'
   end if
 
   call system_clock(counter_start, clock_rate)
diff --git a/example/learn-multiplication.f90 b/example/learn-multiplication.f90
index 18ba510fc..28baca85b 100644
--- a/example/learn-multiplication.f90
+++ b/example/learn-multiplication.f90
@@ -34,7 +34,7 @@ program learn_multiplication
 
   if (len(final_network_file%string())==0) then
     error stop new_line('a') // new_line('a') // &
-      'Usage: ./build/run-fpm.sh run --example learn-multiplication -- --output-file "<file-name>"' 
+      'Usage: fpm run --example learn-multiplication --profile release --flag "-fopenmp" -- --output-file "<file-name>"'
   end if
 
   block
diff --git a/example/learn-power-series.f90 b/example/learn-power-series.f90
index b10cfc2c3..c73b97a0c 100644
--- a/example/learn-power-series.f90
+++ b/example/learn-power-series.f90
@@ -34,7 +34,7 @@ program learn_power_series
 
   if (len(final_network_file%string())==0) then
     error stop new_line('a') // new_line('a') // &
-      'Usage: ./build/run-fpm.sh run --example learn-power-series -- --output-file "<file-name>"' 
+      'Usage: fpm run --example learn-power-series --profile release --flag "-fopenmp" -- --output-file "<file-name>"'
   end if
 
   block
diff --git a/example/learn-saturated-mixing-ratio.f90 b/example/learn-saturated-mixing-ratio.f90
index fe30d982e..472653cb1 100644
--- a/example/learn-saturated-mixing-ratio.f90
+++ b/example/learn-saturated-mixing-ratio.f90
@@ -18,7 +18,7 @@ program train_saturated_mixture_ratio
 
   if (len(network_file%string())==0) then
     error stop new_line('a') // new_line('a') // &
-      'Usage: ./build/run-fpm.sh run --example learn-saturated-mixing-ratio -- --output-file "<file-name>"' 
+      'Usage: fpm run --example learn-saturated-mixing-ratio --profile release --flag "-fopenmp" -- --output-file "<file-name>"'
   end if
 
   call system_clock(counter_start, clock_rate)
diff --git a/example/train-and-write.f90 b/example/train-and-write.f90
index 81de22dac..a2f9ce313 100644
--- a/example/train-and-write.f90
+++ b/example/train-and-write.f90
@@ -21,7 +21,7 @@ program train_and_write
 
   if (len(final_network_file%string())==0) then
     error stop new_line('a') // new_line('a') // &
-      'Usage: ./build/run-fpm.sh run --example train-and-write -- --output-file "<file-name>"' 
+      'Usage: fpm run --example train-and-write --profile release --flag "-fopenmp" -- --output-file "<file-name>"' 
   end if
 
   block
diff --git a/example/write-read-infer.f90 b/example/write-read-infer.f90
index 1a9854333..cbfd0d686 100644
--- a/example/write-read-infer.f90
+++ b/example/write-read-infer.f90
@@ -19,7 +19,7 @@ program write_read_infer
 
   if (len(file_name%string())==0) then
     error stop new_line('a') // new_line('a') // &
-      'Usage: ./build/run-fpm.sh run --example write-read-infer -- --output-file "<file-name>"' 
+      'Usage: fpm run --example write-read-infer --profile release --flag "-fopenmp" -- --output-file "<file-name>"' 
   end if
 
   call write_read_query_infer(file_name)