From 4f7d797f5ec14f5c81b23c1cc7510237a9de1f81 Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Fri, 26 Jul 2024 13:33:48 -0400
Subject: [PATCH 01/19] First attempt at writing kernel and workspace following
 Greg suggestions on issue #112.

---
 src/multilayerqg.jl | 92 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 83 insertions(+), 9 deletions(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 4e35d679..5a10f890 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -18,11 +18,13 @@ using
   LinearAlgebra,
   StaticArrays,
   Reexport,
-  DocStringExtensions
+  DocStringExtensions,
+  KernelAbstractions
 
 @reexport using FourierFlows
 
 using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft
+using KernelAbstractions.Extras.LoopInfo: @unroll
 
 nothingfunction(args...) = nothing
 
@@ -533,16 +535,52 @@ Compute the inverse Fourier transform of `varh` and store it in `var`.
 """
 invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh)
 
+"""
+    @kernel function pvfromstreamfunction_kernel!(qh, ψh, params)
+
+Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier
+transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`.
+"""
+@kernel function pvfromstreamfunction_kernel!(qh, ψh, params)
+    i, j = @index(Global, NTuple)
+    nlayers, S = params.nlayers, params.S
+
+    @unroll for k = 1:nlayers
+
+        @inbounds qh[i, j, k] = 0
+
+        @unroll for m = 1:nlayers
+            @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m]
+        end
+    
+    end
+end
+
 """
     pvfromstreamfunction!(qh, ψh, params, grid)
 
 Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using
-`qh = params.S * ψh`.
+`qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched.
 """
 function pvfromstreamfunction!(qh, ψh, params, grid)
-  for j=1:grid.nl, i=1:grid.nkr
-    CUDA.@allowscalar @views qh[i, j, :] .= params.S[i, j] * ψh[i, j, :]
-  end
+  # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs
+  # different behavior when either nkl or nl are less than 16
+  workgroup = 16, 16
+
+  # The worksize determines how many times the kernel is run
+  worksize = grid.nkr, grid.nl
+
+  # This (and its usage below) will ensure the kernel is not run _before_ the data in ψh is available
+  barrier = Event(grid.device)
+
+  # Creates a loop over the specified worksize, using workgroup to organize the computation
+  loop_kernel! = pvfromstreamfunction_kernel!(grid.device, workgroup, worksize)
+
+  # Launch the kernel
+  event = loop_kernel!(qh, ψh, params, dependencies = barrier)
+
+  # This will ensure that no other operations occur until the kernel has finished
+  wait(event)
 
   return nothing
 end
@@ -587,16 +625,52 @@ function pvfromstreamfunction!(qh, ψh, params::TwoLayerParams, grid)
   return nothing
 end
 
+"""
+    @kernel function streamfunctionfrompv_kernel!(ψh, qh, params)
+
+Kernel for GPU acceleration of streamfunction from PV calculation, i.e., invert the PV to obtain
+the Fourier transform of the streamfunction `ψh` in each layer from `qh` using `ψh = params.S⁻¹ qh`.
+"""
+@kernel function streamfunctionfrompv_kernel!(ψh, qh, params)
+    i, j = @index(Global, NTuple)
+    nlayers, S = params.nlayers, params.S
+
+    @unroll for k = 1:nlayers
+
+        @inbounds ψh[i, j, k] = 0
+
+        @unroll for m = 1:nlayers
+            @inbounds ψh[i, j, k] += S⁻¹[i, j][k, m] * qh[i, j, m]
+        end
+    
+    end
+end
+
 """
     streamfunctionfrompv!(ψh, qh, params, grid)
 
 Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from
-`qh` using `ψh = params.S⁻¹ qh`.
+`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the above-defined kernel is launched.
 """
 function streamfunctionfrompv!(ψh, qh, params, grid)
-  for j=1:grid.nl, i=1:grid.nkr
-    CUDA.@allowscalar @views ψh[i, j, :] .= params.S⁻¹[i, j] * qh[i, j, :]
-  end
+  # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs
+  # different behavior when either nkl or nl are less than 16
+  workgroup = 16, 16
+
+  # The worksize determines how many times the kernel is run
+  worksize = grid.nkr, grid.nl
+
+  # This (and its usage below) will ensure the kernel is not run _before_ the data in qh is available
+  barrier = Event(grid.device)
+
+  # Creates a loop over the specified worksize, using workgroup to organize the computation
+  loop_kernel! = streamfunctionfrompv_kernel!(grid.device, workgroup, worksize)
+
+  # Launch the kernel
+  event = loop_kernel!(ψh, qh, params, dependencies = barrier)
+
+  # This will ensure that no other operations occur until the kernel has finished
+  wait(event)
 
   return nothing
 end

From 114977cd28e34ea8cf5f4d98025081eb5cba4b54 Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Sun, 28 Jul 2024 14:08:50 -0400
Subject: [PATCH 02/19] Change wait to depend on GPU device as per example
 video on youtube.

---
 src/multilayerqg.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 5a10f890..080a9de6 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -580,7 +580,7 @@ function pvfromstreamfunction!(qh, ψh, params, grid)
   event = loop_kernel!(qh, ψh, params, dependencies = barrier)
 
   # This will ensure that no other operations occur until the kernel has finished
-  wait(event)
+  wait(grid.device, event)
 
   return nothing
 end
@@ -670,7 +670,7 @@ function streamfunctionfrompv!(ψh, qh, params, grid)
   event = loop_kernel!(ψh, qh, params, dependencies = barrier)
 
   # This will ensure that no other operations occur until the kernel has finished
-  wait(event)
+  wait(grid.device, event)
 
   return nothing
 end

From 76c772882bbdd4cfda03467c857d72fe7f7cf038 Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Sun, 28 Jul 2024 16:42:09 -0400
Subject: [PATCH 03/19] Rewrote kernel for more recent version of
 KernelAbstraction syntax. Also explicitly import CPU, GPU from FourierFlows
 as there were conflicts with KernelAbstractions.

---
 src/multilayerqg.jl | 62 +++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 080a9de6..06ac4283 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -23,7 +23,7 @@ using
 
 @reexport using FourierFlows
 
-using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft
+using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft, CPU, GPU
 using KernelAbstractions.Extras.LoopInfo: @unroll
 
 nothingfunction(args...) = nothing
@@ -536,24 +536,23 @@ Compute the inverse Fourier transform of `varh` and store it in `var`.
 invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh)
 
 """
-    @kernel function pvfromstreamfunction_kernel!(qh, ψh, params)
+    @kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers)
 
 Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier
 transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`.
 """
-@kernel function pvfromstreamfunction_kernel!(qh, ψh, params)
-    i, j = @index(Global, NTuple)
-    nlayers, S = params.nlayers, params.S
+@kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers)
+  i, j = @index(Global, NTuple)
 
-    @unroll for k = 1:nlayers
+  @unroll for k = 1:nlayers
 
-        @inbounds qh[i, j, k] = 0
+      @inbounds qh[i, j, k] = 0
 
-        @unroll for m = 1:nlayers
-            @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m]
-        end
-    
-    end
+      @unroll for m = 1:nlayers
+          @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m]
+      end
+  
+  end
 end
 
 """
@@ -563,24 +562,23 @@ Obtain the Fourier transform of the PV from the streamfunction `ψh` in each lay
 `qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched.
 """
 function pvfromstreamfunction!(qh, ψh, params, grid)
-  # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs
-  # different behavior when either nkl or nl are less than 16
+  # Larger workgroups are generally more efficient. For more generality, we could put an 
+  # if statement that incurs different behavior when either nkl or nl are less than 16
   workgroup = 16, 16
 
   # The worksize determines how many times the kernel is run
   worksize = grid.nkr, grid.nl
 
-  # This (and its usage below) will ensure the kernel is not run _before_ the data in ψh is available
-  barrier = Event(grid.device)
-
-  # Creates a loop over the specified worksize, using workgroup to organize the computation
-  loop_kernel! = pvfromstreamfunction_kernel!(grid.device, workgroup, worksize)
+  # Instantiates the kernel for relevant backend device
+  backend = KernelAbstractions.get_backend(qh)
+  kernel! = test_pvfromstreamfunction_kernel!(backend, workgroup, worksize)
 
   # Launch the kernel
-  event = loop_kernel!(qh, ψh, params, dependencies = barrier)
+  S, nlayers = params.S, params.nlayers
+  kernel!(qh, ψh, S, nlayers)
 
   # This will ensure that no other operations occur until the kernel has finished
-  wait(grid.device, event)
+  KernelAbstractions.synchronize(backend)
 
   return nothing
 end
@@ -626,14 +624,13 @@ function pvfromstreamfunction!(qh, ψh, params::TwoLayerParams, grid)
 end
 
 """
-    @kernel function streamfunctionfrompv_kernel!(ψh, qh, params)
+    @kernel function streamfunctionfrompv_kernel!(ψh, qh, S⁻¹, nlayers)
 
 Kernel for GPU acceleration of streamfunction from PV calculation, i.e., invert the PV to obtain
 the Fourier transform of the streamfunction `ψh` in each layer from `qh` using `ψh = params.S⁻¹ qh`.
 """
-@kernel function streamfunctionfrompv_kernel!(ψh, qh, params)
+@kernel function streamfunctionfrompv_kernel!(ψh, qh, S⁻¹, nlayers)
     i, j = @index(Global, NTuple)
-    nlayers, S = params.nlayers, params.S
 
     @unroll for k = 1:nlayers
 
@@ -653,24 +650,23 @@ Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in eac
 `qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the above-defined kernel is launched.
 """
 function streamfunctionfrompv!(ψh, qh, params, grid)
-  # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs
-  # different behavior when either nkl or nl are less than 16
+# Larger workgroups are generally more efficient. For more generality, we could put an 
+  # if statement that incurs different behavior when either nkl or nl are less than 16
   workgroup = 16, 16
 
   # The worksize determines how many times the kernel is run
   worksize = grid.nkr, grid.nl
 
-  # This (and its usage below) will ensure the kernel is not run _before_ the data in qh is available
-  barrier = Event(grid.device)
-
-  # Creates a loop over the specified worksize, using workgroup to organize the computation
-  loop_kernel! = streamfunctionfrompv_kernel!(grid.device, workgroup, worksize)
+  # Instantiates the kernel for relevant backend device
+  backend = KernelAbstractions.get_backend(ψh)
+  kernel! = streamfunctionfrompv_kernel!(backend, workgroup, worksize)
 
   # Launch the kernel
-  event = loop_kernel!(ψh, qh, params, dependencies = barrier)
+  S⁻¹, nlayers = params.S⁻¹, params.nlayers
+  kernel!(ψh, qh, S⁻¹, nlayers)
 
   # This will ensure that no other operations occur until the kernel has finished
-  wait(grid.device, event)
+  KernelAbstractions.synchronize(backend)
 
   return nothing
 end

From e8cd9d7c3ad1df63fce449cc4d6fe94e958484c1 Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Sun, 28 Jul 2024 16:56:33 -0400
Subject: [PATCH 04/19] Remove GPU warning for more than two layers.

---
 src/multilayerqg.jl | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 06ac4283..949270f1 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -113,16 +113,16 @@ function Problem(nlayers::Int,                             # number of fluid lay
              aliased_fraction = 1/3,
                             T = Float64)
 
-  if dev == GPU() && nlayers > 2
-    @warn """MultiLayerQG module is not optimized on the GPU yet for configurations with
-    3 fluid layers or more!
+  # if dev == GPU() && nlayers > 2
+  #   @warn """MultiLayerQG module is not optimized on the GPU yet for configurations with
+  #   3 fluid layers or more!
 
-    See issues on Github at https://github.com/FourierFlows/GeophysicalFlows.jl/issues/112
-    and https://github.com/FourierFlows/GeophysicalFlows.jl/issues/267.
+  #   See issues on Github at https://github.com/FourierFlows/GeophysicalFlows.jl/issues/112
+  #   and https://github.com/FourierFlows/GeophysicalFlows.jl/issues/267.
 
-    To use MultiLayerQG with 3 fluid layers or more we suggest, for now, to restrict running
-    on CPU."""
-  end
+  #   To use MultiLayerQG with 3 fluid layers or more we suggest, for now, to restrict running
+  #   on CPU."""
+  # end
 
   if nlayers == 1
     @warn """MultiLayerQG module does work for single-layer configuration but may not be as 
@@ -541,7 +541,7 @@ invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, v
 Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier
 transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`.
 """
-@kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers)
+@kernel function test_pvfromstreamfunction_kernel!(qh, ψh, S, nlayers)
   i, j = @index(Global, NTuple)
 
   @unroll for k = 1:nlayers
@@ -549,7 +549,7 @@ transform of the PV from the streamfunction `ψh` in each layer using `qh = para
       @inbounds qh[i, j, k] = 0
 
       @unroll for m = 1:nlayers
-          @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m]
+          @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] + 1 + 2*im
       end
   
   end
@@ -561,7 +561,7 @@ end
 Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using
 `qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched.
 """
-function pvfromstreamfunction!(qh, ψh, params, grid)
+function test_pvfromstreamfunction!(qh, ψh, params, grid)
   # Larger workgroups are generally more efficient. For more generality, we could put an 
   # if statement that incurs different behavior when either nkl or nl are less than 16
   workgroup = 16, 16
@@ -575,7 +575,8 @@ function pvfromstreamfunction!(qh, ψh, params, grid)
 
   # Launch the kernel
   S, nlayers = params.S, params.nlayers
-  kernel!(qh, ψh, S, nlayers)
+  #kernel!(qh, ψh, S, nlayers)
+  kernel!(qh, psih, S, nlayers)
 
   # This will ensure that no other operations occur until the kernel has finished
   KernelAbstractions.synchronize(backend)

From 633797bdea5ff4ab6d38334cff76d4e0c4cac02d Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Sun, 28 Jul 2024 17:05:29 -0400
Subject: [PATCH 05/19] Added KernelAbstractions to dependency list.

---
 Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Project.toml b/Project.toml
index a49c334c..1aa677dd 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,6 +16,7 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 
 [compat]
 CUDA = "1, 2.4.2, 3.0.0 - 3.6.4, 3.7.1, 4, 5"
@@ -30,6 +31,7 @@ SpecialFunctions = "0.10, 1, 2"
 StaticArrays = "0.12, 1"
 Statistics = "1.6"
 julia = "1.6"
+KernelAbstractions = "0.9"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"

From 916653990ea358d2969e3870bd3e217704da9ec1 Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Sun, 28 Jul 2024 17:14:28 -0400
Subject: [PATCH 06/19] Fix typos.

---
 src/multilayerqg.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 949270f1..9225d516 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -541,7 +541,7 @@ invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, v
 Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier
 transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`.
 """
-@kernel function test_pvfromstreamfunction_kernel!(qh, ψh, S, nlayers)
+@kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers)
   i, j = @index(Global, NTuple)
 
   @unroll for k = 1:nlayers
@@ -561,7 +561,7 @@ end
 Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using
 `qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched.
 """
-function test_pvfromstreamfunction!(qh, ψh, params, grid)
+function pvfromstreamfunction!(qh, ψh, params, grid)
   # Larger workgroups are generally more efficient. For more generality, we could put an 
   # if statement that incurs different behavior when either nkl or nl are less than 16
   workgroup = 16, 16
@@ -575,8 +575,7 @@ function test_pvfromstreamfunction!(qh, ψh, params, grid)
 
   # Launch the kernel
   S, nlayers = params.S, params.nlayers
-  #kernel!(qh, ψh, S, nlayers)
-  kernel!(qh, psih, S, nlayers)
+  kernel!(qh, ψh, S, nlayers)
 
   # This will ensure that no other operations occur until the kernel has finished
   KernelAbstractions.synchronize(backend)

From 04298c4a5db978291bb5fa0eac06e64a7ad06bae Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Sun, 28 Jul 2024 17:19:00 -0400
Subject: [PATCH 07/19] Fix more typos.

---
 src/multilayerqg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 9225d516..8511db6d 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -571,7 +571,7 @@ function pvfromstreamfunction!(qh, ψh, params, grid)
 
   # Instantiates the kernel for relevant backend device
   backend = KernelAbstractions.get_backend(qh)
-  kernel! = test_pvfromstreamfunction_kernel!(backend, workgroup, worksize)
+  kernel! = pvfromstreamfunction_kernel!(backend, workgroup, worksize)
 
   # Launch the kernel
   S, nlayers = params.S, params.nlayers

From b3161d8945239a4a7ffdd0c9ce4aae064c0e0658 Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Mon, 29 Jul 2024 11:27:11 -0400
Subject: [PATCH 08/19] Some simple tests changing the workgroup size showed 8
 was often better than 16.

---
 src/multilayerqg.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 8511db6d..91ef062d 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -563,8 +563,8 @@ Obtain the Fourier transform of the PV from the streamfunction `ψh` in each lay
 """
 function pvfromstreamfunction!(qh, ψh, params, grid)
   # Larger workgroups are generally more efficient. For more generality, we could put an 
-  # if statement that incurs different behavior when either nkl or nl are less than 16
-  workgroup = 16, 16
+  # if statement that incurs different behavior when either nkl or nl are less than 8
+  workgroup = 8, 8
 
   # The worksize determines how many times the kernel is run
   worksize = grid.nkr, grid.nl
@@ -651,8 +651,8 @@ Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in eac
 """
 function streamfunctionfrompv!(ψh, qh, params, grid)
 # Larger workgroups are generally more efficient. For more generality, we could put an 
-  # if statement that incurs different behavior when either nkl or nl are less than 16
-  workgroup = 16, 16
+  # if statement that incurs different behavior when either nkl or nl are less than 8
+  workgroup = 8, 8
 
   # The worksize determines how many times the kernel is run
   worksize = grid.nkr, grid.nl

From 880f4f623845d27c423c0e929db5828a7658365c Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Mon, 29 Jul 2024 12:11:57 -0400
Subject: [PATCH 09/19] Fix typos.

---
 src/multilayerqg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 91ef062d..fb7f9781 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -549,7 +549,7 @@ transform of the PV from the streamfunction `ψh` in each layer using `qh = para
       @inbounds qh[i, j, k] = 0
 
       @unroll for m = 1:nlayers
-          @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] + 1 + 2*im
+          @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m]
       end
   
   end

From a04c5b0da6a551c0cda6b3e0f634ffc38723afd4 Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Tue, 29 Oct 2024 13:28:29 -0400
Subject: [PATCH 10/19] Changes following suggestions of Navid and Greg.

---
 src/multilayerqg.jl | 59 +++++++++++----------------------------------
 1 file changed, 14 insertions(+), 45 deletions(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index fb7f9781..345d931c 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -113,17 +113,6 @@ function Problem(nlayers::Int,                             # number of fluid lay
              aliased_fraction = 1/3,
                             T = Float64)
 
-  # if dev == GPU() && nlayers > 2
-  #   @warn """MultiLayerQG module is not optimized on the GPU yet for configurations with
-  #   3 fluid layers or more!
-
-  #   See issues on Github at https://github.com/FourierFlows/GeophysicalFlows.jl/issues/112
-  #   and https://github.com/FourierFlows/GeophysicalFlows.jl/issues/267.
-
-  #   To use MultiLayerQG with 3 fluid layers or more we suggest, for now, to restrict running
-  #   on CPU."""
-  # end
-
   if nlayers == 1
     @warn """MultiLayerQG module does work for single-layer configuration but may not be as 
     optimized. We suggest using SingleLayerQG module for single-layer QG simulation unless
@@ -536,20 +525,20 @@ Compute the inverse Fourier transform of `varh` and store it in `var`.
 invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh)
 
 """
-    @kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers)
+    @kernel function PVinversion_kernel!(a, b, M, nlayers)
 
-Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier
-transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`.
+Kernel for the PV/stream function inversion step,
+i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
 """
-@kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers)
+@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers})
   i, j = @index(Global, NTuple)
 
   @unroll for k = 1:nlayers
 
-      @inbounds qh[i, j, k] = 0
+      @inbounds a[i, j, k] = 0
 
       @unroll for m = 1:nlayers
-          @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m]
+          @inbounds a[i, j, k] += M[i, j][k, m] * b[i, j, m]
       end
   
   end
@@ -559,7 +548,7 @@ end
     pvfromstreamfunction!(qh, ψh, params, grid)
 
 Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using
-`qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched.
+`qh = params.S * ψh`. We use a work layout over which the PV inversion kernel is launched.
 """
 function pvfromstreamfunction!(qh, ψh, params, grid)
   # Larger workgroups are generally more efficient. For more generality, we could put an 
@@ -571,13 +560,13 @@ function pvfromstreamfunction!(qh, ψh, params, grid)
 
   # Instantiates the kernel for relevant backend device
   backend = KernelAbstractions.get_backend(qh)
-  kernel! = pvfromstreamfunction_kernel!(backend, workgroup, worksize)
+  kernel! = PVinversion_kernel!(backend, workgroup, worksize)
 
   # Launch the kernel
   S, nlayers = params.S, params.nlayers
-  kernel!(qh, ψh, S, nlayers)
+  kernel!(qh, ψh, S, Val(nlayers))
 
-  # This will ensure that no other operations occur until the kernel has finished
+  # Ensure that no other operations occur until the kernel has finished
   KernelAbstractions.synchronize(backend)
 
   return nothing
@@ -623,31 +612,11 @@ function pvfromstreamfunction!(qh, ψh, params::TwoLayerParams, grid)
   return nothing
 end
 
-"""
-    @kernel function streamfunctionfrompv_kernel!(ψh, qh, S⁻¹, nlayers)
-
-Kernel for GPU acceleration of streamfunction from PV calculation, i.e., invert the PV to obtain
-the Fourier transform of the streamfunction `ψh` in each layer from `qh` using `ψh = params.S⁻¹ qh`.
-"""
-@kernel function streamfunctionfrompv_kernel!(ψh, qh, S⁻¹, nlayers)
-    i, j = @index(Global, NTuple)
-
-    @unroll for k = 1:nlayers
-
-        @inbounds ψh[i, j, k] = 0
-
-        @unroll for m = 1:nlayers
-            @inbounds ψh[i, j, k] += S⁻¹[i, j][k, m] * qh[i, j, m]
-        end
-    
-    end
-end
-
 """
     streamfunctionfrompv!(ψh, qh, params, grid)
 
 Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from
-`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the above-defined kernel is launched.
+`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the PV inversion kernel is launched.
 """
 function streamfunctionfrompv!(ψh, qh, params, grid)
 # Larger workgroups are generally more efficient. For more generality, we could put an 
@@ -659,13 +628,13 @@ function streamfunctionfrompv!(ψh, qh, params, grid)
 
   # Instantiates the kernel for relevant backend device
   backend = KernelAbstractions.get_backend(ψh)
-  kernel! = streamfunctionfrompv_kernel!(backend, workgroup, worksize)
+  kernel! = PVinversion_kernel!(backend, workgroup, worksize)
 
   # Launch the kernel
   S⁻¹, nlayers = params.S⁻¹, params.nlayers
-  kernel!(ψh, qh, S⁻¹, nlayers)
+  kernel!(ψh, qh, S⁻¹, Val(nlayers))
 
-  # This will ensure that no other operations occur until the kernel has finished
+  # Ensure that no other operations occur until the kernel has finished
   KernelAbstractions.synchronize(backend)
 
   return nothing

From 102000e86364e025bbe9380fc1213c1db1976bd7 Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Tue, 29 Oct 2024 13:48:26 -0400
Subject: [PATCH 11/19] Fix typo.

---
 src/multilayerqg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index d05fc9ef..b1e9b53a 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -530,7 +530,7 @@ invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, v
 Kernel for the PV/stream function inversion step,
 i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
 """
-@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers})
+@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers}) where nlayers
   i, j = @index(Global, NTuple)
 
   @unroll for k = 1:nlayers

From 151e16cefc9c16a2913d605248acd2f81f85c0f2 Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Fri, 20 Dec 2024 11:09:24 +1100
Subject: [PATCH 12/19] First modification of kernel to utilise StaticVectors.

---
 src/multilayerqg.jl | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index b1e9b53a..d6ae6c8f 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -525,22 +525,24 @@ Compute the inverse Fourier transform of `varh` and store it in `var`.
 invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh)
 
 """
-    @kernel function PVinversion_kernel!(a, b, M, nlayers)
+    @kernel function PVinversion_kernel!(a, b, M, ::Val{N}) where N
 
-Kernel for the PV/stream function inversion step,
-i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
+Kernel for the PV/stream function inversion step, i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
+StaticVectors are used to efficiently perform the matrix-vector multiplication.
 """
-@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers}) where nlayers
+@kernel function PVinversion_kernel!(a, b, M, ::Val{N}) where N
   i, j = @index(Global, NTuple)
 
-  @unroll for k = 1:nlayers
+  b_tuple = ntuple(Val(N)) do n
+      @inbounds b[i, j, n]
+  end
 
-      @inbounds a[i, j, k] = 0
+  T = eltype(a) 
+  b_sv = SVector{N, T}(b_tuple)
+  a_sv = @inbounds M[i, j] * b_sv
 
-      @unroll for m = 1:nlayers
-          @inbounds a[i, j, k] += M[i, j][k, m] * b[i, j, m]
-      end
-  
+  ntuple(Val(N)) do n
+      @inbounds a[i, j, n] = a_sv[n]
   end
 end
 

From 52f8ca3938f06aa89a4740e434f168c7de7fcfae Mon Sep 17 00:00:00 2001
From: mpudig <mattpudig1998@gmail.com>
Date: Fri, 17 Jan 2025 10:13:17 +1100
Subject: [PATCH 13/19] Add PVinversion_kernel to docs.

---
 docs/src/lib/functions.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index af0fe0d5..5afe9484 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -84,6 +84,7 @@ GeophysicalFlows.MultiLayerQG.calcNlinear!
 GeophysicalFlows.MultiLayerQG.calcN_advection!
 GeophysicalFlows.MultiLayerQG.calcN_linearadvection!
 GeophysicalFlows.MultiLayerQG.addforcing!
+GeophysicalFlows.MultiLayerQG.PVinversion_kernel!
 ```
 
 

From 3f0247c874e4fdcb63ca8523e33daa9f5ba722ba Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Fri, 24 Jan 2025 12:28:38 +1100
Subject: [PATCH 14/19] bump patch release

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f65531ec..d6c5a00d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,7 +2,7 @@ name = "GeophysicalFlows"
 uuid = "44ee3b1c-bc02-53fa-8355-8e347616e15e"
 license = "MIT"
 authors = ["Navid C. Constantinou <navidcy@gmail.com>", "Gregory L. Wagner <wagner.greg@gmail.com>", "and contributors"]
-version = "0.16.2"
+version = "0.16.3"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

From 11e4f1e5c0a14f53d10c269adba5e8b6d4c199ff Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sat, 25 Jan 2025 06:51:49 +1100
Subject: [PATCH 15/19] change notation/kernel name

---
 src/multilayerqg.jl | 59 ++++++++++++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 30f2c681..96eb701b 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -379,7 +379,7 @@ numberoflayers(::TwoLayerParams) = 2
 Return the linear operator `L` that corresponds to (hyper)-viscosity of order ``n_ν`` with
 coefficient ``ν`` for ``n`` fluid layers.
 ```math
-L_j = - ν |𝐤|^{2 n_ν}, \\ j = 1, ...,n .
+L_j = - ν |𝐤|^{2 n_ν}, \\ j = 1, ..., n .
 ```
 """
 function hyperviscosity(params, grid)
@@ -440,9 +440,9 @@ struct Vars{Aphys, Atrans, F, P} <: AbstractVars
         q :: Aphys
     "streamfunction"
         ψ :: Aphys
-    "x-component of velocity"
+    "``x``-component of velocity"
         u :: Aphys
-    "y-component of velocity"
+    "``y``-component of velocity"
         v :: Aphys
     "Fourier transform of relative vorticity + vortex stretching"
        qh :: Atrans
@@ -525,24 +525,33 @@ Compute the inverse Fourier transform of `varh` and store it in `var`.
 invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh)
 
 """
-    @kernel function PVinversion_kernel!(a, b, M, ::Val{N}) where N
+    pv_streamfunction_kernel!(y, M, x, ::Val{N}) where N
+
+Kernel for the PV to streamfunction conversion steps. The kernel performs the
+matrix multiplication
+
+```math
+y = M x
+```
+
+for every wavenumber, where ``y`` and ``x`` are column-vectors of length `nlayers`.
+This is equivalent to `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
 
-Kernel for the PV/stream function inversion step, i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
 StaticVectors are used to efficiently perform the matrix-vector multiplication.
 """
-@kernel function PVinversion_kernel!(a, b, M, ::Val{N}) where N
+@kernel function pv_streamfunction_kernel!(y, M, x, ::Val{N}) where N
   i, j = @index(Global, NTuple)
 
-  b_tuple = ntuple(Val(N)) do n
-      @inbounds b[i, j, n]
+  x_tuple = ntuple(Val(N)) do n
+    @inbounds x[i, j, n]
   end
 
-  T = eltype(a) 
-  b_sv = SVector{N, T}(b_tuple)
-  a_sv = @inbounds M[i, j] * b_sv
+  T = eltype(x)
+  x_sv = SVector{N, T}(x_tuple)
+  y_sv = @inbounds M[i, j] * x_sv
 
   ntuple(Val(N)) do n
-      @inbounds a[i, j, n] = a_sv[n]
+    @inbounds y[i, j, n] = y_sv[n]
   end
 end
 
@@ -550,11 +559,14 @@ end
     pvfromstreamfunction!(qh, ψh, params, grid)
 
 Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using
-`qh = params.S * ψh`. We use a work layout over which the PV inversion kernel is launched.
+`qh = params.S * ψh`.
+
+The matrix multiplications are done via launching a kernel. We use a work layout over
+which the PV inversion kernel is launched.
 """
 function pvfromstreamfunction!(qh, ψh, params, grid)
-  # Larger workgroups are generally more efficient. For more generality, we could put an 
-  # if statement that incurs different behavior when either nkl or nl are less than 8
+  # Larger workgroups are generally more efficient. For more generality, we could put an
+  # if statement that incurs different behavior when either nkl or nl are less than 8.
   workgroup = 8, 8
 
   # The worksize determines how many times the kernel is run
@@ -562,11 +574,11 @@ function pvfromstreamfunction!(qh, ψh, params, grid)
 
   # Instantiates the kernel for relevant backend device
   backend = KernelAbstractions.get_backend(qh)
-  kernel! = PVinversion_kernel!(backend, workgroup, worksize)
+  kernel! = pv_streamfunction_kernel!(backend, workgroup, worksize)
 
   # Launch the kernel
   S, nlayers = params.S, params.nlayers
-  kernel!(qh, ψh, S, Val(nlayers))
+  kernel!(qh, S, ψh, Val(nlayers))
 
   # Ensure that no other operations occur until the kernel has finished
   KernelAbstractions.synchronize(backend)
@@ -618,11 +630,14 @@ end
     streamfunctionfrompv!(ψh, qh, params, grid)
 
 Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from
-`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the PV inversion kernel is launched.
+`qh` using `ψh = params.S⁻¹ qh`.
+
+The matrix multiplications are done via launching a kernel. We use a work layout over
+which the PV inversion kernel is launched.
 """
 function streamfunctionfrompv!(ψh, qh, params, grid)
-# Larger workgroups are generally more efficient. For more generality, we could put an 
-  # if statement that incurs different behavior when either nkl or nl are less than 8
+  # Larger workgroups are generally more efficient. For more generality, we could put an
+  # if statement that incurs different behavior when either nkl or nl are less than 8.
   workgroup = 8, 8
 
   # The worksize determines how many times the kernel is run
@@ -630,11 +645,11 @@ function streamfunctionfrompv!(ψh, qh, params, grid)
 
   # Instantiates the kernel for relevant backend device
   backend = KernelAbstractions.get_backend(ψh)
-  kernel! = PVinversion_kernel!(backend, workgroup, worksize)
+  kernel! = pv_streamfunction_kernel!(backend, workgroup, worksize)
 
   # Launch the kernel
   S⁻¹, nlayers = params.S⁻¹, params.nlayers
-  kernel!(ψh, qh, S⁻¹, Val(nlayers))
+  kernel!(ψh, S⁻¹, qh, Val(nlayers))
 
   # Ensure that no other operations occur until the kernel has finished
   KernelAbstractions.synchronize(backend)

From fd7fcfa1c333a408de3aa5f2c0f0f8e8b62a5a2d Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sat, 25 Jan 2025 06:54:10 +1100
Subject: [PATCH 16/19] pv_streamfunction_kernel! in docs

---
 docs/src/lib/functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index 5afe9484..635fefa6 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -84,7 +84,7 @@ GeophysicalFlows.MultiLayerQG.calcNlinear!
 GeophysicalFlows.MultiLayerQG.calcN_advection!
 GeophysicalFlows.MultiLayerQG.calcN_linearadvection!
 GeophysicalFlows.MultiLayerQG.addforcing!
-GeophysicalFlows.MultiLayerQG.PVinversion_kernel!
+GeophysicalFlows.MultiLayerQG.pv_streamfunction_kernel!
 ```
 
 

From dde52296492a8dd9c12adbb15e46495cc8b5b1aa Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sat, 25 Jan 2025 06:55:28 +1100
Subject: [PATCH 17/19] slightly different phrasing

---
 src/multilayerqg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 96eb701b..58683f66 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -535,7 +535,7 @@ y = M x
 ```
 
 for every wavenumber, where ``y`` and ``x`` are column-vectors of length `nlayers`.
-This is equivalent to `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
+This can be used to perform `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
 
 StaticVectors are used to efficiently perform the matrix-vector multiplication.
 """

From 83ab940c2edc04ba009b275eed2f06c609e4b609 Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sat, 25 Jan 2025 07:11:46 +1100
Subject: [PATCH 18/19] reorder deps

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index d6c5a00d..e57c96a1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -24,6 +24,7 @@ DocStringExtensions = "0.8, 0.9"
 FFTW = "1"
 FourierFlows = "0.10.5"
 JLD2 = "0.1, 0.2, 0.3, 0.4"
+KernelAbstractions = "0.9"
 LinearAlgebra = "1.6"
 Random = "1.6"
 Reexport = "0.2, 1"
@@ -31,7 +32,6 @@ SpecialFunctions = "0.10, 1, 2"
 StaticArrays = "0.12, 1"
 Statistics = "1.6"
 julia = "1.6"
-KernelAbstractions = "0.9"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"

From a1eed4dc3b09dd20baab07ef88630b43ff41f57c Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sat, 25 Jan 2025 07:12:02 +1100
Subject: [PATCH 19/19] reorder deps

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index e57c96a1..8e09bd44 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,13 +10,13 @@ DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 FourierFlows = "2aec4490-903f-5c70-9b11-9bed06a700e1"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 
 [compat]
 CUDA = "1, 2.4.2, 3.0.0 - 3.6.4, 3.7.1, 4, 5"