FourierFlows · mpudig · Jul 26, 2024 · Jul 28, 2024 · Jul 28, 2024 · Jul 28, 2024
diff --git a/Project.toml b/Project.toml
@@ -2,14 +2,15 @@ name = "GeophysicalFlows"
 uuid = "44ee3b1c-bc02-53fa-8355-8e347616e15e"
 license = "MIT"
 authors = ["Navid C. Constantinou <navidcy@gmail.com>", "Gregory L. Wagner <wagner.greg@gmail.com>", "and contributors"]
-version = "0.16.2"
+version = "0.16.3"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 FourierFlows = "2aec4490-903f-5c70-9b11-9bed06a700e1"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
@@ -23,6 +24,7 @@ DocStringExtensions = "0.8, 0.9"
 FFTW = "1"
 FourierFlows = "0.10.5"
 JLD2 = "0.1, 0.2, 0.3, 0.4"
+KernelAbstractions = "0.9"
 LinearAlgebra = "1.6"
 Random = "1.6"
 Reexport = "0.2, 1"

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -84,6 +84,7 @@ GeophysicalFlows.MultiLayerQG.calcNlinear!
 GeophysicalFlows.MultiLayerQG.calcN_advection!
 GeophysicalFlows.MultiLayerQG.calcN_linearadvection!
 GeophysicalFlows.MultiLayerQG.addforcing!
+GeophysicalFlows.MultiLayerQG.pv_streamfunction_kernel!
 ```
 
 

diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
@@ -18,11 +18,13 @@ using
   LinearAlgebra,
   StaticArrays,
   Reexport,
-  DocStringExtensions
+  DocStringExtensions,
+  KernelAbstractions
 
 @reexport using FourierFlows
 
-using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft
+using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft, CPU, GPU
+using KernelAbstractions.Extras.LoopInfo: @unroll
 
 nothingfunction(args...) = nothing
 
@@ -111,17 +113,6 @@ function Problem(nlayers::Int,                                     # number of f
              aliased_fraction = 1/3,
                             T = Float64)
 
-  if dev == GPU() && nlayers > 2
-    @warn """MultiLayerQG module is not optimized on the GPU yet for configurations with
-    3 fluid layers or more!
-
-    See issues on Github at https://github.com/FourierFlows/GeophysicalFlows.jl/issues/112
-    and https://github.com/FourierFlows/GeophysicalFlows.jl/issues/267.
-
-    To use MultiLayerQG with 3 fluid layers or more we suggest, for now, to restrict running
-    on CPU."""
-  end
-
   if nlayers == 1
     @warn """MultiLayerQG module does work for single-layer configuration but may not be as 
     optimized. We suggest using SingleLayerQG module for single-layer QG simulation unless
@@ -388,7 +379,7 @@ numberoflayers(::TwoLayerParams) = 2
 Return the linear operator `L` that corresponds to (hyper)-viscosity of order ``n_ν`` with
 coefficient ``ν`` for ``n`` fluid layers.
 ```math
-L_j = - ν |𝐤|^{2 n_ν}, \\ j = 1, ...,n .
+L_j = - ν |𝐤|^{2 n_ν}, \\ j = 1, ..., n .
 ```
 """
 function hyperviscosity(params, grid)
@@ -449,9 +440,9 @@ struct Vars{Aphys, Atrans, F, P} <: AbstractVars
         q :: Aphys
     "streamfunction"
         ψ :: Aphys
-    "x-component of velocity"
+    "``x``-component of velocity"
         u :: Aphys
-    "y-component of velocity"
+    "``y``-component of velocity"
         v :: Aphys
     "Fourier transform of relative vorticity + vortex stretching"
        qh :: Atrans
@@ -533,16 +524,64 @@ Compute the inverse Fourier transform of `varh` and store it in `var`.
 """
 invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh)
 
+"""
+    pv_streamfunction_kernel!(y, M, x, ::Val{N}) where N
+
+Kernel for the PV to streamfunction conversion steps. The kernel performs the
+matrix multiplication
+
+```math
+y = M x
+```
+
+for every wavenumber, where ``y`` and ``x`` are column-vectors of length `nlayers`.
+This can be used to perform `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
+
+StaticVectors are used to efficiently perform the matrix-vector multiplication.
+"""
+@kernel function pv_streamfunction_kernel!(y, M, x, ::Val{N}) where N
+  i, j = @index(Global, NTuple)
+
+  x_tuple = ntuple(Val(N)) do n
+    @inbounds x[i, j, n]
+  end
+
+  T = eltype(x)
+  x_sv = SVector{N, T}(x_tuple)
+  y_sv = @inbounds M[i, j] * x_sv
+
+  ntuple(Val(N)) do n
+    @inbounds y[i, j, n] = y_sv[n]
+  end
+end
+
 """
     pvfromstreamfunction!(qh, ψh, params, grid)
 
 Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using
 `qh = params.S * ψh`.
+
+The matrix multiplications are done via launching a kernel. We use a work layout over
+which the PV inversion kernel is launched.
 """
 function pvfromstreamfunction!(qh, ψh, params, grid)
-  for j=1:grid.nl, i=1:grid.nkr
-    CUDA.@allowscalar @views qh[i, j, :] .= params.S[i, j] * ψh[i, j, :]
-  end
+  # Larger workgroups are generally more efficient. For more generality, we could put an
+  # if statement that incurs different behavior when either nkl or nl are less than 8.
+  workgroup = 8, 8
+
+  # The worksize determines how many times the kernel is run
+  worksize = grid.nkr, grid.nl
+
+  # Instantiates the kernel for relevant backend device
+  backend = KernelAbstractions.get_backend(qh)
+  kernel! = pv_streamfunction_kernel!(backend, workgroup, worksize)
+
+  # Launch the kernel
+  S, nlayers = params.S, params.nlayers
+  kernel!(qh, S, ψh, Val(nlayers))
+
+  # Ensure that no other operations occur until the kernel has finished
+  KernelAbstractions.synchronize(backend)
 
   return nothing
 end
@@ -592,11 +631,28 @@ end
 
 Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from
 `qh` using `ψh = params.S⁻¹ qh`.
+
+The matrix multiplications are done via launching a kernel. We use a work layout over
+which the PV inversion kernel is launched.
 """
 function streamfunctionfrompv!(ψh, qh, params, grid)
-  for j=1:grid.nl, i=1:grid.nkr
-    CUDA.@allowscalar @views ψh[i, j, :] .= params.S⁻¹[i, j] * qh[i, j, :]
-  end
+  # Larger workgroups are generally more efficient. For more generality, we could put an
+  # if statement that incurs different behavior when either nkl or nl are less than 8.
+  workgroup = 8, 8
+
+  # The worksize determines how many times the kernel is run
+  worksize = grid.nkr, grid.nl
+
+  # Instantiates the kernel for relevant backend device
+  backend = KernelAbstractions.get_backend(ψh)
+  kernel! = pv_streamfunction_kernel!(backend, workgroup, worksize)
+
+  # Launch the kernel
+  S⁻¹, nlayers = params.S⁻¹, params.nlayers
+  kernel!(ψh, S⁻¹, qh, Val(nlayers))
+
+  # Ensure that no other operations occur until the kernel has finished
+  KernelAbstractions.synchronize(backend)
 
   return nothing
 end