From 4f7d797f5ec14f5c81b23c1cc7510237a9de1f81 Mon Sep 17 00:00:00 2001 From: mpudig Date: Fri, 26 Jul 2024 13:33:48 -0400 Subject: [PATCH 01/19] First attempt at writing kernel and workspace following Greg suggestions on issue #112. --- src/multilayerqg.jl | 92 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 83 insertions(+), 9 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 4e35d679..5a10f890 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -18,11 +18,13 @@ using LinearAlgebra, StaticArrays, Reexport, - DocStringExtensions + DocStringExtensions, + KernelAbstractions @reexport using FourierFlows using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft +using KernelAbstractions.Extras.LoopInfo: @unroll nothingfunction(args...) = nothing @@ -533,16 +535,52 @@ Compute the inverse Fourier transform of `varh` and store it in `var`. """ invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh) +""" + @kernel function pvfromstreamfunction_kernel!(qh, ψh, params) + +Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier +transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`. +""" +@kernel function pvfromstreamfunction_kernel!(qh, ψh, params) + i, j = @index(Global, NTuple) + nlayers, S = params.nlayers, params.S + + @unroll for k = 1:nlayers + + @inbounds qh[i, j, k] = 0 + + @unroll for m = 1:nlayers + @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] + end + + end +end + """ pvfromstreamfunction!(qh, ψh, params, grid) Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using -`qh = params.S * ψh`. +`qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched. """ function pvfromstreamfunction!(qh, ψh, params, grid) - for j=1:grid.nl, i=1:grid.nkr - CUDA.@allowscalar @views qh[i, j, :] .= params.S[i, j] * ψh[i, j, :] - end + # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs + # different behavior when either nkl or nl are less than 16 + workgroup = 16, 16 + + # The worksize determines how many times the kernel is run + worksize = grid.nkr, grid.nl + + # This (and its usage below) will ensure the kernel is not run _before_ the data in ψh is available + barrier = Event(grid.device) + + # Creates a loop over the specified worksize, using workgroup to organize the computation + loop_kernel! = pvfromstreamfunction_kernel!(grid.device, workgroup, worksize) + + # Launch the kernel + event = loop_kernel!(qh, ψh, params, dependencies = barrier) + + # This will ensure that no other operations occur until the kernel has finished + wait(event) return nothing end @@ -587,16 +625,52 @@ function pvfromstreamfunction!(qh, ψh, params::TwoLayerParams, grid) return nothing end +""" + @kernel function streamfunctionfrompv_kernel!(ψh, qh, params) + +Kernel for GPU acceleration of streamfunction from PV calculation, i.e., invert the PV to obtain +the Fourier transform of the streamfunction `ψh` in each layer from `qh` using `ψh = params.S⁻¹ qh`. +""" +@kernel function streamfunctionfrompv_kernel!(ψh, qh, params) + i, j = @index(Global, NTuple) + nlayers, S = params.nlayers, params.S + + @unroll for k = 1:nlayers + + @inbounds ψh[i, j, k] = 0 + + @unroll for m = 1:nlayers + @inbounds ψh[i, j, k] += S⁻¹[i, j][k, m] * qh[i, j, m] + end + + end +end + """ streamfunctionfrompv!(ψh, qh, params, grid) Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from -`qh` using `ψh = params.S⁻¹ qh`. +`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the above-defined kernel is launched. """ function streamfunctionfrompv!(ψh, qh, params, grid) - for j=1:grid.nl, i=1:grid.nkr - CUDA.@allowscalar @views ψh[i, j, :] .= params.S⁻¹[i, j] * qh[i, j, :] - end + # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs + # different behavior when either nkl or nl are less than 16 + workgroup = 16, 16 + + # The worksize determines how many times the kernel is run + worksize = grid.nkr, grid.nl + + # This (and its usage below) will ensure the kernel is not run _before_ the data in qh is available + barrier = Event(grid.device) + + # Creates a loop over the specified worksize, using workgroup to organize the computation + loop_kernel! = streamfunctionfrompv_kernel!(grid.device, workgroup, worksize) + + # Launch the kernel + event = loop_kernel!(ψh, qh, params, dependencies = barrier) + + # This will ensure that no other operations occur until the kernel has finished + wait(event) return nothing end From 114977cd28e34ea8cf5f4d98025081eb5cba4b54 Mon Sep 17 00:00:00 2001 From: mpudig Date: Sun, 28 Jul 2024 14:08:50 -0400 Subject: [PATCH 02/19] Change wait to depend on GPU device as per example video on youtube. --- src/multilayerqg.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 5a10f890..080a9de6 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -580,7 +580,7 @@ function pvfromstreamfunction!(qh, ψh, params, grid) event = loop_kernel!(qh, ψh, params, dependencies = barrier) # This will ensure that no other operations occur until the kernel has finished - wait(event) + wait(grid.device, event) return nothing end @@ -670,7 +670,7 @@ function streamfunctionfrompv!(ψh, qh, params, grid) event = loop_kernel!(ψh, qh, params, dependencies = barrier) # This will ensure that no other operations occur until the kernel has finished - wait(event) + wait(grid.device, event) return nothing end From 76c772882bbdd4cfda03467c857d72fe7f7cf038 Mon Sep 17 00:00:00 2001 From: mpudig Date: Sun, 28 Jul 2024 16:42:09 -0400 Subject: [PATCH 03/19] Rewrote kernel for more recent version of KernelAbstraction syntax. Also explicitly import CPU, GPU from FourierFlows as there were conflicts with KernelAbstractions. --- src/multilayerqg.jl | 62 +++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 080a9de6..06ac4283 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -23,7 +23,7 @@ using @reexport using FourierFlows -using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft +using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft, CPU, GPU using KernelAbstractions.Extras.LoopInfo: @unroll nothingfunction(args...) = nothing @@ -536,24 +536,23 @@ Compute the inverse Fourier transform of `varh` and store it in `var`. invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh) """ - @kernel function pvfromstreamfunction_kernel!(qh, ψh, params) + @kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers) Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`. """ -@kernel function pvfromstreamfunction_kernel!(qh, ψh, params) - i, j = @index(Global, NTuple) - nlayers, S = params.nlayers, params.S +@kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers) + i, j = @index(Global, NTuple) - @unroll for k = 1:nlayers + @unroll for k = 1:nlayers - @inbounds qh[i, j, k] = 0 + @inbounds qh[i, j, k] = 0 - @unroll for m = 1:nlayers - @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] - end - - end + @unroll for m = 1:nlayers + @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] + end + + end end """ @@ -563,24 +562,23 @@ Obtain the Fourier transform of the PV from the streamfunction `ψh` in each lay `qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched. """ function pvfromstreamfunction!(qh, ψh, params, grid) - # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs - # different behavior when either nkl or nl are less than 16 + # Larger workgroups are generally more efficient. For more generality, we could put an + # if statement that incurs different behavior when either nkl or nl are less than 16 workgroup = 16, 16 # The worksize determines how many times the kernel is run worksize = grid.nkr, grid.nl - # This (and its usage below) will ensure the kernel is not run _before_ the data in ψh is available - barrier = Event(grid.device) - - # Creates a loop over the specified worksize, using workgroup to organize the computation - loop_kernel! = pvfromstreamfunction_kernel!(grid.device, workgroup, worksize) + # Instantiates the kernel for relevant backend device + backend = KernelAbstractions.get_backend(qh) + kernel! = test_pvfromstreamfunction_kernel!(backend, workgroup, worksize) # Launch the kernel - event = loop_kernel!(qh, ψh, params, dependencies = barrier) + S, nlayers = params.S, params.nlayers + kernel!(qh, ψh, S, nlayers) # This will ensure that no other operations occur until the kernel has finished - wait(grid.device, event) + KernelAbstractions.synchronize(backend) return nothing end @@ -626,14 +624,13 @@ function pvfromstreamfunction!(qh, ψh, params::TwoLayerParams, grid) end """ - @kernel function streamfunctionfrompv_kernel!(ψh, qh, params) + @kernel function streamfunctionfrompv_kernel!(ψh, qh, S⁻¹, nlayers) Kernel for GPU acceleration of streamfunction from PV calculation, i.e., invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from `qh` using `ψh = params.S⁻¹ qh`. """ -@kernel function streamfunctionfrompv_kernel!(ψh, qh, params) +@kernel function streamfunctionfrompv_kernel!(ψh, qh, S⁻¹, nlayers) i, j = @index(Global, NTuple) - nlayers, S = params.nlayers, params.S @unroll for k = 1:nlayers @@ -653,24 +650,23 @@ Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in eac `qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the above-defined kernel is launched. """ function streamfunctionfrompv!(ψh, qh, params, grid) - # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs - # different behavior when either nkl or nl are less than 16 +# Larger workgroups are generally more efficient. For more generality, we could put an + # if statement that incurs different behavior when either nkl or nl are less than 16 workgroup = 16, 16 # The worksize determines how many times the kernel is run worksize = grid.nkr, grid.nl - # This (and its usage below) will ensure the kernel is not run _before_ the data in qh is available - barrier = Event(grid.device) - - # Creates a loop over the specified worksize, using workgroup to organize the computation - loop_kernel! = streamfunctionfrompv_kernel!(grid.device, workgroup, worksize) + # Instantiates the kernel for relevant backend device + backend = KernelAbstractions.get_backend(ψh) + kernel! = streamfunctionfrompv_kernel!(backend, workgroup, worksize) # Launch the kernel - event = loop_kernel!(ψh, qh, params, dependencies = barrier) + S⁻¹, nlayers = params.S⁻¹, params.nlayers + kernel!(ψh, qh, S⁻¹, nlayers) # This will ensure that no other operations occur until the kernel has finished - wait(grid.device, event) + KernelAbstractions.synchronize(backend) return nothing end From e8cd9d7c3ad1df63fce449cc4d6fe94e958484c1 Mon Sep 17 00:00:00 2001 From: mpudig Date: Sun, 28 Jul 2024 16:56:33 -0400 Subject: [PATCH 04/19] Remove GPU warning for more than two layers. --- src/multilayerqg.jl | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 06ac4283..949270f1 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -113,16 +113,16 @@ function Problem(nlayers::Int, # number of fluid lay aliased_fraction = 1/3, T = Float64) - if dev == GPU() && nlayers > 2 - @warn """MultiLayerQG module is not optimized on the GPU yet for configurations with - 3 fluid layers or more! + # if dev == GPU() && nlayers > 2 + # @warn """MultiLayerQG module is not optimized on the GPU yet for configurations with + # 3 fluid layers or more! - See issues on Github at https://github.com/FourierFlows/GeophysicalFlows.jl/issues/112 - and https://github.com/FourierFlows/GeophysicalFlows.jl/issues/267. + # See issues on Github at https://github.com/FourierFlows/GeophysicalFlows.jl/issues/112 + # and https://github.com/FourierFlows/GeophysicalFlows.jl/issues/267. - To use MultiLayerQG with 3 fluid layers or more we suggest, for now, to restrict running - on CPU.""" - end + # To use MultiLayerQG with 3 fluid layers or more we suggest, for now, to restrict running + # on CPU.""" + # end if nlayers == 1 @warn """MultiLayerQG module does work for single-layer configuration but may not be as @@ -541,7 +541,7 @@ invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, v Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`. """ -@kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers) +@kernel function test_pvfromstreamfunction_kernel!(qh, ψh, S, nlayers) i, j = @index(Global, NTuple) @unroll for k = 1:nlayers @@ -549,7 +549,7 @@ transform of the PV from the streamfunction `ψh` in each layer using `qh = para @inbounds qh[i, j, k] = 0 @unroll for m = 1:nlayers - @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] + @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] + 1 + 2*im end end @@ -561,7 +561,7 @@ end Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched. """ -function pvfromstreamfunction!(qh, ψh, params, grid) +function test_pvfromstreamfunction!(qh, ψh, params, grid) # Larger workgroups are generally more efficient. For more generality, we could put an # if statement that incurs different behavior when either nkl or nl are less than 16 workgroup = 16, 16 @@ -575,7 +575,8 @@ function pvfromstreamfunction!(qh, ψh, params, grid) # Launch the kernel S, nlayers = params.S, params.nlayers - kernel!(qh, ψh, S, nlayers) + #kernel!(qh, ψh, S, nlayers) + kernel!(qh, psih, S, nlayers) # This will ensure that no other operations occur until the kernel has finished KernelAbstractions.synchronize(backend) From 633797bdea5ff4ab6d38334cff76d4e0c4cac02d Mon Sep 17 00:00:00 2001 From: mpudig Date: Sun, 28 Jul 2024 17:05:29 -0400 Subject: [PATCH 05/19] Added KernelAbstractions to dependency list. --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index a49c334c..1aa677dd 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" [compat] CUDA = "1, 2.4.2, 3.0.0 - 3.6.4, 3.7.1, 4, 5" @@ -30,6 +31,7 @@ SpecialFunctions = "0.10, 1, 2" StaticArrays = "0.12, 1" Statistics = "1.6" julia = "1.6" +KernelAbstractions = "0.9" [extras] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" From 916653990ea358d2969e3870bd3e217704da9ec1 Mon Sep 17 00:00:00 2001 From: mpudig Date: Sun, 28 Jul 2024 17:14:28 -0400 Subject: [PATCH 06/19] Fix typos. --- src/multilayerqg.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 949270f1..9225d516 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -541,7 +541,7 @@ invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, v Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`. """ -@kernel function test_pvfromstreamfunction_kernel!(qh, ψh, S, nlayers) +@kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers) i, j = @index(Global, NTuple) @unroll for k = 1:nlayers @@ -561,7 +561,7 @@ end Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched. """ -function test_pvfromstreamfunction!(qh, ψh, params, grid) +function pvfromstreamfunction!(qh, ψh, params, grid) # Larger workgroups are generally more efficient. For more generality, we could put an # if statement that incurs different behavior when either nkl or nl are less than 16 workgroup = 16, 16 @@ -575,8 +575,7 @@ function test_pvfromstreamfunction!(qh, ψh, params, grid) # Launch the kernel S, nlayers = params.S, params.nlayers - #kernel!(qh, ψh, S, nlayers) - kernel!(qh, psih, S, nlayers) + kernel!(qh, ψh, S, nlayers) # This will ensure that no other operations occur until the kernel has finished KernelAbstractions.synchronize(backend) From 04298c4a5db978291bb5fa0eac06e64a7ad06bae Mon Sep 17 00:00:00 2001 From: mpudig Date: Sun, 28 Jul 2024 17:19:00 -0400 Subject: [PATCH 07/19] Fix more typos. --- src/multilayerqg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 9225d516..8511db6d 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -571,7 +571,7 @@ function pvfromstreamfunction!(qh, ψh, params, grid) # Instantiates the kernel for relevant backend device backend = KernelAbstractions.get_backend(qh) - kernel! = test_pvfromstreamfunction_kernel!(backend, workgroup, worksize) + kernel! = pvfromstreamfunction_kernel!(backend, workgroup, worksize) # Launch the kernel S, nlayers = params.S, params.nlayers From b3161d8945239a4a7ffdd0c9ce4aae064c0e0658 Mon Sep 17 00:00:00 2001 From: mpudig Date: Mon, 29 Jul 2024 11:27:11 -0400 Subject: [PATCH 08/19] Some simple tests changing the workgroup size showed 8 was often better than 16. --- src/multilayerqg.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 8511db6d..91ef062d 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -563,8 +563,8 @@ Obtain the Fourier transform of the PV from the streamfunction `ψh` in each lay """ function pvfromstreamfunction!(qh, ψh, params, grid) # Larger workgroups are generally more efficient. For more generality, we could put an - # if statement that incurs different behavior when either nkl or nl are less than 16 - workgroup = 16, 16 + # if statement that incurs different behavior when either nkl or nl are less than 8 + workgroup = 8, 8 # The worksize determines how many times the kernel is run worksize = grid.nkr, grid.nl @@ -651,8 +651,8 @@ Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in eac """ function streamfunctionfrompv!(ψh, qh, params, grid) # Larger workgroups are generally more efficient. For more generality, we could put an - # if statement that incurs different behavior when either nkl or nl are less than 16 - workgroup = 16, 16 + # if statement that incurs different behavior when either nkl or nl are less than 8 + workgroup = 8, 8 # The worksize determines how many times the kernel is run worksize = grid.nkr, grid.nl From 880f4f623845d27c423c0e929db5828a7658365c Mon Sep 17 00:00:00 2001 From: mpudig Date: Mon, 29 Jul 2024 12:11:57 -0400 Subject: [PATCH 09/19] Fix typos. --- src/multilayerqg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 91ef062d..fb7f9781 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -549,7 +549,7 @@ transform of the PV from the streamfunction `ψh` in each layer using `qh = para @inbounds qh[i, j, k] = 0 @unroll for m = 1:nlayers - @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] + 1 + 2*im + @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] end end From a04c5b0da6a551c0cda6b3e0f634ffc38723afd4 Mon Sep 17 00:00:00 2001 From: mpudig Date: Tue, 29 Oct 2024 13:28:29 -0400 Subject: [PATCH 10/19] Changes following suggestions of Navid and Greg. --- src/multilayerqg.jl | 59 +++++++++++---------------------------------- 1 file changed, 14 insertions(+), 45 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index fb7f9781..345d931c 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -113,17 +113,6 @@ function Problem(nlayers::Int, # number of fluid lay aliased_fraction = 1/3, T = Float64) - # if dev == GPU() && nlayers > 2 - # @warn """MultiLayerQG module is not optimized on the GPU yet for configurations with - # 3 fluid layers or more! - - # See issues on Github at https://github.com/FourierFlows/GeophysicalFlows.jl/issues/112 - # and https://github.com/FourierFlows/GeophysicalFlows.jl/issues/267. - - # To use MultiLayerQG with 3 fluid layers or more we suggest, for now, to restrict running - # on CPU.""" - # end - if nlayers == 1 @warn """MultiLayerQG module does work for single-layer configuration but may not be as optimized. We suggest using SingleLayerQG module for single-layer QG simulation unless @@ -536,20 +525,20 @@ Compute the inverse Fourier transform of `varh` and store it in `var`. invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh) """ - @kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers) + @kernel function PVinversion_kernel!(a, b, M, nlayers) -Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier -transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`. +Kernel for the PV/stream function inversion step, +i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`. """ -@kernel function pvfromstreamfunction_kernel!(qh, ψh, S, nlayers) +@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers}) i, j = @index(Global, NTuple) @unroll for k = 1:nlayers - @inbounds qh[i, j, k] = 0 + @inbounds a[i, j, k] = 0 @unroll for m = 1:nlayers - @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] + @inbounds a[i, j, k] += M[i, j][k, m] * b[i, j, m] end end @@ -559,7 +548,7 @@ end pvfromstreamfunction!(qh, ψh, params, grid) Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using -`qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched. +`qh = params.S * ψh`. We use a work layout over which the PV inversion kernel is launched. """ function pvfromstreamfunction!(qh, ψh, params, grid) # Larger workgroups are generally more efficient. For more generality, we could put an @@ -571,13 +560,13 @@ function pvfromstreamfunction!(qh, ψh, params, grid) # Instantiates the kernel for relevant backend device backend = KernelAbstractions.get_backend(qh) - kernel! = pvfromstreamfunction_kernel!(backend, workgroup, worksize) + kernel! = PVinversion_kernel!(backend, workgroup, worksize) # Launch the kernel S, nlayers = params.S, params.nlayers - kernel!(qh, ψh, S, nlayers) + kernel!(qh, ψh, S, Val(nlayers)) - # This will ensure that no other operations occur until the kernel has finished + # Ensure that no other operations occur until the kernel has finished KernelAbstractions.synchronize(backend) return nothing @@ -623,31 +612,11 @@ function pvfromstreamfunction!(qh, ψh, params::TwoLayerParams, grid) return nothing end -""" - @kernel function streamfunctionfrompv_kernel!(ψh, qh, S⁻¹, nlayers) - -Kernel for GPU acceleration of streamfunction from PV calculation, i.e., invert the PV to obtain -the Fourier transform of the streamfunction `ψh` in each layer from `qh` using `ψh = params.S⁻¹ qh`. -""" -@kernel function streamfunctionfrompv_kernel!(ψh, qh, S⁻¹, nlayers) - i, j = @index(Global, NTuple) - - @unroll for k = 1:nlayers - - @inbounds ψh[i, j, k] = 0 - - @unroll for m = 1:nlayers - @inbounds ψh[i, j, k] += S⁻¹[i, j][k, m] * qh[i, j, m] - end - - end -end - """ streamfunctionfrompv!(ψh, qh, params, grid) Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from -`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the above-defined kernel is launched. +`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the PV inversion kernel is launched. """ function streamfunctionfrompv!(ψh, qh, params, grid) # Larger workgroups are generally more efficient. For more generality, we could put an @@ -659,13 +628,13 @@ function streamfunctionfrompv!(ψh, qh, params, grid) # Instantiates the kernel for relevant backend device backend = KernelAbstractions.get_backend(ψh) - kernel! = streamfunctionfrompv_kernel!(backend, workgroup, worksize) + kernel! = PVinversion_kernel!(backend, workgroup, worksize) # Launch the kernel S⁻¹, nlayers = params.S⁻¹, params.nlayers - kernel!(ψh, qh, S⁻¹, nlayers) + kernel!(ψh, qh, S⁻¹, Val(nlayers)) - # This will ensure that no other operations occur until the kernel has finished + # Ensure that no other operations occur until the kernel has finished KernelAbstractions.synchronize(backend) return nothing From 102000e86364e025bbe9380fc1213c1db1976bd7 Mon Sep 17 00:00:00 2001 From: mpudig Date: Tue, 29 Oct 2024 13:48:26 -0400 Subject: [PATCH 11/19] Fix typo. --- src/multilayerqg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index d05fc9ef..b1e9b53a 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -530,7 +530,7 @@ invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, v Kernel for the PV/stream function inversion step, i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`. """ -@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers}) +@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers}) where nlayers i, j = @index(Global, NTuple) @unroll for k = 1:nlayers From 151e16cefc9c16a2913d605248acd2f81f85c0f2 Mon Sep 17 00:00:00 2001 From: mpudig Date: Fri, 20 Dec 2024 11:09:24 +1100 Subject: [PATCH 12/19] First modification of kernel to utilise StaticVectors. --- src/multilayerqg.jl | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index b1e9b53a..d6ae6c8f 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -525,22 +525,24 @@ Compute the inverse Fourier transform of `varh` and store it in `var`. invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh) """ - @kernel function PVinversion_kernel!(a, b, M, nlayers) + @kernel function PVinversion_kernel!(a, b, M, ::Val{N}) where N -Kernel for the PV/stream function inversion step, -i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`. +Kernel for the PV/stream function inversion step, i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`. +StaticVectors are used to efficiently perform the matrix-vector multiplication. """ -@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers}) where nlayers +@kernel function PVinversion_kernel!(a, b, M, ::Val{N}) where N i, j = @index(Global, NTuple) - @unroll for k = 1:nlayers + b_tuple = ntuple(Val(N)) do n + @inbounds b[i, j, n] + end - @inbounds a[i, j, k] = 0 + T = eltype(a) + b_sv = SVector{N, T}(b_tuple) + a_sv = @inbounds M[i, j] * b_sv - @unroll for m = 1:nlayers - @inbounds a[i, j, k] += M[i, j][k, m] * b[i, j, m] - end - + ntuple(Val(N)) do n + @inbounds a[i, j, n] = a_sv[n] end end From 52f8ca3938f06aa89a4740e434f168c7de7fcfae Mon Sep 17 00:00:00 2001 From: mpudig Date: Fri, 17 Jan 2025 10:13:17 +1100 Subject: [PATCH 13/19] Add PVinversion_kernel to docs. --- docs/src/lib/functions.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index af0fe0d5..5afe9484 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -84,6 +84,7 @@ GeophysicalFlows.MultiLayerQG.calcNlinear! GeophysicalFlows.MultiLayerQG.calcN_advection! GeophysicalFlows.MultiLayerQG.calcN_linearadvection! GeophysicalFlows.MultiLayerQG.addforcing! +GeophysicalFlows.MultiLayerQG.PVinversion_kernel! ``` From 3f0247c874e4fdcb63ca8523e33daa9f5ba722ba Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Fri, 24 Jan 2025 12:28:38 +1100 Subject: [PATCH 14/19] bump patch release --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f65531ec..d6c5a00d 100644 --- a/Project.toml +++ b/Project.toml @@ -2,7 +2,7 @@ name = "GeophysicalFlows" uuid = "44ee3b1c-bc02-53fa-8355-8e347616e15e" license = "MIT" authors = ["Navid C. Constantinou ", "Gregory L. Wagner ", "and contributors"] -version = "0.16.2" +version = "0.16.3" [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" From 11e4f1e5c0a14f53d10c269adba5e8b6d4c199ff Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Sat, 25 Jan 2025 06:51:49 +1100 Subject: [PATCH 15/19] change notation/kernel name --- src/multilayerqg.jl | 59 ++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 30f2c681..96eb701b 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -379,7 +379,7 @@ numberoflayers(::TwoLayerParams) = 2 Return the linear operator `L` that corresponds to (hyper)-viscosity of order ``n_ν`` with coefficient ``ν`` for ``n`` fluid layers. ```math -L_j = - ν |𝐤|^{2 n_ν}, \\ j = 1, ...,n . +L_j = - ν |𝐤|^{2 n_ν}, \\ j = 1, ..., n . ``` """ function hyperviscosity(params, grid) @@ -440,9 +440,9 @@ struct Vars{Aphys, Atrans, F, P} <: AbstractVars q :: Aphys "streamfunction" ψ :: Aphys - "x-component of velocity" + "``x``-component of velocity" u :: Aphys - "y-component of velocity" + "``y``-component of velocity" v :: Aphys "Fourier transform of relative vorticity + vortex stretching" qh :: Atrans @@ -525,24 +525,33 @@ Compute the inverse Fourier transform of `varh` and store it in `var`. invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh) """ - @kernel function PVinversion_kernel!(a, b, M, ::Val{N}) where N + pv_streamfunction_kernel!(y, M, x, ::Val{N}) where N + +Kernel for the PV to streamfunction conversion steps. The kernel performs the +matrix multiplication + +```math +y = M x +``` + +for every wavenumber, where ``y`` and ``x`` are column-vectors of length `nlayers`. +This is equivalent to `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`. -Kernel for the PV/stream function inversion step, i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`. StaticVectors are used to efficiently perform the matrix-vector multiplication. """ -@kernel function PVinversion_kernel!(a, b, M, ::Val{N}) where N +@kernel function pv_streamfunction_kernel!(y, M, x, ::Val{N}) where N i, j = @index(Global, NTuple) - b_tuple = ntuple(Val(N)) do n - @inbounds b[i, j, n] + x_tuple = ntuple(Val(N)) do n + @inbounds x[i, j, n] end - T = eltype(a) - b_sv = SVector{N, T}(b_tuple) - a_sv = @inbounds M[i, j] * b_sv + T = eltype(x) + x_sv = SVector{N, T}(x_tuple) + y_sv = @inbounds M[i, j] * x_sv ntuple(Val(N)) do n - @inbounds a[i, j, n] = a_sv[n] + @inbounds y[i, j, n] = y_sv[n] end end @@ -550,11 +559,14 @@ end pvfromstreamfunction!(qh, ψh, params, grid) Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using -`qh = params.S * ψh`. We use a work layout over which the PV inversion kernel is launched. +`qh = params.S * ψh`. + +The matrix multiplications are done via launching a kernel. We use a work layout over +which the PV inversion kernel is launched. """ function pvfromstreamfunction!(qh, ψh, params, grid) - # Larger workgroups are generally more efficient. For more generality, we could put an - # if statement that incurs different behavior when either nkl or nl are less than 8 + # Larger workgroups are generally more efficient. For more generality, we could put an + # if statement that incurs different behavior when either nkl or nl are less than 8. workgroup = 8, 8 # The worksize determines how many times the kernel is run @@ -562,11 +574,11 @@ function pvfromstreamfunction!(qh, ψh, params, grid) # Instantiates the kernel for relevant backend device backend = KernelAbstractions.get_backend(qh) - kernel! = PVinversion_kernel!(backend, workgroup, worksize) + kernel! = pv_streamfunction_kernel!(backend, workgroup, worksize) # Launch the kernel S, nlayers = params.S, params.nlayers - kernel!(qh, ψh, S, Val(nlayers)) + kernel!(qh, S, ψh, Val(nlayers)) # Ensure that no other operations occur until the kernel has finished KernelAbstractions.synchronize(backend) @@ -618,11 +630,14 @@ end streamfunctionfrompv!(ψh, qh, params, grid) Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from -`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the PV inversion kernel is launched. +`qh` using `ψh = params.S⁻¹ qh`. + +The matrix multiplications are done via launching a kernel. We use a work layout over +which the PV inversion kernel is launched. """ function streamfunctionfrompv!(ψh, qh, params, grid) -# Larger workgroups are generally more efficient. For more generality, we could put an - # if statement that incurs different behavior when either nkl or nl are less than 8 + # Larger workgroups are generally more efficient. For more generality, we could put an + # if statement that incurs different behavior when either nkl or nl are less than 8. workgroup = 8, 8 # The worksize determines how many times the kernel is run @@ -630,11 +645,11 @@ function streamfunctionfrompv!(ψh, qh, params, grid) # Instantiates the kernel for relevant backend device backend = KernelAbstractions.get_backend(ψh) - kernel! = PVinversion_kernel!(backend, workgroup, worksize) + kernel! = pv_streamfunction_kernel!(backend, workgroup, worksize) # Launch the kernel S⁻¹, nlayers = params.S⁻¹, params.nlayers - kernel!(ψh, qh, S⁻¹, Val(nlayers)) + kernel!(ψh, S⁻¹, qh, Val(nlayers)) # Ensure that no other operations occur until the kernel has finished KernelAbstractions.synchronize(backend) From fd7fcfa1c333a408de3aa5f2c0f0f8e8b62a5a2d Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Sat, 25 Jan 2025 06:54:10 +1100 Subject: [PATCH 16/19] pv_streamfunction_kernel! in docs --- docs/src/lib/functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 5afe9484..635fefa6 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -84,7 +84,7 @@ GeophysicalFlows.MultiLayerQG.calcNlinear! GeophysicalFlows.MultiLayerQG.calcN_advection! GeophysicalFlows.MultiLayerQG.calcN_linearadvection! GeophysicalFlows.MultiLayerQG.addforcing! -GeophysicalFlows.MultiLayerQG.PVinversion_kernel! +GeophysicalFlows.MultiLayerQG.pv_streamfunction_kernel! ``` From dde52296492a8dd9c12adbb15e46495cc8b5b1aa Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Sat, 25 Jan 2025 06:55:28 +1100 Subject: [PATCH 17/19] slightly different phrasing --- src/multilayerqg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 96eb701b..58683f66 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -535,7 +535,7 @@ y = M x ``` for every wavenumber, where ``y`` and ``x`` are column-vectors of length `nlayers`. -This is equivalent to `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`. +This can be used to perform `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`. StaticVectors are used to efficiently perform the matrix-vector multiplication. """ From 83ab940c2edc04ba009b275eed2f06c609e4b609 Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Sat, 25 Jan 2025 07:11:46 +1100 Subject: [PATCH 18/19] reorder deps --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d6c5a00d..e57c96a1 100644 --- a/Project.toml +++ b/Project.toml @@ -24,6 +24,7 @@ DocStringExtensions = "0.8, 0.9" FFTW = "1" FourierFlows = "0.10.5" JLD2 = "0.1, 0.2, 0.3, 0.4" +KernelAbstractions = "0.9" LinearAlgebra = "1.6" Random = "1.6" Reexport = "0.2, 1" @@ -31,7 +32,6 @@ SpecialFunctions = "0.10, 1, 2" StaticArrays = "0.12, 1" Statistics = "1.6" julia = "1.6" -KernelAbstractions = "0.9" [extras] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" From a1eed4dc3b09dd20baab07ef88630b43ff41f57c Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Sat, 25 Jan 2025 07:12:02 +1100 Subject: [PATCH 19/19] reorder deps --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index e57c96a1..8e09bd44 100644 --- a/Project.toml +++ b/Project.toml @@ -10,13 +10,13 @@ DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" FourierFlows = "2aec4490-903f-5c70-9b11-9bed06a700e1" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" [compat] CUDA = "1, 2.4.2, 3.0.0 - 3.6.4, 3.7.1, 4, 5"