From 58ec351a5118ad6a51586d915d28316710726072 Mon Sep 17 00:00:00 2001 From: Anton Reinhard Date: Wed, 4 Sep 2024 14:52:05 +0200 Subject: [PATCH] Add cuda, rocm and oneapi functionality to their own extensions (#24) --- Project.toml | 17 ++++++++++---- docs/src/lib/internals/devices.md | 18 +++------------ ext/AMDGPUExt.jl | 8 +++++++ ext/CUDAExt.jl | 11 +++++++++ ext/devices/cuda/function.jl | 33 ++++++++++++++++++++++++++ {src => ext}/devices/cuda/impl.jl | 20 ++++++++-------- {src => ext}/devices/oneapi/impl.jl | 16 ++++++------- {src => ext}/devices/rocm/impl.jl | 14 +++++------ ext/oneAPIExt.jl | 8 +++++++ src/GraphComputing.jl | 12 ++++++---- src/code_gen/function.jl | 36 ----------------------------- src/devices/interface.jl | 15 ++++++++++++ testgpu/runtests.jl | 7 ++++++ testgpu/unit_tests_utility.jl | 10 ++++++++ 14 files changed, 138 insertions(+), 87 deletions(-) create mode 100644 ext/AMDGPUExt.jl create mode 100644 ext/CUDAExt.jl create mode 100644 ext/devices/cuda/function.jl rename {src => ext}/devices/cuda/impl.jl (62%) rename {src => ext}/devices/oneapi/impl.jl (68%) rename {src => ext}/devices/rocm/impl.jl (69%) create mode 100644 ext/oneAPIExt.jl create mode 100644 testgpu/runtests.jl create mode 100644 testgpu/unit_tests_utility.jl diff --git a/Project.toml b/Project.toml index 19b77ea..3feb19a 100644 --- a/Project.toml +++ b/Project.toml @@ -4,24 +4,33 @@ authors = ["AntonReinhard "] version = "0.1.0" [deps] -AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" NumaAllocators = "21436f30-1b4a-4f08-87af-e26101bb5379" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" RuntimeGeneratedFunctions = "7e49a35a-f44a-4d26-94aa-eba1b4ca6b47" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" +[weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" + +[extensions] +AMDGPUExt = "AMDGPU" +CUDAExt = "CUDA" +oneAPIExt = "oneAPI" + [compat] +AMDGPU = "1" +CUDA = "5" DataStructures = "0.18" NumaAllocators = "0.2" +oneAPI = "1" RuntimeGeneratedFunctions = "0.5" StaticArrays = "1" [extras] -CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/docs/src/lib/internals/devices.md b/docs/src/lib/internals/devices.md index 089d769..a5eba3e 100644 --- a/docs/src/lib/internals/devices.md +++ b/docs/src/lib/internals/devices.md @@ -38,22 +38,10 @@ Order = [:type, :function] ``` ### CUDA -```@autodocs -Modules = [GraphComputing] -Pages = ["devices/cuda/impl.jl"] -Order = [:type, :function] -``` +For CUDA functionality to be available, the `CUDA.jl` package must be installed separately, as it is only a weak dependency. ### ROCm -```@autodocs -Modules = [GraphComputing] -Pages = ["devices/rocm/impl.jl"] -Order = [:type, :function] -``` +For ROCm functionality to be available, the `AMDGPU.jl` package must be installed separately, as it is only a weak dependency. ### oneAPI -```@autodocs -Modules = [GraphComputing] -Pages = ["devices/oneapi/impl.jl"] -Order = [:type, :function] -``` +For oneAPI functionality to be available, the `oneAPI.jl` package must be installed separately, as it is only a weak dependency. diff --git a/ext/AMDGPUExt.jl b/ext/AMDGPUExt.jl new file mode 100644 index 0000000..25be189 --- /dev/null +++ b/ext/AMDGPUExt.jl @@ -0,0 +1,8 @@ +module AMDGPUExt + +using GraphComputing, AMDGPU + +# include specialized AMDGPU functions here +include("devices/rocm/impl.jl") + +end diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl new file mode 100644 index 0000000..3620eaf --- /dev/null +++ b/ext/CUDAExt.jl @@ -0,0 +1,11 @@ +module CUDAExt + +using GraphComputing +using CUDA +using RuntimeGeneratedFunctions + +# include specialized CUDA functions here +include("devices/cuda/impl.jl") +include("devices/cuda/function.jl") + +end diff --git a/ext/devices/cuda/function.jl b/ext/devices/cuda/function.jl new file mode 100644 index 0000000..3c3bb4d --- /dev/null +++ b/ext/devices/cuda/function.jl @@ -0,0 +1,33 @@ + +function GraphComputing.cuda_kernel( + graph::DAG, instance, machine::Machine, context_module::Module +) + tape = GraphComputing.gen_tape(graph, instance, machine, context_module) + + init_caches = Expr(:block, tape.initCachesCode...) + assign_inputs = Expr(:block, GraphComputing.expr_from_fc.(tape.inputAssignCode)...) + code = Expr(:block, GraphComputing.expr_from_fc.(tape.computeCode)...) + + function_id = GraphComputing.to_var_name(UUIDs.uuid1(rng[1])) + res_sym = eval( + GraphComputing.gen_access_expr( + GraphComputing.entry_device(tape.machine), tape.outputSymbol + ), + ) + expr = Meta.parse( + "function compute_$(function_id)(input_vector, output_vector, n::Int64) + id = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if (id > n) + return + end + @inline data_input = input_vector[id] + $(init_caches) + $(assign_inputs) + $code + @inline output_vector[id] = $res_sym + return nothing + end" + ) + + return RuntimeGeneratedFunction(@__MODULE__, context_module, expr) +end diff --git a/src/devices/cuda/impl.jl b/ext/devices/cuda/impl.jl similarity index 62% rename from src/devices/cuda/impl.jl rename to ext/devices/cuda/impl.jl index 66383d0..5485df1 100644 --- a/src/devices/cuda/impl.jl +++ b/ext/devices/cuda/impl.jl @@ -1,23 +1,21 @@ -using CUDA - """ CUDAGPU <: AbstractGPU Representation of a specific CUDA GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface. """ -mutable struct CUDAGPU <: AbstractGPU +mutable struct CUDAGPU <: GraphComputing.AbstractGPU device::Any # TODO: what's the cuda device type? cacheStrategy::CacheStrategy FLOPS::Float64 end -push!(DEVICE_TYPES, CUDAGPU) +push!(GraphComputing.DEVICE_TYPES, CUDAGPU) -CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()] +GraphComputing.CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()] -default_strategy(::Type{T}) where {T<:CUDAGPU} = LocalVariables() +GraphComputing.default_strategy(::Type{CUDAGPU}) = LocalVariables() -function measure_device!(device::CUDAGPU; verbose::Bool) +function GraphComputing.measure_device!(device::CUDAGPU; verbose::Bool) if verbose println("Measuring CUDA GPU $(device.device)") end @@ -27,16 +25,16 @@ function measure_device!(device::CUDAGPU; verbose::Bool) end """ - get_devices(deviceType::Type{T}; verbose::Bool) where {T <: CUDAGPU} + get_devices(::Type{CUDAGPU}; verbose::Bool) Return a Vector of [`CUDAGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information. """ -function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:CUDAGPU} - devices = Vector{AbstractDevice}() +function get_devices(::Type{CUDAGPU}; verbose::Bool=false) + devices = Vector{GraphComputing.AbstractDevice}() if !CUDA.functional() if verbose - println("CUDA is non-functional") + println("CUDA.jl is non-functional") end return devices end diff --git a/src/devices/oneapi/impl.jl b/ext/devices/oneapi/impl.jl similarity index 68% rename from src/devices/oneapi/impl.jl rename to ext/devices/oneapi/impl.jl index da2f9e8..705aa80 100644 --- a/src/devices/oneapi/impl.jl +++ b/ext/devices/oneapi/impl.jl @@ -1,23 +1,21 @@ -using oneAPI - """ oneAPIGPU <: AbstractGPU Representation of a specific Intel GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface. """ -mutable struct oneAPIGPU <: AbstractGPU +mutable struct oneAPIGPU <: GraphComputing.AbstractGPU device::Any cacheStrategy::CacheStrategy FLOPS::Float64 end -push!(DEVICE_TYPES, oneAPIGPU) +push!(GraphComputing.DEVICE_TYPES, oneAPIGPU) -CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()] +GraphComputing.CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()] -default_strategy(::Type{T}) where {T<:oneAPIGPU} = LocalVariables() +GraphComputing.default_strategy(::Type{oneAPIGPU}) = LocalVariables() -function measure_device!(device::oneAPIGPU; verbose::Bool) +function GraphComputing.measure_device!(device::oneAPIGPU; verbose::Bool) if verbose println("Measuring oneAPI GPU $(device.device)") end @@ -27,11 +25,11 @@ function measure_device!(device::oneAPIGPU; verbose::Bool) end """ - get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: oneAPIGPU} + get_devices(::Type{oneAPIGPU}; verbose::Bool = false) Return a Vector of [`oneAPIGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information. """ -function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:oneAPIGPU} +function get_devices(::Type{oneAPIGPU}; verbose::Bool=false) devices = Vector{AbstractDevice}() if !oneAPI.functional() diff --git a/src/devices/rocm/impl.jl b/ext/devices/rocm/impl.jl similarity index 69% rename from src/devices/rocm/impl.jl rename to ext/devices/rocm/impl.jl index ba189d9..836d9c7 100644 --- a/src/devices/rocm/impl.jl +++ b/ext/devices/rocm/impl.jl @@ -5,19 +5,19 @@ using AMDGPU Representation of a specific AMD GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface. """ -mutable struct ROCmGPU <: AbstractGPU +mutable struct ROCmGPU <: GraphComputing.AbstractGPU device::Any cacheStrategy::CacheStrategy FLOPS::Float64 end -push!(DEVICE_TYPES, ROCmGPU) +push!(GraphComputing.DEVICE_TYPES, ROCmGPU) -CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()] +GraphComputing.CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()] -default_strategy(::Type{T}) where {T<:ROCmGPU} = LocalVariables() +GraphComputing.default_strategy(::Type{ROCmGPU}) = LocalVariables() -function measure_device!(device::ROCmGPU; verbose::Bool) +function GraphComputing.measure_device!(device::ROCmGPU; verbose::Bool) if verbose println("Measuring ROCm GPU $(device.device)") end @@ -27,11 +27,11 @@ function measure_device!(device::ROCmGPU; verbose::Bool) end """ - get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: ROCmGPU} + get_devices(::Type{ROCmGPU}; verbose::Bool = false) Return a Vector of [`ROCmGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information. """ -function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:ROCmGPU} +function get_devices(::Type{ROCmGPU}; verbose::Bool=false) devices = Vector{AbstractDevice}() if !AMDGPU.functional() diff --git a/ext/oneAPIExt.jl b/ext/oneAPIExt.jl new file mode 100644 index 0000000..1f6ebae --- /dev/null +++ b/ext/oneAPIExt.jl @@ -0,0 +1,8 @@ +module oneAPIExt + +using GraphComputing, oneAPI + +# include specialized oneAPI functions here +include("devices/oneapi/impl.jl") + +end diff --git a/src/GraphComputing.jl b/src/GraphComputing.jl index ad7d884..2d43df0 100644 --- a/src/GraphComputing.jl +++ b/src/GraphComputing.jl @@ -31,11 +31,11 @@ export Operation, AppliedOperation export NodeReduction, NodeSplit export push_operation!, pop_operation!, can_pop export reset_graph! -export get_operations +export get_operationsr # code generation related export execute -export get_compute_function, get_cuda_kernel +export get_compute_function export gen_tape, execute_tape export unpack_identity @@ -57,6 +57,11 @@ export problem_instance, input_type, graph, input_expr export Machine export NumaNode export get_machine_info, cpu_st +export CacheStrategy, default_strategy +export LocalVariables, Dictionary + +# CUDAExt +export cuda_kernel include("devices/interface.jl") include("task/type.jl") @@ -119,9 +124,6 @@ include("devices/detect.jl") include("devices/impl.jl") include("devices/numa/impl.jl") -include("devices/cuda/impl.jl") -include("devices/rocm/impl.jl") -#include("devices/oneapi/impl.jl") include("scheduler/interface.jl") include("scheduler/greedy.jl") diff --git a/src/code_gen/function.jl b/src/code_gen/function.jl index 8e45f05..3ff1273 100644 --- a/src/code_gen/function.jl +++ b/src/code_gen/function.jl @@ -40,42 +40,6 @@ function get_compute_function( return RuntimeGeneratedFunction(@__MODULE__, context_module, expr) end -""" - get_cuda_kernel( - graph::DAG, - instance, - machine::Machine, - ) - -Return a function of signature `compute_(input::CuVector, output::CuVector, n::Int64)`, which will return the result of the DAG computation of the input on the given output variable. -""" -function get_cuda_kernel(graph::DAG, instance, machine::Machine, context_module::Module) - tape = gen_tape(graph, instance, machine, context_module) - - initCaches = Expr(:block, tape.initCachesCode...) - assignInputs = Expr(:block, expr_from_fc.(tape.inputAssignCode)...) - code = Expr(:block, expr_from_fc.(tape.computeCode)...) - - functionId = to_var_name(UUIDs.uuid1(rng[1])) - resSym = eval(gen_access_expr(entry_device(tape.machine), tape.outputSymbol)) - expr = Meta.parse( - "function compute_$(functionId)(input_vector, output_vector, n::Int64) - id = (blockIdx().x - 1) * blockDim().x + threadIdx().x - if (id > n) - return - end - @inline data_input = input_vector[id] - $(initCaches) - $(assignInputs) - $code - @inline output_vector[id] = $resSym - return nothing - end" - ) - - return RuntimeGeneratedFunction(@__MODULE__, context_module, expr) -end - """ execute( graph::DAG, diff --git a/src/devices/interface.jl b/src/devices/interface.jl index 6c5e06f..1c7c37a 100644 --- a/src/devices/interface.jl +++ b/src/devices/interface.jl @@ -106,3 +106,18 @@ Interface function that must be implemented for every subtype of [`AbstractDevic Return an `Expr` or `QuoteNode` accessing the variable identified by [`symbol`]. """ function gen_access_expr end + +""" + cuda_kernel( + graph::DAG, + instance, + machine::Machine, + context_module::Module + ) + +Return a function of signature `compute_(input::CuVector, output::CuVector, n::Int64)`, which will return the result of the DAG computation of the input on the given output variable. + +!!! note + This function is only available when the CUDA Extension is loaded by `using CUDA` before `using GraphComputing` +""" +function cuda_kernel end diff --git a/testgpu/runtests.jl b/testgpu/runtests.jl new file mode 100644 index 0000000..4b8ba6c --- /dev/null +++ b/testgpu/runtests.jl @@ -0,0 +1,7 @@ +using SafeTestsets +using CUDA + +@safetestset "Utility Unit Tests " begin + include("unit_tests_utility.jl") +end +# TODO: Make a new simple test model and rewrite tests here diff --git a/testgpu/unit_tests_utility.jl b/testgpu/unit_tests_utility.jl new file mode 100644 index 0000000..2599804 --- /dev/null +++ b/testgpu/unit_tests_utility.jl @@ -0,0 +1,10 @@ +using GraphComputing +import GraphComputing.bytes_to_human_readable + +@test bytes_to_human_readable(0) == "0.0 B" +@test bytes_to_human_readable(1020) == "1020.0 B" +@test bytes_to_human_readable(1025) == "1.001 KiB" +@test bytes_to_human_readable(684235) == "668.2 KiB" +@test bytes_to_human_readable(86214576) == "82.22 MiB" +@test bytes_to_human_readable(9241457698) == "8.607 GiB" +@test bytes_to_human_readable(3218598654367) == "2.927 TiB"