diff --git a/docs/make.jl b/docs/make.jl index 374c187..a91ad88 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -32,7 +32,7 @@ makedocs(; sitename="ComputableDAGs.jl", format=Documenter.HTML(; prettyurls=get(ENV, "CI", "false") == "true", - canonical="https://ComputableDAGs.gitlab.io/ComputableDAGs.jl", + canonical="https://ComputableDAGs.github.io/ComputableDAGs.jl", assets=String[], ), pages=pages, diff --git a/docs/src/lib/internals/devices.md b/docs/src/lib/internals/devices.md index 7030c25..64d4360 100644 --- a/docs/src/lib/internals/devices.md +++ b/docs/src/lib/internals/devices.md @@ -37,11 +37,10 @@ Pages = ["devices/numa/impl.jl"] Order = [:type, :function] ``` -### CUDA -For CUDA functionality to be available, the `CUDA.jl` package must be installed separately, as it is only a weak dependency. +### GPUs -### ROCm -For ROCm functionality to be available, the `AMDGPU.jl` package must be installed separately, as it is only a weak dependency. - -### oneAPI -For oneAPI functionality to be available, the `oneAPI.jl` package must be installed separately, as it is only a weak dependency. +```@autodocs +Modules = [ComputableDAGs] +Pages = ["devices/ext.jl"] +Order = [:type] +``` diff --git a/ext/AMDGPUExt.jl b/ext/AMDGPUExt.jl index d798f39..6572658 100644 --- a/ext/AMDGPUExt.jl +++ b/ext/AMDGPUExt.jl @@ -1,8 +1,20 @@ module AMDGPUExt -using ComputableDAGs, AMDGPU +using ComputableDAGs +using UUIDs +using AMDGPU + +function __init__() + @debug "Loading AMDGPUExt" + + push!(ComputableDAGs.DEVICE_TYPES, ROCmGPU) + ComputableDAGs.CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()] + + return nothing +end # include specialized AMDGPU functions here include("devices/rocm/impl.jl") +include("devices/rocm/function.jl") end diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index 772c714..0797b10 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -1,8 +1,17 @@ module CUDAExt using ComputableDAGs +using UUIDs using CUDA -using RuntimeGeneratedFunctions + +function __init__() + @debug "Loading CUDAExt" + + push!(ComputableDAGs.DEVICE_TYPES, CUDAGPU) + ComputableDAGs.CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()] + + return nothing +end # include specialized CUDA functions here include("devices/cuda/impl.jl") diff --git a/ext/devices/cuda/function.jl b/ext/devices/cuda/function.jl index b407b77..ef7920a 100644 --- a/ext/devices/cuda/function.jl +++ b/ext/devices/cuda/function.jl @@ -1,14 +1,12 @@ - -function ComputableDAGs.cuda_kernel( - graph::DAG, instance, machine::Machine, context_module::Module -) +function ComputableDAGs.kernel(::Type{CUDAGPU}, graph::DAG, instance) + machine = cpu_st() tape = ComputableDAGs.gen_tape(graph, instance, machine, context_module) init_caches = Expr(:block, tape.initCachesCode...) assign_inputs = Expr(:block, ComputableDAGs.expr_from_fc.(tape.inputAssignCode)...) code = Expr(:block, ComputableDAGs.expr_from_fc.(tape.computeCode)...) - function_id = ComputableDAGs.to_var_name(UUIDs.uuid1(rng[1])) + function_id = ComputableDAGs.to_var_name(UUIDs.uuid1(ComputableDAGs.rng[1])) res_sym = eval( ComputableDAGs.gen_access_expr( ComputableDAGs.entry_device(tape.machine), tape.outputSymbol @@ -29,5 +27,5 @@ function ComputableDAGs.cuda_kernel( end" ) - return RuntimeGeneratedFunction(@__MODULE__, context_module, expr) + return expr end diff --git a/ext/devices/cuda/impl.jl b/ext/devices/cuda/impl.jl index 44cb17f..77b1e3f 100644 --- a/ext/devices/cuda/impl.jl +++ b/ext/devices/cuda/impl.jl @@ -1,24 +1,7 @@ -""" - CUDAGPU <: AbstractGPU - -Representation of a specific CUDA GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface. -""" -mutable struct CUDAGPU <: ComputableDAGs.AbstractGPU - device::Any # TODO: what's the cuda device type? - cacheStrategy::CacheStrategy - FLOPS::Float64 -end - -push!(ComputableDAGs.DEVICE_TYPES, CUDAGPU) - -ComputableDAGs.CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()] - ComputableDAGs.default_strategy(::Type{CUDAGPU}) = LocalVariables() function ComputableDAGs.measure_device!(device::CUDAGPU; verbose::Bool) - if verbose - println("Measuring CUDA GPU $(device.device)") - end + verbose && @info "Measuring CUDA GPU $(device.device)" # TODO implement return nothing @@ -29,20 +12,16 @@ end Return a Vector of [`CUDAGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information. """ -function get_devices(::Type{CUDAGPU}; verbose::Bool=false) +function ComputableDAGs.get_devices(::Type{CUDAGPU}; verbose::Bool=false) devices = Vector{ComputableDAGs.AbstractDevice}() if !CUDA.functional() - if verbose - println("CUDA.jl is non-functional") - end + @warn "The CUDA extension is loaded but CUDA.jl is non-functional" return devices end CUDADevices = CUDA.devices() - if verbose - println("Found $(length(CUDADevices)) CUDA devices") - end + verbose && @info "Found $(length(CUDADevices)) CUDA devices" for device in CUDADevices push!(devices, CUDAGPU(device, default_strategy(CUDAGPU), -1)) end diff --git a/ext/devices/oneapi/impl.jl b/ext/devices/oneapi/impl.jl index abf35cc..61c44dd 100644 --- a/ext/devices/oneapi/impl.jl +++ b/ext/devices/oneapi/impl.jl @@ -1,24 +1,7 @@ -""" - oneAPIGPU <: AbstractGPU - -Representation of a specific Intel GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface. -""" -mutable struct oneAPIGPU <: ComputableDAGs.AbstractGPU - device::Any - cacheStrategy::CacheStrategy - FLOPS::Float64 -end - -push!(ComputableDAGs.DEVICE_TYPES, oneAPIGPU) - -ComputableDAGs.CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()] - ComputableDAGs.default_strategy(::Type{oneAPIGPU}) = LocalVariables() function ComputableDAGs.measure_device!(device::oneAPIGPU; verbose::Bool) - if verbose - println("Measuring oneAPI GPU $(device.device)") - end + verbose && @info "Measuring oneAPI GPU $(device.device)" # TODO implement return nothing @@ -29,20 +12,16 @@ end Return a Vector of [`oneAPIGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information. """ -function get_devices(::Type{oneAPIGPU}; verbose::Bool=false) - devices = Vector{AbstractDevice}() +function ComputableDAGs.get_devices(::Type{oneAPIGPU}; verbose::Bool=false) + devices = Vector{ComputableDAGs.AbstractDevice}() if !oneAPI.functional() - if verbose - println("oneAPI is non-functional") - end + @warn "the oneAPI extension is loaded but oneAPI.jl is non-functional" return devices end oneAPIDevices = oneAPI.devices() - if verbose - println("Found $(length(oneAPIDevices)) oneAPI devices") - end + verbose && @info "Found $(length(oneAPIDevices)) oneAPI devices" for device in oneAPIDevices push!(devices, oneAPIGPU(device, default_strategy(oneAPIGPU), -1)) end diff --git a/ext/devices/rocm/function.jl b/ext/devices/rocm/function.jl new file mode 100644 index 0000000..6990ba1 --- /dev/null +++ b/ext/devices/rocm/function.jl @@ -0,0 +1,31 @@ +function ComputableDAGs.kernel(::Type{ROCmGPU}, graph::DAG, instance) + machine = cpu_st() + tape = ComputableDAGs.gen_tape(graph, instance, machine, context_module) + + init_caches = Expr(:block, tape.initCachesCode...) + assign_inputs = Expr(:block, ComputableDAGs.expr_from_fc.(tape.inputAssignCode)...) + code = Expr(:block, ComputableDAGs.expr_from_fc.(tape.computeCode)...) + + function_id = ComputableDAGs.to_var_name(UUIDs.uuid1(ComputableDAGs.rng[1])) + res_sym = eval( + ComputableDAGs.gen_access_expr( + ComputableDAGs.entry_device(tape.machine), tape.outputSymbol + ), + ) + expr = Meta.parse( + "function compute_$(function_id)(input_vector, output_vector, n::Int64) + id = (workgroupIdx().x - 1) * workgroupDim().x + workgroupIdx().x + if (id > n) + return + end + @inline data_input = input_vector[id] + $(init_caches) + $(assign_inputs) + $code + @inline output_vector[id] = $res_sym + return nothing + end" + ) + + return expr +end diff --git a/ext/devices/rocm/impl.jl b/ext/devices/rocm/impl.jl index 021d28c..18ea34f 100644 --- a/ext/devices/rocm/impl.jl +++ b/ext/devices/rocm/impl.jl @@ -1,26 +1,7 @@ -using AMDGPU - -""" - ROCmGPU <: AbstractGPU - -Representation of a specific AMD GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface. -""" -mutable struct ROCmGPU <: ComputableDAGs.AbstractGPU - device::Any - cacheStrategy::CacheStrategy - FLOPS::Float64 -end - -push!(ComputableDAGs.DEVICE_TYPES, ROCmGPU) - -ComputableDAGs.CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()] - ComputableDAGs.default_strategy(::Type{ROCmGPU}) = LocalVariables() function ComputableDAGs.measure_device!(device::ROCmGPU; verbose::Bool) - if verbose - println("Measuring ROCm GPU $(device.device)") - end + verbose && @info "Measuring ROCm GPU $(device.device)" # TODO implement return nothing @@ -31,20 +12,16 @@ end Return a Vector of [`ROCmGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information. """ -function get_devices(::Type{ROCmGPU}; verbose::Bool=false) - devices = Vector{AbstractDevice}() +function ComputableDAGs.get_devices(::Type{ROCmGPU}; verbose::Bool=false) + devices = Vector{ComputableDAGs.AbstractDevice}() if !AMDGPU.functional() - if verbose - println("AMDGPU is non-functional") - end + @warn "The AMDGPU extension is loaded but AMDGPU.jl is non-functional" return devices end AMDDevices = AMDGPU.devices() - if verbose - println("Found $(length(AMDDevices)) AMD devices") - end + verbose && @info "Found $(length(AMDDevices)) AMD devices" for device in AMDDevices push!(devices, ROCmGPU(device, default_strategy(ROCmGPU), -1)) end diff --git a/ext/oneAPIExt.jl b/ext/oneAPIExt.jl index a75ce75..c638942 100644 --- a/ext/oneAPIExt.jl +++ b/ext/oneAPIExt.jl @@ -1,6 +1,17 @@ module oneAPIExt -using ComputableDAGs, oneAPI +using ComputableDAGs +using UUIDs +using oneAPI + +function __init__() + @debug "Loading oneAPIExt" + + push!(ComputableDAGs.DEVICE_TYPES, oneAPIGPU) + ComputableDAGs.CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()] + + return nothing +end # include specialized oneAPI functions here include("devices/oneapi/impl.jl") diff --git a/src/ComputableDAGs.jl b/src/ComputableDAGs.jl index d1da18e..e9597ba 100644 --- a/src/ComputableDAGs.jl +++ b/src/ComputableDAGs.jl @@ -60,8 +60,8 @@ export get_machine_info, cpu_st export CacheStrategy, default_strategy export LocalVariables, Dictionary -# CUDAExt -export cuda_kernel +# GPU Extensions +export kernel, CUDAGPU, ROCmGPU, oneAPIGPU include("devices/interface.jl") include("task/type.jl") @@ -124,6 +124,7 @@ include("devices/detect.jl") include("devices/impl.jl") include("devices/numa/impl.jl") +include("devices/ext.jl") include("scheduler/interface.jl") include("scheduler/greedy.jl") diff --git a/src/devices/ext.jl b/src/devices/ext.jl new file mode 100644 index 0000000..b346465 --- /dev/null +++ b/src/devices/ext.jl @@ -0,0 +1,44 @@ +# file for struct definitions used by the extensions +# since extensions can't export names themselves + +""" + CUDAGPU <: AbstractGPU + +Representation of a specific CUDA GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface. + +!!! note + This requires CUDA to be loaded to be useful. +""" +mutable struct CUDAGPU <: AbstractGPU + device::Any # CuDevice + cacheStrategy::CacheStrategy + FLOPS::Float64 +end + +""" + oneAPIGPU <: AbstractGPU + +Representation of a specific Intel GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface. + +!!! note + This requires oneAPI to be loaded to be useful. +""" +mutable struct oneAPIGPU <: AbstractGPU + device::Any # oneAPI.oneL0.ZeDevice + cacheStrategy::CacheStrategy + FLOPS::Float64 +end + +""" + ROCmGPU <: AbstractGPU + +Representation of a specific AMD GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface. + +!!! note + This requires AMDGPU to be loaded to be useful. +""" +mutable struct ROCmGPU <: AbstractGPU + device::Any # HIPDevice + cacheStrategy::CacheStrategy + FLOPS::Float64 +end diff --git a/src/devices/impl.jl b/src/devices/impl.jl index e58d839..8e28b46 100644 --- a/src/devices/impl.jl +++ b/src/devices/impl.jl @@ -59,15 +59,6 @@ It is the simplest machine definition possible and produces a simple function wh """ function cpu_st() return Machine( - [ - ComputableDAGs.NumaNode( - 0, - 1, - ComputableDAGs.default_strategy(ComputableDAGs.NumaNode), - -1.0, - UUIDs.uuid1(), - ), - ], - [-1.0;;], + [NumaNode(0, 1, default_strategy(NumaNode), -1.0, UUIDs.uuid1())], [-1.0;;] ) end diff --git a/src/devices/interface.jl b/src/devices/interface.jl index 96c4fba..1c39111 100644 --- a/src/devices/interface.jl +++ b/src/devices/interface.jl @@ -108,16 +108,32 @@ Return an `Expr` or `QuoteNode` accessing the variable identified by [`symbol`]. function gen_access_expr end """ - cuda_kernel( - graph::DAG, - instance, - machine::Machine, - context_module::Module - ) + kernel(gpu_type::Type{<:AbstractGPU}, graph::DAG, instance) -Return a function of signature `compute_(input::CuVector, output::CuVector, n::Int64)`, which will return the result of the DAG computation of the input on the given output variable. +For a GPU type, a [`DAG`](@ref), and a problem instance, return an `Expr` containing a function of signature `compute_(input::Vector, output::Vector, n::Int64)`, which will return the result of the DAG computation of the input on the given output vector, intended for computation on GPUs. Currently, `CUDAGPU` and `ROCmGPU` are available if their respective package extensions are loaded. + +The generated kernel function accepts its thread ID in only the x-dimension, and only as thread ID, not as block ID. The input and output should therefore be 1-dimensional vectors. For detailed information on GPU programming and the Julia packages, please refer to their respective documentations. + +A simple example call for a CUDA kernel might look like the following: +```Julia +@cuda threads = (32,) always_inline = true cuda_kernel!(cu_inputs, outputs, length(cu_inputs)) +``` !!! note - This function is only available when the CUDA Extension is loaded by `using CUDA` before `using ComputableDAGs` + Unlike the standard [`get_compute_function`](@ref) to generate a callable function which returns a `RuntimeGeneratedFunction`, this returns an `Expr` that needs to be `eval`'d. This is a current limitation of `RuntimeGeneratedFunctions.jl` which currently cannot wrap GPU kernels. This might change in the future. + +### Size limitation + +The generated kernel does not use any internal parallelization, i.e., the DAG is compiled into a serialized function, processing each input in a single thread of the GPU. This means it can be heavily parallelized and use the GPU at 100% for sufficiently large input vectors (and assuming the function does not become IO limited etc.). However, it also means that there is a limit to how large the compiled function can be. If it gets too large, the compilation might fail, take too long to complete, the kernel might fail during execution if too much stack memory is required, or other similar problems. If this happens, your problem is likely too large to be compiled to a GPU kernel like this. + +### Compute Requirements + +A GPU function has more restrictions on what can be computed than general functions running on the CPU. In Julia, there are mainly two important restrictions to consider: + +1. Used data types must be stack allocatable, i.e., `isbits(x)` must be `true` for arguments and local variables used in `ComputeTasks`. +2. Function calls must not be dynamic. This means that type stability is required and the compiler must know in advance which method of a generic function to call. What this specifically entails may change with time and also differs between the different target GPU libraries. From experience, using the `always_inline = true` argument for `@cuda` calls can help with this. + +!!! warning + This feature is currently experimental. There are still some unresolved issues with the generated kernels. """ -function cuda_kernel end +function kernel end diff --git a/src/devices/numa/impl.jl b/src/devices/numa/impl.jl index b12ce97..a418824 100644 --- a/src/devices/numa/impl.jl +++ b/src/devices/numa/impl.jl @@ -20,9 +20,7 @@ CACHE_STRATEGIES[NumaNode] = [LocalVariables()] default_strategy(::Type{T}) where {T<:NumaNode} = LocalVariables() function measure_device!(device::NumaNode; verbose::Bool) - if verbose - println("Measuring Numa Node $(device.numaId)") - end + verbose && @info "Measuring Numa Node $(device.numaId)" # TODO implement return nothing @@ -37,9 +35,8 @@ function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:NumaNod devices = Vector{AbstractDevice}() noNumaNodes = highest_numa_node() - if (verbose) - println("Found $(noNumaNodes + 1) NUMA nodes") - end + verbose && @info "Found $(noNumaNodes + 1) NUMA nodes" + for i in 0:noNumaNodes push!(devices, NumaNode(i, 1, default_strategy(NumaNode), -1, UUIDs.uuid1(rng[1]))) end diff --git a/src/graph/type.jl b/src/graph/type.jl index 751b5dc..b5ceeb4 100644 --- a/src/graph/type.jl +++ b/src/graph/type.jl @@ -16,8 +16,6 @@ end The representation of the graph as a set of [`Node`](@ref)s. -A DAG can be loaded using the appropriate parse_dag function, e.g. [`parse_dag`](@ref). - [`Operation`](@ref)s can be applied on it using [`push_operation!`](@ref) and reverted using [`pop_operation!`](@ref) like a stack. To get the set of possible operations, use [`get_operations`](@ref). The members of the object should not be manually accessed, instead always use the provided interface functions.