Skip to content

Commit

Permalink
Add cuda, rocm and oneapi functionality to their own extensions (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonReinhard authored Sep 4, 2024
1 parent 0d604b1 commit 58ec351
Show file tree
Hide file tree
Showing 14 changed files with 138 additions and 87 deletions.
17 changes: 13 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,33 @@ authors = ["AntonReinhard <anton.reinhard@proton.me>"]
version = "0.1.0"

[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
NumaAllocators = "21436f30-1b4a-4f08-87af-e26101bb5379"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
RuntimeGeneratedFunctions = "7e49a35a-f44a-4d26-94aa-eba1b4ca6b47"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[weakdeps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"

[extensions]
AMDGPUExt = "AMDGPU"
CUDAExt = "CUDA"
oneAPIExt = "oneAPI"

[compat]
AMDGPU = "1"
CUDA = "5"
DataStructures = "0.18"
NumaAllocators = "0.2"
oneAPI = "1"
RuntimeGeneratedFunctions = "0.5"
StaticArrays = "1"

[extras]
CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

Expand Down
18 changes: 3 additions & 15 deletions docs/src/lib/internals/devices.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,10 @@ Order = [:type, :function]
```

### CUDA
```@autodocs
Modules = [GraphComputing]
Pages = ["devices/cuda/impl.jl"]
Order = [:type, :function]
```
For CUDA functionality to be available, the `CUDA.jl` package must be installed separately, as it is only a weak dependency.

### ROCm
```@autodocs
Modules = [GraphComputing]
Pages = ["devices/rocm/impl.jl"]
Order = [:type, :function]
```
For ROCm functionality to be available, the `AMDGPU.jl` package must be installed separately, as it is only a weak dependency.

### oneAPI
```@autodocs
Modules = [GraphComputing]
Pages = ["devices/oneapi/impl.jl"]
Order = [:type, :function]
```
For oneAPI functionality to be available, the `oneAPI.jl` package must be installed separately, as it is only a weak dependency.
8 changes: 8 additions & 0 deletions ext/AMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
module AMDGPUExt

using GraphComputing, AMDGPU

# include specialized AMDGPU functions here
include("devices/rocm/impl.jl")

end
11 changes: 11 additions & 0 deletions ext/CUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
module CUDAExt

using GraphComputing
using CUDA
using RuntimeGeneratedFunctions

# include specialized CUDA functions here
include("devices/cuda/impl.jl")
include("devices/cuda/function.jl")

end
33 changes: 33 additions & 0 deletions ext/devices/cuda/function.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@

function GraphComputing.cuda_kernel(
graph::DAG, instance, machine::Machine, context_module::Module
)
tape = GraphComputing.gen_tape(graph, instance, machine, context_module)

init_caches = Expr(:block, tape.initCachesCode...)
assign_inputs = Expr(:block, GraphComputing.expr_from_fc.(tape.inputAssignCode)...)
code = Expr(:block, GraphComputing.expr_from_fc.(tape.computeCode)...)

function_id = GraphComputing.to_var_name(UUIDs.uuid1(rng[1]))
res_sym = eval(
GraphComputing.gen_access_expr(
GraphComputing.entry_device(tape.machine), tape.outputSymbol
),
)
expr = Meta.parse(
"function compute_$(function_id)(input_vector, output_vector, n::Int64)
id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if (id > n)
return
end
@inline data_input = input_vector[id]
$(init_caches)
$(assign_inputs)
$code
@inline output_vector[id] = $res_sym
return nothing
end"
)

return RuntimeGeneratedFunction(@__MODULE__, context_module, expr)
end
20 changes: 9 additions & 11 deletions src/devices/cuda/impl.jl → ext/devices/cuda/impl.jl
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
using CUDA

"""
CUDAGPU <: AbstractGPU
Representation of a specific CUDA GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
"""
mutable struct CUDAGPU <: AbstractGPU
mutable struct CUDAGPU <: GraphComputing.AbstractGPU
device::Any # TODO: what's the cuda device type?
cacheStrategy::CacheStrategy
FLOPS::Float64
end

push!(DEVICE_TYPES, CUDAGPU)
push!(GraphComputing.DEVICE_TYPES, CUDAGPU)

CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()]
GraphComputing.CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()]

default_strategy(::Type{T}) where {T<:CUDAGPU} = LocalVariables()
GraphComputing.default_strategy(::Type{CUDAGPU}) = LocalVariables()

function measure_device!(device::CUDAGPU; verbose::Bool)
function GraphComputing.measure_device!(device::CUDAGPU; verbose::Bool)
if verbose
println("Measuring CUDA GPU $(device.device)")
end
Expand All @@ -27,16 +25,16 @@ function measure_device!(device::CUDAGPU; verbose::Bool)
end

"""
get_devices(deviceType::Type{T}; verbose::Bool) where {T <: CUDAGPU}
get_devices(::Type{CUDAGPU}; verbose::Bool)
Return a Vector of [`CUDAGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
"""
function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:CUDAGPU}
devices = Vector{AbstractDevice}()
function get_devices(::Type{CUDAGPU}; verbose::Bool=false)
devices = Vector{GraphComputing.AbstractDevice}()

if !CUDA.functional()
if verbose
println("CUDA is non-functional")
println("CUDA.jl is non-functional")
end
return devices
end
Expand Down
16 changes: 7 additions & 9 deletions src/devices/oneapi/impl.jl → ext/devices/oneapi/impl.jl
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
using oneAPI

"""
oneAPIGPU <: AbstractGPU
Representation of a specific Intel GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
"""
mutable struct oneAPIGPU <: AbstractGPU
mutable struct oneAPIGPU <: GraphComputing.AbstractGPU
device::Any
cacheStrategy::CacheStrategy
FLOPS::Float64
end

push!(DEVICE_TYPES, oneAPIGPU)
push!(GraphComputing.DEVICE_TYPES, oneAPIGPU)

CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()]
GraphComputing.CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()]

default_strategy(::Type{T}) where {T<:oneAPIGPU} = LocalVariables()
GraphComputing.default_strategy(::Type{oneAPIGPU}) = LocalVariables()

function measure_device!(device::oneAPIGPU; verbose::Bool)
function GraphComputing.measure_device!(device::oneAPIGPU; verbose::Bool)
if verbose
println("Measuring oneAPI GPU $(device.device)")
end
Expand All @@ -27,11 +25,11 @@ function measure_device!(device::oneAPIGPU; verbose::Bool)
end

"""
get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: oneAPIGPU}
get_devices(::Type{oneAPIGPU}; verbose::Bool = false)
Return a Vector of [`oneAPIGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
"""
function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:oneAPIGPU}
function get_devices(::Type{oneAPIGPU}; verbose::Bool=false)
devices = Vector{AbstractDevice}()

if !oneAPI.functional()
Expand Down
14 changes: 7 additions & 7 deletions src/devices/rocm/impl.jl → ext/devices/rocm/impl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@ using AMDGPU
Representation of a specific AMD GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
"""
mutable struct ROCmGPU <: AbstractGPU
mutable struct ROCmGPU <: GraphComputing.AbstractGPU
device::Any
cacheStrategy::CacheStrategy
FLOPS::Float64
end

push!(DEVICE_TYPES, ROCmGPU)
push!(GraphComputing.DEVICE_TYPES, ROCmGPU)

CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()]
GraphComputing.CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()]

default_strategy(::Type{T}) where {T<:ROCmGPU} = LocalVariables()
GraphComputing.default_strategy(::Type{ROCmGPU}) = LocalVariables()

function measure_device!(device::ROCmGPU; verbose::Bool)
function GraphComputing.measure_device!(device::ROCmGPU; verbose::Bool)
if verbose
println("Measuring ROCm GPU $(device.device)")
end
Expand All @@ -27,11 +27,11 @@ function measure_device!(device::ROCmGPU; verbose::Bool)
end

"""
get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: ROCmGPU}
get_devices(::Type{ROCmGPU}; verbose::Bool = false)
Return a Vector of [`ROCmGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
"""
function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:ROCmGPU}
function get_devices(::Type{ROCmGPU}; verbose::Bool=false)
devices = Vector{AbstractDevice}()

if !AMDGPU.functional()
Expand Down
8 changes: 8 additions & 0 deletions ext/oneAPIExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
module oneAPIExt

using GraphComputing, oneAPI

# include specialized oneAPI functions here
include("devices/oneapi/impl.jl")

end
12 changes: 7 additions & 5 deletions src/GraphComputing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ export Operation, AppliedOperation
export NodeReduction, NodeSplit
export push_operation!, pop_operation!, can_pop
export reset_graph!
export get_operations
export get_operationsr

# code generation related
export execute
export get_compute_function, get_cuda_kernel
export get_compute_function
export gen_tape, execute_tape
export unpack_identity

Expand All @@ -57,6 +57,11 @@ export problem_instance, input_type, graph, input_expr
export Machine
export NumaNode
export get_machine_info, cpu_st
export CacheStrategy, default_strategy
export LocalVariables, Dictionary

# CUDAExt
export cuda_kernel

include("devices/interface.jl")
include("task/type.jl")
Expand Down Expand Up @@ -119,9 +124,6 @@ include("devices/detect.jl")
include("devices/impl.jl")

include("devices/numa/impl.jl")
include("devices/cuda/impl.jl")
include("devices/rocm/impl.jl")
#include("devices/oneapi/impl.jl")

include("scheduler/interface.jl")
include("scheduler/greedy.jl")
Expand Down
36 changes: 0 additions & 36 deletions src/code_gen/function.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,42 +40,6 @@ function get_compute_function(
return RuntimeGeneratedFunction(@__MODULE__, context_module, expr)
end

"""
get_cuda_kernel(
graph::DAG,
instance,
machine::Machine,
)
Return a function of signature `compute_<id>(input::CuVector, output::CuVector, n::Int64)`, which will return the result of the DAG computation of the input on the given output variable.
"""
function get_cuda_kernel(graph::DAG, instance, machine::Machine, context_module::Module)
tape = gen_tape(graph, instance, machine, context_module)

initCaches = Expr(:block, tape.initCachesCode...)
assignInputs = Expr(:block, expr_from_fc.(tape.inputAssignCode)...)
code = Expr(:block, expr_from_fc.(tape.computeCode)...)

functionId = to_var_name(UUIDs.uuid1(rng[1]))
resSym = eval(gen_access_expr(entry_device(tape.machine), tape.outputSymbol))
expr = Meta.parse(
"function compute_$(functionId)(input_vector, output_vector, n::Int64)
id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if (id > n)
return
end
@inline data_input = input_vector[id]
$(initCaches)
$(assignInputs)
$code
@inline output_vector[id] = $resSym
return nothing
end"
)

return RuntimeGeneratedFunction(@__MODULE__, context_module, expr)
end

"""
execute(
graph::DAG,
Expand Down
15 changes: 15 additions & 0 deletions src/devices/interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,18 @@ Interface function that must be implemented for every subtype of [`AbstractDevic
Return an `Expr` or `QuoteNode` accessing the variable identified by [`symbol`].
"""
function gen_access_expr end

"""
cuda_kernel(
graph::DAG,
instance,
machine::Machine,
context_module::Module
)
Return a function of signature `compute_<id>(input::CuVector, output::CuVector, n::Int64)`, which will return the result of the DAG computation of the input on the given output variable.
!!! note
This function is only available when the CUDA Extension is loaded by `using CUDA` before `using GraphComputing`
"""
function cuda_kernel end
7 changes: 7 additions & 0 deletions testgpu/runtests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
using SafeTestsets
using CUDA

@safetestset "Utility Unit Tests " begin
include("unit_tests_utility.jl")
end
# TODO: Make a new simple test model and rewrite tests here
10 changes: 10 additions & 0 deletions testgpu/unit_tests_utility.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
using GraphComputing
import GraphComputing.bytes_to_human_readable

@test bytes_to_human_readable(0) == "0.0 B"
@test bytes_to_human_readable(1020) == "1020.0 B"
@test bytes_to_human_readable(1025) == "1.001 KiB"
@test bytes_to_human_readable(684235) == "668.2 KiB"
@test bytes_to_human_readable(86214576) == "82.22 MiB"
@test bytes_to_human_readable(9241457698) == "8.607 GiB"
@test bytes_to_human_readable(3218598654367) == "2.927 TiB"

0 comments on commit 58ec351

Please sign in to comment.