diff --git a/docs/make.jl b/docs/make.jl
index 374c187..a91ad88 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -32,7 +32,7 @@ makedocs(;
     sitename="ComputableDAGs.jl",
     format=Documenter.HTML(;
         prettyurls=get(ENV, "CI", "false") == "true",
-        canonical="https://ComputableDAGs.gitlab.io/ComputableDAGs.jl",
+        canonical="https://ComputableDAGs.github.io/ComputableDAGs.jl",
         assets=String[],
     ),
     pages=pages,
diff --git a/docs/src/lib/internals/devices.md b/docs/src/lib/internals/devices.md
index 7030c25..64d4360 100644
--- a/docs/src/lib/internals/devices.md
+++ b/docs/src/lib/internals/devices.md
@@ -37,11 +37,10 @@ Pages = ["devices/numa/impl.jl"]
 Order = [:type, :function]
 ```
 
-### CUDA
-For CUDA functionality to be available, the `CUDA.jl` package must be installed separately, as it is only a weak dependency.
+### GPUs
 
-### ROCm
-For ROCm functionality to be available, the `AMDGPU.jl` package must be installed separately, as it is only a weak dependency.
-
-### oneAPI
-For oneAPI functionality to be available, the `oneAPI.jl` package must be installed separately, as it is only a weak dependency.
+```@autodocs
+Modules = [ComputableDAGs]
+Pages = ["devices/ext.jl"]
+Order = [:type]
+```
diff --git a/ext/AMDGPUExt.jl b/ext/AMDGPUExt.jl
index d798f39..6572658 100644
--- a/ext/AMDGPUExt.jl
+++ b/ext/AMDGPUExt.jl
@@ -1,8 +1,20 @@
 module AMDGPUExt
 
-using ComputableDAGs, AMDGPU
+using ComputableDAGs
+using UUIDs
+using AMDGPU
+
+function __init__()
+    @debug "Loading AMDGPUExt"
+
+    push!(ComputableDAGs.DEVICE_TYPES, ROCmGPU)
+    ComputableDAGs.CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()]
+
+    return nothing
+end
 
 # include specialized AMDGPU functions here
 include("devices/rocm/impl.jl")
+include("devices/rocm/function.jl")
 
 end
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
index 772c714..0797b10 100644
--- a/ext/CUDAExt.jl
+++ b/ext/CUDAExt.jl
@@ -1,8 +1,17 @@
 module CUDAExt
 
 using ComputableDAGs
+using UUIDs
 using CUDA
-using RuntimeGeneratedFunctions
+
+function __init__()
+    @debug "Loading CUDAExt"
+
+    push!(ComputableDAGs.DEVICE_TYPES, CUDAGPU)
+    ComputableDAGs.CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()]
+
+    return nothing
+end
 
 # include specialized CUDA functions here
 include("devices/cuda/impl.jl")
diff --git a/ext/devices/cuda/function.jl b/ext/devices/cuda/function.jl
index b407b77..ef7920a 100644
--- a/ext/devices/cuda/function.jl
+++ b/ext/devices/cuda/function.jl
@@ -1,14 +1,12 @@
-
-function ComputableDAGs.cuda_kernel(
-    graph::DAG, instance, machine::Machine, context_module::Module
-)
+function ComputableDAGs.kernel(::Type{CUDAGPU}, graph::DAG, instance)
+    machine = cpu_st()
     tape = ComputableDAGs.gen_tape(graph, instance, machine, context_module)
 
     init_caches = Expr(:block, tape.initCachesCode...)
     assign_inputs = Expr(:block, ComputableDAGs.expr_from_fc.(tape.inputAssignCode)...)
     code = Expr(:block, ComputableDAGs.expr_from_fc.(tape.computeCode)...)
 
-    function_id = ComputableDAGs.to_var_name(UUIDs.uuid1(rng[1]))
+    function_id = ComputableDAGs.to_var_name(UUIDs.uuid1(ComputableDAGs.rng[1]))
     res_sym = eval(
         ComputableDAGs.gen_access_expr(
             ComputableDAGs.entry_device(tape.machine), tape.outputSymbol
@@ -29,5 +27,5 @@ function ComputableDAGs.cuda_kernel(
         end"
     )
 
-    return RuntimeGeneratedFunction(@__MODULE__, context_module, expr)
+    return expr
 end
diff --git a/ext/devices/cuda/impl.jl b/ext/devices/cuda/impl.jl
index 44cb17f..77b1e3f 100644
--- a/ext/devices/cuda/impl.jl
+++ b/ext/devices/cuda/impl.jl
@@ -1,24 +1,7 @@
-"""
-    CUDAGPU <: AbstractGPU
-
-Representation of a specific CUDA GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
-"""
-mutable struct CUDAGPU <: ComputableDAGs.AbstractGPU
-    device::Any # TODO: what's the cuda device type?
-    cacheStrategy::CacheStrategy
-    FLOPS::Float64
-end
-
-push!(ComputableDAGs.DEVICE_TYPES, CUDAGPU)
-
-ComputableDAGs.CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()]
-
 ComputableDAGs.default_strategy(::Type{CUDAGPU}) = LocalVariables()
 
 function ComputableDAGs.measure_device!(device::CUDAGPU; verbose::Bool)
-    if verbose
-        println("Measuring CUDA GPU $(device.device)")
-    end
+    verbose && @info "Measuring CUDA GPU $(device.device)"
 
     # TODO implement
     return nothing
@@ -29,20 +12,16 @@ end
 
 Return a Vector of [`CUDAGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
 """
-function get_devices(::Type{CUDAGPU}; verbose::Bool=false)
+function ComputableDAGs.get_devices(::Type{CUDAGPU}; verbose::Bool=false)
     devices = Vector{ComputableDAGs.AbstractDevice}()
 
     if !CUDA.functional()
-        if verbose
-            println("CUDA.jl is non-functional")
-        end
+        @warn "The CUDA extension is loaded but CUDA.jl is non-functional"
         return devices
     end
 
     CUDADevices = CUDA.devices()
-    if verbose
-        println("Found $(length(CUDADevices)) CUDA devices")
-    end
+    verbose && @info "Found $(length(CUDADevices)) CUDA devices"
     for device in CUDADevices
         push!(devices, CUDAGPU(device, default_strategy(CUDAGPU), -1))
     end
diff --git a/ext/devices/oneapi/impl.jl b/ext/devices/oneapi/impl.jl
index abf35cc..61c44dd 100644
--- a/ext/devices/oneapi/impl.jl
+++ b/ext/devices/oneapi/impl.jl
@@ -1,24 +1,7 @@
-"""
-    oneAPIGPU <: AbstractGPU
-
-Representation of a specific Intel GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
-"""
-mutable struct oneAPIGPU <: ComputableDAGs.AbstractGPU
-    device::Any
-    cacheStrategy::CacheStrategy
-    FLOPS::Float64
-end
-
-push!(ComputableDAGs.DEVICE_TYPES, oneAPIGPU)
-
-ComputableDAGs.CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()]
-
 ComputableDAGs.default_strategy(::Type{oneAPIGPU}) = LocalVariables()
 
 function ComputableDAGs.measure_device!(device::oneAPIGPU; verbose::Bool)
-    if verbose
-        println("Measuring oneAPI GPU $(device.device)")
-    end
+    verbose && @info "Measuring oneAPI GPU $(device.device)"
 
     # TODO implement
     return nothing
@@ -29,20 +12,16 @@ end
 
 Return a Vector of [`oneAPIGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
 """
-function get_devices(::Type{oneAPIGPU}; verbose::Bool=false)
-    devices = Vector{AbstractDevice}()
+function ComputableDAGs.get_devices(::Type{oneAPIGPU}; verbose::Bool=false)
+    devices = Vector{ComputableDAGs.AbstractDevice}()
 
     if !oneAPI.functional()
-        if verbose
-            println("oneAPI is non-functional")
-        end
+        @warn "the oneAPI extension is loaded but oneAPI.jl is non-functional"
         return devices
     end
 
     oneAPIDevices = oneAPI.devices()
-    if verbose
-        println("Found $(length(oneAPIDevices)) oneAPI devices")
-    end
+    verbose && @info "Found $(length(oneAPIDevices)) oneAPI devices"
     for device in oneAPIDevices
         push!(devices, oneAPIGPU(device, default_strategy(oneAPIGPU), -1))
     end
diff --git a/ext/devices/rocm/function.jl b/ext/devices/rocm/function.jl
new file mode 100644
index 0000000..6990ba1
--- /dev/null
+++ b/ext/devices/rocm/function.jl
@@ -0,0 +1,31 @@
+function ComputableDAGs.kernel(::Type{ROCmGPU}, graph::DAG, instance)
+    machine = cpu_st()
+    tape = ComputableDAGs.gen_tape(graph, instance, machine, context_module)
+
+    init_caches = Expr(:block, tape.initCachesCode...)
+    assign_inputs = Expr(:block, ComputableDAGs.expr_from_fc.(tape.inputAssignCode)...)
+    code = Expr(:block, ComputableDAGs.expr_from_fc.(tape.computeCode)...)
+
+    function_id = ComputableDAGs.to_var_name(UUIDs.uuid1(ComputableDAGs.rng[1]))
+    res_sym = eval(
+        ComputableDAGs.gen_access_expr(
+            ComputableDAGs.entry_device(tape.machine), tape.outputSymbol
+        ),
+    )
+    expr = Meta.parse(
+        "function compute_$(function_id)(input_vector, output_vector, n::Int64)
+            id = (workgroupIdx().x - 1) * workgroupDim().x + workgroupIdx().x
+            if (id > n)  
+                return
+            end
+            @inline data_input = input_vector[id]
+            $(init_caches)
+            $(assign_inputs)
+            $code
+            @inline output_vector[id] = $res_sym
+            return nothing
+        end"
+    )
+
+    return expr
+end
diff --git a/ext/devices/rocm/impl.jl b/ext/devices/rocm/impl.jl
index 021d28c..18ea34f 100644
--- a/ext/devices/rocm/impl.jl
+++ b/ext/devices/rocm/impl.jl
@@ -1,26 +1,7 @@
-using AMDGPU
-
-"""
-    ROCmGPU <: AbstractGPU
-
-Representation of a specific AMD GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
-"""
-mutable struct ROCmGPU <: ComputableDAGs.AbstractGPU
-    device::Any
-    cacheStrategy::CacheStrategy
-    FLOPS::Float64
-end
-
-push!(ComputableDAGs.DEVICE_TYPES, ROCmGPU)
-
-ComputableDAGs.CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()]
-
 ComputableDAGs.default_strategy(::Type{ROCmGPU}) = LocalVariables()
 
 function ComputableDAGs.measure_device!(device::ROCmGPU; verbose::Bool)
-    if verbose
-        println("Measuring ROCm GPU $(device.device)")
-    end
+    verbose && @info "Measuring ROCm GPU $(device.device)"
 
     # TODO implement
     return nothing
@@ -31,20 +12,16 @@ end
 
 Return a Vector of [`ROCmGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
 """
-function get_devices(::Type{ROCmGPU}; verbose::Bool=false)
-    devices = Vector{AbstractDevice}()
+function ComputableDAGs.get_devices(::Type{ROCmGPU}; verbose::Bool=false)
+    devices = Vector{ComputableDAGs.AbstractDevice}()
 
     if !AMDGPU.functional()
-        if verbose
-            println("AMDGPU is non-functional")
-        end
+        @warn "The AMDGPU extension is loaded but AMDGPU.jl is non-functional"
         return devices
     end
 
     AMDDevices = AMDGPU.devices()
-    if verbose
-        println("Found $(length(AMDDevices)) AMD devices")
-    end
+    verbose && @info "Found $(length(AMDDevices)) AMD devices"
     for device in AMDDevices
         push!(devices, ROCmGPU(device, default_strategy(ROCmGPU), -1))
     end
diff --git a/ext/oneAPIExt.jl b/ext/oneAPIExt.jl
index a75ce75..c638942 100644
--- a/ext/oneAPIExt.jl
+++ b/ext/oneAPIExt.jl
@@ -1,6 +1,17 @@
 module oneAPIExt
 
-using ComputableDAGs, oneAPI
+using ComputableDAGs
+using UUIDs
+using oneAPI
+
+function __init__()
+    @debug "Loading oneAPIExt"
+
+    push!(ComputableDAGs.DEVICE_TYPES, oneAPIGPU)
+    ComputableDAGs.CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()]
+
+    return nothing
+end
 
 # include specialized oneAPI functions here
 include("devices/oneapi/impl.jl")
diff --git a/src/ComputableDAGs.jl b/src/ComputableDAGs.jl
index d1da18e..e9597ba 100644
--- a/src/ComputableDAGs.jl
+++ b/src/ComputableDAGs.jl
@@ -60,8 +60,8 @@ export get_machine_info, cpu_st
 export CacheStrategy, default_strategy
 export LocalVariables, Dictionary
 
-# CUDAExt
-export cuda_kernel
+# GPU Extensions
+export kernel, CUDAGPU, ROCmGPU, oneAPIGPU
 
 include("devices/interface.jl")
 include("task/type.jl")
@@ -124,6 +124,7 @@ include("devices/detect.jl")
 include("devices/impl.jl")
 
 include("devices/numa/impl.jl")
+include("devices/ext.jl")
 
 include("scheduler/interface.jl")
 include("scheduler/greedy.jl")
diff --git a/src/devices/ext.jl b/src/devices/ext.jl
new file mode 100644
index 0000000..b346465
--- /dev/null
+++ b/src/devices/ext.jl
@@ -0,0 +1,44 @@
+# file for struct definitions used by the extensions
+# since extensions can't export names themselves
+
+"""
+    CUDAGPU <: AbstractGPU
+
+Representation of a specific CUDA GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
+
+!!! note
+    This requires CUDA to be loaded to be useful.
+"""
+mutable struct CUDAGPU <: AbstractGPU
+    device::Any # CuDevice
+    cacheStrategy::CacheStrategy
+    FLOPS::Float64
+end
+
+"""
+    oneAPIGPU <: AbstractGPU
+
+Representation of a specific Intel GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
+
+!!! note
+    This requires oneAPI to be loaded to be useful.
+"""
+mutable struct oneAPIGPU <: AbstractGPU
+    device::Any # oneAPI.oneL0.ZeDevice
+    cacheStrategy::CacheStrategy
+    FLOPS::Float64
+end
+
+"""
+    ROCmGPU <: AbstractGPU
+
+Representation of a specific AMD GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
+
+!!! note
+    This requires AMDGPU to be loaded to be useful.
+"""
+mutable struct ROCmGPU <: AbstractGPU
+    device::Any # HIPDevice
+    cacheStrategy::CacheStrategy
+    FLOPS::Float64
+end
diff --git a/src/devices/impl.jl b/src/devices/impl.jl
index e58d839..8e28b46 100644
--- a/src/devices/impl.jl
+++ b/src/devices/impl.jl
@@ -59,15 +59,6 @@ It is the simplest machine definition possible and produces a simple function wh
 """
 function cpu_st()
     return Machine(
-        [
-            ComputableDAGs.NumaNode(
-                0,
-                1,
-                ComputableDAGs.default_strategy(ComputableDAGs.NumaNode),
-                -1.0,
-                UUIDs.uuid1(),
-            ),
-        ],
-        [-1.0;;],
+        [NumaNode(0, 1, default_strategy(NumaNode), -1.0, UUIDs.uuid1())], [-1.0;;]
     )
 end
diff --git a/src/devices/interface.jl b/src/devices/interface.jl
index 96c4fba..1c39111 100644
--- a/src/devices/interface.jl
+++ b/src/devices/interface.jl
@@ -108,16 +108,32 @@ Return an `Expr` or `QuoteNode` accessing the variable identified by [`symbol`].
 function gen_access_expr end
 
 """
-    cuda_kernel(
-        graph::DAG,
-        instance,
-        machine::Machine,
-        context_module::Module
-    )
+    kernel(gpu_type::Type{<:AbstractGPU}, graph::DAG, instance)
 
-Return a function of signature `compute_<id>(input::CuVector, output::CuVector, n::Int64)`, which will return the result of the DAG computation of the input on the given output variable.
+For a GPU type, a [`DAG`](@ref), and a problem instance, return an `Expr` containing a function of signature `compute_<id>(input::<GPU>Vector, output::<GPU>Vector, n::Int64)`, which will return the result of the DAG computation of the input on the given output vector, intended for computation on GPUs. Currently, `CUDAGPU` and `ROCmGPU` are available if their respective package extensions are loaded.
+
+The generated kernel function accepts its thread ID in only the x-dimension, and only as thread ID, not as block ID. The input and output should therefore be 1-dimensional vectors. For detailed information on GPU programming and the Julia packages, please refer to their respective documentations.
+
+A simple example call for a CUDA kernel might look like the following:
+```Julia
+@cuda threads = (32,) always_inline = true cuda_kernel!(cu_inputs, outputs, length(cu_inputs))
+```
 
 !!! note
-    This function is only available when the CUDA Extension is loaded by `using CUDA` before `using ComputableDAGs`
+    Unlike the standard [`get_compute_function`](@ref) to generate a callable function which returns a `RuntimeGeneratedFunction`, this returns an `Expr` that needs to be `eval`'d. This is a current limitation of `RuntimeGeneratedFunctions.jl` which currently cannot wrap GPU kernels. This might change in the future.
+
+### Size limitation
+
+The generated kernel does not use any internal parallelization, i.e., the DAG is compiled into a serialized function, processing each input in a single thread of the GPU. This means it can be heavily parallelized and use the GPU at 100% for sufficiently large input vectors (and assuming the function does not become IO limited etc.). However, it also means that there is a limit to how large the compiled function can be. If it gets too large, the compilation might fail, take too long to complete, the kernel might fail during execution if too much stack memory is required, or other similar problems. If this happens, your problem is likely too large to be compiled to a GPU kernel like this.
+
+### Compute Requirements
+
+A GPU function has more restrictions on what can be computed than general functions running on the CPU. In Julia, there are mainly two important restrictions to consider: 
+    
+1. Used data types must be stack allocatable, i.e., `isbits(x)` must be `true` for arguments and local variables used in `ComputeTasks`.
+2. Function calls must not be dynamic. This means that type stability is required and the compiler must know in advance which method of a generic function to call. What this specifically entails may change with time and also differs between the different target GPU libraries. From experience, using the `always_inline = true` argument for `@cuda` calls can help with this.
+
+!!! warning
+    This feature is currently experimental. There are still some unresolved issues with the generated kernels.
 """
-function cuda_kernel end
+function kernel end
diff --git a/src/devices/numa/impl.jl b/src/devices/numa/impl.jl
index b12ce97..a418824 100644
--- a/src/devices/numa/impl.jl
+++ b/src/devices/numa/impl.jl
@@ -20,9 +20,7 @@ CACHE_STRATEGIES[NumaNode] = [LocalVariables()]
 default_strategy(::Type{T}) where {T<:NumaNode} = LocalVariables()
 
 function measure_device!(device::NumaNode; verbose::Bool)
-    if verbose
-        println("Measuring Numa Node $(device.numaId)")
-    end
+    verbose && @info "Measuring Numa Node $(device.numaId)"
 
     # TODO implement
     return nothing
@@ -37,9 +35,8 @@ function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:NumaNod
     devices = Vector{AbstractDevice}()
     noNumaNodes = highest_numa_node()
 
-    if (verbose)
-        println("Found $(noNumaNodes + 1) NUMA nodes")
-    end
+    verbose && @info "Found $(noNumaNodes + 1) NUMA nodes"
+
     for i in 0:noNumaNodes
         push!(devices, NumaNode(i, 1, default_strategy(NumaNode), -1, UUIDs.uuid1(rng[1])))
     end
diff --git a/src/graph/type.jl b/src/graph/type.jl
index 751b5dc..b5ceeb4 100644
--- a/src/graph/type.jl
+++ b/src/graph/type.jl
@@ -16,8 +16,6 @@ end
 
 The representation of the graph as a set of [`Node`](@ref)s.
 
-A DAG can be loaded using the appropriate parse_dag function, e.g. [`parse_dag`](@ref).
-
 [`Operation`](@ref)s can be applied on it using [`push_operation!`](@ref) and reverted using [`pop_operation!`](@ref) like a stack.
 To get the set of possible operations, use [`get_operations`](@ref).
 The members of the object should not be manually accessed, instead always use the provided interface functions.