diff --git a/Project.toml b/Project.toml
index 19b77ea..3feb19a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,24 +4,33 @@ authors = ["AntonReinhard <anton.reinhard@proton.me>"]
 version = "0.1.0"
 
 [deps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 NumaAllocators = "21436f30-1b4a-4f08-87af-e26101bb5379"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 RuntimeGeneratedFunctions = "7e49a35a-f44a-4d26-94aa-eba1b4ca6b47"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
+[weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+
+[extensions]
+AMDGPUExt = "AMDGPU"
+CUDAExt = "CUDA"
+oneAPIExt = "oneAPI"
+
 [compat]
+AMDGPU = "1"
+CUDA = "5"
 DataStructures = "0.18"
 NumaAllocators = "0.2"
+oneAPI = "1"
 RuntimeGeneratedFunctions = "0.5"
 StaticArrays = "1"
 
 [extras]
-CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
diff --git a/docs/src/lib/internals/devices.md b/docs/src/lib/internals/devices.md
index 089d769..a5eba3e 100644
--- a/docs/src/lib/internals/devices.md
+++ b/docs/src/lib/internals/devices.md
@@ -38,22 +38,10 @@ Order = [:type, :function]
 ```
 
 ### CUDA
-```@autodocs
-Modules = [GraphComputing]
-Pages = ["devices/cuda/impl.jl"]
-Order = [:type, :function]
-```
+For CUDA functionality to be available, the `CUDA.jl` package must be installed separately, as it is only a weak dependency.
 
 ### ROCm
-```@autodocs
-Modules = [GraphComputing]
-Pages = ["devices/rocm/impl.jl"]
-Order = [:type, :function]
-```
+For ROCm functionality to be available, the `AMDGPU.jl` package must be installed separately, as it is only a weak dependency.
 
 ### oneAPI
-```@autodocs
-Modules = [GraphComputing]
-Pages = ["devices/oneapi/impl.jl"]
-Order = [:type, :function]
-```
+For oneAPI functionality to be available, the `oneAPI.jl` package must be installed separately, as it is only a weak dependency.
diff --git a/ext/AMDGPUExt.jl b/ext/AMDGPUExt.jl
new file mode 100644
index 0000000..25be189
--- /dev/null
+++ b/ext/AMDGPUExt.jl
@@ -0,0 +1,8 @@
+module AMDGPUExt
+
+using GraphComputing, AMDGPU
+
+# include specialized AMDGPU functions here
+include("devices/rocm/impl.jl")
+
+end
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
new file mode 100644
index 0000000..3620eaf
--- /dev/null
+++ b/ext/CUDAExt.jl
@@ -0,0 +1,11 @@
+module CUDAExt
+
+using GraphComputing
+using CUDA
+using RuntimeGeneratedFunctions
+
+# include specialized CUDA functions here
+include("devices/cuda/impl.jl")
+include("devices/cuda/function.jl")
+
+end
diff --git a/ext/devices/cuda/function.jl b/ext/devices/cuda/function.jl
new file mode 100644
index 0000000..3c3bb4d
--- /dev/null
+++ b/ext/devices/cuda/function.jl
@@ -0,0 +1,33 @@
+
+function GraphComputing.cuda_kernel(
+    graph::DAG, instance, machine::Machine, context_module::Module
+)
+    tape = GraphComputing.gen_tape(graph, instance, machine, context_module)
+
+    init_caches = Expr(:block, tape.initCachesCode...)
+    assign_inputs = Expr(:block, GraphComputing.expr_from_fc.(tape.inputAssignCode)...)
+    code = Expr(:block, GraphComputing.expr_from_fc.(tape.computeCode)...)
+
+    function_id = GraphComputing.to_var_name(UUIDs.uuid1(rng[1]))
+    res_sym = eval(
+        GraphComputing.gen_access_expr(
+            GraphComputing.entry_device(tape.machine), tape.outputSymbol
+        ),
+    )
+    expr = Meta.parse(
+        "function compute_$(function_id)(input_vector, output_vector, n::Int64)
+            id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+            if (id > n)  
+                return
+            end
+            @inline data_input = input_vector[id]
+            $(init_caches)
+            $(assign_inputs)
+            $code
+            @inline output_vector[id] = $res_sym
+            return nothing
+        end"
+    )
+
+    return RuntimeGeneratedFunction(@__MODULE__, context_module, expr)
+end
diff --git a/src/devices/cuda/impl.jl b/ext/devices/cuda/impl.jl
similarity index 62%
rename from src/devices/cuda/impl.jl
rename to ext/devices/cuda/impl.jl
index 66383d0..5485df1 100644
--- a/src/devices/cuda/impl.jl
+++ b/ext/devices/cuda/impl.jl
@@ -1,23 +1,21 @@
-using CUDA
-
 """
     CUDAGPU <: AbstractGPU
 
 Representation of a specific CUDA GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
 """
-mutable struct CUDAGPU <: AbstractGPU
+mutable struct CUDAGPU <: GraphComputing.AbstractGPU
     device::Any # TODO: what's the cuda device type?
     cacheStrategy::CacheStrategy
     FLOPS::Float64
 end
 
-push!(DEVICE_TYPES, CUDAGPU)
+push!(GraphComputing.DEVICE_TYPES, CUDAGPU)
 
-CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()]
+GraphComputing.CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()]
 
-default_strategy(::Type{T}) where {T<:CUDAGPU} = LocalVariables()
+GraphComputing.default_strategy(::Type{CUDAGPU}) = LocalVariables()
 
-function measure_device!(device::CUDAGPU; verbose::Bool)
+function GraphComputing.measure_device!(device::CUDAGPU; verbose::Bool)
     if verbose
         println("Measuring CUDA GPU $(device.device)")
     end
@@ -27,16 +25,16 @@ function measure_device!(device::CUDAGPU; verbose::Bool)
 end
 
 """
-    get_devices(deviceType::Type{T}; verbose::Bool) where {T <: CUDAGPU}
+    get_devices(::Type{CUDAGPU}; verbose::Bool)
 
 Return a Vector of [`CUDAGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
 """
-function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:CUDAGPU}
-    devices = Vector{AbstractDevice}()
+function get_devices(::Type{CUDAGPU}; verbose::Bool=false)
+    devices = Vector{GraphComputing.AbstractDevice}()
 
     if !CUDA.functional()
         if verbose
-            println("CUDA is non-functional")
+            println("CUDA.jl is non-functional")
         end
         return devices
     end
diff --git a/src/devices/oneapi/impl.jl b/ext/devices/oneapi/impl.jl
similarity index 68%
rename from src/devices/oneapi/impl.jl
rename to ext/devices/oneapi/impl.jl
index da2f9e8..705aa80 100644
--- a/src/devices/oneapi/impl.jl
+++ b/ext/devices/oneapi/impl.jl
@@ -1,23 +1,21 @@
-using oneAPI
-
 """
     oneAPIGPU <: AbstractGPU
 
 Representation of a specific Intel GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
 """
-mutable struct oneAPIGPU <: AbstractGPU
+mutable struct oneAPIGPU <: GraphComputing.AbstractGPU
     device::Any
     cacheStrategy::CacheStrategy
     FLOPS::Float64
 end
 
-push!(DEVICE_TYPES, oneAPIGPU)
+push!(GraphComputing.DEVICE_TYPES, oneAPIGPU)
 
-CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()]
+GraphComputing.CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()]
 
-default_strategy(::Type{T}) where {T<:oneAPIGPU} = LocalVariables()
+GraphComputing.default_strategy(::Type{oneAPIGPU}) = LocalVariables()
 
-function measure_device!(device::oneAPIGPU; verbose::Bool)
+function GraphComputing.measure_device!(device::oneAPIGPU; verbose::Bool)
     if verbose
         println("Measuring oneAPI GPU $(device.device)")
     end
@@ -27,11 +25,11 @@ function measure_device!(device::oneAPIGPU; verbose::Bool)
 end
 
 """
-    get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: oneAPIGPU}
+    get_devices(::Type{oneAPIGPU}; verbose::Bool = false)
 
 Return a Vector of [`oneAPIGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
 """
-function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:oneAPIGPU}
+function get_devices(::Type{oneAPIGPU}; verbose::Bool=false)
     devices = Vector{AbstractDevice}()
 
     if !oneAPI.functional()
diff --git a/src/devices/rocm/impl.jl b/ext/devices/rocm/impl.jl
similarity index 69%
rename from src/devices/rocm/impl.jl
rename to ext/devices/rocm/impl.jl
index ba189d9..836d9c7 100644
--- a/src/devices/rocm/impl.jl
+++ b/ext/devices/rocm/impl.jl
@@ -5,19 +5,19 @@ using AMDGPU
 
 Representation of a specific AMD GPU that code can run on. Implements the [`AbstractDevice`](@ref) interface.
 """
-mutable struct ROCmGPU <: AbstractGPU
+mutable struct ROCmGPU <: GraphComputing.AbstractGPU
     device::Any
     cacheStrategy::CacheStrategy
     FLOPS::Float64
 end
 
-push!(DEVICE_TYPES, ROCmGPU)
+push!(GraphComputing.DEVICE_TYPES, ROCmGPU)
 
-CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()]
+GraphComputing.CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()]
 
-default_strategy(::Type{T}) where {T<:ROCmGPU} = LocalVariables()
+GraphComputing.default_strategy(::Type{ROCmGPU}) = LocalVariables()
 
-function measure_device!(device::ROCmGPU; verbose::Bool)
+function GraphComputing.measure_device!(device::ROCmGPU; verbose::Bool)
     if verbose
         println("Measuring ROCm GPU $(device.device)")
     end
@@ -27,11 +27,11 @@ function measure_device!(device::ROCmGPU; verbose::Bool)
 end
 
 """
-    get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: ROCmGPU}
+    get_devices(::Type{ROCmGPU}; verbose::Bool = false)
 
 Return a Vector of [`ROCmGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
 """
-function get_devices(deviceType::Type{T}; verbose::Bool=false) where {T<:ROCmGPU}
+function get_devices(::Type{ROCmGPU}; verbose::Bool=false)
     devices = Vector{AbstractDevice}()
 
     if !AMDGPU.functional()
diff --git a/ext/oneAPIExt.jl b/ext/oneAPIExt.jl
new file mode 100644
index 0000000..1f6ebae
--- /dev/null
+++ b/ext/oneAPIExt.jl
@@ -0,0 +1,8 @@
+module oneAPIExt
+
+using GraphComputing, oneAPI
+
+# include specialized oneAPI functions here
+include("devices/oneapi/impl.jl")
+
+end
diff --git a/src/GraphComputing.jl b/src/GraphComputing.jl
index ad7d884..2d43df0 100644
--- a/src/GraphComputing.jl
+++ b/src/GraphComputing.jl
@@ -31,11 +31,11 @@ export Operation, AppliedOperation
 export NodeReduction, NodeSplit
 export push_operation!, pop_operation!, can_pop
 export reset_graph!
-export get_operations
+export get_operationsr
 
 # code generation related
 export execute
-export get_compute_function, get_cuda_kernel
+export get_compute_function
 export gen_tape, execute_tape
 export unpack_identity
 
@@ -57,6 +57,11 @@ export problem_instance, input_type, graph, input_expr
 export Machine
 export NumaNode
 export get_machine_info, cpu_st
+export CacheStrategy, default_strategy
+export LocalVariables, Dictionary
+
+# CUDAExt
+export cuda_kernel
 
 include("devices/interface.jl")
 include("task/type.jl")
@@ -119,9 +124,6 @@ include("devices/detect.jl")
 include("devices/impl.jl")
 
 include("devices/numa/impl.jl")
-include("devices/cuda/impl.jl")
-include("devices/rocm/impl.jl")
-#include("devices/oneapi/impl.jl")
 
 include("scheduler/interface.jl")
 include("scheduler/greedy.jl")
diff --git a/src/code_gen/function.jl b/src/code_gen/function.jl
index 8e45f05..3ff1273 100644
--- a/src/code_gen/function.jl
+++ b/src/code_gen/function.jl
@@ -40,42 +40,6 @@ function get_compute_function(
     return RuntimeGeneratedFunction(@__MODULE__, context_module, expr)
 end
 
-"""
-    get_cuda_kernel(
-        graph::DAG,
-        instance,
-        machine::Machine,
-    )
-
-Return a function of signature `compute_<id>(input::CuVector, output::CuVector, n::Int64)`, which will return the result of the DAG computation of the input on the given output variable.
-"""
-function get_cuda_kernel(graph::DAG, instance, machine::Machine, context_module::Module)
-    tape = gen_tape(graph, instance, machine, context_module)
-
-    initCaches = Expr(:block, tape.initCachesCode...)
-    assignInputs = Expr(:block, expr_from_fc.(tape.inputAssignCode)...)
-    code = Expr(:block, expr_from_fc.(tape.computeCode)...)
-
-    functionId = to_var_name(UUIDs.uuid1(rng[1]))
-    resSym = eval(gen_access_expr(entry_device(tape.machine), tape.outputSymbol))
-    expr = Meta.parse(
-        "function compute_$(functionId)(input_vector, output_vector, n::Int64)
-            id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-            if (id > n)
-                return
-            end
-            @inline data_input = input_vector[id]
-            $(initCaches)
-            $(assignInputs)
-            $code
-            @inline output_vector[id] = $resSym
-            return nothing
-        end"
-    )
-
-    return RuntimeGeneratedFunction(@__MODULE__, context_module, expr)
-end
-
 """
     execute(
         graph::DAG,
diff --git a/src/devices/interface.jl b/src/devices/interface.jl
index 6c5e06f..1c7c37a 100644
--- a/src/devices/interface.jl
+++ b/src/devices/interface.jl
@@ -106,3 +106,18 @@ Interface function that must be implemented for every subtype of [`AbstractDevic
 Return an `Expr` or `QuoteNode` accessing the variable identified by [`symbol`].
 """
 function gen_access_expr end
+
+"""
+    cuda_kernel(
+        graph::DAG,
+        instance,
+        machine::Machine,
+        context_module::Module
+    )
+
+Return a function of signature `compute_<id>(input::CuVector, output::CuVector, n::Int64)`, which will return the result of the DAG computation of the input on the given output variable.
+
+!!! note
+    This function is only available when the CUDA Extension is loaded by `using CUDA` before `using GraphComputing`
+"""
+function cuda_kernel end
diff --git a/testgpu/runtests.jl b/testgpu/runtests.jl
new file mode 100644
index 0000000..4b8ba6c
--- /dev/null
+++ b/testgpu/runtests.jl
@@ -0,0 +1,7 @@
+using SafeTestsets
+using CUDA
+
+@safetestset "Utility Unit Tests                  " begin
+    include("unit_tests_utility.jl")
+end
+# TODO: Make a new simple test model and rewrite tests here
diff --git a/testgpu/unit_tests_utility.jl b/testgpu/unit_tests_utility.jl
new file mode 100644
index 0000000..2599804
--- /dev/null
+++ b/testgpu/unit_tests_utility.jl
@@ -0,0 +1,10 @@
+using GraphComputing
+import GraphComputing.bytes_to_human_readable
+
+@test bytes_to_human_readable(0) == "0.0 B"
+@test bytes_to_human_readable(1020) == "1020.0 B"
+@test bytes_to_human_readable(1025) == "1.001 KiB"
+@test bytes_to_human_readable(684235) == "668.2 KiB"
+@test bytes_to_human_readable(86214576) == "82.22 MiB"
+@test bytes_to_human_readable(9241457698) == "8.607 GiB"
+@test bytes_to_human_readable(3218598654367) == "2.927 TiB"