diff --git a/examples/asymmetric_quadratic.jl b/examples/asymmetric_quadratic.jl
deleted file mode 100644
index d5abc6c0..00000000
--- a/examples/asymmetric_quadratic.jl
+++ /dev/null
@@ -1,55 +0,0 @@
-using Nabla
-
-# Generate the values required to compute a matrix quadratic form.
-N = 5
-B = randn(5, 5)
-A = B.'B + UniformScaling(1e-6)
-
-# Low-level API computation of derivatives.
-
-# Construct some trackable data.
-x_, y_ = randn(N), randn(N)
-x, y = Leaf.(Tape(), (x_, y_))
-
-# Compute the forward pass.
-z = x.' * (A * y) # Temporary bracketting because we don't support RowVectors yet.
-
-println("Output of the forward pass is:")
-println(z)
-println()
-println("y is $(Nabla.unbox(z)).")
-println()
-
-# Get the reverse tape.
-z̄ = ∇(z)
-println("Output of reverse-pass is")
-println(z̄)
-println()
-
-# Index into the reverse tape using x to get the gradient of `y` w.r.t. `x`.
-x̄ = z̄[x]
-println("Gradient of z w.r.t. x at $x_ is $x̄.")
-println()
-
-ȳ = z̄[y]
-println("Gradient of z w.r.t. y at $y_ is $ȳ")
-
-
-# (Current) High-Level API computation of derivatives. I will probably maintain this
-# interface and extend is significantly as it is currently rather limited. It just returns
-# a function which returns the gradient, which isn't really what you want.
-
-# Define the function to be differentiated. Parameters w.r.t. which we want gradients must
-# be arguments. Parameters that we don't want gradients w.r.t. should be passed in via a
-# closure.
-@unionise f(x::AbstractVector, y::AbstractVector) = x.'A * y
-
-# Compute a function `∇f` which computes the derivative of `f` w.r.t. the inputs.
-∇f = ∇(f)
-
-# Compute the derivative of `f` w.r.t. `x` at `x_`. Result is currently a 1-Tuple. Might
-# introduce a special case for unary functions where it just returns the result.
-(x̄, ȳ) = ∇f(x_, y_)
-
-@assert x̄ == z̄[x]
-@assert ȳ == z̄[y]
diff --git a/examples/mlp.jl b/examples/mlp.jl
deleted file mode 100644
index 06cad992..00000000
--- a/examples/mlp.jl
+++ /dev/null
@@ -1,138 +0,0 @@
-using MNIST, Nabla
-
-""" Implementation of the Adam optimiser. """
-mutable struct Adam{T<:AbstractArray}
-    α::Float64
-    β1::Float64
-    β2::Float64
-    m::T
-    v::T
-    β1_acc::Float64
-    β2_acc::Float64
-    ϵ::Float64
-end
-function Adam(θ0::AbstractArray, α::Float64, β1::Float64, β2::Float64, ϵ::Float64)
-    return Adam(α, β1, β2, zeros(θ0), zeros(θ0), β1, β2, ϵ)
-end
-
-function iterate!(θ::AbstractArray{Float64}, ∇θ::AbstractArray{Float64}, opt::Adam)
-    m, v, α, β1, β2, ϵ, β1_acc, β2_acc =
-        opt.m, opt.v, opt.α, opt.β1, opt.β2, opt.ϵ, opt.β1_acc, opt.β2_acc
-    @inbounds for n in eachindex(θ)
-        m[n] = β1 * m[n] + (1.0 - β1) * ∇θ[n]
-        v[n] = β2 * v[n] + (1.0 - β2) * ∇θ[n]^2
-        m̂ = m[n] / (1 - β1_acc)
-        v̂ = v[n] / (1 - β2_acc)
-        θ[n] = θ[n] + α * m̂ / (sqrt(v̂) + ϵ)
-    end
-    opt.β1_acc *= β1
-    opt.β2_acc *= β2
-end
-
-function to1hot(y_::Vector)
-    y = Matrix{Int}(undef, 10, length(y_))
-    for n in eachindex(y_)
-        for j in 1:10
-            y[j, n] = y_[n] == j - 1 ? 1 : 0
-        end
-    end
-    return y
-end
-accuracy(Y, Ypr) = mean(all(Y .== Ypr, dims=1))
-@inline logistic(x) = 1 ./ (1 .+ exp.(-x))
-
-"""
-    mlp_log_joint(
-        X::AbstractMatrix{T} where T<:Real,
-        Y::AbstractMatrix{T} where T<:Real,
-        W::NTuple{N, T} where T<:AbstractMatrix{V} where V<:Real,
-        b::NTuple{N, T} where T<:AbstractVector{V} where V<:Real,
-        λ::AbstractFloat)
-
-Compute the log joint probability of the data `(X, Y)` given weights `W`, biases `b` and
-precision for the isotropic Gaussian prior on the weights and biases `λ` (equivalent to
-quadratic weight decay) and a Bernoulli likelihood (equivalent to the cross-entropy).
-"""
-function mlp_log_joint(X, Y, W, b, λ)
-
-    # Compute the log prior probability of the weights. We have assumed an isotropic
-    # Gaussian prior over all of the parameters. Note that since we are not trying to learn
-    # λ (it is fixed) we can neglect the normalising constant Z = 0.5 * log(2πλ).
-    log_prior = 0.0
-    for n in 1:length(W)
-        log_prior += sum(abs2, W[n]) + sum(abs2, b[n])
-    end
-    log_prior *= -0.5 * λ
-
-    # Compute the output of the MLP.
-    f = logistic.(apply_transforms(X, W, b, tanh))
-    f = f ./ sum(f, dims=1)
-
-    # Compute the log likelihood of the observations given the outputs of the MLP. We assume
-    ϵ = 1e-15
-    log_lik = sum(Y .* log.(f .+ ϵ) .+ (1 - Y) .* log.((1 + ϵ) .- f))
-
-    # Return the joint and the predictive distribution over the labels.
-    return log_lik + log_prior, f
-end
-
-apply_transforms(X, W::Tuple{Vararg{Any, 1}}, b::Tuple{Vararg{Any, 1}}, f) =
-    f.(W[1] * X .+ b[1])
-apply_transforms(X, W::Tuple{Vararg{Any, N}}, b::Tuple{Vararg{Any, N}}, f) where N =
-    f.(W[N] * apply_transforms(X, W[1:N-1], b[1:N-1], f) .+ b[N])
-
-# A simple Multilayer Feedforward Neural Network (Multi-layer Perceptron (MLP)) example for
-# classifying the MNIST data set.
-function demo_mlp(itrs::Int, sz::Int)
-
-    # Load the data.
-    println("Loading data.")
-    xtr, ytr_ = traindata()
-    xte, yte_ = testdata()
-
-    # Convert to 1-hot label encoding.
-    ytr, yte = to1hot(ytr_), to1hot(yte_)
-
-    # Initialise parameters.
-    println("Initialising parameters.")
-    d0, d1, d2, d3, λ = size(xtr, 1), 500, 300, size(ytr, 1), 1e-3
-    W_ = (0.1 * randn(d1, d0), 0.1 * randn(d2, d1), 0.1 * randn(d3, d2))
-    b_ = (0.1 * randn(d1), 0.1 * randn(d2), 0.1 * randn(d3))
-
-    # Initialise the Adam optimiser.
-    α, β1, β2, ϵ = 1e-3, 0.9, 0.999, 1e-8
-    optW, optb = Adam.(W_, α, β1, β2, ϵ), Adam.(b_, α, β1, β2, ϵ)
-
-    # Iterate to learn the parameters.
-    println("Starting learning.")
-    scal = size(xtr, 2) / sz
-    for itr in 1:itrs
-
-        # Pick the mini batch.
-        idx = rand(eachindex(ytr_), sz)
-        xtr_batch = view(xtr, :, idx)
-        ytr_batch = view(ytr, :, idx)
-
-        # Initialise computational graph.
-        tape = Tape()
-        W, b = Leaf.(tape, W_), Leaf.(tape, b_)
-
-        # Compute the log marginal probability.
-        logp, f = mlp_log_joint(xtr_batch, ytr_batch, W, b, λ)
-        ∇logp = ∇(logp)
-
-        # Compute most probably classification for each observation in the batch.
-        ypr = zeros(d3, sz)
-        for n in 1:sz
-            ypr[argmax(Nabla.unbox(f)[:, n]), n] = 1.
-        end
-        acc = accuracy(ytr_batch, ypr)
-
-        # Update the parameters using AdaGrad by indexing into the ∇logp tape.
-        iterate!.(W_, getindex.(∇logp, W), optW)
-        iterate!.(b_, getindex.(∇logp, b), optb)
-        println("logp is $(Nabla.unbox(logp)) at iteration $itr. Mean loglik is ",
-            "$(Nabla.unbox(logp) / size(xtr, 2)). Accuracy is $acc")
-    end
-end
-
diff --git a/examples/symmetric_quadratic.jl b/examples/symmetric_quadratic.jl
deleted file mode 100644
index cc7b88d8..00000000
--- a/examples/symmetric_quadratic.jl
+++ /dev/null
@@ -1,49 +0,0 @@
-using Nabla
-
-# Generate the values required to compute a matrix quadratic form.
-N = 5
-B = randn(5, 5)
-A = B.'B + UniformScaling(1e-6)
-
-# Low-level API computation of derivatives.
-
-# Construct some trackable data.
-x_ = randn(N)
-x = Leaf(Tape(), x_)
-
-# Compute the forward pass.
-y = x.' * (A * x) # Temporary bracketting because we don't support RowVectors yet.
-println("Output of the forward pass is:")
-println(y)
-println()
-println("y is $(Nabla.unbox(y)).")
-println()
-
-# Get the reverse tape.
-ȳ = ∇(y)
-println("Output of reverse-pass is")
-println(ȳ)
-println()
-
-# Index into the reverse tape using x to get the gradient of `y` w.r.t. `x`.
-x̄ = ȳ[x]
-println("Gradient of y w.r.t. x at $(Nabla.unbox(x)) is $x̄.")
-println()
-
-# (Current) High-Level API computation of derivatives. I will probably maintain this
-# interface and extend is significantly as it is currently rather limited. It just returns
-# a function which returns the gradient, which isn't really what you want.
-
-# Define the function to be differentiated. Parameters w.r.t. which we want gradients must
-# be arguments. Parameters that we don't want gradients w.r.t. should be passed in via a
-# closure.
-@unionise f(x::AbstractVector) = x.' * (A * x)
-
-# Compute a function `∇f` which computes the derivative of `f` w.r.t. the input.
-∇f = ∇(f)
-
-# Compute the derivative of `f` w.r.t. `x` at `x_`. Result is currently a 1-Tuple. Might
-# introduce a special case for unary functions where it just returns the result.
-x̄ = ∇f(x_)
-
-@assert x̄[1] == ȳ[x]
diff --git a/examples/vae.jl b/examples/vae.jl
deleted file mode 100644
index 508e401e..00000000
--- a/examples/vae.jl
+++ /dev/null
@@ -1,109 +0,0 @@
-using MNIST, Nabla
-
-""" Implementation of the Adam optimiser. """
-mutable struct Adam{T<:AbstractArray}
-    α::Float64
-    β1::Float64
-    β2::Float64
-    m::T
-    v::T
-    β1_acc::Float64
-    β2_acc::Float64
-    ϵ::Float64
-end
-function Adam(θ0::AbstractArray, α::Float64, β1::Float64, β2::Float64, ϵ::Float64)
-    return Adam(α, β1, β2, zeros(θ0), zeros(θ0), β1, β2, ϵ)
-end
-
-function iterate!(θ::AbstractArray{Float64}, ∇θ::AbstractArray{Float64}, opt::Adam)
-    m, v, α, β1, β2, ϵ, β1_acc, β2_acc =
-        opt.m, opt.v, opt.α, opt.β1, opt.β2, opt.ϵ, opt.β1_acc, opt.β2_acc
-    @inbounds for n in eachindex(θ)
-        m[n] = β1 * m[n] + (1.0 - β1) * ∇θ[n]
-        v[n] = β2 * v[n] + (1.0 - β2) * ∇θ[n]^2
-        m̂ = m[n] / (1 - β1_acc)
-        v̂ = v[n] / (1 - β2_acc)
-        θ[n] = θ[n] + α * m̂ / (sqrt(v̂) + ϵ)
-    end
-    opt.β1_acc *= β1
-    opt.β2_acc *= β2
-end
-
-@inline logistic(x) = 1 / (1 + exp(-x))
-
-function encode(Wh, Wμ, Wσ, X)
-    h_enc = tanh.(Wh * X)
-    return Wμ * h_enc, exp.(Wσ * h_enc)
-end
-decode(Vh, Vf, Z) = logistic.(Vf * tanh.(Vh * Z))
-klvae(μ, σ², D) = 0.5 * (sum(σ²) + sum(abs2, μ) - convert(Float64, D) - sum(log.(σ²)))
-
-# Generate and learn in a vanilla vae.
-function demo_vae(itrs::Int, sz::Int, L::Int)
-
-    # Load the data and normalise.
-    xtr, ytr_ = traindata()
-    xtr ./= 256
-
-    # Initialise parameters.
-    dx, dz, dh_enc, dh_dec = size(xtr, 1), 10, 500, 500
-    Wh, Wμ, Wσ = 0.1 * randn(dh_enc, dx), 0.1 * randn(dz, dh_enc), 0.1 * randn(dz, dh_enc)
-    Vh, Vf = 0.1 * randn(dh_dec, dz), 0.1 * randn(dx, dh_dec)
-    λ = 1e-3
-
-    # Initialise the AdaGrad optimiser.
-    α, β1, β2, ϵ = 1e-3, 0.9, 0.999, 1e-8
-    optWh, optWμ, optWσ, optVh, optVf = Adam.((Wh, Wμ, Wσ, Vh, Vf), α, β1, β2, ϵ)
-
-    # Iterate to learn the parameters.
-    scal, ϵ, elbos = size(xtr, 2) / sz, 1e-12, Vector{Float64}(undef, itrs)
-    for itr in 1:itrs
-
-        # Pick the mini batch.
-        idx = rand(eachindex(ytr_), sz)
-        xtr_batch = view(xtr, :, idx)
-
-        # Box paramters that we wish to differentiate w.r.t.
-        Wh_, Wμ_, Wσ_, Vh_, Vf_ = Leaf.(Tape(), (Wh, Wμ, Wσ, Vh, Vf))
-
-        # Compute log prior of paramter values.
-        logprior = -0.5 * λ * (sum(abs2, Wh_) + sum(abs2, Wμ_) + sum(abs2, Wσ_) +
-            sum(abs2, Vh_) + sum(abs2, Vf_))
-
-        # Encode, sample, decode, estimate expection of log likelihood under q(z).
-        loglik = 0.0
-        μz, σz = encode(Wh_, Wμ_, Wσ_, xtr_batch)
-        for l in 1:L
-            z = μz .+ σz .* randn(dz, sz)
-            f = decode(Vh_, Vf_, z)
-            loglik += sum(xtr_batch .* log.(f .+ ϵ) .+
-                          (1 .- xtr_batch) .* log.((1 + ϵ) .- f))
-        end
-        loglik /= convert(Float64, L)
-
-        # Compute the kl divergence between local posteriors and prior.
-        klqp = klvae(μz, σz .* σz, dz)
-
-        # Compute gradient of log-joint w.r.t. each of the parameters.
-        elbo = loglik - klqp
-        elbos[itr] = Nabla.unbox(elbo) / sz
-        logp = scal * elbo + logprior
-        ∇logp = ∇(logp)
-
-        # Update the parameters using AdaGrad by indexing into the ∇logp tape.
-        iterate!(Wh, ∇logp[Wh_], optWh)
-        iterate!(Wμ, ∇logp[Wμ_], optWμ)
-        iterate!(Wσ, ∇logp[Wσ_], optWσ)
-        iterate!(Vh, ∇logp[Vh_], optVh)
-        iterate!(Vf, ∇logp[Vf_], optVf)
-        println("logp is $(Nabla.unbox(logp)) at iterate $itr. Mean elbo is $(elbos[itr])")
-    end
-
-    function reconstruct(x)
-        μz, σz = encode(Wh, Wμ, Wσ, x)
-        return logistic.(Vf * tanh.(Vh * (μz .+ σz .* randn(dz, size(x, 2)))))
-    end
-    sample_vae() = logistic.(Vf * tanh.(Vh * randn(dz)))
-
-    return sample_vae, reconstruct, elbos
-end