diff --git a/examples/asymmetric_quadratic.jl b/examples/asymmetric_quadratic.jl deleted file mode 100644 index d5abc6c0..00000000 --- a/examples/asymmetric_quadratic.jl +++ /dev/null @@ -1,55 +0,0 @@ -using Nabla - -# Generate the values required to compute a matrix quadratic form. -N = 5 -B = randn(5, 5) -A = B.'B + UniformScaling(1e-6) - -# Low-level API computation of derivatives. - -# Construct some trackable data. -x_, y_ = randn(N), randn(N) -x, y = Leaf.(Tape(), (x_, y_)) - -# Compute the forward pass. -z = x.' * (A * y) # Temporary bracketting because we don't support RowVectors yet. - -println("Output of the forward pass is:") -println(z) -println() -println("y is $(Nabla.unbox(z)).") -println() - -# Get the reverse tape. -z̄ = ∇(z) -println("Output of reverse-pass is") -println(z̄) -println() - -# Index into the reverse tape using x to get the gradient of `y` w.r.t. `x`. -x̄ = z̄[x] -println("Gradient of z w.r.t. x at $x_ is $x̄.") -println() - -ȳ = z̄[y] -println("Gradient of z w.r.t. y at $y_ is $ȳ") - - -# (Current) High-Level API computation of derivatives. I will probably maintain this -# interface and extend is significantly as it is currently rather limited. It just returns -# a function which returns the gradient, which isn't really what you want. - -# Define the function to be differentiated. Parameters w.r.t. which we want gradients must -# be arguments. Parameters that we don't want gradients w.r.t. should be passed in via a -# closure. -@unionise f(x::AbstractVector, y::AbstractVector) = x.'A * y - -# Compute a function `∇f` which computes the derivative of `f` w.r.t. the inputs. -∇f = ∇(f) - -# Compute the derivative of `f` w.r.t. `x` at `x_`. Result is currently a 1-Tuple. Might -# introduce a special case for unary functions where it just returns the result. -(x̄, ȳ) = ∇f(x_, y_) - -@assert x̄ == z̄[x] -@assert ȳ == z̄[y] diff --git a/examples/mlp.jl b/examples/mlp.jl deleted file mode 100644 index 06cad992..00000000 --- a/examples/mlp.jl +++ /dev/null @@ -1,138 +0,0 @@ -using MNIST, Nabla - -""" Implementation of the Adam optimiser. """ -mutable struct Adam{T<:AbstractArray} - α::Float64 - β1::Float64 - β2::Float64 - m::T - v::T - β1_acc::Float64 - β2_acc::Float64 - ϵ::Float64 -end -function Adam(θ0::AbstractArray, α::Float64, β1::Float64, β2::Float64, ϵ::Float64) - return Adam(α, β1, β2, zeros(θ0), zeros(θ0), β1, β2, ϵ) -end - -function iterate!(θ::AbstractArray{Float64}, ∇θ::AbstractArray{Float64}, opt::Adam) - m, v, α, β1, β2, ϵ, β1_acc, β2_acc = - opt.m, opt.v, opt.α, opt.β1, opt.β2, opt.ϵ, opt.β1_acc, opt.β2_acc - @inbounds for n in eachindex(θ) - m[n] = β1 * m[n] + (1.0 - β1) * ∇θ[n] - v[n] = β2 * v[n] + (1.0 - β2) * ∇θ[n]^2 - m̂ = m[n] / (1 - β1_acc) - v̂ = v[n] / (1 - β2_acc) - θ[n] = θ[n] + α * m̂ / (sqrt(v̂) + ϵ) - end - opt.β1_acc *= β1 - opt.β2_acc *= β2 -end - -function to1hot(y_::Vector) - y = Matrix{Int}(undef, 10, length(y_)) - for n in eachindex(y_) - for j in 1:10 - y[j, n] = y_[n] == j - 1 ? 1 : 0 - end - end - return y -end -accuracy(Y, Ypr) = mean(all(Y .== Ypr, dims=1)) -@inline logistic(x) = 1 ./ (1 .+ exp.(-x)) - -""" - mlp_log_joint( - X::AbstractMatrix{T} where T<:Real, - Y::AbstractMatrix{T} where T<:Real, - W::NTuple{N, T} where T<:AbstractMatrix{V} where V<:Real, - b::NTuple{N, T} where T<:AbstractVector{V} where V<:Real, - λ::AbstractFloat) - -Compute the log joint probability of the data `(X, Y)` given weights `W`, biases `b` and -precision for the isotropic Gaussian prior on the weights and biases `λ` (equivalent to -quadratic weight decay) and a Bernoulli likelihood (equivalent to the cross-entropy). -""" -function mlp_log_joint(X, Y, W, b, λ) - - # Compute the log prior probability of the weights. We have assumed an isotropic - # Gaussian prior over all of the parameters. Note that since we are not trying to learn - # λ (it is fixed) we can neglect the normalising constant Z = 0.5 * log(2πλ). - log_prior = 0.0 - for n in 1:length(W) - log_prior += sum(abs2, W[n]) + sum(abs2, b[n]) - end - log_prior *= -0.5 * λ - - # Compute the output of the MLP. - f = logistic.(apply_transforms(X, W, b, tanh)) - f = f ./ sum(f, dims=1) - - # Compute the log likelihood of the observations given the outputs of the MLP. We assume - ϵ = 1e-15 - log_lik = sum(Y .* log.(f .+ ϵ) .+ (1 - Y) .* log.((1 + ϵ) .- f)) - - # Return the joint and the predictive distribution over the labels. - return log_lik + log_prior, f -end - -apply_transforms(X, W::Tuple{Vararg{Any, 1}}, b::Tuple{Vararg{Any, 1}}, f) = - f.(W[1] * X .+ b[1]) -apply_transforms(X, W::Tuple{Vararg{Any, N}}, b::Tuple{Vararg{Any, N}}, f) where N = - f.(W[N] * apply_transforms(X, W[1:N-1], b[1:N-1], f) .+ b[N]) - -# A simple Multilayer Feedforward Neural Network (Multi-layer Perceptron (MLP)) example for -# classifying the MNIST data set. -function demo_mlp(itrs::Int, sz::Int) - - # Load the data. - println("Loading data.") - xtr, ytr_ = traindata() - xte, yte_ = testdata() - - # Convert to 1-hot label encoding. - ytr, yte = to1hot(ytr_), to1hot(yte_) - - # Initialise parameters. - println("Initialising parameters.") - d0, d1, d2, d3, λ = size(xtr, 1), 500, 300, size(ytr, 1), 1e-3 - W_ = (0.1 * randn(d1, d0), 0.1 * randn(d2, d1), 0.1 * randn(d3, d2)) - b_ = (0.1 * randn(d1), 0.1 * randn(d2), 0.1 * randn(d3)) - - # Initialise the Adam optimiser. - α, β1, β2, ϵ = 1e-3, 0.9, 0.999, 1e-8 - optW, optb = Adam.(W_, α, β1, β2, ϵ), Adam.(b_, α, β1, β2, ϵ) - - # Iterate to learn the parameters. - println("Starting learning.") - scal = size(xtr, 2) / sz - for itr in 1:itrs - - # Pick the mini batch. - idx = rand(eachindex(ytr_), sz) - xtr_batch = view(xtr, :, idx) - ytr_batch = view(ytr, :, idx) - - # Initialise computational graph. - tape = Tape() - W, b = Leaf.(tape, W_), Leaf.(tape, b_) - - # Compute the log marginal probability. - logp, f = mlp_log_joint(xtr_batch, ytr_batch, W, b, λ) - ∇logp = ∇(logp) - - # Compute most probably classification for each observation in the batch. - ypr = zeros(d3, sz) - for n in 1:sz - ypr[argmax(Nabla.unbox(f)[:, n]), n] = 1. - end - acc = accuracy(ytr_batch, ypr) - - # Update the parameters using AdaGrad by indexing into the ∇logp tape. - iterate!.(W_, getindex.(∇logp, W), optW) - iterate!.(b_, getindex.(∇logp, b), optb) - println("logp is $(Nabla.unbox(logp)) at iteration $itr. Mean loglik is ", - "$(Nabla.unbox(logp) / size(xtr, 2)). Accuracy is $acc") - end -end - diff --git a/examples/symmetric_quadratic.jl b/examples/symmetric_quadratic.jl deleted file mode 100644 index cc7b88d8..00000000 --- a/examples/symmetric_quadratic.jl +++ /dev/null @@ -1,49 +0,0 @@ -using Nabla - -# Generate the values required to compute a matrix quadratic form. -N = 5 -B = randn(5, 5) -A = B.'B + UniformScaling(1e-6) - -# Low-level API computation of derivatives. - -# Construct some trackable data. -x_ = randn(N) -x = Leaf(Tape(), x_) - -# Compute the forward pass. -y = x.' * (A * x) # Temporary bracketting because we don't support RowVectors yet. -println("Output of the forward pass is:") -println(y) -println() -println("y is $(Nabla.unbox(y)).") -println() - -# Get the reverse tape. -ȳ = ∇(y) -println("Output of reverse-pass is") -println(ȳ) -println() - -# Index into the reverse tape using x to get the gradient of `y` w.r.t. `x`. -x̄ = ȳ[x] -println("Gradient of y w.r.t. x at $(Nabla.unbox(x)) is $x̄.") -println() - -# (Current) High-Level API computation of derivatives. I will probably maintain this -# interface and extend is significantly as it is currently rather limited. It just returns -# a function which returns the gradient, which isn't really what you want. - -# Define the function to be differentiated. Parameters w.r.t. which we want gradients must -# be arguments. Parameters that we don't want gradients w.r.t. should be passed in via a -# closure. -@unionise f(x::AbstractVector) = x.' * (A * x) - -# Compute a function `∇f` which computes the derivative of `f` w.r.t. the input. -∇f = ∇(f) - -# Compute the derivative of `f` w.r.t. `x` at `x_`. Result is currently a 1-Tuple. Might -# introduce a special case for unary functions where it just returns the result. -x̄ = ∇f(x_) - -@assert x̄[1] == ȳ[x] diff --git a/examples/vae.jl b/examples/vae.jl deleted file mode 100644 index 508e401e..00000000 --- a/examples/vae.jl +++ /dev/null @@ -1,109 +0,0 @@ -using MNIST, Nabla - -""" Implementation of the Adam optimiser. """ -mutable struct Adam{T<:AbstractArray} - α::Float64 - β1::Float64 - β2::Float64 - m::T - v::T - β1_acc::Float64 - β2_acc::Float64 - ϵ::Float64 -end -function Adam(θ0::AbstractArray, α::Float64, β1::Float64, β2::Float64, ϵ::Float64) - return Adam(α, β1, β2, zeros(θ0), zeros(θ0), β1, β2, ϵ) -end - -function iterate!(θ::AbstractArray{Float64}, ∇θ::AbstractArray{Float64}, opt::Adam) - m, v, α, β1, β2, ϵ, β1_acc, β2_acc = - opt.m, opt.v, opt.α, opt.β1, opt.β2, opt.ϵ, opt.β1_acc, opt.β2_acc - @inbounds for n in eachindex(θ) - m[n] = β1 * m[n] + (1.0 - β1) * ∇θ[n] - v[n] = β2 * v[n] + (1.0 - β2) * ∇θ[n]^2 - m̂ = m[n] / (1 - β1_acc) - v̂ = v[n] / (1 - β2_acc) - θ[n] = θ[n] + α * m̂ / (sqrt(v̂) + ϵ) - end - opt.β1_acc *= β1 - opt.β2_acc *= β2 -end - -@inline logistic(x) = 1 / (1 + exp(-x)) - -function encode(Wh, Wμ, Wσ, X) - h_enc = tanh.(Wh * X) - return Wμ * h_enc, exp.(Wσ * h_enc) -end -decode(Vh, Vf, Z) = logistic.(Vf * tanh.(Vh * Z)) -klvae(μ, σ², D) = 0.5 * (sum(σ²) + sum(abs2, μ) - convert(Float64, D) - sum(log.(σ²))) - -# Generate and learn in a vanilla vae. -function demo_vae(itrs::Int, sz::Int, L::Int) - - # Load the data and normalise. - xtr, ytr_ = traindata() - xtr ./= 256 - - # Initialise parameters. - dx, dz, dh_enc, dh_dec = size(xtr, 1), 10, 500, 500 - Wh, Wμ, Wσ = 0.1 * randn(dh_enc, dx), 0.1 * randn(dz, dh_enc), 0.1 * randn(dz, dh_enc) - Vh, Vf = 0.1 * randn(dh_dec, dz), 0.1 * randn(dx, dh_dec) - λ = 1e-3 - - # Initialise the AdaGrad optimiser. - α, β1, β2, ϵ = 1e-3, 0.9, 0.999, 1e-8 - optWh, optWμ, optWσ, optVh, optVf = Adam.((Wh, Wμ, Wσ, Vh, Vf), α, β1, β2, ϵ) - - # Iterate to learn the parameters. - scal, ϵ, elbos = size(xtr, 2) / sz, 1e-12, Vector{Float64}(undef, itrs) - for itr in 1:itrs - - # Pick the mini batch. - idx = rand(eachindex(ytr_), sz) - xtr_batch = view(xtr, :, idx) - - # Box paramters that we wish to differentiate w.r.t. - Wh_, Wμ_, Wσ_, Vh_, Vf_ = Leaf.(Tape(), (Wh, Wμ, Wσ, Vh, Vf)) - - # Compute log prior of paramter values. - logprior = -0.5 * λ * (sum(abs2, Wh_) + sum(abs2, Wμ_) + sum(abs2, Wσ_) + - sum(abs2, Vh_) + sum(abs2, Vf_)) - - # Encode, sample, decode, estimate expection of log likelihood under q(z). - loglik = 0.0 - μz, σz = encode(Wh_, Wμ_, Wσ_, xtr_batch) - for l in 1:L - z = μz .+ σz .* randn(dz, sz) - f = decode(Vh_, Vf_, z) - loglik += sum(xtr_batch .* log.(f .+ ϵ) .+ - (1 .- xtr_batch) .* log.((1 + ϵ) .- f)) - end - loglik /= convert(Float64, L) - - # Compute the kl divergence between local posteriors and prior. - klqp = klvae(μz, σz .* σz, dz) - - # Compute gradient of log-joint w.r.t. each of the parameters. - elbo = loglik - klqp - elbos[itr] = Nabla.unbox(elbo) / sz - logp = scal * elbo + logprior - ∇logp = ∇(logp) - - # Update the parameters using AdaGrad by indexing into the ∇logp tape. - iterate!(Wh, ∇logp[Wh_], optWh) - iterate!(Wμ, ∇logp[Wμ_], optWμ) - iterate!(Wσ, ∇logp[Wσ_], optWσ) - iterate!(Vh, ∇logp[Vh_], optVh) - iterate!(Vf, ∇logp[Vf_], optVf) - println("logp is $(Nabla.unbox(logp)) at iterate $itr. Mean elbo is $(elbos[itr])") - end - - function reconstruct(x) - μz, σz = encode(Wh, Wμ, Wσ, x) - return logistic.(Vf * tanh.(Vh * (μz .+ σz .* randn(dz, size(x, 2))))) - end - sample_vae() = logistic.(Vf * tanh.(Vh * randn(dz))) - - return sample_vae, reconstruct, elbos -end