diff --git a/README.md b/README.md index 0f3fc46..c33f793 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Read the paper [here](https://arxiv.org/abs/1902.06714). * Training and inference of dense (fully connected) and convolutional neural networks * Stochastic gradient descent optimizers: Classic, momentum, Nesterov momentum, - RMSProp, Adam, AdamW + RMSProp, Adagrad, Adam, AdamW * More than a dozen activation functions and their derivatives * Loading dense and convolutional models from Keras HDF5 (.h5) files * Data-based parallelism diff --git a/example/quadratic.f90 b/example/quadratic.f90 index 90394f0..9bd4cdc 100644 --- a/example/quadratic.f90 +++ b/example/quadratic.f90 @@ -4,10 +4,10 @@ program quadratic_fit ! descent. use nf, only: dense, input, network use nf_dense_layer, only: dense_layer - use nf_optimizers, only: sgd, rmsprop, adam + use nf_optimizers, only: sgd, rmsprop, adam, adagrad implicit none - type(network) :: net(9) + type(network) :: net(11) ! Training parameters integer, parameter :: num_epochs = 1000 @@ -95,6 +95,17 @@ program quadratic_fit beta1, beta2, epsilon, weight_decay_decoupled=1e-5 & ) + ! Adagrad optimizer + call adagrad_optimizer( & + net(10), x, y, xtest, ytest, learning_rate, num_epochs, epsilon & + ) + + ! Adagrad optimizer with L2 regularization and learning rate decay + call adagrad_optimizer( & + net(11), x, y, xtest, ytest, learning_rate, num_epochs, epsilon, & + weight_decay_l2=1e-4, learning_rate_decay=0.99 & + ) + contains real elemental function quadratic(x) result(y) @@ -358,6 +369,68 @@ subroutine adam_optimizer( & end subroutine adam_optimizer + subroutine adagrad_optimizer( & + net, x, y, xtest, ytest, learning_rate, num_epochs, epsilon, & + weight_decay_l2, learning_rate_decay & + ) + ! Adagrad optimizer for updating weights using adaptive gradient algorithm + type(network), intent(inout) :: net + real, intent(in) :: x(:), y(:) + real, intent(in) :: xtest(:), ytest(:) + real, intent(in) :: learning_rate, epsilon + real, intent(in), optional :: weight_decay_l2 + real, intent(in), optional :: learning_rate_decay + integer, intent(in) :: num_epochs + integer :: i, n + real, allocatable :: ypred(:) + real :: weight_decay_l2_val + real :: learning_rate_decay_val + + ! Set default values for weight_decay_l2 + if (.not. present(weight_decay_l2)) then + weight_decay_l2_val = 0.0 + else + weight_decay_l2_val = weight_decay_l2 + end if + + ! Set default values for learning_rate_decay + if (.not. present(learning_rate_decay)) then + learning_rate_decay_val = 0.0 + else + learning_rate_decay_val = learning_rate_decay + end if + + print '(a)', 'Adagrad optimizer' + print '(34("-"))' + + do n = 1, num_epochs + + do i = 1, size(x) + call net % forward([x(i)]) + call net % backward([y(i)]) + end do + + call net % update( & + adagrad( & + learning_rate=learning_rate, & + epsilon=epsilon, & + weight_decay_l2=weight_decay_l2_val, & + learning_rate_decay=learning_rate_decay_val & + ) & + ) + + if (mod(n, num_epochs / 10) == 0) then + ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))] + print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', & + n, num_epochs, sum((ypred - ytest)**2) / size(ytest) + end if + + end do + + print *, '' + + end subroutine adagrad_optimizer + subroutine shuffle(arr) ! Shuffle an array using the Fisher-Yates algorithm. integer, intent(inout) :: arr(:) diff --git a/fpm.toml b/fpm.toml index 36242ef..89f067b 100644 --- a/fpm.toml +++ b/fpm.toml @@ -1,5 +1,5 @@ name = "neural-fortran" -version = "0.14.0" +version = "0.15.0" license = "MIT" author = "Milan Curcic" maintainer = "milancurcic@hey.com" diff --git a/src/nf.f90 b/src/nf.f90 index f26e99b..eb2a903 100644 --- a/src/nf.f90 +++ b/src/nf.f90 @@ -5,7 +5,7 @@ module nf use nf_layer_constructors, only: & conv2d, dense, flatten, input, maxpool2d, reshape use nf_network, only: network - use nf_optimizers, only: sgd, rmsprop, adam + use nf_optimizers, only: sgd, rmsprop, adam, adagrad use nf_activation, only: activation_function, elu, exponential, & gaussian, linear, relu, leaky_relu, & sigmoid, softmax, softplus, step, tanhf, & diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index 43f9d3b..c64cefe 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -13,7 +13,7 @@ module nf_optimizers implicit none private - public :: optimizer_base_type, sgd, rmsprop, adam + public :: optimizer_base_type, sgd, rmsprop, adam, adagrad type, abstract :: optimizer_base_type real :: learning_rate = 0.01 @@ -87,6 +87,23 @@ end subroutine minimize procedure :: minimize => minimize_adam end type adam + type, extends(optimizer_base_type) :: adagrad + !! Adagrad optimizer by Duchi et al. (2011) + !! + !! Duchi, J., Hazan, E. and Singer, Y., 2011. Adaptive subgradient + !! methods for online learning and stochastic optimization. Journal + !! of Machine Learning Research, 12(Jul), pp.2121-2159. + !! http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf + real :: epsilon = 1e-8 + real :: weight_decay_l2 = 0 + real :: learning_rate_decay = 0 + real, allocatable, private :: sum_squared_gradient(:) + integer, private :: t = 0 + contains + procedure :: init => init_adagrad + procedure :: minimize => minimize_adagrad + end type adagrad + contains impure elemental subroutine init_sgd(self, num_params) @@ -186,11 +203,49 @@ pure subroutine minimize_adam(self, param, gradient) ! Update parameters. param = param & - - self % learning_rate * m_hat / (sqrt(v_hat) + self % epsilon) & - - self % weight_decay_decoupled * param + - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) & + + self % weight_decay_decoupled * param) end associate end subroutine minimize_adam + + impure elemental subroutine init_adagrad(self, num_params) + class(adagrad), intent(inout) :: self + integer, intent(in) :: num_params + if (.not. allocated(self % sum_squared_gradient)) then + allocate(self % sum_squared_gradient(num_params)) + self % sum_squared_gradient = 0 + end if + end subroutine init_adagrad + + + pure subroutine minimize_adagrad(self, param, gradient) + !! Concrete implementation of an Adagrad optimizer update rule. + class(adagrad), intent(inout) :: self + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) + + ! Update the current time step + self % t = self % t + 1 + + associate( & + ! If weight_decay_l2 > 0, use L2 regularization; + ! otherwise, default to regular Adagrad. + g => gradient + self % weight_decay_l2 * param, & + ! Amortize the learning rate as function of the current time step. + learning_rate => self % learning_rate & + / (1 + (self % t - 1) * self % learning_rate_decay) & + ) + + self % sum_squared_gradient = self % sum_squared_gradient + g**2 + + param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) & + + self % epsilon) + + end associate + + end subroutine minimize_adagrad + end module nf_optimizers diff --git a/test/test_optimizers.f90 b/test/test_optimizers.f90 index 1c3e5c3..dc2cc03 100644 --- a/test/test_optimizers.f90 +++ b/test/test_optimizers.f90 @@ -1,10 +1,10 @@ program test_optimizers - use nf, only: dense, input, network, rmsprop, sgd, adam + use nf, only: dense, input, network, rmsprop, sgd, adam, adagrad use iso_fortran_env, only: stderr => error_unit implicit none - type(network) :: net(5) + type(network) :: net(6) real, allocatable :: x(:), y(:) real, allocatable :: ypred(:) integer, parameter :: num_iterations = 1000 @@ -116,6 +116,26 @@ program test_optimizers ok = .false. end if + ! Test Adagrad optimizer + converged = .false. + + do n = 0, num_iterations + + call net(6) % forward(x) + call net(6) % backward(y) + call net(6) % update(optimizer=adagrad(learning_rate=0.01, weight_decay_l2=1e-4, learning_rate_decay=0.99)) + + ypred = net(5) % predict(x) + converged = check_convergence(y, ypred) + if (converged) exit + + end do + + if (.not. converged) then + write(stderr, '(a)') 'adagrad should converge in simple training.. failed' + ok = .false. + end if + if (ok) then print '(a)', 'test_optimizers: All tests passed.'