Source code for ztlearn.optimizers

# -*- coding: utf-8 -*-

import numpy as np

from .decayers import DecayFunction as decayer


[docs]class Optimizer(object):

    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

[docs]    def get_learning_rate(self, current_epoch):
        self.min_lr     = self.min_lr if hasattr(self, 'min_lr')   else 0
        self.max_lr     = self.max_lr if hasattr(self, 'max_lr')   else np.inf
        self.decay_rate = self.decay_rate if hasattr(self, 'decay_rate')  else 1e-6
        self.decay_func = self.decay_func if hasattr(self, 'decay_func')  else 'inverse'
        self.decay_lr   = self.decay_lr if hasattr(self, 'decay_lr') else True

        if self.decay_lr is False: return self.lr

        if current_epoch == 1: return self.lr

        if hasattr(self, 'step_size') and isinstance(self.step_size, (int, np.integer)):

            return decayer(self.lr,
                                    self.decay_func,
                                    self.decay_rate,
                                    current_epoch,
                                    self.min_lr,
                                    self.max_lr,
                                    self.step_size).decompose

        return decayer(self.lr,
                                self.decay_func,
                                self.decay_rate,
                                current_epoch,
                                self.min_lr,
                                self.max_lr).decompose


[docs]class GD:

    """
    **Gradient Descent (GD)**

    GD optimizes parameters theta of an objective function J(theta) by  updating
    all of the  training samples in the  dataset. The update is perfomed in  the
    opposite  direction of  the  gradient of the  objective  function  d/d_theta
    J(theta) - with respect  to  the parameters (theta).  The learning rate  eta
    helps determine the size of teh steps we take to the minima

    References:
        [1] An overview of gradient descent optimization algorithms
            * [Sebastien Ruder, 2016] https://arxiv.org/abs/1609.04747
            * [PDF] https://arxiv.org/pdf/1609.04747.pdf
    """

    def __init__(self): pass


[docs]class SGD(Optimizer):

    """
    **Stochastic Gradient Descent (SGD)**

    SGD optimizes parameters theta of an objective function J(theta) by updating
    each of the training samples inputs(i) and targets(i) for all samples in the
    dataset. The update is perfomed in the opposite direction of the gradient of
    the objective function d/d_theta J(theta) - with respect  to the  parameters
    (theta). The learning rate eta helps determine the size of the steps we take
    to the minima

    References:
        [1] An overview of gradient descent optimization algorithms
            * [Sebastien Ruder, 2016] https://arxiv.org/abs/1609.04747
            * [PDF] https://arxiv.org/pdf/1609.04747.pdf

        [2] Large-Scale Machine Learning with Stochastic Gradient Descent
            * [Leon Botou, 2011][PDF] http://leon.bottou.org/publications/pdf/compstat-2010.pdf

    Args:
        kwargs: Arbitrary keyword arguments.
    """

    def __init__(self, **kwargs):
        super(SGD, self).__init__(**kwargs)
        self.lr = kwargs['lr'] if 'lr' in kwargs else 0.01

[docs]    def update(self, weights, grads, epoch_num, batch_num, batch_size):
        self.weights = weights
        self.grads   = grads

        self.weights -= np.multiply(super(SGD, self).get_learning_rate(epoch_num), self.grads, dtype = np.float128)

        return self.weights

    @property
    def optimization_name(self):
        return self.__class__.__name__


[docs]class SGDMomentum(Optimizer):

    """
    **Stochastic Gradient Descent with Momentum (SGDMomentum)**

    The objective function  regularly forms  places on  the contour map in which
    the surface  curves more steeply  than  others (ravines). Standard SGD  will
    tend to  oscillate across the  narrow  ravine since  the  negative  gradient
    will  point  down one  of  the  steep  sides  rather than  along the  ravine
    towards  the  optimum.  Momentum  hepls to  push the  objective more quickly
    along the shallow ravine towards the global minima

    References:
        [1] An overview of gradient descent optimization algorithms
            * [Sebastien Ruder, 2016] https://arxiv.org/abs/1609.04747
            * [PDF] https://arxiv.org/pdf/1609.04747.pdf

        [2] On the Momentum Term in Gradient Descent Learning Algorithms
            * [Ning Qian, 199] https://goo.gl/7fhr14
            * [PDF] https://goo.gl/91HtDt

        [3] Two problems with backpropagation and other steepest-descent learning procedures for networks.
            * [Sutton, R. S., 1986][PDF] https://goo.gl/M3VFM1

    Args:
        kwargs: Arbitrary keyword arguments.
    """

    def __init__(self, **kwargs):
        super(SGDMomentum, self).__init__(**kwargs)
        self.lr       = kwargs['lr']       if 'lr'       in kwargs else 0.01
        self.momentum = kwargs['momentum'] if 'momemtum' in kwargs else 0.1
        self.velocity = None

[docs]    def update(self, weights, grads, epoch_num, batch_num, batch_size):
        self.weights = weights
        self.grads   = grads

        if self.velocity is None:
            self.velocity = np.zeros_like(self.weights)

        self.velocity  = np.multiply(self.momentum, self.velocity) - np.multiply(super(SGDMomentum, self).get_learning_rate(epoch_num), self.grads)
        self.weights  += self.velocity

        return self.weights

    @property
    def optimization_name(self):
        return self.__class__.__name__


[docs]class Adam(Optimizer):

    """
    **Adaptive Moment Estimation (Adam)**

    Adam computes adaptive  learning rates for by  updating each of the training
    samples while storing  an  exponentially  decaying  average of past  squared
    gradients. Adam  also  keeps  an  exponentially  decaying  average  of  past
    gradients.

    References:
        [1] An overview of gradient descent optimization algorithms
            * [Sebastien Ruder, 2016] https://arxiv.org/abs/1609.04747
            * [PDF] https://arxiv.org/pdf/1609.04747.pdf

        [2] Adam: A Method for Stochastic Optimization
            * [Diederik P. Kingma et. al., 2014] https://arxiv.org/abs/1412.6980
            * [PDF] https://arxiv.org/pdf/1412.6980.pdf

    Args:
        kwargs: Arbitrary keyword arguments.
    """

    def __init__(self, **kwargs):
        super(Adam, self).__init__(**kwargs)
        self.lr      = kwargs['lr']      if 'lr'      in kwargs else 0.001
        self.epsilon = kwargs['epsilon'] if 'epsilon' in kwargs else 1e-8
        self.beta1   = kwargs['beta1']   if 'beta1'   in kwargs else 0.9
        self.beta2   = kwargs['beta2']   if 'beta2'   in kwargs else 0.999
        self.m       = None
        self.v       = None
        self.t       = 1

[docs]    def update(self, weights, grads, epoch_num, batch_num, batch_size):
        self.weights = weights
        self.grads   = grads
        self.t       = batch_num

        if self.m is None:
            self.m = np.zeros_like(self.weights)

        if self.v is None:
            self.v = np.zeros_like(self.weights)

        self.m = np.multiply(self.beta1, self.m) + np.multiply((1 - self.beta1), self.grads)
        m_hat  = np.true_divide(self.m, (1 - np.power(self.beta1, self.t)))

        self.v = np.multiply(self.beta2, self.v)  + np.multiply((1 - self.beta2), np.square(self.grads))
        v_hat  = np.true_divide(self.v, (1 - np.power(self.beta2, self.t)))

        self.weights -= np.true_divide(np.multiply(super(Adam, self).get_learning_rate(epoch_num), m_hat), np.sqrt(v_hat) + self.epsilon)

        return self.weights

    @property
    def optimization_name(self):
        return self.__class__.__name__


[docs]class Adamax(Optimizer):

    """
    **Admax**

    AdaMax is a variant of Adam based on the infinity norm. The Adam update rule
    for individual weights is to scale their gradients inversely proportional to
    a (scaled) L2  norm of  their  individual c urrent and  past  gradients. For
    Adamax we generalize the L2 norm based update rule to a Lp norm based update
    rule. These variants are numerically unstable for large p.  but have special
    cases where as p tens to infinity, a simple and stable algorithm emerges.

    References:
        [1] An overview of gradient descent optimization algorithms
            * [Sebastien Ruder, 2016] https://arxiv.org/abs/1609.04747
            * [PDF] https://arxiv.org/pdf/1609.04747.pdf

        [2] Adam: A Method for Stochastic Optimization
            * [Diederik P. Kingma et. al., 2014] https://arxiv.org/abs/1412.6980
            * [PDF] https://arxiv.org/pdf/1412.6980.pdf

    Args:
        kwargs: Arbitrary keyword arguments.
    """

    def __init__(self, **kwargs):
        super(Adamax, self).__init__(**kwargs)
        self.lr      = kwargs['lr']      if 'lr'      in kwargs else 0.02
        self.epsilon = kwargs['epsilon'] if 'epsilon' in kwargs else 1e-8
        self.beta1   = kwargs['beta1']   if 'beta1'   in kwargs else 0.9
        self.beta2   = kwargs['beta2']   if 'beta2'   in kwargs else 0.999
        self.m       = None
        self.u       = None
        self.t       = 1

[docs]    def update(self, weights, grads, epoch_num, batch_num, batch_size):
        self.weights = weights
        self.grads   = grads
        self.t       = batch_num

        if self.m is None:
            self.m = np.zeros_like(self.weights)

        if self.u is None:
            self.u = np.zeros_like(self.weights)

        lr_t = np.true_divide(super(Adamax, self).get_learning_rate(epoch_num),
                              1. - np.power(self.beta1, self.t))

        m_hat = np.multiply(self.beta1, self.m) + np.multiply((1. - self.beta1), self.grads)
        u_hat = np.maximum(np.multiply(self.beta2, self.u), np.abs(self.grads))

        self.weights -= np.true_divide(np.multiply(lr_t, m_hat), (u_hat + self.epsilon))

        return self.weights

    @property
    def optimization_name(self):
        return self.__class__.__name__


[docs]class AdaGrad(Optimizer):

    """
    **Adaptive Gradient Algorithm (AdaGrad)**

    AdaGrad is an  optimization method  that  allows  different  step  sizes for
    different  features.  It increases  the  influence of  rare  but informative
    features

    References:
        [1] An overview of gradient descent optimization algorithms
            * [Sebastien Ruder, 2016] https://arxiv.org/abs/1609.04747
            * [PDF] https://arxiv.org/pdf/1609.04747.pdf

        [2] Adaptive Subgradient Methods for Online Learning and Stochastic Optimization
            * [John Duchi et. al., 2011] http://jmlr.org/papers/v12/duchi11a.html
            * [PDF] http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf

    Args:
        kwargs: Arbitrary keyword arguments.
    """

    def __init__(self, **kwargs):
        super(AdaGrad, self).__init__(**kwargs)
        self.lr      = kwargs['lr']      if 'lr'      in kwargs else 0.01
        self.epsilon = kwargs['epsilon'] if 'epsilon' in kwargs else 1e-8
        self.cache   = None

[docs]    def update(self, weights, grads, epoch_num, batch_num, batch_size):
        self.weights = weights
        self.grads   = grads

        if self.cache is None:
            self.cache = np.zeros_like(self.grads)

        self.cache   += np.square(self.grads)
        self.weights -= np.multiply(super(AdaGrad, self).get_learning_rate(epoch_num),
                                    np.true_divide(self.grads, np.sqrt(self.cache) + self.epsilon))

        return self.weights

    @property
    def optimization_name(self):
        return self.__class__.__name__


[docs]class Adadelta(Optimizer):

    """
    **An Adaptive Learning Rate Method (Adadelta)**

    Adadelta is an extension of Adagrad that seeks to avoid setting  the learing
    rate to an aggresively  monotonically  decreasing rate. This is achieved via
    a dynamic learning rate i.e a diffrent learning  rate is computed  for  each
    training sample

    References:
        [1] An overview of gradient descent optimization algorithms
            * [Sebastien Ruder, 2016] https://arxiv.org/abs/1609.04747
            * [PDF] https://arxiv.org/pdf/1609.04747.pdf

        [2] ADADELTA: An Adaptive Learning Rate Method
            * [Matthew D. Zeiler, 2012] https://arxiv.org/abs/1212.5701
            * [PDF] https://arxiv.org/pdf/1212.5701.pdf

    Args:
        kwargs: Arbitrary keyword arguments.
    """

    def __init__(self, **kwargs):
        super(Adadelta, self).__init__(**kwargs)
        self.lr      = kwargs['lr']      if 'lr'      in kwargs else 1.0
        self.epsilon = kwargs['epsilon'] if 'epsilon' in kwargs else 1e-6
        self.rho     = kwargs['rho']     if 'rho'     in kwargs else 0.9
        self.cache   = None
        self.delta   = None

[docs]    def update(self, weights, grads, epoch_num, batch_num, batch_size):
        self.weights = weights
        self.grads   = grads

        if self.cache is None:
            self.cache = np.zeros_like(self.weights)

        if self.delta is None:
            self.delta = np.zeros_like(self.weights)

        self.cache = np.multiply(self.rho, self.cache) + np.multiply(1 - self.rho, np.square(self.grads))

        RMSE_grad  = np.sqrt(self.cache + self.epsilon)
        RMSE_delta = np.sqrt(self.delta + self.epsilon)

        update = np.multiply(self.grads, np.true_divide(RMSE_delta, RMSE_grad))

        self.weights -= np.multiply(super(Adadelta, self).get_learning_rate(epoch_num), update)
        self.delta    = np.multiply(self.rho, self.delta) + np.multiply((1 - self.rho), np.square(update))

        return self.weights

    @property
    def optimization_name(self):
        return self.__class__.__name__


[docs]class RMSprop(Optimizer):

    """
    **Root Mean Squared Propagation (RMSprop)**

    RMSprop utilizes  the magnitude of recent gradients to  normalize  gradients.
    A moving average over the root mean squared (RMS) gradients is kept and then
    divided by  the current  gradient. Parameters are  recomended to  be  set as
    follows rho = 0.9 and eta (learning rate) = 0.001

    References:
        [1] An overview of gradient descent optimization algorithms
            * [Sebastien Ruder, 2016] https://arxiv.org/abs/1609.04747
            * [PDF] https://arxiv.org/pdf/1609.04747.pdf

        [2] Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning
            * [Tieleman, T. and Hinton, G. 2012][PDF] https://goo.gl/Dhkvpk

    Args:
        kwargs: Arbitrary keyword arguments.
    """

    def __init__(self, **kwargs):
        super(RMSprop, self).__init__(**kwargs)
        self.lr      = kwargs['lr']      if 'lr'      in kwargs else 0.001
        self.epsilon = kwargs['epsilon'] if 'epsilon' in kwargs else 1e-6
        self.rho     = kwargs['rho']     if 'rho'     in kwargs else 0.9
        self.cache   = None

[docs]    def update(self, weights, grads, epoch_num, batch_num, batch_size):
        self.weights = weights
        self.grads   = grads

        if self.cache is None:
            self.cache = np.zeros_like(self.weights)

        self.cache    = np.multiply(self.rho, np.multiply(self.cache + (1 - self.rho), np.square(self.grads)))
        self.weights -= np.multiply(self.lr, np.true_divide(self.grads, (np.sqrt(self.cache) + self.epsilon)))

        return self.weights

    @property
    def optimization_name(self):
        return self.__class__.__name__


[docs]class NesterovAcceleratedGradient(Optimizer):

    """
    **Nesterov Accelerated Gradient (NAG)**

    NAG is an improvement in SGDMomentum where the the previous parameter values
    are smoothed and a gradient  descent  step is taken from this smoothed value.
    This enables a more intelligent way of arriving at the minima

    References:
        [1] An overview of gradient descent optimization algorithms
            * [Sebastien Ruder, 2016] https://arxiv.org/abs/1609.04747
            * [PDF] https://arxiv.org/pdf/1609.04747.pdf

        [2] A method for unconstrained convex minimization problem with the rate of convergence
            * [Nesterov, Y. 1983][PDF] https://goo.gl/X8313t

        [3] Nesterov's Accelerated Gradient and Momentum as approximations to Regularised Update Descent
            * [Aleksandar Botev, 2016] https://arxiv.org/abs/1607.01981
            * [PDF] https://arxiv.org/pdf/1607.01981.pdf

    Args:
        kwargs: Arbitrary keyword arguments.
    """

    def __init__(self, **kwargs):
        super(NesterovAcceleratedGradient, self).__init__(**kwargs)
        self.lr            = kwargs['lr']       if 'lr'       in kwargs else 0.001
        self.momentum      = kwargs['momentum'] if 'momemtum' in kwargs else 0.9
        self.velocity_prev = None
        self.velocity      = None

[docs]    def update(self, weights, grads, epoch_num, batch_num, batch_size):
        self.weights = weights
        self.grads   = grads

        if self.velocity_prev is None:
            self.velocity_prev = np.zeros_like(self.weights)

        if self.velocity is None:
            self.velocity = np.zeros_like(self.weights)

        self.velocity_prev  = self.velocity
        self.velocity       = np.multiply(self.momentum, self.velocity) - np.multiply(super(NesterovAcceleratedGradient, self).get_learning_rate(epoch_num), self.grads)
        self.weights       += np.multiply(-self.momentum, self.velocity_prev) + np.multiply(1 + self.momentum, self.velocity)

        return self.weights

    @property
    def optimization_name(self):
        return self.__class__.__name__


[docs]class OptimizationFunction:

    _optimizers = {
        'sgd'          : SGD,
        'adam'         : Adam,
        'adamax'       : Adamax,
        'adagrad'      : AdaGrad,
        'rmsprop'      : RMSprop,
        'adadelta'     : Adadelta,
        'sgd_momentum' : SGDMomentum,
        'nestrov'      : NesterovAcceleratedGradient
    }

    def __init__(self, optimizer_kwargs):

        # check if optimizer_kwargs is an instance of any of the classes in _optimizers.values dict
        if any(isinstance(optimizer_kwargs, cls_type) for cls_type in list(self._optimizers.values())):
            import copy
            self.optimization_func = copy.copy(optimizer_kwargs)

        # check if optimizer_kwargs is an instance of any of type string and is in _optimizers.keys dict
        elif any(isinstance(optimizer_kwargs, str) for cls_type in list(self._optimizers.keys())):
            if optimizer_kwargs not in self._optimizers.keys():
                raise Exception('Optimization function must be either one of the following: {}.'.format(', '.join(self._optimizers.keys())))
            self.optimization_func = self._optimizers[optimizer_kwargs]()

        # we have a dictionary of keyword arguments from the register_opt func
        else:
            if optimizer_kwargs['optimizer_name'] not in self._optimizers.keys():
                raise Exception('Optimization function must be either one of the following: {}.'.format(', '.join(self._optimizers.keys())))
            self.optimization_func = self._optimizers[optimizer_kwargs['optimizer_name']](**optimizer_kwargs)

    @property
    def name(self):
        return self.optimization_func.optimization_name

[docs]    def update(self, weights, grads, epoch_num, batch_num, batch_size):
        return self.optimization_func.update(weights, grads, epoch_num, batch_num, batch_size)


[docs]def register_opt(**kwargs):

    # ensure that key optimizer_name is present
    if 'optimizer_name' not in kwargs:
        raise Exception('optimizer_name must be included in the register_opt func to deduce the optimization type to be used')

    allowed_kwargs = {
        'lr',
        'rho',
        'beta2',
        'beta1',
        'epsilon',
        'epsilon',
        'decay_lr',
        'momentum',
        'velocity',
        'step_size',
        'decay_rate',
        'decay_func',
        'optimizer_name'
    }

    for kwrd in kwargs:
        if kwrd not in allowed_kwargs:
            raise TypeError('Unexpected keyword argument passed to optimizer: ' + str(kwrd))
    return kwargs