Source code for cil.optimisation.functions.SVRGFunction

#   Copyright 2024 United Kingdom Research and Innovation
#   Copyright 2024 The University of Manchester
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
#
#  Authors:
#  - CIL Developers, listed at: https://github.com/TomographicImaging/CIL/blob/master/NOTICE.txt
#  - Daniel Deidda (National Physical Laboratory, UK)
#  - Claire Delplancke (Electricite de France, Research and Development)
#  - Ashley Gillman (Australian e-Health Res. Ctr., CSIRO, Brisbane, Queensland, Australia)
#  - Zeljko Kereta (Department of Computer Science, University College London, UK)
#  - Evgueni Ovtchinnikov (STFC - UKRI)
#  - Georg Schramm (Department of Imaging and Pathology, Division of Nuclear Medicine, KU Leuven, Leuven, Belgium)


from .ApproximateGradientSumFunction import ApproximateGradientSumFunction
import numpy as np
import numbers



[docs]
class SVRGFunction(ApproximateGradientSumFunction):

    r"""
    The Stochastic Variance Reduced Gradient (SVRG) function calculates the approximate gradient of :math:`\sum_{i=1}^{n-1}f_i`.  For this approximation, every `snapshot_update_interval` number of iterations, a full gradient calculation is made at this "snapshot" point. Intermediate gradient calculations update this snapshot by taking a index :math:`i_k` and calculating the gradient of :math:`f_{i_k}`s at the current iterate and the snapshot, updating the approximate gradient to be:

        .. math ::
            n*\nabla f_{i_k}(x_k) - n*\nabla f_{i_k}(\tilde{x}) + \nabla \sum_{i=0}^{n-1}f_i(\tilde{x}),

    where :math:`\tilde{x}` is the latest "snapshot" point and :math:`x_k` is the value at the current iteration. 

    Note
    -----
    Compared with the literature, we multiply by :math:`n`, the number of functions, so that we return an approximate gradient of the whole sum function and not an average gradient. 

    Note
    ----
    In the case where `store_gradients=False` the memory requirements are 4 times the image size (1 stored full gradient at the "snapshot", one stored "snapshot" point and two lots of intermediary calculations). Alternatively, if  `store_gradients=True`  the memory requirement is `n+4` (`n` gradients at the snapshot for each function in the sum, one stored full gradient at the "snapshot", one stored "snapshot" point and two lots of intermediary calculations).
    
    Reference
    ---------
    Johnson, R. and Zhang, T., 2013. Accelerating stochastic gradient descent using predictive variance reduction. Advances in neural information processing systems, 26.https://proceedings.neurips.cc/paper_files/paper/2013/file/ac1dd209cbcc5e5d1c6e28598e8cbbe8-Paper.pdf


    Parameters
    ----------
    functions : `list`  of functions
        A list of functions: :code:`[f_{0}, f_{1}, ..., f_{n-1}]`. Each function is assumed to be smooth with an implemented :func:`~Function.gradient` method. All functions must have the same domain. The number of functions must be strictly greater than 1. 
    sampler: An instance of a CIL Sampler class ( :meth:`~optimisation.utilities.sampler`) or of another class which has a `next` function implemented to output integers in {0, 1, ..., n-1}.
        This sampler is called each time gradient is called and  sets the internal `function_num` passed to the `approximate_gradient` function.  Default is `Sampler.random_with_replacement(len(functions))`. 
    snapshot_update_interval : positive int or None, optional
        The interval for updating the full gradient (taking a snapshot). The default is 2*len(functions) so a "snapshot" is taken every 2*len(functions) iterations. If the user passes `0` then no full gradient snapshots will be taken. 
    store_gradients : bool, default: `False`
        Flag indicating whether to store an update a list of gradients for each function :math:`f_i` or just to store the snapshot point :math:` \tilde{x}` and its gradient :math:`\nabla \sum_{i=0}^{n-1}f_i(\tilde{x})`.


    """

    def __init__(self, functions, sampler=None, snapshot_update_interval=None, store_gradients=False):
        super(SVRGFunction, self).__init__(functions, sampler)

        #  snapshot_update_interval for SVRG
        self.snapshot_update_interval = snapshot_update_interval
    
        if snapshot_update_interval is None:
            self.snapshot_update_interval = 2*self.num_functions
        self.store_gradients = store_gradients

        self._svrg_iter_number = 0

        self._full_gradient_at_snapshot = None
        self._list_stored_gradients = None

        self.stoch_grad_at_iterate = None
        self._stochastic_grad_difference = None
        
        self.snapshot = None


[docs]
    def gradient(self, x, out=None):
        """ Selects a random function using the `sampler` and then calls the approximate gradient at :code:`x` or calculates a full gradient depending on the update frequency

        Parameters
        ----------
        x : DataContainer (e.g. ImageData object)
        out: return DataContainer, if `None` a new DataContainer is returned, default `None`.

        Returns
        --------
        DataContainer (e.g. ImageData object)
            the value of the approximate gradient of the sum function at :code:`x` 
        """

        #  For SVRG, every `snapshot_update_interval` a full gradient step is calculated, else an approximate gradient is taken.
        if ( (self.snapshot_update_interval != 0) and (self._svrg_iter_number % (self.snapshot_update_interval)) == 0):

            return self._update_full_gradient_and_return(x, out=out)

        else:

            self.function_num = self.sampler.next()
            if not isinstance(self.function_num, numbers.Number):
                raise ValueError("Batch gradient is not yet implemented")
            if self.function_num >= self.num_functions or self.function_num < 0:
                raise IndexError(
                    f"The sampler has produced the index {self.function_num} which does not match the expected range of available functions to sample from. Please ensure your sampler only selects from [0,1,...,len(functions)-1] ")
            return self.approximate_gradient(x, self.function_num, out=out)



[docs]
    def approximate_gradient(self, x, function_num, out=None):
        """ Calculates the stochastic gradient at the point :math:`x` by using the gradient of the selected function, indexed by :math:`i_k`, the `function_number` in {0,...,len(functions)-1}, and the full gradient at the snapshot :math:`\tilde{x}`
            .. math ::
                n*\nabla f_{i_k}(x_k) - n*\nabla f_{i_k}(\tilde{x}) + \nabla \sum_{i=0}^{n-1}f_i(\tilde{x})

        Note
        -----
        Compared with the literature, we multiply by :math:`n`, the number of functions, so that we return an approximate gradient of the whole sum function and not an average gradient.

        Parameters
        ----------
        x : DataContainer (e.g. ImageData object)
        out: return DataContainer, if `None` a new DataContainer is returned, default `None`.
        function_num: `int` 
            Between 0 and n-1, where n is the number of functions in the list  
        Returns
        --------
        DataContainer (e.g. ImageData object)
            the value of the approximate gradient of the sum function at :code:`x` given a `function_number` in {0,...,len(functions)-1}
        """

        self._svrg_iter_number += 1

        self.stoch_grad_at_iterate = self.functions[function_num].gradient(x, out=self.stoch_grad_at_iterate)

        if self.store_gradients is True:
            self._stochastic_grad_difference = self.stoch_grad_at_iterate.sapyb(
                1., self._list_stored_gradients[function_num], -1., out=self._stochastic_grad_difference)
        else:
            self._stochastic_grad_difference = self.stoch_grad_at_iterate.sapyb(
                1., self.functions[function_num].gradient(self.snapshot), -1., out=self._stochastic_grad_difference)

        self._update_data_passes_indices([function_num])

        out = self._stochastic_grad_difference.sapyb(
            self.num_functions, self._full_gradient_at_snapshot, 1., out=out)

        return out


    def _update_full_gradient_and_return(self, x, out=None):
        """
        Takes a "snapshot" at the point :math:`x`, saving both the point :math:` \tilde{x}=x` and its gradient :math:`\sum_{i=0}^{n-1}f_i{\tilde{x}}`. The function returns :math:`\sum_{i=0}^{n-1}f_i{\tilde{x}}` as the gradient calculation. If :code:`store_gradients==True`, the gradient of all the :math:`f_i`s is computed and stored at the "snapshot"..

        Parameters
        ----------
        Takes a "snapshot" at the point :math:`x`. The function returns :math:`\sum_{i=0}^{n-1}f_i{\tilde{x}}` as the gradient calculation. If :code:`store_gradients==True`, the gradient of all the :math:`f_i`s is stored, otherwise only  the sum of the gradients and the snapshot point :math:` \tilde{x}=x` are stored.
        out: return DataContainer, if `None` a new DataContainer is returned, default `None`.

        Returns
        --------
        DataContainer (e.g. ImageData object)
            the value of the approximate gradient of the sum function at :code:`x` given a `function_number` in {0,...,len(functions)-1}
        """

        self._svrg_iter_number += 1

        if self.store_gradients is True:
            if self._list_stored_gradients is None:
                # Save the gradient of each individual f_i and the gradient of the full sum at the point x.
                self._list_stored_gradients = [
                    fi.gradient(x) for fi in self.functions]
                self._full_gradient_at_snapshot = sum(
                    self._list_stored_gradients, start=0*x)
            else:
                for i, fi in enumerate(self.functions):
                    fi.gradient(x, out=self._list_stored_gradients[i])

                self._full_gradient_at_snapshot.fill(
                    sum(self._list_stored_gradients, start=0*x))
                self._full_gradient_at_snapshot *= 0

                for i, el in enumerate(self._list_stored_gradients):
                    self._full_gradient_at_snapshot += el

        else:
            # Save the snapshot point and the gradient of the full sum at the point x.
            self._full_gradient_at_snapshot = self.full_gradient(
                x, out=self._full_gradient_at_snapshot)

        if self.snapshot is None:
            self.snapshot = x.copy()

        self.snapshot.fill(x)

        # In this iteration all functions in the sum were used to update the gradient
        self._update_data_passes_indices(list(range(self.num_functions)))

        # Return the gradient of the full sum at the snapshot.
        if out is None:
            out = self._full_gradient_at_snapshot
        else:
            out.fill(self._full_gradient_at_snapshot)

        return out




[docs]
class LSVRGFunction(SVRGFunction):
    """""
    A class representing a function for Loopless Stochastic Variance Reduced Gradient (SVRG) approximation. This is similar to SVRG, except the full gradient at a "snapshot"  is calculated at random intervals rather than at fixed numbers of iterations. 


    Reference
    ----------

    Kovalev, D., Horváth, S. &; Richtárik, P.. (2020). Don’t Jump Through Hoops and Remove Those Loops:  SVRG and Katyusha are Better Without the Outer Loop. Proceedings of the 31st International Conference  on Algorithmic Learning Theory, in Proceedings of Machine Learning Research 117:451-467 Available from https://proceedings.mlr.press/v117/kovalev20a.html.



    Parameters
    ----------
     functions : `list`  of functions
        A list of functions: :code:`[f_{0}, f_{1}, ..., f_{n-1}]`. Each function is assumed to be smooth with an implemented :func:`~Function.gradient` method. All functions must have the same domain. The number of functions `n` must be strictly greater than 1. 
    sampler: An instance of a CIL Sampler class ( :meth:`~optimisation.utilities.sampler`) or of another class which has a `next` function implemented to output integers in {0,...,n-1}.
        This sampler is called each time gradient is called and  sets the internal `function_num` passed to the `approximate_gradient` function.  Default is `Sampler.random_with_replacement(len(functions))`. 
    snapshot_update_probability: positive float, default: 1/n
        The probability of updating the full gradient (taking a snapshot) at each iteration. The default is :math:`1./n` so, in expectation, a snapshot will be taken every :math:`n` iterations. 
    store_gradients : bool, default: `False`
        Flag indicating whether to store an update a list of gradients for each function :math:`f_i` or just to store the snapshot point :math:` \tilde{x}` and it's gradient :math:`\nabla \sum_{i=0}^{n-1}f_i(\tilde{x})`.


    Note
    ----
    In the case where `store_gradients=False` the memory requirements are 4 times the image size (1 stored full gradient at the "snapshot", one stored "snapshot" point and two lots of intermediary calculations). Alternatively, if  `store_gradients=True`  the memory requirement is `n+4` (`n` gradients at the snapshot for each function in the sum, one stored full gradient at the "snapshot", one stored "snapshot" point and two lots of intermediary calculations).

    """

    def __init__(self, functions, sampler=None, snapshot_update_probability=None, store_gradients=False, seed=None):

        super(LSVRGFunction, self).__init__(
            functions, sampler=sampler, store_gradients=store_gradients)

        # Update frequency based on probability.
        self.snapshot_update_probability = snapshot_update_probability
        #  Default snapshot_update_probability for Loopless SVRG
        if self.snapshot_update_probability is None:
            self.snapshot_update_probability = 1./self.num_functions

        #  The random generator used to decide if the gradient calculation is a full gradient or an approximate gradient
        self.generator = np.random.default_rng(seed=seed)


[docs]
    def gradient(self, x, out=None):
        """ Selects a random function using the `sampler` and then calls the approximate gradient at :code:`x` or calculates a full gradient depending on the update probability.

        Parameters
        ----------
        x : DataContainer (e.g. ImageData objects)
        out: return DataContainer, if `None` a new DataContainer is returned, default `None`.

        Returns
        --------
        DataContainer (e.g. ImageData object)
            the value of the approximate gradient of the sum function at :code:`x`
        """

        if self._svrg_iter_number == 0 or self.generator.uniform() < self.snapshot_update_probability:

            return self._update_full_gradient_and_return(x, out=out)

        else:

            self.function_num = self.sampler.next()
            if not isinstance(self.function_num, numbers.Number):
                raise ValueError("Batch gradient is not yet implemented")
            if self.function_num >= self.num_functions or self.function_num < 0:
                raise IndexError(
                    f"The sampler has produced the index {self.function_num} which does not match the expected range of available functions to sample from. Please ensure your sampler only selects from [0,1,...,len(functions)-1] ")
            return self.approximate_gradient(x, self.function_num, out=out)
Table of Contents

Source code for cil.optimisation.functions.SVRGFunction