"""Hessian update strategies for quasi-Newton optimization methods.""" import numpy as np from numpy.linalg import norm from scipy.linalg import get_blas_funcs from warnings import warn __all__ = ['HessianUpdateStrategy', 'BFGS', 'SR1'] class HessianUpdateStrategy(object): """Interface for implementing Hessian update strategies. Many optimization methods make use of Hessian (or inverse Hessian) approximations, such as the quasi-Newton methods BFGS, SR1, L-BFGS. Some of these approximations, however, do not actually need to store the entire matrix or can compute the internal matrix product with a given vector in a very efficiently manner. This class serves as an abstract interface between the optimization algorithm and the quasi-Newton update strategies, giving freedom of implementation to store and update the internal matrix as efficiently as possible. Different choices of initialization and update procedure will result in different quasi-Newton strategies. Four methods should be implemented in derived classes: ``initialize``, ``update``, ``dot`` and ``get_matrix``. Notes ----- Any instance of a class that implements this interface, can be accepted by the method ``minimize`` and used by the compatible solvers to approximate the Hessian (or inverse Hessian) used by the optimization algorithms. """ def initialize(self, n, approx_type): """Initialize internal matrix. Allocate internal memory for storing and updating the Hessian or its inverse. Parameters ---------- n : int Problem dimension. approx_type : {'hess', 'inv_hess'} Selects either the Hessian or the inverse Hessian. When set to 'hess' the Hessian will be stored and updated. When set to 'inv_hess' its inverse will be used instead. """ raise NotImplementedError("The method ``initialize(n, approx_type)``" " is not implemented.") def update(self, delta_x, delta_grad): """Update internal matrix. Update Hessian matrix or its inverse (depending on how 'approx_type' is defined) using information about the last evaluated points. Parameters ---------- delta_x : ndarray The difference between two points the gradient function have been evaluated at: ``delta_x = x2 - x1``. delta_grad : ndarray The difference between the gradients: ``delta_grad = grad(x2) - grad(x1)``. """ raise NotImplementedError("The method ``update(delta_x, delta_grad)``" " is not implemented.") def dot(self, p): """Compute the product of the internal matrix with the given vector. Parameters ---------- p : array_like 1-D array representing a vector. Returns ------- Hp : array 1-D represents the result of multiplying the approximation matrix by vector p. """ raise NotImplementedError("The method ``dot(p)``" " is not implemented.") def get_matrix(self): """Return current internal matrix. Returns ------- H : ndarray, shape (n, n) Dense matrix containing either the Hessian or its inverse (depending on how 'approx_type' is defined). """ raise NotImplementedError("The method ``get_matrix(p)``" " is not implemented.") class FullHessianUpdateStrategy(HessianUpdateStrategy): """Hessian update strategy with full dimensional internal representation. """ _syr = get_blas_funcs('syr', dtype='d') # Symmetric rank 1 update _syr2 = get_blas_funcs('syr2', dtype='d') # Symmetric rank 2 update # Symmetric matrix-vector product _symv = get_blas_funcs('symv', dtype='d') def __init__(self, init_scale='auto'): self.init_scale = init_scale # Until initialize is called we can't really use the class, # so it makes sense to set everything to None. self.first_iteration = None self.approx_type = None self.B = None self.H = None def initialize(self, n, approx_type): """Initialize internal matrix. Allocate internal memory for storing and updating the Hessian or its inverse. Parameters ---------- n : int Problem dimension. approx_type : {'hess', 'inv_hess'} Selects either the Hessian or the inverse Hessian. When set to 'hess' the Hessian will be stored and updated. When set to 'inv_hess' its inverse will be used instead. """ self.first_iteration = True self.n = n self.approx_type = approx_type if approx_type not in ('hess', 'inv_hess'): raise ValueError("`approx_type` must be 'hess' or 'inv_hess'.") # Create matrix if self.approx_type == 'hess': self.B = np.eye(n, dtype=float) else: self.H = np.eye(n, dtype=float) def _auto_scale(self, delta_x, delta_grad): # Heuristic to scale matrix at first iteration. # Described in Nocedal and Wright "Numerical Optimization" # p.143 formula (6.20). s_norm2 = np.dot(delta_x, delta_x) y_norm2 = np.dot(delta_grad, delta_grad) ys = np.abs(np.dot(delta_grad, delta_x)) if ys == 0.0 or y_norm2 == 0 or s_norm2 == 0: return 1 if self.approx_type == 'hess': return y_norm2 / ys else: return ys / y_norm2 def _update_implementation(self, delta_x, delta_grad): raise NotImplementedError("The method ``_update_implementation``" " is not implemented.") def update(self, delta_x, delta_grad): """Update internal matrix. Update Hessian matrix or its inverse (depending on how 'approx_type' is defined) using information about the last evaluated points. Parameters ---------- delta_x : ndarray The difference between two points the gradient function have been evaluated at: ``delta_x = x2 - x1``. delta_grad : ndarray The difference between the gradients: ``delta_grad = grad(x2) - grad(x1)``. """ if np.all(delta_x == 0.0): return if np.all(delta_grad == 0.0): warn('delta_grad == 0.0. Check if the approximated ' 'function is linear. If the function is linear ' 'better results can be obtained by defining the ' 'Hessian as zero instead of using quasi-Newton ' 'approximations.', UserWarning) return if self.first_iteration: # Get user specific scale if self.init_scale == "auto": scale = self._auto_scale(delta_x, delta_grad) else: scale = float(self.init_scale) # Scale initial matrix with ``scale * np.eye(n)`` if self.approx_type == 'hess': self.B *= scale else: self.H *= scale self.first_iteration = False self._update_implementation(delta_x, delta_grad) def dot(self, p): """Compute the product of the internal matrix with the given vector. Parameters ---------- p : array_like 1-D array representing a vector. Returns ------- Hp : array 1-D represents the result of multiplying the approximation matrix by vector p. """ if self.approx_type == 'hess': return self._symv(1, self.B, p) else: return self._symv(1, self.H, p) def get_matrix(self): """Return the current internal matrix. Returns ------- M : ndarray, shape (n, n) Dense matrix containing either the Hessian or its inverse (depending on how `approx_type` was defined). """ if self.approx_type == 'hess': M = np.copy(self.B) else: M = np.copy(self.H) li = np.tril_indices_from(M, k=-1) M[li] = M.T[li] return M class BFGS(FullHessianUpdateStrategy): """Broyden-Fletcher-Goldfarb-Shanno (BFGS) Hessian update strategy. Parameters ---------- exception_strategy : {'skip_update', 'damp_update'}, optional Define how to proceed when the curvature condition is violated. Set it to 'skip_update' to just skip the update. Or, alternatively, set it to 'damp_update' to interpolate between the actual BFGS result and the unmodified matrix. Both exceptions strategies are explained in [1]_, p.536-537. min_curvature : float This number, scaled by a normalization factor, defines the minimum curvature ``dot(delta_grad, delta_x)`` allowed to go unaffected by the exception strategy. By default is equal to 1e-8 when ``exception_strategy = 'skip_update'`` and equal to 0.2 when ``exception_strategy = 'damp_update'``. init_scale : {float, 'auto'} Matrix scale at first iteration. At the first iteration the Hessian matrix or its inverse will be initialized with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension. Set it to 'auto' in order to use an automatic heuristic for choosing the initial scale. The heuristic is described in [1]_, p.143. By default uses 'auto'. Notes ----- The update is based on the description in [1]_, p.140. References ---------- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization" Second Edition (2006). """ def __init__(self, exception_strategy='skip_update', min_curvature=None, init_scale='auto'): if exception_strategy == 'skip_update': if min_curvature is not None: self.min_curvature = min_curvature else: self.min_curvature = 1e-8 elif exception_strategy == 'damp_update': if min_curvature is not None: self.min_curvature = min_curvature else: self.min_curvature = 0.2 else: raise ValueError("`exception_strategy` must be 'skip_update' " "or 'damp_update'.") super(BFGS, self).__init__(init_scale) self.exception_strategy = exception_strategy def _update_inverse_hessian(self, ys, Hy, yHy, s): """Update the inverse Hessian matrix. BFGS update using the formula: ``H <- H + ((H*y).T*y + s.T*y)/(s.T*y)^2 * (s*s.T) - 1/(s.T*y) * ((H*y)*s.T + s*(H*y).T)`` where ``s = delta_x`` and ``y = delta_grad``. This formula is equivalent to (6.17) in [1]_ written in a more efficient way for implementation. References ---------- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization" Second Edition (2006). """ self.H = self._syr2(-1.0 / ys, s, Hy, a=self.H) self.H = self._syr((ys+yHy)/ys**2, s, a=self.H) def _update_hessian(self, ys, Bs, sBs, y): """Update the Hessian matrix. BFGS update using the formula: ``B <- B - (B*s)*(B*s).T/s.T*(B*s) + y*y^T/s.T*y`` where ``s`` is short for ``delta_x`` and ``y`` is short for ``delta_grad``. Formula (6.19) in [1]_. References ---------- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization" Second Edition (2006). """ self.B = self._syr(1.0 / ys, y, a=self.B) self.B = self._syr(-1.0 / sBs, Bs, a=self.B) def _update_implementation(self, delta_x, delta_grad): # Auxiliary variables w and z if self.approx_type == 'hess': w = delta_x z = delta_grad else: w = delta_grad z = delta_x # Do some common operations wz = np.dot(w, z) Mw = self.dot(w) wMw = Mw.dot(w) # Guarantee that wMw > 0 by reinitializing matrix. # While this is always true in exact arithmetics, # indefinite matrix may appear due to roundoff errors. if wMw <= 0.0: scale = self._auto_scale(delta_x, delta_grad) # Reinitialize matrix if self.approx_type == 'hess': self.B = scale * np.eye(self.n, dtype=float) else: self.H = scale * np.eye(self.n, dtype=float) # Do common operations for new matrix Mw = self.dot(w) wMw = Mw.dot(w) # Check if curvature condition is violated if wz <= self.min_curvature * wMw: # If the option 'skip_update' is set # we just skip the update when the condion # is violated. if self.exception_strategy == 'skip_update': return # If the option 'damp_update' is set we # interpolate between the actual BFGS # result and the unmodified matrix. elif self.exception_strategy == 'damp_update': update_factor = (1-self.min_curvature) / (1 - wz/wMw) z = update_factor*z + (1-update_factor)*Mw wz = np.dot(w, z) # Update matrix if self.approx_type == 'hess': self._update_hessian(wz, Mw, wMw, z) else: self._update_inverse_hessian(wz, Mw, wMw, z) class SR1(FullHessianUpdateStrategy): """Symmetric-rank-1 Hessian update strategy. Parameters ---------- min_denominator : float This number, scaled by a normalization factor, defines the minimum denominator magnitude allowed in the update. When the condition is violated we skip the update. By default uses ``1e-8``. init_scale : {float, 'auto'}, optional Matrix scale at first iteration. At the first iteration the Hessian matrix or its inverse will be initialized with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension. Set it to 'auto' in order to use an automatic heuristic for choosing the initial scale. The heuristic is described in [1]_, p.143. By default uses 'auto'. Notes ----- The update is based on the description in [1]_, p.144-146. References ---------- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization" Second Edition (2006). """ def __init__(self, min_denominator=1e-8, init_scale='auto'): self.min_denominator = min_denominator super(SR1, self).__init__(init_scale) def _update_implementation(self, delta_x, delta_grad): # Auxiliary variables w and z if self.approx_type == 'hess': w = delta_x z = delta_grad else: w = delta_grad z = delta_x # Do some common operations Mw = self.dot(w) z_minus_Mw = z - Mw denominator = np.dot(w, z_minus_Mw) # If the denominator is too small # we just skip the update. if np.abs(denominator) <= self.min_denominator*norm(w)*norm(z_minus_Mw): return # Update matrix if self.approx_type == 'hess': self.B = self._syr(1/denominator, z_minus_Mw, a=self.B) else: self.H = self._syr(1/denominator, z_minus_Mw, a=self.H)