Source code for bayespy.inference.vmp.transformations

################################################################################
# Copyright (C) 2013-2014 Jaakko Luttinen
#
# This file is licensed under the MIT License.
################################################################################


import numpy as np
import warnings
import scipy

from bayespy.utils import optimize
from bayespy.utils import random
from bayespy.utils import linalg
from bayespy.utils import misc

from bayespy.utils.linalg import dot, tracedot

from .nodes import gaussian

from .nodes.categorical import CategoricalMoments

[docs]class RotationOptimizer(): r""" Optimizer for rotation parameter expansion in state-space models Rotates one model block with :math:`\mathbf{R}` and one model block with :math:`\mathbf{R}^{-1}`. Parameters ---------- block1 : rotator object The first rotation parameter expansion object block2 : rotator object The second rotation parameter expansion object D : int Dimensionality of the latent space References ---------- :cite:`Luttinen:2010`, :cite:`Luttinen:2013` """
[docs] def __init__(self, block1, block2, D): self.block1 = block1 self.block2 = block2 self.D = D
[docs] def rotate(self, maxiter=10, check_gradient=False, verbose=False, check_bound=False): """ Optimize the rotation of two separate model blocks jointly. If some variable is the dot product of two Gaussians, rotating the two Gaussians optimally can make the inference algorithm orders of magnitude faster. First block is rotated with :math:`\mathbf{R}` and the second with :math:`\mathbf{R}^{-T}`. Blocks must have methods: `bound(U,s,V)` and `rotate(R)`. """ I = np.identity(self.D) piv = np.arange(self.D) def cost(r): # Make vector-r into matrix-R R = np.reshape(r, (self.D,self.D)) # Compute SVD invR = np.linalg.inv(R) logdetR = np.linalg.slogdet(R)[1] # Compute lower bound terms (b1,db1) = self.block1.bound(R, logdet=logdetR, inv=invR) (b2,db2) = self.block2.bound(invR.T, logdet=-logdetR, inv=R.T) # Apply chain rule for the second gradient: # d b(invR.T) # = tr(db.T * d(invR.T)) # = tr(db * d(invR)) # = -tr(db * invR * (dR) * invR) # = -tr(invR * db * invR * dR) db2 = -dot(invR.T, db2.T, invR.T) # Compute the cost function c = -(b1+b2) dc = -(db1+db2) return (c, np.ravel(dc)) def get_bound_terms(r, gradient=False): """ Returns a dictionary of bound terms for the nodes. """ # Gradient not yet implemented.. if gradient: raise NotImplementedError() # Make vector-r into matrix-R R = np.reshape(r, (self.D,self.D)) # Compute SVD invR = np.linalg.inv(R) logdetR = np.linalg.slogdet(R)[1] # Compute lower bound terms dict1 = self.block1.get_bound_terms(R, logdet=logdetR, inv=invR) dict2 = self.block2.get_bound_terms(invR.T, logdet=-logdetR, inv=R.T) if not gradient: dict1.update(dict2) return dict1 else: terms = dict1[0].copy() terms = terms.update(dict2[0]) grad = dict1[1].copy() grad = grad.update(dict2[1]) return (terms, grad) def get_true_bound_terms(): nodes = set(self.block1.nodes()) | set(self.block2.nodes()) D = {} # TODO/FIXME: Also compute bound for child nodes as they could be # affected in practice although they shouldn't. Just checking that. for node in nodes: L = node.lower_bound_contribution() D[node] = L return D self.block1.setup() self.block2.setup() if check_gradient: R = np.random.randn(self.D, self.D) err = optimize.check_gradient(cost, np.ravel(R), verbose=verbose)[1] if err > 1e-5: warnings.warn("Rotation gradient has relative error %g" % err) # Initial rotation is identity matrix r0 = np.ravel(np.identity(self.D)) (cost_begin, _) = cost(r0) if check_bound: bound_terms_begin = get_bound_terms(r0) true_bound_terms_begin = get_true_bound_terms() # Run optimization r = optimize.minimize(cost, r0, maxiter=maxiter, verbose=verbose) (cost_end, _) = cost(r) if check_bound: bound_terms_end = get_bound_terms(r) # Apply the optimal rotation R = np.reshape(r, (self.D,self.D)) invR = np.linalg.inv(R) logdetR = np.linalg.slogdet(R)[1] self.block1.rotate(R, inv=invR, logdet=logdetR) self.block2.rotate(invR.T, inv=R.T, logdet=-logdetR) # Check that the cost function and the true lower bound changed equally cost_change = cost_end - cost_begin # Check that we really have improved the bound. if cost_change > 0: warnings.warn("Rotation optimization made the cost function worse " "by %g. Probably a bug in the gradient of the " "rotation functions." % (cost_change,)) if check_bound: true_bound_terms_end = get_true_bound_terms() bound_change = 0 for node in bound_terms_begin.keys(): node_bound_change = (bound_terms_end[node] - bound_terms_begin[node]) bound_change += node_bound_change true_node_bound_change = 0 try: true_node_bound_change += (true_bound_terms_end[node] - true_bound_terms_begin[node]) except KeyError: raise Exception("The node %s is part of the " "transformation but not part of the " "model. Check your VB construction." % node.name) if not np.allclose(node_bound_change, true_node_bound_change): warnings.warn("Rotation cost function is not consistent " "with the true lower bound for node %s. " "Bound changed %g but optimized function " "changed %g." % (node.name, true_node_bound_change, node_bound_change)) # Check that we really have improved the bound. # TODO/FIXME: Also compute bound for child nodes as they could be # affected in practice although they shouldn't. Just checking that. if bound_change < 0: warnings.warn("Rotation made the true lower bound worse by %g. " "Probably a bug in the rotation functions." % (bound_change,))
[docs]class RotateGaussian(): r""" Rotation parameter expansion for :class:`bayespy.nodes.Gaussian` """
[docs] def __init__(self, X): self.X = X
[docs] def rotate(self, R, inv=None, logdet=None): self.X.rotate(R, inv=inv, logdet=logdet)
[docs] def setup(self): """ This method should be called just before optimization. """ mask = self.X.mask[...,np.newaxis,np.newaxis] # Number of plates self.N = self.X.plates[0] #np.sum(mask) # Compute the sum <XX> over plates self.XX = misc.sum_multiply(self.X.get_moments()[1], mask, axis=(-1,-2), sumaxis=False, keepdims=False) # Parent's moments self.Lambda = self.X.parents[1].get_moments()[0]
def _compute_bound(self, R, logdet=None, inv=None, gradient=False): """ Rotate q(X) as X->RX: q(X)=N(R*mu, R*Cov*R') Assume: :math:`p(\mathbf{X}) = \prod^M_{m=1} N(\mathbf{x}_m|0, \mathbf{\Lambda})` """ # TODO/FIXME: X and alpha should NOT contain observed values!! Check # that. # TODO/FIXME: Allow non-zero prior mean! # Assume constant mean and precision matrix over plates.. # Compute rotated moments XX_R = dot(R, self.XX, R.T) inv_R = inv logdet_R = logdet # Compute entropy H(X) logH_X = random.gaussian_entropy(-2*self.N*logdet_R, 0) # Compute <log p(X)> logp_X = random.gaussian_logpdf(np.vdot(XX_R, self.Lambda), 0, 0, 0, 0) # Compute the bound if terms: bound = {self.X: bound} else: bound = logp_X + logH_X if not gradient: return bound # Compute dH(X) dlogH_X = random.gaussian_entropy(-2*self.N*inv_R.T, 0) # Compute d<log p(X)> dXX = 2*dot(self.Lambda, R, self.XX) dlogp_X = random.gaussian_logpdf(dXX, 0, 0, 0, 0) if terms: d_bound = {self.X: dlogp_X + dlogH_X} else: d_bound = dlogp_X + dlogH_X return (bound, d_bound)
[docs] def bound(self, R, logdet=None, inv=None): return self._compute_bound(R, logdet=logdet, inv=inv, gradient=True)
[docs] def get_bound_terms(self, R, logdet=None, inv=None): return self._compute_bound(R, logdet=logdet, inv=inv, gradient=False, terms=True)
[docs] def nodes(self): return [self.X]
def covariance_to_variance(C, ndim=1, covariance_axis=None): # Force None to empty list if covariance_axis is None: covariance_axis = [] # Force a list from integer if isinstance(covariance_axis, int): covariance_axis = [covariance_axis] # Force positive axis indices covariance_axis = [axis + ndim if axis < 0 else axis for axis in covariance_axis] # Make a set of the axes covariance_axis = set(covariance_axis) keys = [i+ndim if i in covariance_axis else i for i in range(ndim)] keys += [i+2*ndim if i in covariance_axis else i for i in range(ndim)] out_keys = sorted(list(set(keys))) return np.einsum(C, [Ellipsis]+keys, [Ellipsis]+out_keys) def sum_to_plates(V, plates_to, plates_from=None, ndim=0): if ndim == 0: if plates_from is not None: r = gaussian.Gaussian.broadcasting_multiplier(plates_from, np.shape(V)) else: r = 1 return r * misc.sum_to_shape(V, plates_to) else: dims_V = np.shape(V)[-ndim:] plates_V = np.shape(V)[:-ndim] shape_to = tuple(plates_to) + dims_V if plates_from is not None: r = gaussian.Gaussian.broadcasting_multiplier(plates_from, plates_V) else: r = 1 return r * misc.sum_to_shape(V, shape_to)
[docs]class RotateGaussianARD(): """ Rotation parameter expansion for :class:`bayespy.nodes.GaussianARD` The model: alpha ~ N(a, b) X ~ N(mu, alpha) X can be an array (e.g., GaussianARD). Transform q(X) and q(alpha) by rotating X. Requirements: * X and alpha do not contain any observed values """
[docs] def __init__(self, X, *alpha, axis=-1, precompute=False, subset=None): """ Precompute tells whether to compute some moments once in the setup function instead of every time in the bound function. However, they are computed a bit differently in the bound function so it can be useful too. Precomputation is probably beneficial only when there are large axes that are not rotated (by R nor Q) and they are not contained in the plates of alpha, and the dimensions for R and Q are quite small. """ self.precompute = precompute self.node_parent = X.parents[0] if len(alpha) == 0: self.update_alpha = False elif len(alpha) == 1: self.node_alpha = alpha[0] self.update_alpha = True else: raise ValueError("Too many arguments") self.node_X = X #self.node_mu = X.parents[0] self.ndim = len(X.dims[0]) # Force negative rotation axis indexing if not isinstance(axis, int): raise ValueError("Axis must be integer") if axis >= 0: axis -= self.ndim if axis < -self.ndim or axis >= 0: raise ValueError("Axis out of bounds") self.axis = axis # Allow rotation of only subset of elements/slices self.D = X.dims[0][axis] if subset is None: #self.subset = np.ones(self.D, dtype=bool) self.subset = None #tuple(range(self.D)) else: #self.subset = tuple(range(self.D)) self.subset = subset #self.subset[subset] if axis != -1: raise NotImplementedError("Subset indexing for non-last " "axis not yet implemented")
## self.subset = np.zeros(self.D, dtype=bool) ## self.subset[list(subset)] = True
[docs] def nodes(self): if self.update_alpha: return [self.node_X, self.node_alpha] else: return [self.node_X]
def _full_rotation_matrix(self, R): if self.subset is not None: R_full = np.identity(self.D) indices = np.ix_(self.subset, self.subset) R_full[indices] = R return R_full else: return R
[docs] def rotate(self, R, inv=None, logdet=None, Q=None): ## R = self._full_rotation_matrix(R) ## if inv is not None: ## inv = self._full_rotation_matrix(inv) self.node_X.rotate(R, inv=inv, logdet=logdet, subset=self.subset, axis=self.axis) if self.plate_axis is not None: self.node_X.rotate_plates(Q, plate_axis=self.plate_axis) if self.update_alpha: self.node_alpha.update()
[docs] def setup(self, plate_axis=None): """ This method should be called just before optimization. For efficiency, sum over axes that are not in mu, alpha nor rotation. If using Q, set rotate_plates to True. """ # Store the original plate_axis parameter for later use in other methods self.plate_axis = plate_axis # Manipulate the plate_axis parameter to suit the needs of this method if plate_axis is not None: if not isinstance(plate_axis, int): raise ValueError("Plate axis must be integer") if plate_axis >= 0: plate_axis -= len(self.node_X.plates) if plate_axis < -len(self.node_X.plates) or plate_axis >= 0: raise ValueError("Axis out of bounds") plate_axis -= self.ndim - 1 # Why -1? Because one axis is preserved! # Get the mean parameter. It will not be rotated. This assumes that mu # and alpha are really independent. (alpha_mu, alpha_mu2, alpha, _) = self.node_parent.get_moments() (X, XX) = self.node_X.get_moments() # mu = alpha_mu / alpha mu2 = alpha_mu2 / alpha # For simplicity, force mu to have the same shape as X mu = mu * np.ones(self.node_X.dims[0]) mu2 = mu2 * np.ones(self.node_X.dims[0]) ## (mu, mumu) = gaussian.reshape_gaussian_array(self.node_mu.dims[0], ## self.node_X.dims[0], ## mu, ## mumu) # Take diagonal of covariances to variances for axes that are not in R # (and move those axes to be the last) XX = covariance_to_variance(XX, ndim=self.ndim, covariance_axis=self.axis) ## mumu = covariance_to_variance(mumu, ## ndim=self.ndim, ## covariance_axis=self.axis) # Move axes of X and mu and compute their outer product X = misc.moveaxis(X, self.axis, -1) mu = misc.moveaxis(mu, self.axis, -1) mu2 = misc.moveaxis(mu2, self.axis, -1) Xmu = linalg.outer(X, mu, ndim=1) D = np.shape(X)[-1] # Move axes of alpha related variables def safe_move_axis(x): if np.ndim(x) >= -self.axis: return misc.moveaxis(x, self.axis, -1) else: return x[...,np.newaxis] if self.update_alpha: a = safe_move_axis(self.node_alpha.phi[1]) a0 = safe_move_axis(self.node_alpha.parents[0].get_moments()[0]) b0 = safe_move_axis(self.node_alpha.parents[1].get_moments()[0]) plates_alpha = list(self.node_alpha.plates) else: alpha = safe_move_axis(self.node_parent.get_moments()[2]) plates_alpha = list(self.node_parent.get_shape(2)) # Move plates of alpha for R if len(plates_alpha) >= -self.axis: plate = plates_alpha.pop(self.axis) plates_alpha.append(plate) else: plates_alpha.append(1) plates_X = list(self.node_X.get_shape(0)) plates_X.pop(self.axis) def sum_to_alpha(V, ndim=2): # TODO/FIXME: This could be improved so that it is not required to # explicitly repeat to alpha plates. Multiplying by ones was just a # simple bug fix. return sum_to_plates(V * np.ones(plates_alpha[:-1]+ndim*[1]), plates_alpha[:-1], ndim=ndim, plates_from=plates_X) if plate_axis is not None: # Move plate axis just before the rotated dimensions (which are # last) def safe_move_plate_axis(x, ndim): if np.ndim(x)-ndim >= -plate_axis: return misc.moveaxis(x, plate_axis-ndim, -ndim-1) else: inds = (Ellipsis,None) + ndim*(slice(None),) return x[inds] X = safe_move_plate_axis(X, 1) mu = safe_move_plate_axis(mu, 1) XX = safe_move_plate_axis(XX, 2) mu2 = safe_move_plate_axis(mu2, 1) if self.update_alpha: a = safe_move_plate_axis(a, 1) a0 = safe_move_plate_axis(a0, 1) b0 = safe_move_plate_axis(b0, 1) else: alpha = safe_move_plate_axis(alpha, 1) # Move plates of X and alpha plate = plates_X.pop(plate_axis) plates_X.append(plate) if len(plates_alpha) >= -plate_axis+1: plate = plates_alpha.pop(plate_axis-1) else: plate = 1 plates_alpha = plates_alpha[:-1] + [plate] + plates_alpha[-1:] CovX = XX - linalg.outer(X, X) self.CovX = sum_to_plates(CovX, plates_alpha[:-2], ndim=3, plates_from=plates_X[:-1]) # Broadcast mumu to ensure shape #mumu = np.ones(np.shape(XX)[-3:]) * mumu mu2 = mu2 * np.ones(np.shape(X)[-2:]) self.mu2 = sum_to_alpha(mu2, ndim=1) if self.precompute: # Precompute some stuff for the gradient of plate rotation # # NOTE: These terms may require a lot of memory if alpha has the # same or almost the same plates as X. self.X_X = sum_to_plates(X[...,:,:,None,None] * X[...,None,None,:,:], plates_alpha[:-2], ndim=4, plates_from=plates_X[:-1]) self.X_mu = sum_to_plates(X[...,:,:,None,None] * mu[...,None,None,:,:], plates_alpha[:-2], ndim=4, plates_from=plates_X[:-1]) else: self.X = X self.mu = mu else: # Sum axes that are not in the plates of alpha self.XX = sum_to_alpha(XX) self.mu2 = sum_to_alpha(mu2, ndim=1) self.Xmu = sum_to_alpha(Xmu) if self.update_alpha: self.a = a self.a0 = a0 self.b0 = b0 else: self.alpha = alpha self.plates_X = plates_X self.plates_alpha = plates_alpha # Take only a subset of the matrix for rotation if self.subset is not None: if self.precompute: raise NotImplementedError("Precomputation not implemented when " "using a subset") # from X self.X = self.X[...,self.subset] self.mu2 = self.mu2[...,self.subset] if plate_axis is not None: # from CovX inds = [] for i in range(np.ndim(self.CovX)-2): inds.append(range(np.shape(self.CovX)[i])) inds.append(self.subset) inds.append(self.subset) indices = np.ix_(*inds) self.CovX = self.CovX[indices] # from mu self.mu = self.mu[...,self.subset] else: # from XX inds = [] for i in range(np.ndim(self.XX)-2): inds.append(range(np.shape(self.XX)[i])) inds.append(self.subset) inds.append(self.subset) indices = np.ix_(*inds) self.XX = self.XX[indices] # from Xmu self.Xmu = self.Xmu[...,self.subset] # from alpha if self.update_alpha: if np.shape(self.a)[-1] > 1: self.a = self.a[...,self.subset] if np.shape(self.a0)[-1] > 1: self.a0 = self.a0[...,self.subset] if np.shape(self.b0)[-1] > 1: self.b0 = self.b0[...,self.subset] else: if np.shape(self.alpha)[-1] > 1: self.alpha = self.alpha[...,self.subset] self.plates_alpha[-1] = min(self.plates_alpha[-1], len(self.subset))
## # from mu ## # from alpha ## alpha_mu = alpha_mu[...,self.subset] ## alpha_mu2 = alpha_mu2[...,self.subset] ## alpha = alpha[...,self.subset] ## dims = list(self.node_X.dims[0]) ## dims[-1] = len(self.subset) ## else: ## dims = list(self.node_X.dims[0]) def _compute_bound(self, R, logdet=None, inv=None, Q=None, gradient=False, terms=False): """ Rotate q(X) and q(alpha). Assume: p(X|alpha) = prod_m N(x_m|0,diag(alpha)) p(alpha) = prod_d G(a_d,b_d) """ ## R = self._full_rotation_matrix(R) ## if inv is not None: ## inv = self._full_rotation_matrix(inv) # # Transform the distributions and moments # plates_alpha = self.plates_alpha plates_X = self.plates_X # Compute rotated second moment if self.plate_axis is not None: # The plate axis has been moved to be the last plate axis if Q is None: raise ValueError("Plates should be rotated but no Q give") # Transform covariance sumQ = np.sum(Q, axis=0) QCovQ = sumQ[:,None,None]**2 * self.CovX # Rotate plates if self.precompute: QX_QX = np.einsum('...kalb,...ik,...il->...iab', self.X_X, Q, Q) XX = QX_QX + QCovQ XX = sum_to_plates(XX, plates_alpha[:-1], ndim=2) Xmu = np.einsum('...kaib,...ik->...iab', self.X_mu, Q) Xmu = sum_to_plates(Xmu, plates_alpha[:-1], ndim=2) else: X = self.X mu = self.mu QX = np.einsum('...ik,...kj->...ij', Q, X) XX = (sum_to_plates(QCovQ, plates_alpha[:-1], ndim=2) + sum_to_plates(linalg.outer(QX, QX), plates_alpha[:-1], ndim=2, plates_from=plates_X)) Xmu = sum_to_plates(linalg.outer(QX, self.mu), plates_alpha[:-1], ndim=2, plates_from=plates_X) mu2 = self.mu2 D = np.shape(XX)[-1] logdet_Q = D * np.log(np.abs(sumQ)) else: XX = self.XX mu2 = self.mu2 Xmu = self.Xmu logdet_Q = 0 # Compute transformed moments #mu2 = np.einsum('...ii->...i', mu2) RXmu = np.einsum('...ik,...ki->...i', R, Xmu) RXX = np.einsum('...ik,...kj->...ij', R, XX) RXXR = np.einsum('...ik,...ik->...i', RXX, R) # <(X-mu) * (X-mu)'>_R XmuXmu = (RXXR - 2*RXmu + mu2) D = np.shape(R)[0] # Compute q(alpha) if self.update_alpha: # Parameters a0 = self.a0 b0 = self.b0 a = self.a b = b0 + 0.5*sum_to_plates(XmuXmu, plates_alpha, plates_from=None, ndim=0) # Some expectations alpha = a / b logb = np.log(b) logalpha = -logb # + const b0_alpha = b0 * alpha a0_logalpha = a0 * logalpha else: alpha = self.alpha logalpha = 0 # # Compute the cost # def sum_plates(V, *plates): full_plates = misc.broadcasted_shape(*plates) r = self.node_X.broadcasting_multiplier(full_plates, np.shape(V)) return r * np.sum(V) XmuXmu_alpha = XmuXmu * alpha if logdet is None: logdet_R = np.linalg.slogdet(R)[1] inv_R = np.linalg.inv(R) else: logdet_R = logdet inv_R = inv # Compute entropy H(X) logH_X = random.gaussian_entropy(-2*sum_plates(logdet_R + logdet_Q, plates_X), 0) # Compute <log p(X|alpha)> logp_X = random.gaussian_logpdf(sum_plates(XmuXmu_alpha, plates_alpha[:-1] + [D]), 0, 0, sum_plates(logalpha, plates_X + [D]), 0) if self.update_alpha: # Compute entropy H(alpha) # This cancels out with the log(alpha) term in log(p(alpha)) logH_alpha = 0 # Compute <log p(alpha)> logp_alpha = random.gamma_logpdf(sum_plates(b0_alpha, plates_alpha), 0, sum_plates(a0_logalpha, plates_alpha), 0, 0) else: logH_alpha = 0 logp_alpha = 0 # Compute the bound if terms: bound = {self.node_X: logp_X + logH_X} if self.update_alpha: bound.update({self.node_alpha: logp_alpha + logH_alpha}) else: bound = (0 + logp_X + logp_alpha + logH_X + logH_alpha ) if not gradient: return bound # # Compute the gradient with respect R # broadcasting_multiplier = self.node_X.broadcasting_multiplier def sum_plates(V, plates): ones = np.ones(np.shape(R)) r = broadcasting_multiplier(plates, np.shape(V)[:-2]) return r * misc.sum_multiply(V, ones, axis=(-1,-2), sumaxis=False, keepdims=False) D_XmuXmu = 2*RXX - 2*gaussian.transpose_covariance(Xmu) DXmuXmu_alpha = np.einsum('...i,...ij->...ij', alpha, D_XmuXmu) if self.update_alpha: D_b = 0.5 * D_XmuXmu XmuXmu_Dalpha = np.einsum('...i,...i,...i,...ij->...ij', sum_to_plates(XmuXmu, plates_alpha, plates_from=None, ndim=0), alpha, -1/b, D_b) D_b0_alpha = np.einsum('...i,...i,...i,...ij->...ij', b0, alpha, -1/b, D_b) D_logb = np.einsum('...i,...ij->...ij', 1/b, D_b) D_logalpha = -D_logb D_a0_logalpha = a0 * D_logalpha else: XmuXmu_Dalpha = 0 D_logalpha = 0 D_XmuXmu_alpha = DXmuXmu_alpha + XmuXmu_Dalpha D_logR = inv_R.T # Compute dH(X) dlogH_X = random.gaussian_entropy(-2*sum_plates(D_logR, plates_X), 0) # Compute d<log p(X|alpha)> dlogp_X = random.gaussian_logpdf(sum_plates(D_XmuXmu_alpha, plates_alpha[:-1]), 0, 0, (sum_plates(D_logalpha, plates_X) * broadcasting_multiplier((D,), plates_alpha[-1:])), 0) if self.update_alpha: # Compute dH(alpha) # This cancels out with the log(alpha) term in log(p(alpha)) dlogH_alpha = 0 # Compute d<log p(alpha)> dlogp_alpha = random.gamma_logpdf(sum_plates(D_b0_alpha, plates_alpha[:-1]), 0, sum_plates(D_a0_logalpha, plates_alpha[:-1]), 0, 0) else: dlogH_alpha = 0 dlogp_alpha = 0 if terms: raise NotImplementedError() dR_bound = {self.node_X: dlogp_X + dlogH_X} if self.update_alpha: dR_bound.update({self.node_alpha: dlogp_alpha + dlogH_alpha}) else: dR_bound = (0*dlogp_X + dlogp_X + dlogp_alpha + dlogH_X + dlogH_alpha ) if self.subset: indices = np.ix_(self.subset, self.subset) dR_bound = dR_bound[indices] if self.plate_axis is None: return (bound, dR_bound) # # Compute the gradient with respect to Q (if Q given) # # Some pre-computations Q_RCovR = np.einsum('...ik,...kl,...il,...->...i', R, self.CovX, R, sumQ) if self.precompute: Xr_rX = np.einsum('...abcd,...jb,...jd->...jac', self.X_X, R, R) QXr_rX = np.einsum('...akj,...ik->...aij', Xr_rX, Q) RX_mu = np.einsum('...jk,...akbj->...jab', R, self.X_mu) else: RX = np.einsum('...ik,...k->...i', R, X) QXR = np.einsum('...ik,...kj->...ij', Q, RX) QXr_rX = np.einsum('...ik,...jk->...kij', QXR, RX) RX_mu = np.einsum('...ik,...jk->...kij', RX, mu) QXr_rX = sum_to_plates(QXr_rX, plates_alpha[:-2], ndim=3, plates_from=plates_X[:-1]) RX_mu = sum_to_plates(RX_mu, plates_alpha[:-2], ndim=3, plates_from=plates_X[:-1]) def psi(v): """ Compute: d/dQ 1/2*trace(diag(v)*<(X-mu)*(X-mu)>) = Q*<X>'*R'*diag(v)*R*<X> + ones * Q diag( tr(R'*diag(v)*R*Cov) ) + mu*diag(v)*R*<X> """ # Precompute all terms to plates_alpha because v has shape # plates_alpha. # Gradient of 0.5*v*<x>*<x> v_QXrrX = np.einsum('...kij,...ik->...ij', QXr_rX, v) # Gradient of 0.5*v*Cov Q_tr_R_v_R_Cov = np.einsum('...k,...k->...', Q_RCovR, v)[...,None,:] # Gradient of mu*v*x mu_v_R_X = np.einsum('...ik,...kji->...ij', v, RX_mu) return v_QXrrX + Q_tr_R_v_R_Cov - mu_v_R_X def sum_plates(V, plates): ones = np.ones(np.shape(Q)) r = self.node_X.broadcasting_multiplier(plates, np.shape(V)[:-2]) return r * misc.sum_multiply(V, ones, axis=(-1,-2), sumaxis=False, keepdims=False) if self.update_alpha: D_logb = psi(1/b) XX_Dalpha = -psi(alpha/b * sum_to_plates(XmuXmu, plates_alpha)) D_logalpha = -D_logb else: XX_Dalpha = 0 D_logalpha = 0 DXX_alpha = 2*psi(alpha) D_XX_alpha = DXX_alpha + XX_Dalpha D_logdetQ = D / sumQ N = np.shape(Q)[-1] # Compute dH(X) dQ_logHX = random.gaussian_entropy(-2*sum_plates(D_logdetQ, plates_X[:-1]), 0) # Compute d<log p(X|alpha)> dQ_logpX = random.gaussian_logpdf(sum_plates(D_XX_alpha, plates_alpha[:-2]), 0, 0, (sum_plates(D_logalpha, plates_X[:-1]) * broadcasting_multiplier((N,D), plates_alpha[-2:])), 0) if self.update_alpha: D_alpha = -psi(alpha/b) D_b0_alpha = b0 * D_alpha D_a0_logalpha = a0 * D_logalpha # Compute dH(alpha) # This cancels out with the log(alpha) term in log(p(alpha)) dQ_logHalpha = 0 # Compute d<log p(alpha)> dQ_logpalpha = random.gamma_logpdf(sum_plates(D_b0_alpha, plates_alpha[:-2]), 0, sum_plates(D_a0_logalpha, plates_alpha[:-2]), 0, 0) else: dQ_logHalpha = 0 dQ_logpalpha = 0 if terms: raise NotImplementedError() dQ_bound = {self.node_X: dQ_logpX + dQ_logHX} if self.update_alpha: dQ_bound.update({self.node_alpha: dQ_logpalpha + dQ_logHalpha}) else: dQ_bound = (0*dQ_logpX + dQ_logpX + dQ_logpalpha + dQ_logHX + dQ_logHalpha ) return (bound, dR_bound, dQ_bound)
[docs] def bound(self, R, logdet=None, inv=None, Q=None): return self._compute_bound(R, logdet=logdet, inv=inv, Q=Q, gradient=True)
[docs] def get_bound_terms(self, R, logdet=None, inv=None, Q=None): return self._compute_bound(R, logdet=logdet, inv=inv, Q=Q, gradient=False, terms=True)
[docs]class RotateGaussianMarkovChain(): r""" Rotation parameter expansion for :class:`bayespy.nodes.GaussianMarkovChain` Assume the following model. Constant, unit isotropic innovation noise. Unit variance only? Maybe: Assume innovation noise with unit variance? Would it help make this function more general with respect to A. TODO: Allow constant A or not rotating A. .. math:: R x_n = R A R^{-1} R x_{n-1} + R B u_{n-1} + noise \\\ R x_n = R [A, B] [R^{-1}, 0; 0, I] [R, 0; 0, I] [x_{n-1}; u_{n-1}] :math:`A` may vary in time. Shape of A: (N,D,D) Shape of AA: (N,D,D,D) No plates for X. """
[docs] def __init__(self, X, *args): self.X_node = X # FIXME: Currently, GaussianMarkovChain wraps initial state mean and # precision into one node and dynamics plus innovation noise into # another node. This transformation doesn't yet support GaussianGamma # dynamics, so we'll do some ugly checking here: # Dynamics node from bayespy.inference.vmp.nodes.gaussian import ( WrapToGaussianGamma, GaussianToGaussianGamma, GaussianMoments, ) dynamics_innovation = X.parents[1] assert(isinstance(dynamics_innovation, WrapToGaussianGamma)) dynamics_gaussiangamma = dynamics_innovation.parents[0] assert(isinstance(dynamics_gaussiangamma, GaussianToGaussianGamma)) dynamics = dynamics_gaussiangamma.parents[0] assert(isinstance(dynamics._moments, GaussianMoments)) self.A_node = dynamics if len(args) == 0: raise NotImplementedError() elif len(args) == 1: self.A_rotator = args[0] else: raise ValueError("Wrong number of arguments") self.N = X.dims[0][0]
[docs] def nodes(self): return [self.X_node] + self.A_rotator.nodes()
[docs] def rotate(self, R, inv=None, logdet=None): if inv is None: inv = np.linalg.inv(R) if logdet is None: logdet = np.linalg.slogdet(R)[1] self.X_node.rotate(R, inv=inv, logdet=logdet) from scipy.linalg import block_diag if len(self.X_node.parents) >= 3: input_shape = self.X_node.parents[2].dims[0] input_len = input_shape[-1] I = np.identity(input_len) else: I = np.identity(0) self.A_rotator.rotate(block_diag(inv.T, I), inv=block_diag(R.T, I), logdet=-logdet, Q=R)
def _computations_for_A_and_X(self, XpXn, XpXp): # Get moments of the state dynamics matrix (A, AA) = self.A_node.get_moments() # Make sure time axis is in the arrays A = misc.atleast_nd(A, 3) AA = misc.atleast_nd(AA, 4) CovA = AA - A[...,:,np.newaxis]*A[...,np.newaxis,:] # # Expectations with respect to A and X # # TODO: In case A does not depend on time, use a bit more efficient # formulas # Compute: \sum_n <A_n> <x_{n-1} x_n^T> A_XpXn = np.einsum('...nik,...nkj->...ij', A, XpXn) A_XpXn = sum_to_plates(A_XpXn, (), ndim=2, plates_from=self.X_node.plates) # Compute: \sum_n <A_n> <x_{n-1} x_{n-1}^T> <A_n>^T A_XpXp = np.einsum('...nik,...nkj->...nij', A, XpXp) A_XpXp_A = np.einsum('...nik,...njk->...ij', A_XpXp, A) A_XpXp_A = sum_to_plates(A_XpXp_A, (), ndim=2, plates_from=self.X_node.plates) # Compute: \sum_n tr(CovA_n <x_{n-1} x_{n-1}^T>) CovA_XpXp = np.einsum('...ndij,...nij->...d', CovA, XpXp) CovA_XpXp = sum_to_plates(CovA_XpXp, (), ndim=1, plates_from=self.X_node.plates) return (A_XpXn, A_XpXp_A, CovA_XpXp)
[docs] def setup(self): """ This method should be called just before optimization. """ # Get moments of X (X, XnXn, XpXn) = self.X_node.get_moments() # TODO/FIXME: Sum to plates of A/CovA XpXp = XnXn[...,:-1,:,:] # Add input signals if len(self.X_node.parents) >= 3: (U, UU) = self.X_node.parents[2].get_moments() UXn = linalg.outer(U, X[...,1:,:]) UXp = linalg.outer(U, X[...,:-1,:]) XpXn = np.concatenate([XpXn, UXn], axis=-2) XpXp = np.concatenate( [ np.concatenate([XpXp, linalg.transpose(UXp)], axis=-1), np.concatenate([UXp, UU], axis=-1) ], axis=-2 ) # # Expectations with respect to X # self.X0 = X[...,0,:] self.X0X0 = XnXn[...,0,:,:] #self.XnXn = np.sum(XnXn[...,1:,:,:], axis=-3) self.XnXn = sum_to_plates(XnXn[...,1:,:,:], (), plates_from=self.X_node.plates + (self.N-1,), ndim=2) # Get moments of the fixed parameter nodes Lambda_mu = self.X_node.parents[0].get_moments()[0] self.Lambda = self.X_node.parents[0].get_moments()[2] #self.Lambda = self.X_node.parents[1].get_moments()[0] self.Lambda_mu_X0 = linalg.outer(Lambda_mu, self.X0) self.Lambda_mu_X0 = sum_to_plates(self.Lambda_mu_X0, (), plates_from=self.X_node.plates, ndim=2) # # Prepare the rotation for A # (self.A_XpXn, self.A_XpXp_A, self.CovA_XpXp) = self._computations_for_A_and_X(XpXn, XpXp) self.A_rotator.setup(plate_axis=-1)
# Innovation noise is assumed to be I #self.v = self.X_node.parents[3].get_moments()[0] def _compute_bound(self, R, logdet=None, inv=None, gradient=False, terms=False): """ Rotate q(X) as X->RX: q(X)=N(R*mu, R*Cov*R') Assume: :math:`p(\mathbf{X}) = \prod^M_{m=1} N(\mathbf{x}_m|0, \mathbf{\Lambda})` Assume unit innovation noise covariance. """ # TODO/FIXME: X and alpha should NOT contain observed values!! Check # that. # Assume constant mean and precision matrix over plates.. if inv is None: invR = np.linalg.inv(R) else: invR = inv if logdet is None: logdetR = np.linalg.slogdet(R)[1] else: logdetR = logdet # Transform moments of X and A: Lambda_R_X0X0 = sum_to_plates(dot(self.Lambda, R, self.X0X0), (), plates_from=self.X_node.plates, ndim=2) R_XnXn = dot(R, self.XnXn) RA_XpXp_A = dot(R, self.A_XpXp_A) sumr = np.sum(R, axis=0) R_CovA_XpXp = sumr * self.CovA_XpXp # Compute entropy H(X) M = self.N*np.prod(self.X_node.plates) # total number of rotated vectors logH_X = random.gaussian_entropy(-2 * M * logdetR, 0) # Compute <log p(X)> yy = tracedot(R_XnXn, R.T) + tracedot(Lambda_R_X0X0, R.T) yz = tracedot(dot(R,self.A_XpXn),R.T) + tracedot(self.Lambda_mu_X0, R.T) zz = tracedot(RA_XpXp_A, R.T) + np.einsum('...k,...k->...', R_CovA_XpXp, sumr) logp_X = random.gaussian_logpdf(yy, yz, zz, 0, 0) # Compute the bound if terms: bound = {self.X_node: logp_X + logH_X} else: bound = logp_X + logH_X if not gradient: return bound # Compute dH(X) dlogH_X = random.gaussian_entropy(-2 * M * invR.T, 0) # Compute d<log p(X)> dyy = 2 * (R_XnXn + Lambda_R_X0X0) dyz = dot(R, self.A_XpXn + self.A_XpXn.T) + self.Lambda_mu_X0 dzz = 2 * (RA_XpXp_A + R_CovA_XpXp[None,:]) dlogp_X = random.gaussian_logpdf(dyy, dyz, dzz, 0, 0) if terms: d_bound = {self.X_node: dlogp_X + dlogH_X} else: d_bound = ( + dlogp_X + dlogH_X ) return (bound, d_bound)
[docs] def bound(self, R, logdet=None, inv=None): if inv is None: inv = np.linalg.inv(R) if logdet is None: logdet = np.linalg.slogdet(R)[1] (bound_X, d_bound_X) = self._compute_bound(R, logdet=logdet, inv=inv, gradient=True) # Compute cost and gradient from A # Handle possible input signals from scipy.linalg import block_diag if len(self.X_node.parents) >= 3: input_shape = self.X_node.parents[2].dims[0] input_len = input_shape[-1] I = np.identity(input_len) else: I = np.identity(0) (bound_A, dR_bound_A, dQ_bound_A) = self.A_rotator.bound(block_diag(inv.T, I), inv=block_diag(R.T, I), logdet=-logdet, Q=R) # Ignore input signals gradients D = self.X_node.dims[0][-1] dR_bound_A = dR_bound_A[...,:D,:D] dR_bound_A = -dot(inv.T, dR_bound_A.T, inv.T) # Compute the bound bound = bound_X + bound_A d_bound = d_bound_X + dR_bound_A + dQ_bound_A return (bound, d_bound)
[docs] def get_bound_terms(self, R, logdet=None, inv=None): if inv is None: inv = np.linalg.inv(R) if logdet is None: logdet = np.linalg.slogdet(R)[1] # Handle possible input signals from scipy.linalg import block_diag if len(self.X_node.parents) >= 3: input_shape = self.X_node.parents[2].dims[0] input_len = input_shape[-1] I = np.identity(input_len) else: I = np.identity(0) terms_A = self.A_rotator.get_bound_terms(block_diag(inv.T, I), inv=block_diag(R.T, I), logdet=-logdet, Q=R) terms_X = self._compute_bound(R, logdet=logdet, inv=inv, gradient=False, terms=True) terms_X.update(terms_A) return terms_X
[docs]class RotateVaryingMarkovChain(RotateGaussianMarkovChain): r""" Rotation for :class:`bayespy.nodes.SwitchingGaussianMarkovChain` Assume the following model. Constant, unit isotropic innovation noise. :math:`A_n = \sum_k B_k s_{kn}` Gaussian B: (1,D) x (D,K) Gaussian S: (N,1) x (K) MC X: () x (N+1,D) No plates for X. """
[docs] def __init__(self, X, B, S, B_rotator): self.X_node = X self.B_node = B self.S_node = S self.B_rotator = B_rotator if len(S.plates) > 0 and S.plates[-1] > 1: raise ValueError("The length of the last plate of S must be 1.") if len(B.plates) > 1 and B.plates[-2] > 1: raise ValueError("The length of the last plate of B must be 1.") if len(S.dims[0]) != 1: raise ValueError("S should have exactly one variable axis") if len(B.dims[0]) != 2: raise ValueError("B should have exactly two variable axes") super().__init__(X, B_rotator)
def _computations_for_A_and_X(self, XpXn, XpXp): # Get moments of B and S (B, BB) = self.B_node.get_moments() CovB = BB - B[...,:,:,None,None]*B[...,None,None,:,:] u_S = self.S_node.get_moments() S = u_S[0] SS = u_S[1] # # Expectations with respect to A and X # # TODO/FIXME: If S and B have overlapping plates, then these will give # wrong results, because those plates of S are summed before multiplying # by the plates of B. There should be some "smart einsum" function which # would compute sum-multiplys intelligently given a number of inputs. # Compute: \sum_n <A_n> <x_{n-1} x_n^T> # Axes: (N, D, D, D, K) S_XpXn = misc.sum_multiply(S[...,None,None,:], XpXn[...,:,None,:,:,None], axis=(-3,-2,-1), sumaxis=False) A_XpXn = misc.sum_multiply(B[...,:,:,None,:], S_XpXn[...,:,:,:], axis=(-4,-2), sumaxis=False) # Compute: \sum_n <A_n> <x_{n-1} x_{n-1}^T> <A_n>^T # Axes: (N, D, D, D, K, D, K) SS_XpXp = misc.sum_multiply(SS[...,None,:,None,:], XpXp[...,None,:,None,:,None], axis=(-4,-3,-2,-1), sumaxis=False) B_SS_XpXp = misc.sum_multiply(B[...,:,:,:,None,None], SS_XpXp[...,:,:,:,:], axis=(-4,-3), sumaxis=True) A_XpXp_A = misc.sum_multiply(B_SS_XpXp[...,:,None,:,:], B[...,None,:,:,:], axis=(-4,-3), sumaxis=False) # Compute: \sum_n tr(CovA_n <x_{n-1} x_{n-1}^T>) # Axes: (D,D,K,D,K) CovA_XpXp = misc.sum_multiply(CovB, SS_XpXp, axis=(-5,), sumaxis=False) return (A_XpXn, A_XpXp_A, CovA_XpXp)
[docs]class RotateSwitchingMarkovChain(RotateGaussianMarkovChain): """ Rotation for :class:`bayespy.nodes.VaryingGaussianMarkovChain` Assume the following model. Constant, unit isotropic innovation noise. :math:`A_n = B_{z_n}` Gaussian B: (..., K, D) x (D) Categorical Z: (..., N-1) x (K) GaussianMarkovChain X: (...) x (N,D) No plates for X. """
[docs] def __init__(self, X, B, Z, B_rotator): self.X_node = X self.B_node = B self.Z_node = Z._moments.get_converter(CategoricalMoments)(Z) self.B_rotator = B_rotator (N,D) = self.X_node.dims[0] K = self.Z_node.dims[0][0] if len(self.Z_node.plates) == 0 and self.Z_node.plates[-1] != N-1: raise ValueError("Incorrect plate length in Z") if self.B_node.plates[-2:] != (K,D): raise ValueError("Incorrect plates in B") if len(self.Z_node.dims[0]) != 1: raise ValueError("Z should have exactly one variable axis") if len(self.B_node.dims[0]) != 1: raise ValueError("B should have exactly one variable axes") super().__init__(X, B_rotator)
def _computations_for_A_and_X(self, XpXn, XpXp): # Get moments of B and Z (B, BB) = self.B_node.get_moments() CovB = BB - B[...,:,None]*B[...,None,:] u_Z = self.Z_node.get_moments() Z = u_Z[0] # # Expectations with respect to A and X # # Compute: \sum_n <A_n> <x_{n-1} x_n^T> Z_XpXn = np.einsum('...nij,...nk->...kij', XpXn, Z) A_XpXn = np.einsum('...kil,...klj->...ij', B, Z_XpXn) A_XpXn = sum_to_plates(A_XpXn, (), ndim=2, plates_from=self.X_node.plates) # Compute: \sum_n <A_n> <x_{n-1} x_{n-1}^T> <A_n>^T Z_XpXp = np.einsum('...nij,...nk->...kij', XpXp, Z) B_Z_XpXp = np.einsum('...kil,...klj->...kij', B, Z_XpXp) A_XpXp_A = np.einsum('...kil,...kjl->...ij', B_Z_XpXp, B) A_XpXp_A = sum_to_plates(A_XpXp_A, (), ndim=2, plates_from=self.X_node.plates) # Compute: \sum_n tr(CovA_n <x_{n-1} x_{n-1}^T>) CovA_XpXp = np.einsum('...kij,...kdij->...d', Z_XpXp, CovB) CovA_XpXp = sum_to_plates(CovA_XpXp, (), ndim=1, plates_from=self.X_node.plates) return (A_XpXn, A_XpXp_A, CovA_XpXp)
[docs]class RotateMultiple(): r""" Identical parameter expansion for several nodes simultaneously Performs the same rotation for multiple nodes and combines the cost effect. """
[docs] def __init__(self, *rotators): self.rotators = rotators
[docs] def nodes(self): nodes = [] for rotator in self.rotators: nodes += rotator.nodes() return nodes
[docs] def rotate(self, R, inv=None, logdet=None): for rotator in self.rotators: rotator.rotate(R, inv=inv, logdet=logdet)
[docs] def setup(self): for rotator in self.rotators: rotator.setup()
[docs] def bound(self, R, logdet=None, inv=None): bound = 0 dbound = 0 for rotator in self.rotators: (b, db) = rotator.bound(R, logdet=logdet, inv=inv) bound = bound + b dbound = dbound + db return (bound, dbound)
[docs] def get_bound_terms(self, *args, **kwargs): d = dict() for rotator in self.rotators: d.update(rotator.get_bound_terms(*args, **kwargs)) return d