Source code for bayespy.inference.vmp.transformations

################################################################################
# Copyright (C) 2013-2014 Jaakko Luttinen
#
# This file is licensed under the MIT License.
################################################################################


import numpy as np
import warnings
import scipy

from bayespy.utils import optimize
from bayespy.utils import random
from bayespy.utils import linalg
from bayespy.utils import misc

from bayespy.utils.linalg import dot, tracedot

from .nodes import gaussian

from .nodes.categorical import CategoricalMoments

[docs]class RotationOptimizer():
    r"""
    Optimizer for rotation parameter expansion in state-space models

    Rotates one model block with :math:`\mathbf{R}` and one model block with
    :math:`\mathbf{R}^{-1}`.

    Parameters
    ----------

    block1 : rotator object

        The first rotation parameter expansion object

    block2 : rotator object

        The second rotation parameter expansion object

    D : int

        Dimensionality of the latent space

    References
    ----------

    :cite:`Luttinen:2010`, :cite:`Luttinen:2013`

    """

[docs]    def __init__(self, block1, block2, D):
        self.block1 = block1
        self.block2 = block2
        self.D = D

[docs]    def rotate(self, 
               maxiter=10, 
               check_gradient=False,
               verbose=False,
               check_bound=False):
        """
        Optimize the rotation of two separate model blocks jointly.

        If some variable is the dot product of two Gaussians, rotating the two
        Gaussians optimally can make the inference algorithm orders of magnitude
        faster.

        First block is rotated with :math:`\mathbf{R}` and the second with
        :math:`\mathbf{R}^{-T}`.

        Blocks must have methods: `bound(U,s,V)` and `rotate(R)`.
        """

        I = np.identity(self.D)
        piv = np.arange(self.D)
        
        def cost(r):

            # Make vector-r into matrix-R
            R = np.reshape(r, (self.D,self.D))

            # Compute SVD
            invR = np.linalg.inv(R)
            logdetR = np.linalg.slogdet(R)[1]

            # Compute lower bound terms
            (b1,db1) = self.block1.bound(R, logdet=logdetR, inv=invR)
            (b2,db2) = self.block2.bound(invR.T, logdet=-logdetR, inv=R.T)

            # Apply chain rule for the second gradient:
            # d b(invR.T) 
            # = tr(db.T * d(invR.T)) 
            # = tr(db * d(invR))
            # = -tr(db * invR * (dR) * invR) 
            # = -tr(invR * db * invR * dR)
            db2 = -dot(invR.T, db2.T, invR.T)

            # Compute the cost function
            c = -(b1+b2)
            dc = -(db1+db2)

            return (c, np.ravel(dc))

        def get_bound_terms(r, gradient=False):
            """
            Returns a dictionary of bound terms for the nodes.
            """
            # Gradient not yet implemented..
            if gradient:
                raise NotImplementedError()
            
            # Make vector-r into matrix-R
            R = np.reshape(r, (self.D,self.D))

            # Compute SVD
            invR = np.linalg.inv(R)
            logdetR = np.linalg.slogdet(R)[1]

            # Compute lower bound terms
            dict1 = self.block1.get_bound_terms(R, 
                                                logdet=logdetR, 
                                                inv=invR)
            dict2 = self.block2.get_bound_terms(invR.T, 
                                                logdet=-logdetR, 
                                                inv=R.T)

            if not gradient:
                dict1.update(dict2)
                return dict1
            else:
                terms = dict1[0].copy()
                terms = terms.update(dict2[0])
                grad = dict1[1].copy()
                grad = grad.update(dict2[1])
                return (terms, grad)

        def get_true_bound_terms():
            nodes = set(self.block1.nodes()) | set(self.block2.nodes())
            D = {}
            # TODO/FIXME: Also compute bound for child nodes as they could be
            # affected in practice although they shouldn't. Just checking that.
            for node in nodes:
                L = node.lower_bound_contribution()
                D[node] = L
            return D


        self.block1.setup()
        self.block2.setup()
        
        if check_gradient:
            R = np.random.randn(self.D, self.D)
            err = optimize.check_gradient(cost, np.ravel(R), 
                                          verbose=verbose)[1]
            if err > 1e-5:
                warnings.warn("Rotation gradient has relative error %g" % err)

        # Initial rotation is identity matrix
        r0 = np.ravel(np.identity(self.D))

        (cost_begin, _) = cost(r0)
        if check_bound:
            bound_terms_begin = get_bound_terms(r0)
            true_bound_terms_begin = get_true_bound_terms()

        # Run optimization
        r = optimize.minimize(cost, r0, maxiter=maxiter, verbose=verbose)

        (cost_end, _) = cost(r)
        if check_bound:
            bound_terms_end = get_bound_terms(r)

        # Apply the optimal rotation
        R = np.reshape(r, (self.D,self.D))
        invR = np.linalg.inv(R)
        logdetR = np.linalg.slogdet(R)[1]
        self.block1.rotate(R, inv=invR, logdet=logdetR)
        self.block2.rotate(invR.T, inv=R.T, logdet=-logdetR)

        # Check that the cost function and the true lower bound changed equally
        cost_change = cost_end - cost_begin
        
        # Check that we really have improved the bound.
        if cost_change > 0:
            warnings.warn("Rotation optimization made the cost function worse "
                          "by %g. Probably a bug in the gradient of the "
                          "rotation functions."
                          % (cost_change,))
                
        if check_bound:
            true_bound_terms_end = get_true_bound_terms()
            bound_change = 0
            for node in bound_terms_begin.keys():
                node_bound_change = (bound_terms_end[node] 
                                    - bound_terms_begin[node])
                bound_change += node_bound_change
                true_node_bound_change = 0
                try:
                    true_node_bound_change += (true_bound_terms_end[node] 
                                               - true_bound_terms_begin[node])
                except KeyError:
                    raise Exception("The node %s is part of the "
                                    "transformation but not part of the "
                                    "model. Check your VB construction." 
                                    % node.name)
                if not np.allclose(node_bound_change, true_node_bound_change):
                    warnings.warn("Rotation cost function is not consistent "
                                  "with the true lower bound for node %s. "
                                  "Bound changed %g but optimized function "
                                  "changed %g."  
                                  % (node.name,
                                     true_node_bound_change,
                                     node_bound_change))

            # Check that we really have improved the bound.
            # TODO/FIXME: Also compute bound for child nodes as they could be
            # affected in practice although they shouldn't. Just checking that.
            if bound_change < 0:
                warnings.warn("Rotation made the true lower bound worse by %g. "
                              "Probably a bug in the rotation functions."
                              % (bound_change,))
                

[docs]class RotateGaussian():
    r"""
    Rotation parameter expansion for :class:`bayespy.nodes.Gaussian`
    """

[docs]    def __init__(self, X):
        self.X = X

[docs]    def rotate(self, R, inv=None, logdet=None):
        self.X.rotate(R, inv=inv, logdet=logdet)

[docs]    def setup(self):
        """
        This method should be called just before optimization.
        """
        
        mask = self.X.mask[...,np.newaxis,np.newaxis]

        # Number of plates
        self.N = self.X.plates[0] #np.sum(mask)

        # Compute the sum <XX> over plates
        self.XX = misc.sum_multiply(self.X.get_moments()[1],
                                    mask,
                                    axis=(-1,-2),
                                    sumaxis=False,
                                    keepdims=False)
        # Parent's moments
        self.Lambda = self.X.parents[1].get_moments()[0]

    def _compute_bound(self, R, logdet=None, inv=None, gradient=False):
        
        """
        Rotate q(X) as X->RX: q(X)=N(R*mu, R*Cov*R')

        Assume:
        :math:`p(\mathbf{X}) = \prod^M_{m=1} 
               N(\mathbf{x}_m|0, \mathbf{\Lambda})`
        """

        # TODO/FIXME: X and alpha should NOT contain observed values!! Check
        # that.

        # TODO/FIXME: Allow non-zero prior mean!

        # Assume constant mean and precision matrix over plates..

        # Compute rotated moments
        XX_R = dot(R, self.XX, R.T)

        inv_R = inv
        logdet_R = logdet

        # Compute entropy H(X)
        logH_X = random.gaussian_entropy(-2*self.N*logdet_R, 
                                         0)

        # Compute <log p(X)>
        logp_X = random.gaussian_logpdf(np.vdot(XX_R, self.Lambda),
                                        0,
                                        0,
                                        0,
                                        0)

        # Compute the bound
        if terms:
            bound = {self.X: bound}
        else:
            bound = logp_X + logH_X

        if not gradient:
            return bound

        # Compute dH(X)
        dlogH_X = random.gaussian_entropy(-2*self.N*inv_R.T,
                                          0)

        # Compute d<log p(X)>
        dXX = 2*dot(self.Lambda, R, self.XX)
        dlogp_X = random.gaussian_logpdf(dXX,
                                         0,
                                         0,
                                         0,
                                         0)

        if terms:
            d_bound = {self.X: dlogp_X + dlogH_X}
        else:
            d_bound = dlogp_X + dlogH_X

        return (bound, d_bound)


[docs]    def bound(self, R, logdet=None, inv=None):
        return self._compute_bound(R, 
                                   logdet=logdet,
                                   inv=inv,
                                   gradient=True)

[docs]    def get_bound_terms(self, R, logdet=None, inv=None):
        return self._compute_bound(R, 
                                   logdet=logdet,
                                   inv=inv,
                                   gradient=False,
                                   terms=True)
        
[docs]    def nodes(self):
        return [self.X]


    
def covariance_to_variance(C, ndim=1, covariance_axis=None):
    # Force None to empty list
    if covariance_axis is None:
        covariance_axis = []

    # Force a list from integer
    if isinstance(covariance_axis, int):
        covariance_axis = [covariance_axis]

    # Force positive axis indices
    covariance_axis = [axis + ndim if axis < 0 else axis 
                       for axis in covariance_axis]
    
    # Make a set of the axes
    covariance_axis = set(covariance_axis)

    keys = [i+ndim if i in covariance_axis else i for i in range(ndim)]
    keys += [i+2*ndim if i in covariance_axis else i for i in range(ndim)]
    out_keys = sorted(list(set(keys)))

    return np.einsum(C, [Ellipsis]+keys, [Ellipsis]+out_keys)

def sum_to_plates(V, plates_to, plates_from=None, ndim=0):
    if ndim == 0:
        if plates_from is not None:
            r = gaussian.Gaussian.broadcasting_multiplier(plates_from,
                                                          np.shape(V))
        else:
            r = 1
        return r * misc.sum_to_shape(V, plates_to)
    else:
        dims_V = np.shape(V)[-ndim:]
        plates_V = np.shape(V)[:-ndim]
        shape_to = tuple(plates_to) + dims_V
        if plates_from is not None:
            r = gaussian.Gaussian.broadcasting_multiplier(plates_from, plates_V)
        else:
            r = 1
        return r * misc.sum_to_shape(V, shape_to)

[docs]class RotateGaussianARD():
    """
    Rotation parameter expansion for :class:`bayespy.nodes.GaussianARD`
    
    The model:

    alpha ~ N(a, b)
    X ~ N(mu, alpha)

    X can be an array (e.g., GaussianARD).

    Transform q(X) and q(alpha) by rotating X.

    Requirements:
    * X and alpha do not contain any observed values
    """
[docs]    def __init__(self, X, *alpha, axis=-1, precompute=False, subset=None):
        """
        Precompute tells whether to compute some moments once in the setup
        function instead of every time in the bound function.  However, they are
        computed a bit differently in the bound function so it can be useful
        too. Precomputation is probably beneficial only when there are large
        axes that are not rotated (by R nor Q) and they are not contained in the
        plates of alpha, and the dimensions for R and Q are quite small.
        """
        
        self.precompute = precompute
        
        self.node_parent = X.parents[0]
        if len(alpha) == 0:
            self.update_alpha = False
        elif len(alpha) == 1:
            self.node_alpha = alpha[0]
            self.update_alpha = True
        else:
            raise ValueError("Too many arguments")
        self.node_X = X
        #self.node_mu = X.parents[0]
        self.ndim = len(X.dims[0])

        # Force negative rotation axis indexing
        if not isinstance(axis, int):
            raise ValueError("Axis must be integer")
        if axis >= 0:
            axis -= self.ndim
        if axis < -self.ndim or axis >= 0:
            raise ValueError("Axis out of bounds")
        self.axis = axis

        # Allow rotation of only subset of elements/slices
        self.D = X.dims[0][axis]
        if subset is None:
            #self.subset = np.ones(self.D, dtype=bool)
            self.subset = None #tuple(range(self.D))
        else:
            #self.subset = tuple(range(self.D))
            self.subset = subset #self.subset[subset]
            if axis != -1:
                raise NotImplementedError("Subset indexing for non-last "
                                          "axis not yet implemented")
            ## self.subset = np.zeros(self.D, dtype=bool)
            ## self.subset[list(subset)] = True


[docs]    def nodes(self):
        if self.update_alpha:
            return [self.node_X, self.node_alpha]
        else:
            return [self.node_X]


    def _full_rotation_matrix(self, R):
        if self.subset is not None:
            R_full = np.identity(self.D)
            indices = np.ix_(self.subset, self.subset)
            R_full[indices] = R
            return R_full
        else:
            return R


[docs]    def rotate(self, R, inv=None, logdet=None, Q=None):

        ## R = self._full_rotation_matrix(R)
        ## if inv is not None:
        ##     inv = self._full_rotation_matrix(inv)

        self.node_X.rotate(R, 
                           inv=inv,
                           logdet=logdet,
                           subset=self.subset,
                           axis=self.axis)

        if self.plate_axis is not None:
            self.node_X.rotate_plates(Q, plate_axis=self.plate_axis)

        if self.update_alpha:
            self.node_alpha.update()

[docs]    def setup(self, plate_axis=None):
        """
        This method should be called just before optimization.

        For efficiency, sum over axes that are not in mu, alpha nor rotation.

        If using Q, set rotate_plates to True.
        """

        # Store the original plate_axis parameter for later use in other methods
        self.plate_axis = plate_axis

        # Manipulate the plate_axis parameter to suit the needs of this method
        if plate_axis is not None:
            if not isinstance(plate_axis, int):
                raise ValueError("Plate axis must be integer")
            if plate_axis >= 0:
                plate_axis -= len(self.node_X.plates)
            if plate_axis < -len(self.node_X.plates) or plate_axis >= 0:
                raise ValueError("Axis out of bounds")
            plate_axis -= self.ndim - 1 # Why -1? Because one axis is preserved!
                
        # Get the mean parameter. It will not be rotated. This assumes that mu
        # and alpha are really independent.
        (alpha_mu, alpha_mu2, alpha, _) = self.node_parent.get_moments()
        (X, XX) = self.node_X.get_moments()

        #
        mu = alpha_mu / alpha
        mu2 = alpha_mu2 / alpha
        # For simplicity, force mu to have the same shape as X
        mu = mu * np.ones(self.node_X.dims[0])
        mu2 = mu2 * np.ones(self.node_X.dims[0])
        ## (mu, mumu) = gaussian.reshape_gaussian_array(self.node_mu.dims[0],
        ##                                              self.node_X.dims[0],
        ##                                              mu,
        ##                                              mumu)

        # Take diagonal of covariances to variances for axes that are not in R
        # (and move those axes to be the last)
        XX = covariance_to_variance(XX,
                                    ndim=self.ndim,
                                    covariance_axis=self.axis)
        ## mumu = covariance_to_variance(mumu,
        ##                               ndim=self.ndim, 
        ##                               covariance_axis=self.axis)
        
        # Move axes of X and mu and compute their outer product
        X = misc.moveaxis(X, self.axis, -1)
        mu = misc.moveaxis(mu, self.axis, -1)
        mu2 = misc.moveaxis(mu2, self.axis, -1)
        Xmu = linalg.outer(X, mu, ndim=1)
        D = np.shape(X)[-1]
        
        # Move axes of alpha related variables
        def safe_move_axis(x):
            if np.ndim(x) >= -self.axis:
                return misc.moveaxis(x, self.axis, -1)
            else:
                return x[...,np.newaxis]
        if self.update_alpha:
            a = safe_move_axis(self.node_alpha.phi[1])
            a0 = safe_move_axis(self.node_alpha.parents[0].get_moments()[0])
            b0 = safe_move_axis(self.node_alpha.parents[1].get_moments()[0])
            plates_alpha = list(self.node_alpha.plates)
        else:
            alpha = safe_move_axis(self.node_parent.get_moments()[2])
            plates_alpha = list(self.node_parent.get_shape(2))

        # Move plates of alpha for R
        if len(plates_alpha) >= -self.axis:
            plate = plates_alpha.pop(self.axis)
            plates_alpha.append(plate)
        else:
            plates_alpha.append(1)
            
        plates_X = list(self.node_X.get_shape(0))
        plates_X.pop(self.axis)

        def sum_to_alpha(V, ndim=2):
            # TODO/FIXME: This could be improved so that it is not required to
            # explicitly repeat to alpha plates. Multiplying by ones was just a
            # simple bug fix.
            return sum_to_plates(V * np.ones(plates_alpha[:-1]+ndim*[1]),
                                 plates_alpha[:-1],
                                 ndim=ndim,
                                 plates_from=plates_X)
        
        if plate_axis is not None:
            # Move plate axis just before the rotated dimensions (which are
            # last)
            def safe_move_plate_axis(x, ndim):
                if np.ndim(x)-ndim >= -plate_axis:
                    return misc.moveaxis(x, 
                                         plate_axis-ndim,
                                         -ndim-1)
                else:
                    inds = (Ellipsis,None) + ndim*(slice(None),)
                    return x[inds]
            X = safe_move_plate_axis(X, 1)
            mu = safe_move_plate_axis(mu, 1)
            XX = safe_move_plate_axis(XX, 2)
            mu2 = safe_move_plate_axis(mu2, 1)
            if self.update_alpha:
                a = safe_move_plate_axis(a, 1)
                a0 = safe_move_plate_axis(a0, 1)
                b0 = safe_move_plate_axis(b0, 1)
            else:
                alpha = safe_move_plate_axis(alpha, 1)
            # Move plates of X and alpha
            plate = plates_X.pop(plate_axis)
            plates_X.append(plate)
            if len(plates_alpha) >= -plate_axis+1:
                plate = plates_alpha.pop(plate_axis-1)
            else:
                plate = 1
            plates_alpha = plates_alpha[:-1] + [plate] + plates_alpha[-1:]

            CovX = XX - linalg.outer(X, X)
            self.CovX = sum_to_plates(CovX,
                                      plates_alpha[:-2],
                                      ndim=3,
                                      plates_from=plates_X[:-1])
            # Broadcast mumu to ensure shape
            #mumu = np.ones(np.shape(XX)[-3:]) * mumu
            mu2 = mu2 * np.ones(np.shape(X)[-2:])
            self.mu2 = sum_to_alpha(mu2, ndim=1)

            if self.precompute:
                # Precompute some stuff for the gradient of plate rotation
                #
                # NOTE: These terms may require a lot of memory if alpha has the
                # same or almost the same plates as X.
                self.X_X = sum_to_plates(X[...,:,:,None,None] *
                                         X[...,None,None,:,:],
                                         plates_alpha[:-2],
                                         ndim=4,
                                         plates_from=plates_X[:-1])
                self.X_mu = sum_to_plates(X[...,:,:,None,None] *
                                          mu[...,None,None,:,:],
                                          plates_alpha[:-2],
                                          ndim=4,
                                          plates_from=plates_X[:-1])
            else:
                self.X = X
                self.mu = mu
                    
        else:
            # Sum axes that are not in the plates of alpha
            self.XX = sum_to_alpha(XX)
            self.mu2 = sum_to_alpha(mu2, ndim=1)
            self.Xmu = sum_to_alpha(Xmu)
            
        
        if self.update_alpha:
            self.a = a
            self.a0 = a0
            self.b0 = b0
        else:
            self.alpha = alpha

        self.plates_X = plates_X
        self.plates_alpha = plates_alpha

        # Take only a subset of the matrix for rotation
        if self.subset is not None:
            if self.precompute:
                raise NotImplementedError("Precomputation not implemented when "
                                          "using a subset")
            # from X
            self.X = self.X[...,self.subset]
            self.mu2 = self.mu2[...,self.subset]
            if plate_axis is not None:
                # from CovX
                inds = []
                for i in range(np.ndim(self.CovX)-2):
                    inds.append(range(np.shape(self.CovX)[i]))
                inds.append(self.subset)
                inds.append(self.subset)
                indices = np.ix_(*inds)
                self.CovX = self.CovX[indices]
                # from mu
                self.mu = self.mu[...,self.subset]
            else:
                # from XX
                inds = []
                for i in range(np.ndim(self.XX)-2):
                    inds.append(range(np.shape(self.XX)[i]))
                inds.append(self.subset)
                inds.append(self.subset)
                indices = np.ix_(*inds)
                self.XX = self.XX[indices]
                # from Xmu
                self.Xmu = self.Xmu[...,self.subset]
            # from alpha
            if self.update_alpha:
                if np.shape(self.a)[-1] > 1:
                    self.a = self.a[...,self.subset]
                if np.shape(self.a0)[-1] > 1:
                    self.a0 = self.a0[...,self.subset]
                if np.shape(self.b0)[-1] > 1:
                    self.b0 = self.b0[...,self.subset]
            else:
                if np.shape(self.alpha)[-1] > 1:
                    self.alpha = self.alpha[...,self.subset]

            self.plates_alpha[-1] = min(self.plates_alpha[-1], len(self.subset))
        ##     # from mu
        ##     # from alpha
        ##     alpha_mu = alpha_mu[...,self.subset]
        ##     alpha_mu2 = alpha_mu2[...,self.subset]
        ##     alpha = alpha[...,self.subset]
        ##     dims = list(self.node_X.dims[0])
        ##     dims[-1] = len(self.subset)
        ## else:
        ##     dims = list(self.node_X.dims[0])


    def _compute_bound(self, R, logdet=None, inv=None, Q=None, gradient=False, terms=False):
        """
        Rotate q(X) and q(alpha).

        Assume:
        p(X|alpha) = prod_m N(x_m|0,diag(alpha))
        p(alpha) = prod_d G(a_d,b_d)
        """

        ## R = self._full_rotation_matrix(R)
        ## if inv is not None:
        ##     inv = self._full_rotation_matrix(inv)

        #
        # Transform the distributions and moments
        #

        plates_alpha = self.plates_alpha
        plates_X = self.plates_X
        
        # Compute rotated second moment
        if self.plate_axis is not None:
            # The plate axis has been moved to be the last plate axis

            if Q is None:
                raise ValueError("Plates should be rotated but no Q give")

            # Transform covariance
            sumQ = np.sum(Q, axis=0)
            QCovQ = sumQ[:,None,None]**2 * self.CovX
            
            # Rotate plates
            if self.precompute:
                QX_QX = np.einsum('...kalb,...ik,...il->...iab', self.X_X, Q, Q)
                XX = QX_QX + QCovQ
                XX = sum_to_plates(XX,
                                   plates_alpha[:-1],
                                   ndim=2)
                Xmu = np.einsum('...kaib,...ik->...iab', self.X_mu, Q)
                Xmu = sum_to_plates(Xmu,
                                   plates_alpha[:-1],
                                   ndim=2)
            else:
                X = self.X
                mu = self.mu
                QX = np.einsum('...ik,...kj->...ij', Q, X)
                XX = (sum_to_plates(QCovQ,
                                    plates_alpha[:-1],
                                    ndim=2) +
                      sum_to_plates(linalg.outer(QX, QX),
                                    plates_alpha[:-1],
                                    ndim=2,
                                    plates_from=plates_X))
                Xmu = sum_to_plates(linalg.outer(QX, self.mu),
                                    plates_alpha[:-1],
                                    ndim=2,
                                    plates_from=plates_X)

            mu2 = self.mu2
            D = np.shape(XX)[-1]
            logdet_Q = D * np.log(np.abs(sumQ))

        else:
            XX = self.XX
            mu2 = self.mu2
            Xmu = self.Xmu
            logdet_Q = 0

        # Compute transformed moments
        #mu2 = np.einsum('...ii->...i', mu2)
        RXmu = np.einsum('...ik,...ki->...i', R, Xmu)
        RXX = np.einsum('...ik,...kj->...ij', R, XX)
        RXXR = np.einsum('...ik,...ik->...i', RXX, R)

        # <(X-mu) * (X-mu)'>_R
        XmuXmu = (RXXR - 2*RXmu + mu2)

        D = np.shape(R)[0]

        # Compute q(alpha)
        if self.update_alpha:
            # Parameters
            a0 = self.a0
            b0 = self.b0
            a = self.a
            b = b0 + 0.5*sum_to_plates(XmuXmu,
                                       plates_alpha,
                                       plates_from=None,
                                       ndim=0)
            # Some expectations
            alpha = a / b
            logb = np.log(b)
            logalpha = -logb # + const
            b0_alpha = b0 * alpha
            a0_logalpha = a0 * logalpha
        else:
            alpha = self.alpha
            logalpha = 0
        
        #
        # Compute the cost
        #

        def sum_plates(V, *plates):
            full_plates = misc.broadcasted_shape(*plates)
            
            r = self.node_X.broadcasting_multiplier(full_plates, np.shape(V))
            return r * np.sum(V)

        XmuXmu_alpha = XmuXmu * alpha

        if logdet is None:
            logdet_R = np.linalg.slogdet(R)[1]
            inv_R = np.linalg.inv(R)
        else:
            logdet_R = logdet
            inv_R = inv

        # Compute entropy H(X)
        logH_X = random.gaussian_entropy(-2*sum_plates(logdet_R + logdet_Q,
                                                             plates_X),
                                               0)

        # Compute <log p(X|alpha)>
        logp_X = random.gaussian_logpdf(sum_plates(XmuXmu_alpha,
                                                         plates_alpha[:-1] + [D]),
                                              0,
                                              0,
                                              sum_plates(logalpha,
                                                         plates_X + [D]),
                                              0)

        if self.update_alpha:

            # Compute entropy H(alpha)
            # This cancels out with the log(alpha) term in log(p(alpha))
            logH_alpha = 0

            # Compute <log p(alpha)>
            logp_alpha = random.gamma_logpdf(sum_plates(b0_alpha,
                                                        plates_alpha),
                                             0,
                                             sum_plates(a0_logalpha,
                                                        plates_alpha),
                                             0,
                                             0)
        else:
            logH_alpha = 0
            logp_alpha = 0

        # Compute the bound
        if terms:
            bound = {self.node_X: logp_X + logH_X}
            if self.update_alpha:
                bound.update({self.node_alpha: logp_alpha + logH_alpha})
        else:
            bound = (0
            + logp_X
            + logp_alpha
            + logH_X
            + logH_alpha
                     )

        if not gradient:
            return bound

        #
        # Compute the gradient with respect R
        #

        broadcasting_multiplier = self.node_X.broadcasting_multiplier
        def sum_plates(V, plates):
            ones = np.ones(np.shape(R))
            r = broadcasting_multiplier(plates, np.shape(V)[:-2])
            return r * misc.sum_multiply(V, ones,
                                         axis=(-1,-2),
                                         sumaxis=False,
                                         keepdims=False)

        D_XmuXmu = 2*RXX - 2*gaussian.transpose_covariance(Xmu)

        DXmuXmu_alpha = np.einsum('...i,...ij->...ij', 
                                  alpha,
                                  D_XmuXmu)
        if self.update_alpha:
            D_b            = 0.5 * D_XmuXmu
            XmuXmu_Dalpha  = np.einsum('...i,...i,...i,...ij->...ij', 
                                       sum_to_plates(XmuXmu,
                                                     plates_alpha,
                                                     plates_from=None,
                                                     ndim=0), 
                                       alpha, 
                                       -1/b, 
                                       D_b)
            D_b0_alpha     = np.einsum('...i,...i,...i,...ij->...ij', 
                                       b0,
                                       alpha,
                                       -1/b,
                                       D_b)
            D_logb         = np.einsum('...i,...ij->...ij', 
                                       1/b,
                                       D_b)
            D_logalpha     = -D_logb
            D_a0_logalpha  = a0 * D_logalpha
        else:
            XmuXmu_Dalpha = 0
            D_logalpha = 0

        D_XmuXmu_alpha = DXmuXmu_alpha + XmuXmu_Dalpha
        D_logR         = inv_R.T
        
        
        # Compute dH(X)
        dlogH_X = random.gaussian_entropy(-2*sum_plates(D_logR,
                                                        plates_X),
                                          0)

        # Compute d<log p(X|alpha)>
        dlogp_X = random.gaussian_logpdf(sum_plates(D_XmuXmu_alpha,
                                                    plates_alpha[:-1]),
                                         0,
                                         0,
                                         (sum_plates(D_logalpha,
                                                     plates_X)
                                          * broadcasting_multiplier((D,),
                                                             plates_alpha[-1:])),
                                         0)

        if self.update_alpha:

            # Compute dH(alpha)
            # This cancels out with the log(alpha) term in log(p(alpha))
            dlogH_alpha = 0

            # Compute d<log p(alpha)>
            dlogp_alpha = random.gamma_logpdf(sum_plates(D_b0_alpha,
                                                         plates_alpha[:-1]),
                                              0,
                                              sum_plates(D_a0_logalpha,
                                                         plates_alpha[:-1]),
                                              0,
                                              0)
        else:
            dlogH_alpha = 0
            dlogp_alpha = 0

        if terms:
            raise NotImplementedError()
            dR_bound = {self.node_X: dlogp_X + dlogH_X}
            if self.update_alpha:
                dR_bound.update({self.node_alpha: dlogp_alpha + dlogH_alpha})
        else:
            dR_bound = (0*dlogp_X
            + dlogp_X
            + dlogp_alpha
            + dlogH_X
            + dlogH_alpha
                        )

        if self.subset:
            indices = np.ix_(self.subset, self.subset)
            dR_bound = dR_bound[indices]

        if self.plate_axis is None:
            return (bound, dR_bound)

        #
        # Compute the gradient with respect to Q (if Q given)
        #

        # Some pre-computations
        Q_RCovR = np.einsum('...ik,...kl,...il,...->...i', 
                            R, 
                            self.CovX,
                            R, 
                            sumQ)
        if self.precompute:
            Xr_rX = np.einsum('...abcd,...jb,...jd->...jac', 
                               self.X_X, 
                               R, 
                               R)
            QXr_rX = np.einsum('...akj,...ik->...aij', 
                               Xr_rX, 
                               Q)
            RX_mu = np.einsum('...jk,...akbj->...jab', 
                              R, 
                              self.X_mu)

        else:
            RX = np.einsum('...ik,...k->...i', R, X)
            QXR = np.einsum('...ik,...kj->...ij', Q, RX)
            QXr_rX = np.einsum('...ik,...jk->...kij', QXR, RX)
            RX_mu = np.einsum('...ik,...jk->...kij', RX, mu)

            QXr_rX = sum_to_plates(QXr_rX,
                                   plates_alpha[:-2],
                                   ndim=3,
                                   plates_from=plates_X[:-1])
            RX_mu = sum_to_plates(RX_mu,
                                  plates_alpha[:-2],
                                  ndim=3,
                                  plates_from=plates_X[:-1])
        
        def psi(v):
            """
            Compute: d/dQ 1/2*trace(diag(v)*<(X-mu)*(X-mu)>)

            = Q*<X>'*R'*diag(v)*R*<X> + ones * Q diag( tr(R'*diag(v)*R*Cov) ) 
              + mu*diag(v)*R*<X>
            """

            # Precompute all terms to plates_alpha because v has shape
            # plates_alpha.

            # Gradient of 0.5*v*<x>*<x>
            v_QXrrX = np.einsum('...kij,...ik->...ij', QXr_rX, v)

            # Gradient of 0.5*v*Cov
            Q_tr_R_v_R_Cov = np.einsum('...k,...k->...', Q_RCovR, v)[...,None,:]

            # Gradient of mu*v*x
            mu_v_R_X = np.einsum('...ik,...kji->...ij', v, RX_mu)

            return v_QXrrX + Q_tr_R_v_R_Cov - mu_v_R_X

        def sum_plates(V, plates):
            ones = np.ones(np.shape(Q))
            r = self.node_X.broadcasting_multiplier(plates,
                                                    np.shape(V)[:-2])

            return r * misc.sum_multiply(V, ones,
                                         axis=(-1,-2),
                                         sumaxis=False,
                                         keepdims=False)

        if self.update_alpha:
            D_logb = psi(1/b)
            XX_Dalpha = -psi(alpha/b * sum_to_plates(XmuXmu, plates_alpha))
            D_logalpha = -D_logb
        else:
            XX_Dalpha = 0
            D_logalpha = 0
        DXX_alpha = 2*psi(alpha)
        D_XX_alpha = DXX_alpha + XX_Dalpha
        D_logdetQ = D / sumQ
        N = np.shape(Q)[-1]

        # Compute dH(X)
        dQ_logHX = random.gaussian_entropy(-2*sum_plates(D_logdetQ,
                                                         plates_X[:-1]),
                                           0)

        # Compute d<log p(X|alpha)>
        dQ_logpX = random.gaussian_logpdf(sum_plates(D_XX_alpha,
                                                     plates_alpha[:-2]),
                                          0,
                                          0,
                                          (sum_plates(D_logalpha,
                                                      plates_X[:-1])
                                           * broadcasting_multiplier((N,D),
                                                              plates_alpha[-2:])),
                                          0)

        if self.update_alpha:

            D_alpha = -psi(alpha/b)
            D_b0_alpha = b0 * D_alpha
            D_a0_logalpha = a0 * D_logalpha

            # Compute dH(alpha)
            # This cancels out with the log(alpha) term in log(p(alpha))
            dQ_logHalpha = 0

            # Compute d<log p(alpha)>
            dQ_logpalpha = random.gamma_logpdf(sum_plates(D_b0_alpha,
                                                          plates_alpha[:-2]),
                                               0,
                                               sum_plates(D_a0_logalpha,
                                                          plates_alpha[:-2]),
                                               0,
                                               0)
        else:

            dQ_logHalpha = 0
            dQ_logpalpha = 0

        if terms:
            raise NotImplementedError()
            dQ_bound = {self.node_X: dQ_logpX + dQ_logHX}
            if self.update_alpha:
                dQ_bound.update({self.node_alpha: dQ_logpalpha + dQ_logHalpha})
        else:
            dQ_bound = (0*dQ_logpX
            + dQ_logpX
            + dQ_logpalpha
            + dQ_logHX
            + dQ_logHalpha
                        )
        return (bound, dR_bound, dQ_bound)



[docs]    def bound(self, R, logdet=None, inv=None, Q=None):
        return self._compute_bound(R, 
                                   logdet=logdet, 
                                   inv=inv, 
                                   Q=Q,
                                   gradient=True)
            
[docs]    def get_bound_terms(self, R, logdet=None, inv=None, Q=None):
        return self._compute_bound(R, 
                                   logdet=logdet, 
                                   inv=inv, 
                                   Q=Q,
                                   gradient=False,
                                   terms=True)


    
[docs]class RotateGaussianMarkovChain():
    r"""
    Rotation parameter expansion for :class:`bayespy.nodes.GaussianMarkovChain`
    
    Assume the following model.

    Constant, unit isotropic innovation noise. Unit variance only?

    Maybe: Assume innovation noise with unit variance? Would it help make this
    function more general with respect to A.

    TODO: Allow constant A or not rotating A.

    .. math::

       R x_n = R A R^{-1} R x_{n-1} + R B u_{n-1} + noise
       \\\
       R x_n = R [A, B] [R^{-1}, 0; 0, I] [R, 0; 0, I] [x_{n-1}; u_{n-1}]

    :math:`A` may vary in time.

    Shape of A: (N,D,D)
    Shape of AA: (N,D,D,D)

    No plates for X.
    """

[docs]    def __init__(self, X, *args):
        self.X_node = X

        # FIXME: Currently, GaussianMarkovChain wraps initial state mean and
        # precision into one node and dynamics plus innovation noise into
        # another node. This transformation doesn't yet support GaussianGamma
        # dynamics, so we'll do some ugly checking here:

        # Dynamics node
        from bayespy.inference.vmp.nodes.gaussian import (
            WrapToGaussianGamma,
            GaussianToGaussianGamma,
            GaussianMoments,
        )
        dynamics_innovation  = X.parents[1]
        assert(isinstance(dynamics_innovation, WrapToGaussianGamma))
        dynamics_gaussiangamma = dynamics_innovation.parents[0]
        assert(isinstance(dynamics_gaussiangamma, GaussianToGaussianGamma))
        dynamics = dynamics_gaussiangamma.parents[0]
        assert(isinstance(dynamics._moments, GaussianMoments))
        self.A_node = dynamics

        if len(args) == 0:
            raise NotImplementedError()
        elif len(args) == 1:
            self.A_rotator = args[0]
        else:
            raise ValueError("Wrong number of arguments")

        self.N = X.dims[0][0]

[docs]    def nodes(self):
        return [self.X_node] + self.A_rotator.nodes()

[docs]    def rotate(self, R, inv=None, logdet=None):
        if inv is None:
            inv = np.linalg.inv(R)
        if logdet is None:
            logdet = np.linalg.slogdet(R)[1]

        self.X_node.rotate(R, inv=inv, logdet=logdet)

        from scipy.linalg import block_diag
        if len(self.X_node.parents) >= 3:
            input_shape = self.X_node.parents[2].dims[0]
            input_len = input_shape[-1]
            I = np.identity(input_len)
        else:
            I = np.identity(0)
        self.A_rotator.rotate(block_diag(inv.T, I), inv=block_diag(R.T, I), logdet=-logdet, Q=R)


    def _computations_for_A_and_X(self, XpXn, XpXp):
        # Get moments of the state dynamics matrix
        (A, AA) = self.A_node.get_moments()
        # Make sure time axis is in the arrays
        A = misc.atleast_nd(A, 3)
        AA = misc.atleast_nd(AA, 4)
        CovA = AA - A[...,:,np.newaxis]*A[...,np.newaxis,:]

        #
        # Expectations with respect to A and X
        #

        # TODO: In case A does not depend on time, use a bit more efficient
        # formulas

        # Compute: \sum_n <A_n> <x_{n-1} x_n^T>
        A_XpXn = np.einsum('...nik,...nkj->...ij',
                           A,
                           XpXn)
        A_XpXn = sum_to_plates(A_XpXn,
                               (),
                               ndim=2,
                               plates_from=self.X_node.plates)

        # Compute: \sum_n <A_n> <x_{n-1} x_{n-1}^T> <A_n>^T
        A_XpXp = np.einsum('...nik,...nkj->...nij', 
                           A,
                           XpXp)
        A_XpXp_A = np.einsum('...nik,...njk->...ij', 
                             A_XpXp,
                             A)
        A_XpXp_A = sum_to_plates(A_XpXp_A,
                                 (),
                                 ndim=2,
                                 plates_from=self.X_node.plates)

        # Compute: \sum_n tr(CovA_n <x_{n-1} x_{n-1}^T>)
        CovA_XpXp = np.einsum('...ndij,...nij->...d',
                              CovA,
                              XpXp)
        CovA_XpXp = sum_to_plates(CovA_XpXp,
                                  (),
                                  ndim=1,
                                  plates_from=self.X_node.plates)

        return (A_XpXn, A_XpXp_A, CovA_XpXp)

[docs]    def setup(self):
        """
        This method should be called just before optimization.
        """

        # Get moments of X
        (X, XnXn, XpXn) = self.X_node.get_moments()

        # TODO/FIXME: Sum to plates of A/CovA
        XpXp = XnXn[...,:-1,:,:]

        # Add input signals
        if len(self.X_node.parents) >= 3:
            (U, UU) = self.X_node.parents[2].get_moments()
            UXn = linalg.outer(U, X[...,1:,:])
            UXp = linalg.outer(U, X[...,:-1,:])
            XpXn = np.concatenate([XpXn, UXn], axis=-2)
            XpXp = np.concatenate(
                [
                    np.concatenate([XpXp, linalg.transpose(UXp)], axis=-1),
                    np.concatenate([UXp, UU], axis=-1)
                ],
                axis=-2
            )

        #
        # Expectations with respect to X
        #

        self.X0 = X[...,0,:]
        self.X0X0 = XnXn[...,0,:,:]
        #self.XnXn = np.sum(XnXn[...,1:,:,:], axis=-3)
        self.XnXn = sum_to_plates(XnXn[...,1:,:,:],
                                  (),
                                  plates_from=self.X_node.plates + (self.N-1,),
                                  ndim=2)

        # Get moments of the fixed parameter nodes
        Lambda_mu = self.X_node.parents[0].get_moments()[0]
        self.Lambda = self.X_node.parents[0].get_moments()[2]
        #self.Lambda = self.X_node.parents[1].get_moments()[0]
        self.Lambda_mu_X0 = linalg.outer(Lambda_mu, self.X0)
        self.Lambda_mu_X0 = sum_to_plates(self.Lambda_mu_X0,
                                          (),
                                          plates_from=self.X_node.plates,
                                          ndim=2)

        #
        # Prepare the rotation for A
        #

        (self.A_XpXn,
         self.A_XpXp_A,
         self.CovA_XpXp) = self._computations_for_A_and_X(XpXn, XpXp)

        self.A_rotator.setup(plate_axis=-1)

        # Innovation noise is assumed to be I
        #self.v = self.X_node.parents[3].get_moments()[0]

    def _compute_bound(self, R, logdet=None, inv=None, gradient=False, terms=False):
        """
        Rotate q(X) as X->RX: q(X)=N(R*mu, R*Cov*R')

        Assume:
        :math:`p(\mathbf{X}) = \prod^M_{m=1} 
               N(\mathbf{x}_m|0, \mathbf{\Lambda})`

        Assume unit innovation noise covariance.
        """

        # TODO/FIXME: X and alpha should NOT contain observed values!! Check
        # that.

        # Assume constant mean and precision matrix over plates..

        if inv is None:
            invR = np.linalg.inv(R)
        else:
            invR = inv

        if logdet is None:
            logdetR = np.linalg.slogdet(R)[1]
        else:
            logdetR = logdet

        # Transform moments of X and A:
        
        Lambda_R_X0X0 = sum_to_plates(dot(self.Lambda, R, self.X0X0),
                                      (),
                                      plates_from=self.X_node.plates,
                                      ndim=2)
        R_XnXn = dot(R, self.XnXn)
        RA_XpXp_A = dot(R, self.A_XpXp_A)
        sumr = np.sum(R, axis=0)
        R_CovA_XpXp = sumr * self.CovA_XpXp

        # Compute entropy H(X)
        M = self.N*np.prod(self.X_node.plates) # total number of rotated vectors
        logH_X = random.gaussian_entropy(-2 * M * logdetR, 
                                         0)

        # Compute <log p(X)>
        yy = tracedot(R_XnXn, R.T) + tracedot(Lambda_R_X0X0, R.T)
        yz = tracedot(dot(R,self.A_XpXn),R.T) + tracedot(self.Lambda_mu_X0, R.T)
        zz = tracedot(RA_XpXp_A, R.T) + np.einsum('...k,...k->...',
                                                  R_CovA_XpXp,
                                                  sumr)
        logp_X = random.gaussian_logpdf(yy,
                                        yz,
                                        zz,
                                        0,
                                        0)

        # Compute the bound
        if terms:
            bound = {self.X_node: logp_X + logH_X}
        else:
            bound = logp_X + logH_X

        if not gradient:
            return bound
        
        # Compute dH(X)
        dlogH_X = random.gaussian_entropy(-2 * M * invR.T,
                                          0)

        # Compute d<log p(X)>
        dyy = 2 * (R_XnXn + Lambda_R_X0X0)
        dyz = dot(R, self.A_XpXn + self.A_XpXn.T) + self.Lambda_mu_X0
        dzz = 2 * (RA_XpXp_A + R_CovA_XpXp[None,:])
        dlogp_X = random.gaussian_logpdf(dyy,
                                         dyz,
                                         dzz,
                                         0,
                                         0)

        if terms:
            d_bound = {self.X_node: dlogp_X + dlogH_X}
        else:
            d_bound = (
                       + dlogp_X
                       + dlogH_X
                       )

        return (bound, d_bound)

    
[docs]    def bound(self, R, logdet=None, inv=None):

        if inv is None:
            inv = np.linalg.inv(R)
        if logdet is None:
            logdet = np.linalg.slogdet(R)[1]
            
        (bound_X, d_bound_X) = self._compute_bound(R,
                                                   logdet=logdet,
                                                   inv=inv,
                                                   gradient=True)
        
        # Compute cost and gradient from A

        # Handle possible input signals
        from scipy.linalg import block_diag
        if len(self.X_node.parents) >= 3:
            input_shape = self.X_node.parents[2].dims[0]
            input_len = input_shape[-1]
            I = np.identity(input_len)
        else:
            I = np.identity(0)
        (bound_A, dR_bound_A, dQ_bound_A) = self.A_rotator.bound(block_diag(inv.T, I),
                                                                 inv=block_diag(R.T, I),
                                                                 logdet=-logdet,
                                                                 Q=R)
        # Ignore input signals gradients
        D = self.X_node.dims[0][-1]
        dR_bound_A = dR_bound_A[...,:D,:D]
        dR_bound_A = -dot(inv.T, dR_bound_A.T, inv.T)

        # Compute the bound
        bound = bound_X + bound_A
        d_bound = d_bound_X + dR_bound_A + dQ_bound_A

        return (bound, d_bound)

[docs]    def get_bound_terms(self, R, logdet=None, inv=None):

        if inv is None:
            inv = np.linalg.inv(R)
        if logdet is None:
            logdet = np.linalg.slogdet(R)[1]

        # Handle possible input signals
        from scipy.linalg import block_diag
        if len(self.X_node.parents) >= 3:
            input_shape = self.X_node.parents[2].dims[0]
            input_len = input_shape[-1]
            I = np.identity(input_len)
        else:
            I = np.identity(0)
        terms_A = self.A_rotator.get_bound_terms(block_diag(inv.T, I),
                                                 inv=block_diag(R.T, I),
                                                 logdet=-logdet,
                                                 Q=R)

        terms_X = self._compute_bound(R,
                                      logdet=logdet,
                                      inv=inv,
                                      gradient=False,
                                      terms=True)

        terms_X.update(terms_A)

        return terms_X

    
[docs]class RotateVaryingMarkovChain(RotateGaussianMarkovChain):
    r"""
    Rotation for :class:`bayespy.nodes.SwitchingGaussianMarkovChain`
    
    Assume the following model.

    Constant, unit isotropic innovation noise.

    :math:`A_n = \sum_k B_k s_{kn}`
    
    Gaussian B: (1,D) x (D,K)
    Gaussian S: (N,1) x (K)
    MC X:          () x (N+1,D)

    No plates for X.
    """

[docs]    def __init__(self, X, B, S, B_rotator):
        self.X_node = X
        self.B_node = B
        self.S_node = S
        self.B_rotator = B_rotator

        if len(S.plates) > 0 and S.plates[-1] > 1:
            raise ValueError("The length of the last plate of S must be 1.")
        if len(B.plates) > 1 and B.plates[-2] > 1:
            raise ValueError("The length of the last plate of B must be 1.")

        if len(S.dims[0]) != 1:
            raise ValueError("S should have exactly one variable axis")
        if len(B.dims[0]) != 2:
            raise ValueError("B should have exactly two variable axes")

        super().__init__(X, B_rotator)

    def _computations_for_A_and_X(self, XpXn, XpXp):

        # Get moments of B and S
        (B, BB) = self.B_node.get_moments()
        CovB = BB - B[...,:,:,None,None]*B[...,None,None,:,:]
        
        u_S = self.S_node.get_moments()
        S = u_S[0]
        SS = u_S[1]

        #
        # Expectations with respect to A and X
        #

        # TODO/FIXME: If S and B have overlapping plates, then these will give
        # wrong results, because those plates of S are summed before multiplying
        # by the plates of B. There should be some "smart einsum" function which
        # would compute sum-multiplys intelligently given a number of inputs.
        
        # Compute: \sum_n <A_n> <x_{n-1} x_n^T>
        # Axes: (N, D, D, D, K)
        S_XpXn = misc.sum_multiply(S[...,None,None,:],
                                   XpXn[...,:,None,:,:,None],
                                   axis=(-3,-2,-1),
                                   sumaxis=False)
        A_XpXn = misc.sum_multiply(B[...,:,:,None,:],
                                   S_XpXn[...,:,:,:],
                                   axis=(-4,-2),
                                   sumaxis=False)

        # Compute: \sum_n <A_n> <x_{n-1} x_{n-1}^T> <A_n>^T
        # Axes: (N, D, D, D, K, D, K)
        SS_XpXp = misc.sum_multiply(SS[...,None,:,None,:],
                                    XpXp[...,None,:,None,:,None],
                                    axis=(-4,-3,-2,-1),
                                    sumaxis=False)
        B_SS_XpXp = misc.sum_multiply(B[...,:,:,:,None,None],
                                      SS_XpXp[...,:,:,:,:],
                                      axis=(-4,-3),
                                      sumaxis=True)
        A_XpXp_A = misc.sum_multiply(B_SS_XpXp[...,:,None,:,:],
                                     B[...,None,:,:,:],
                                     axis=(-4,-3),
                                     sumaxis=False)

        # Compute: \sum_n tr(CovA_n <x_{n-1} x_{n-1}^T>)
        # Axes: (D,D,K,D,K)
        CovA_XpXp = misc.sum_multiply(CovB,
                                      SS_XpXp,
                                      axis=(-5,),
                                      sumaxis=False)

        return (A_XpXn, A_XpXp_A, CovA_XpXp)


[docs]class RotateSwitchingMarkovChain(RotateGaussianMarkovChain):
    """
    Rotation for :class:`bayespy.nodes.VaryingGaussianMarkovChain`

    Assume the following model.

    Constant, unit isotropic innovation noise.

    :math:`A_n = B_{z_n}`
    
    Gaussian B:            (..., K,  D) x   (D)
    Categorical Z:         (...,   N-1) x   (K)
    GaussianMarkovChain X: (...)        x (N,D)

    No plates for X.
    """

[docs]    def __init__(self, X, B, Z, B_rotator):
        self.X_node = X
        self.B_node = B
        self.Z_node = Z._moments.get_converter(CategoricalMoments)(Z)
        self.B_rotator = B_rotator

        (N,D) = self.X_node.dims[0]
        K = self.Z_node.dims[0][0]

        if len(self.Z_node.plates) == 0 and self.Z_node.plates[-1] != N-1:
            raise ValueError("Incorrect plate length in Z")
        if self.B_node.plates[-2:] != (K,D):
            raise ValueError("Incorrect plates in B")

        if len(self.Z_node.dims[0]) != 1:
            raise ValueError("Z should have exactly one variable axis")
        if len(self.B_node.dims[0]) != 1:
            raise ValueError("B should have exactly one variable axes")

        super().__init__(X, B_rotator)

    def _computations_for_A_and_X(self, XpXn, XpXp):

        # Get moments of B and Z
        (B, BB) = self.B_node.get_moments()
        CovB = BB - B[...,:,None]*B[...,None,:]
        
        u_Z = self.Z_node.get_moments()
        Z = u_Z[0]

        #
        # Expectations with respect to A and X
        #

        # Compute: \sum_n <A_n> <x_{n-1} x_n^T>
        Z_XpXn = np.einsum('...nij,...nk->...kij',
                           XpXn,
                           Z)
        A_XpXn = np.einsum('...kil,...klj->...ij',
                           B,
                           Z_XpXn)
        A_XpXn = sum_to_plates(A_XpXn,
                               (),
                               ndim=2,
                               plates_from=self.X_node.plates)
        
        # Compute: \sum_n <A_n> <x_{n-1} x_{n-1}^T> <A_n>^T
        Z_XpXp = np.einsum('...nij,...nk->...kij',
                           XpXp,
                           Z)
        B_Z_XpXp = np.einsum('...kil,...klj->...kij',
                             B,
                             Z_XpXp)
        A_XpXp_A = np.einsum('...kil,...kjl->...ij',
                             B_Z_XpXp,
                             B)
        A_XpXp_A = sum_to_plates(A_XpXp_A,
                                 (),
                                 ndim=2,
                                 plates_from=self.X_node.plates)
        
        # Compute: \sum_n tr(CovA_n <x_{n-1} x_{n-1}^T>)
        CovA_XpXp = np.einsum('...kij,...kdij->...d',
                              Z_XpXp,
                              CovB)
        CovA_XpXp = sum_to_plates(CovA_XpXp,
                                  (),
                                  ndim=1,
                                  plates_from=self.X_node.plates)


        return (A_XpXn, A_XpXp_A, CovA_XpXp)


[docs]class RotateMultiple():
    r"""
    Identical parameter expansion for several nodes simultaneously

    Performs the same rotation for multiple nodes and combines the cost
    effect.
    """

[docs]    def __init__(self, *rotators):
        self.rotators = rotators

[docs]    def nodes(self):
        nodes = []
        for rotator in self.rotators:
            nodes += rotator.nodes()
        return nodes

[docs]    def rotate(self, R, inv=None, logdet=None):
        for rotator in self.rotators:
            rotator.rotate(R, inv=inv, logdet=logdet)

[docs]    def setup(self):
        for rotator in self.rotators:
            rotator.setup()

[docs]    def bound(self, R, logdet=None, inv=None):
        bound = 0
        dbound = 0

        for rotator in self.rotators:
            (b, db) = rotator.bound(R, logdet=logdet, inv=inv)
            bound = bound + b
            dbound = dbound + db

        return (bound, dbound)

[docs]    def get_bound_terms(self, *args, **kwargs):
        d = dict()
        for rotator in self.rotators:
            d.update(rotator.get_bound_terms(*args, **kwargs))
        return d
Source code for bayespy.inference.vmp.transformations

BayesPy

Navigation

Related Topics