Source code for prm.localdistribution

'''
The model parameters in a ProbReM project are the `conditional probability distributions` (CPDs) defined for each probabilistic attribute defined in the model. They are also refered to as `local distributions` interchangeably. 

.. inheritance-diagram:: prm.localdistribution
'''

#import prm.PRM

#from pylab import * 
import numpy as N

from itertools import izip,count 

from analytics.performance import time_analysis



[docs]class CPD(): ''' A conditional probability distribution CPD is defined for an attribute. This is an abstract version of a CPD that defines a set of methods all CPD implementations must provide. ''' def __init__(self, attr): self.attr = attr """The :class:`.Attribute` that the CPD is associated with """
[docs] def sample(self,paAssignment): """ :arg paAssignment: List of parent values :returns: Randomly drawn sample of the CPD given the `paAssignment` """ raise Exception('Sampling not implemented for %s'%(self.__class__.__name__))
[docs] def logLikelihood(self,fullAssignment): """ :arg fullAssignment: List of values order such that [attributeValue,ParentValue1,ParentValue2,....] :returns: Loglikelihood of `fullAssignment` """ raise Exception('logLikelihood not implemented for %s'%(self.__class__.__name__))
[docs] def save(self): """Saves the CPD to disk """ raise Exception('saving of CPD not supported for %s'%(self.__class__.__name__))
[docs]class CPDTabular(CPD): """ The tabular representation of a CPD for discrete variables. A matrix of dimensions `m x n`, where * `m` is the number of possible parent assignments :math:`\prod_{pa \in Parents} |V(pa)|` * `n` is the cardinalitiy of the attribute domain :math:`|V(attr)|` This matrix grows exponentially with the number of parents, thus not suited for large V-Structures. .. todo:: The rows of the `CPDTabular.cpdMatrix` are the possible parent assignments. Naturally the ordering of indexing matters, and it depends on the order of the attributes in the `attr.parents` list (it is set in :meth:`.PRMparser.start_element`). The order in which the dependencies in the model specification are defined sets the order of the `attr.parents`. The problem is that when the CPDs are loaded from file, but the specification of the dependencies changed, it is possible that the CPD is incorrect. """ def __init__(self, attr): ''' The CPD will be stored, inefficiently, in a matrix of dimension [ product of domain size of all parents , size of domain of attribute] ''' CPD.__init__(self,attr) self.parentAssignments = 1 self.indexingMultiplier = [1 for p in self.attr.parents] #a multiplier used to access the correct self.initCPD() # initialize the probability matrix self.cpdMatrixDim = [self.parentAssignments,attr.cardinality] """Dimension of `cpdMatrix` """ self.cpdMatrix = N.zeros( self.cpdMatrixDim ) """The CPD matrix of type `numpy.array`. The rows represent different parent assignments, the columns of a row define the distribution over the attribute. """ self.cpdLogMatrix = None """Log values of `cpdMatrix` """ #the cumulative distribution for sampling self.cumMatrix = N.zeros( self.cpdMatrixDim ) """Cumulativ `cpdMatrix`. Computed by :meth:`.computeCumulativeDist` """ self.cumLogMatrix = None """Log values of `cumMatrix` """
[docs] def sample(self,paAssignment): ''' Samples a random value using `cumMatrix` :arg paAssignment: List of parent values :returns: Randomly drawn sample of the CPD given the `paAssignment` ''' #print 'paAssignment ',paAssignment ri = self.indexRow(paAssignment) #print 'rowIndex ',rowIndex u = N.random.uniform() # If the attribute doesn't have any parents, the CPD is a 1 x n vector and # thus can't be indexed cumRow = self.cumMatrix[ri,:] # cumRow = N.atleast_2d(self.cumMatrix)[ri,:] for i,cumprop in enumerate(cumRow): if u <= cumprop: return self.attr.domain[i]
[docs] def logLikelihood(self,fullAssignment): ''' :arg fullAssignment: List of values order such that [`attributeValue`,`parentValue1`,`parentValue2`,....] :returns: Loglikelihood of `fullAssignment` using `cpdLogMatrix` ''' #compute the matrix index for the attribute values [indexRow,indexColumn] = self.attr.CPD.indexingCPD(fullAssignment) #update the loglik with the log prob of the instance that we have seen return self.attr.CPD.cpdLogMatrix[indexRow,indexColumn]
[docs] def indexingCPD(self,currentRow): ''' Returns the row and column indices for a full assignment to the attribute `attr`. `indexRow` is the index of the row of the cpd matrix that corresponds to the assignment of the parent attributes. The parents attribute values are ordered the same way as in `attr.parents`. `indexColumn` is the index of the column that corresponds to the assignment of the attribute value itself. :arg currentRow: List containing a full assignment, [`attributeValue`,`parentValue1`,`parentValue2`,....] :returns: Tuple [`indexRow`,`indexColumn`] ''' return [self.indexRow(currentRow[1:]),self.indexColumn(currentRow[0]) ] #@time_analysis
[docs] def indexRow(self,parentAssignment): ''' See :meth:`.indexingCPD` ''' index=0 for i, mult, value in izip(count(),self.indexingMultiplier, parentAssignment): index += mult * self.attr.parents[i].indexingValue(value) return int(index)
[docs] def conditionalDist(self,gbnV): ''' Returns the conditional probability distribution of the `gbnV` given its parent values. :arg gbnV: :class:`.GBN` instance :returns: Index of the row ''' index=0 for i, mult, value in izip(count(),self.indexingMultiplier, parentAssignment): index += mult * self.attr.parents[i].indexingValue(value) return int(index)
[docs] def reverseIndexRow(self,index): ''' Computes the parent assignment given an row index of `cpdMatrix` :arg index: Row index of `cpdMatrix` :returns: Parent assignment associated with `index` ''' parentAssignment = [None for p in self.attr.parents] for i,m in enumerate(self.indexingMultiplier): parentAssignment[i] = self.attr.parents[i].domain[(index/m)] index = index % m return parentAssignment
[docs] def indexColumn(self,attrValue): ''' See :meth:`.indexingCPD` ''' return self.attr.indexingValue(attrValue)
[docs] def computeLogDists(self): ''' Calculates the log probability distribution `cpdLogMatrix` and cumulative log probability distribution `cumLogMatrix` ''' self.cpdLogMatrix = N.log(self.cpdMatrix) self.computeCumulativeDist() self.cumLogMatrix = N.log(self.cumMatrix)
[docs] def computeCumulativeDist(self): ''' Calculates the cumulative distribution of the tabular CPD by incrementally summing the columns ''' # self.cumMatrix = self.cpdMatrix.copy() # # for i in range(0,(self.cumMatrix.shape[1]-1)): # self.cumMatrix[:,(i+1)] = self.cumMatrix[:,(i+1)] + self.cumMatrix[:,i] self.cumMatrix = N.atleast_2d(self.cpdMatrix).cumsum(axis=1)
def __repr__(self): return 'TabularCPD, Dim=%s'%(self.cpdMatrixDim)
[docs] def initCPD(self): ''' Computes the number of possible parent assigments and the index multipliers needed to compute the row index of a given parent assignment, see :meth:`.indexingCPD`. ''' nparentsindex= len(self.attr.parents) #temp variable for i, pa in enumerate(self.attr.parents): #print "parent %s , cardinality %s"%(pa.name,pa.cardinality) # we calculate the total number of all possible combinations of parent assignments self.parentAssignments *= pa.cardinality # the index serves to find the row that corresponds to a specific parents assignment for j in range(i+1,nparentsindex): #print 'i=%s,j=%s'%(i,j) self.indexingMultiplier[i] *= self.attr.parents[j].cardinality
[docs] def save(self,relPath='./localdistributions'): """ Saves `cpdMatrix` to disk using `numpy.save` and outputs the XML specification that can be added to the PRM specification. :arg relPath: Relative path to the local distribution files, starting from the directory where the model is instantiated from. """ fname = self.attr.name if len(self.attr.parents)!=0: fname = '%s_%s'%(fname,''.join([pa.name for pa in self.attr.parents])) locDistPath = '%s/%s'%(relPath,fname) #print 'Saving CPDmatrix.npy and attrname.xml for %s to %s -> include reference in PRM xml'%(self.attr.name,locDistPath) N.save(locDistPath,self.cpdMatrix) locDistXML = "<?xml version='1.0' standalone='no' ?><LocalDistribution attribute='%s'><TabularCPD file='%s.npy'/></LocalDistribution>"%(self.attr.fullname,locDistPath) xmlFile = open('%s.xml'%(locDistPath), 'w') xmlFile.write(locDistXML) xmlFile.close() #print 'Tag for cpd in prm xml specification:\n%s'%("<LocalDistribution attribute='%s' file='%s.xml'/>"%(self.attr.name,locDistPath)) print "<LocalDistribution attribute='%s' file='%s.xml'/>"%(self.attr.fullname,locDistPath)
[docs]class CPDTree(CPD): """Future implementation for a CPD based on a decision tree. No need so far. """ def __init__(self): pass