Source code for learners.cpdlearners

'''
The parameters of a PRM model are the conditional probability distributions (CPDs or `local distributions`), they are learned using the module :mod:`!learners.cpdlearners`.

This module contains classes that are used to learn conditional probability distributions (CPDs) for attributes. 
The basic CPD class is :class:`prm.localdistributions.CPD` which can be implemented for different CPD representations.
:class:`prm.localdistributions.CPDTabular` stores the CPD as a matrix, whereas :class:`prm.localdistributions.CPDTree` 
stores the CPD in a decision tree (not implemented yet).

'''

from analytics.performance import time_analysis

from prm.localdistribution import CPDTabular

#from pylab import *
import numpy as N

[docs]class CPDLearner(): ''' Abstract class that is used to learn the conditional probability distributions for a PRM using the data linked in the data interface. ''' def __init__(self): ''' Creates CPDLearner. It now has to be configured using the self.configure() method. ''' import prm.prm as PRM import data.datainterface as DI self.prmToLearn = PRM self.di = DI def __repr__(self): return "%s - PRM %s - DI %s"%(self.__class__.__name__ ,self.prmToLearn.name,self.di.name) def learnCPDs(self,saveDistributions=False,forceLearning=False): raise Exception("learnCPDs not implemented in the CPDLearner instance")
[docs]class CPDTabularLearner(CPDLearner): ''' The CPDTabularLearner learns the local distributions for all probabilistic attributes and stores them in tabular form :class:`prm.localdistributions.CPDTabular`. It loads all the necessary data for each attribute in one query using the data interface. ''' def __init__(self): CPDLearner.__init__(self)
[docs] def learnCPDs(self,saveDistributions=False,forceLearning=False): ''' Depending on which method is standard for learning CPDs, either :meth:`.learnCPDsCount` or :meth:`.learnCPDsFull` is executed. ''' #self.learnCPDsCount(saveDistributions,forceLearning) self.learnCPDsFull(saveDistributions,forceLearning) #@time_analysis
[docs] def learnCPDsCount(self,saveDistributions=False,forceLearning=False): ''' Learns the conditional probability distributions for all probabilistic attributes by counting the occurences on the data side. If the data interface connects to a SQL based database this can easily be done using the `COUNT` statement. The data is retrieved by calling the data interface method :meth:`data.sqliteinterface.loadCountCPDdata` :arg saveDistributions: If `True`, saves the learned CPDs to disk and prints the XML line that needs to be added to the PRM specification to the standard output :arg forceLearning: If `True`, the CPDs are learned even if there are distributions that could be loaded from disk ''' #iterate over all attributes for attr in self.prmToLearn.attributes.values(): if attr.probabilistic: #only learn probabilistic attributes if attr.CPD != None and not forceLearning: print "... CPD for attribute '%s' already loaded"%(attr.name) else: #only learn attributes that don't have a CPD yet (it could also be specified in prm) print "... Learning CPD for attribute '%s'"%(attr.fullname) #create CPD instance for attribute attr.CPD = CPDTabular(attr) #we count all occurrences of the individual parent assignments counter = N.zeros((attr.CPD.cpdMatrix.shape[0],1)) ''' We learn the distributions over all the trainig sets (no cross validation) ''' for dsi in self.di.DSI: #load the full data for attribute #dsi.loadFullCPDdata(attr) #or load the aggregated data for attribute dsi.loadCountCPDdata(attr) for i,currentRow in enumerate(dsi.resultSet()): #keeping track of the index for the attribute npa = attr.CPD.cpdMatrix.shape[0] attrIndex = i / npa parentIndex = i % npa ''' We handle the current row ''' #count the attribute assignment attr.CPD.cpdMatrix[parentIndex,attrIndex] = currentRow[0] #count the parent assignment counter[parentIndex] += currentRow[0] #add fake counts #TODO #calculate probabilities attr.CPD.cpdMatrix = attr.CPD.cpdMatrix / counter #compute the cumulative distribution attr.CPD.computeCumulativeDist() attr.CPD.computeLogDists() #print attr.CPD.cpdMatrix #print attr.CPD.cpdMatrix.sum(axis=1) if saveDistributions: ''' Finally we can save the distributions to file if desired ''' attr.CDP.save() #@time_analysis
[docs] def learnCPDsFull(self,saveDistributions=False,forceLearning=False): ''' Learns the conditional probability distributions for all probabilistic attributes by iterating over a big table counting the occurences on the way. If the data interface connects to a SQL based database, the result set is a big table in the form [valAttr, valPa1, valPa2, etc.]. The data is retrieved by calling the data interface method :meth:`data.sqliteinterface.loadFullCPDdata` :arg saveDistributions: If `True`, saves the learned CPDs to disk and prints the XML line that needs to be added to the PRM specification to the standard output :arg forceLearning: If `True`, the CPDs are learned even if there are distributions that could be loaded from disk ''' print "Learning CPD for attributes '%s'"%(','.join([attr.fullname for attr in self.prmToLearn.attributes.values() if attr.probabilistic])) #iterate over all attributes for attr in self.prmToLearn.attributes.values(): if attr.probabilistic: #only learn probabilistic attributes if attr.CPD != None and not forceLearning: print "... CPD for attribute '%s' already loaded"%(attr.name) else: #only learn attributes that don't have a CPD yet (it could also be specified in prm) #print "... Learning CPD for attribute '%s'"%(attr.fullname) #create CPD instance for attribute attr.CPD = CPDTabular(attr) #we count all occurrences of the individual parent assignments counter = N.zeros((attr.CPD.cpdMatrix.shape[0],1)) ''' We learn the distributions over all the trainig sets (no cross validation) ''' for dsi in self.di.DSI: #load the full data for attribute dsi.loadFullCPDdata(attr) #or load the aggregated data for attribute #dsi.loadFullAggCPDdata(attr) for currentRow in dsi.resultSet(): ''' We handle the current row ''' #compute the matrix index for the attribute values [indexRow,indexColumn] = attr.CPD.indexingCPD(currentRow) #count the attribute assignment attr.CPD.cpdMatrix[indexRow,indexColumn] += 1 #count the parent assignment counter[indexRow] +=1 #add fake counts nF = 1 attr.CPD.cpdMatrix += nF counter += attr.CPD.cpdMatrix.shape[1]*nF #calculate probabilities attr.CPD.cpdMatrix = attr.CPD.cpdMatrix / counter #compute the cumulative distribution attr.CPD.computeCumulativeDist() attr.CPD.computeLogDists() #print attr.CPD.cpdMatrix #print attr.CPD.cpdMatrix.sum(axis=1) if saveDistributions: ''' Finally we can save the distributions to file if desired ''' attr.CPD.save()
[docs] def loglikelihood(self): ''' Computes the log likelihood for the learned CPDs. As aggregation is possibly required, it uses the method :meth:`data.sqliteinterface.loadFullAggCPDdata` to retrieve the data. ''' loglik = 0 for attr in self.prmToLearn.attributes.values(): if attr.probabilistic: for dsi in self.di.DSI: #load the full data for attribute #dsi.loadFullCPDdata(attr) #or load the aggregated data for attribute dsi.loadFullAggCPDdata(attr) for currentRow in dsi.resultSet(): ''' We handle the current row ''' #compute the matrix index for the attribute values [indexRow,indexColumn] = attr.CPD.indexingCPD(currentRow) #update the loglik with the log prob of the instance that we have seen loglik += attr.CPD.cpdLogMatrix[indexRow,indexColumn] return loglik
class CPDTreeLearner(CPDLearner): pass