Source code for inference.engine

'''
The inference engine handles inference. 

* Unrolling the Ground Bayesian Network ('propositionalize' the the first-order representation)
* Run an inference algorithm (e.g. Gibbs sampling)

There are different algorithms that will need to perform inference:

* Simple query :math:`P(event|evidence)`
* Expectation Maximization (EM) for missing data / latent variables
* Structure learning
* Reference uncertainty
'''



import logging

from network.groundBN import GBNGraph,GBNqueue
from network.vertices import GBNvertex,ReferenceVertex,computeID,computeERID

from analytics.performance import time_analysis

import random



import prm.prm as PRM
''' :mod:`prm.prm` instance
'''
import data.datainterface as DI
''' :mod:`data.datainterface` instance
'''

GBNS = {}
'''
At this point, only one ground Bayesian network exists, :attr:`.GBN`. If posterior samples are collected from multiple chains,  e.g. in the case of MCMC inference, it would be ideal to run these chains in paralell. However this would require multiple instantiations of the ground Bayesian network. Not just the values of the vertices are required, since the network stucture changes with reference uncertainty, the whole network needs to be duplicated.

We could envision a dict() of ground Bayesian networks. Note that copy.deepcopy() doesn't work as it would also copy all other istances, e.g. :class:`.Attribute` objects.

Also, there would be problems with some datastructures that use an ID of a :class:`.GBNvertex` as a key. For example the :attr:`.ReferenceVertex.existParents` would break. 

For now, changes are run sequentially and the convergence diagnostics are run on the collected samples of the :mod:`.posterior`.
'''

GBN = GBNGraph()
'''Ground Bayesian Network used for inference of type :class:`.GBNGraph`. 
'''

gbnQ = GBNqueue()
''':class:`.GBNqueue` data structure used to unroll the GBN
'''


inferenceAlgo = None
'''
The algorithm used for the approximate inference (e.g. :mod:`.gibbs`,:mod:`.mh` ) 
'''

query = None
'''Current Query instance of type :class:`.Query` 
'''

    
[docs]def reset():
    """
    Resets the :class:`GBNGraph` and :class:`GBNqueue` instances.
    """
    global GBN,gbnQ
    
    for gbnV in GBN.values():
        del gbnV
    del GBN
    del gbnQ
    GBN = GBNGraph()
    gbnQ = GBNqueue()

    
@time_analysis
def infer(queryI):
    global query
    '''
    Runs inference for `query` by 
        
    * Resetting GBN
    * Unrolling GBN
    * Initialize inference algorithm
    * Run inference and collect posterior samples        
    
    :arg query: :class:`.Query` instance        
    '''
    query = queryI
    logging.info('Inference for: %s'%query)
        
    reset()
    #unrolling the ground bayes net
    unrollGBN()
    
    logging.info(GBN)
    
    inferenceAlgo.run()
    # logging.debug('inferenceAlgo.run() is commented')
    
            

# @time_analysis
[docs]def unrollGBN():
    '''
    The unrolled Ground Bayesian Network is a subgraph of the
    full graph that d-seperated the event variables `Y`
    from the query from the rest of the graph given the evidence
    variables. 
    
    .. note::
        Add proper description of algorithm
    
    
    
    '''
    
    

    # TODO : FIX THE CROSS VALIDATION. WE DON'T WANT TO QUERY K DIFFERENT FOLDS 
    # WHEN UNROLLING. JUST ONE, THE TRAINING SET FOLD
    dsi = DI.DSI[0]
    #print DI.trainingSets[di.DSI[0]]
    
    
    # add the inference (event) variables to the GBN 
    for qvar in query.event:
        #load all objects in qVariable
        dsi.loadObjects(qvar)
        # result set row : [attribute, pk1 , pk2, ....]
        
        
        #adding all inference vertices for that qVariable to the ground bayes net
        for row in dsi.resultSet():
            
            vertexID = computeID(qvar.attr,row[1:]) 
            #add Vertex
            GBN.addVertex(ID=vertexID,attr=qvar.attr,obj=row[1:],fixed=False,event=True)
            
            #add Vertex to the dictionary of event vertices
            GBN.eventVertices[vertexID] = GBN[vertexID]
            
            #add vertex to the queue of vertices to process
            gbnQ.push(GBN[vertexID])
            #logging.debug('pushed on queue: %s'%gbnVertex)

            
        
    # logging.info('All %s inference vertices have been added'%(len(GBN)))
    # logging.info('\t%s'%' '.join([gbnV.ID for gbnV in GBN.values()]))

    # add all the parents of the inference nodes
    while not gbnQ.isEmpty():
        (attr,gbnVertices) = gbnQ.pop()
        
        #logging.info('Handling attribute %s with %s gbn vertices'%(attr.fullname,len(gbnVertices)))
        
        #filtering the gbn vertices that are part of the evidence

        gbnVerticesNotInE = filter(lambda gbnV: not query.gbnVertexInEvidence(gbnV), gbnVertices)
        #logging.debug("gbnVerticesNotInE: %s"%gbnVerticesNotInE)
        
        #We load the parents. Note that the parents are also loaded for gbnVertices that are in 
        #the evidence to cover the V-structure effect (explaining away)
        #logging.info('Loading Parents for %s'%(attr.fullname))
        addParents(attr,gbnVertices)            
        
        #we only load the children if the gbnvertex is not in the evidence (if there are any)
        #logging.info('Loading Children for %s'%(attr.fullname))
        if gbnVerticesNotInE: 
            addChildren(attr,gbnVerticesNotInE)
        
        #logging.info('Done Handling %s'%(attr.fullname))

[docs]def validateGBN():
    """
    Validating the GBN is necessary as there might be missing data (either because there is no datapoint or the datapoint is missing from the dataset). The standard approach would be to use an `EM` algorithm to find the `MLE` for the missing data. 
    In order for the inference method to work we need to have valid probability distributions. In case there is a GBN vertex that has no parent vertex but a probability distribution conditional on that parent attribute, one way to avoid invalid GBN structures is to add a sampling vertex. This is only possible (and reasonable) in case this sampling vertex does not depend on parent values it
    """
    for attr, gbns in GBN.allByAttribute.items():
        if attr.hasParents:
            # in this case all gbn vertices of this attribute should have at least one parent
            for gbnV in gbns:
                for paAttr in gbnV.parents.keys():
                    if not gbnV.hasParents(paAttr):
                        # if there aren't any parents, a sampling vertex is added.
                        artificial_ID = '%s(%s)'%(paAttr.fullname,gbnV.ID)
                        GBN.addSamplingVertex(artificial_ID,paAttr,None)
                        GBN[gbnV.ID].addParent(GBN[artificial_ID])
                        logging.debug('adding artificial %s'%artificial_ID ) 
        

# @time_analysis
[docs]def addParents(attr,gbnVertices):
    '''
    Loads the set of parent attribute objects for all dependencies for the given GBN vertices of the same attribute class. The BGN is updated and potential new latent variables are added to the queue.
    
    :arg attr: :class:`.Attribute` instance
    :arg gbnVertices: List of :class:`.GBNvertex` instances of type `attr`                
    '''
    
    # NOTE ON EVIDENCE LOOKUP:
    #         when looking up evidence based on attribute (=all objects of an attribute are either in
    #         the evidence or not), then we check attrEvidenceLookUp() only once:
    #             dep.parent in Evidence?
    #                 yes -> for row in resultset do this
    #                 no -> for row in resultset do this instead
    #         
    #         In case the evidence is object based (=one object of a certain attribute can be in the evidence 
    #         while another object of the same attribute is not), then this is not possible and we have to do
    #         the objEvidenceLookUp() for every object over which we iterate:
    #             for row in resultset:
    #                 if obj in evidence?
    #                     yes -> do this
    #                     no -> do that
    
    
    # TODO : FIX THE CROSS VALIDATION. WE DON'T WANT TO QUERY K DIFFERENT FOLDS 
    # WHEN UNROLLING. JUST ONE, THE TRAINING SET FOLD, AT THIS POINT WE ARE ONLY WORKING WITH ONE FOLD.
    dsi = DI.DSI[0]
    #print DI.trainingSets[di.DSI[0]]
    
    
    # parents for each dependency that the attribute is a child of
    for dep in attr.dependenciesChild: 
                                        
        if dep.uncertain:
            # Reference Uncertainty

            
            
            # If there are no exist attributes in the GBN, all required attribute need to be added to the GBN first
            if dep.uncertainRelationship.existAttribute not in GBN.allByAttribute: 
                initReferenceUncertainty(dep)

            
            # Note that 'gbnVertices' is a SET of gbnVertices 
            for gbnV in gbnVertices:
            
                logging.debug('Adding Reference vertex for %s'%gbnV.ID)
                
                refVID = GBN.addReferenceVertex(gbnV,dep)

                # Initialize the edges for all k connections
                for i in range(GBN[refVID].k):
                    

                    # choose one k-entity object
                    # TODO sample it from proposal distribution.... 
                    # choose uniformly for now (i.e. = Pasula paper)
                    kGBNv = random.choice(GBN.allByAttribute[dep.kAttribute])                    

                    GBN[refVID].addReference(kGBNv)


        else:
            #  No Reference Uncertainty

            # variables used to access the result sets redefined for code readability    
            child_pk = dep.child.erClass.pk
            n_child_pk = len(child_pk)
            parent_pk = dep.parent.erClass.pk
            n_parent_pk = len(parent_pk)
            end_parent_pk = n_child_pk+n_parent_pk
            
            
            # load all parent obj
            dsi.loadDependencyParentObjects(dep,gbnVertices)
            
            
            for row in dsi.resultSet():
                
                
                #extract child id
                child_ID = computeID(dep.child,row[0:n_child_pk]) 
                #extract parent information
                parent_obj = row[n_child_pk:end_parent_pk]
                parent_val = row[-1]
                parent_ID = computeID(dep.parent,parent_obj) 
                

                                            
                #only add if not already in GBN
                if parent_ID not in GBN:   
                
                    #logging.debug('%s : adding parent %s (val=%s)'%(child_ID,parent_ID,parent_val))
                
                
                        
                    if not query.objInEvidence(dep.parent,parent_ID) : #parent obj not in evidence -> add the queue  
                    
                        GBN.addSamplingVertex(parent_ID,dep.parent,parent_obj)
                        gbnQ.push(GBN[parent_ID])                    
                        #Having added the vertex just before, we can for efficiency reasons add the edge to the GBN as we know that it hasn't been there before
                        #adds parent information to child node                    
                        GBN[child_ID].parents[GBN[parent_ID].attr][parent_ID] =GBN[parent_ID] 
                        #adds child information to parent node
                        GBN[parent_ID].children[GBN[child_ID].attr][child_ID] = GBN[child_ID]
                    
                    elif parent_val is not None:   #parent obj is in evidence -> d-seperates -> not pushed onto queue 

                        #add parent vertex to GBN
                        GBN.addEvidenceVertex(parent_ID,dep.parent,parent_obj,parent_val)  
                        #Having added the vertex just before, we can for efficiency reasons add the edge to the GBN as we know that it hasn't been there before
                        #adds parent information to child node                    
                        GBN[child_ID].parents[GBN[parent_ID].attr][parent_ID] = GBN[parent_ID] 
                        #adds child information to parent node
                        GBN[parent_ID].children[GBN[child_ID].attr][child_ID] = GBN[child_ID]
                    
                #parent vertex already in GBN
                else:    
                    #add parent information to child node
                    GBN[child_ID].addParent(GBN[parent_ID])  
        
# @time_analysis
[docs]def addChildren(attr,gbnVertices):
    '''
    
    Loads the set of children attribute objects for all dependencies for the given GBN vertices of the same attribute class. The BGN is updated and potential new latent variables are added to the queue.
    
    :arg attr: :class:`.Attribute` instance
    :arg gbnVertices: List of :class:`.GBNvertex` instances of type `attr`                
    '''
    
    # TODO : FIX THE CROSS VALIDATION. WE DON'T WANT TO QUERY K DIFFERENT FOLDS 
    # WHEN UNROLLING. JUST ONE, THE TRAINING SET FOLD
    dsi = DI.DSI[0]
    #print DI.trainingSets[di.DSI[0]]
    
    
    # parents for each dependency that the attribute is a child of
    for dep in attr.dependenciesParent: 


        if dep.uncertain:
            # Reference Uncertainty

            raise Exception('We are adding children of an uncertain depependcy. Not implemented yet.')


        else:
            # No Reference Uncertainty
        
            # variables for code readability 
            parent_pk = dep.parent.erClass.pk
            n_parent_pk = len(parent_pk)
            child_pk = dep.child.erClass.pk
            n_child_pk = len(child_pk)
            end_child_pk = n_child_pk+n_parent_pk
            
            # load all parent obj
            dsi.loadDependencyChildrenObjects(dep,gbnVertices)            
            
            for row in dsi.resultSet():
                #extract parent id
                parent_ID = computeID(dep.parent,row[0:n_parent_pk]) 
                #extract child information
                child_obj = row[n_parent_pk:end_child_pk]
                child_val = row[-1]
                child_ID = computeID(dep.child,child_obj) 
                
                
                #only add if not already in GBN                
                if child_ID not in GBN:   
                
                    #logging.debug('%s : adding child %s (val=%s)'%(parent_ID,child_ID,child_val))
                
                    if not query.objInEvidence(dep.child,child_ID):   #children obj is not in evidence                     
                        #add child vertex to the GBN
                        GBN.addSamplingVertex(child_ID,dep.child,child_obj)
                    
                        #Having added the vertex just before, we can for efficiency reasons add the edge to the GBN as we know that it hasn't been there before
                        #adds parent information to child node                    
                        GBN[child_ID].parents[GBN[parent_ID].attr][parent_ID] =GBN[parent_ID] 
                        #adds child information to parent node
                        GBN[parent_ID].children[GBN[child_ID].attr][child_ID] = GBN[child_ID]
                    
                        # pushing gbnVertex onto queue (also push vertices that are in the evidence because of possible V-structures)
                        gbnQ.push(GBN[child_ID])
                    
                    elif child_val is not None:    #children obj is in evidence 

                    
                        #add child vertex to the GBN
                        GBN.addEvidenceVertex(child_ID,dep.child,child_obj,child_val)  

                        #Having added the vertex just before, we can for efficiency reasons add the edge to the GBN as we know that it hasn't been there before
                        #adds parent information to child node                    
                        GBN[child_ID].parents[GBN[parent_ID].attr][parent_ID] =GBN[parent_ID] 
                        #adds child information to parent node
                        GBN[parent_ID].children[GBN[child_ID].attr][child_ID] = GBN[child_ID]
                    
                        # pushing gbnVertex onto queue (also push vertices that are in the evidence because of possible V-structures)
                        gbnQ.push(GBN[child_ID])
                
            
                #child vertex already in GBN                    
                else:    
                    #add parent information to child node
                    GBN[child_ID].addParent(GBN[parent_ID]) 
                     
                    # TODO is this correct? or do we need to push to vertex even if it is already in the gbn? What happens if a vertex is reached from two 'sides' one of which opens a V structre - this means that the vertex is already in the gbn and therefore only the edge is added, even though the V-structure should be loaded
                    #gbnQ.push(GBN[child_ID])
                         
                       
        
# @time_analysis
[docs]def initReferenceUncertainty(dep):
    '''
    When a uncertain relationship is first encountered, all the attribute objects that could be associated with the object that initiated the reference have to be loaded. Additionally, all the parent attributes of the exist attributes need to be loaded as well.

    :arg dep: :class:`.UncertainDependency` instance
    '''
    # TODO : FIX THE CROSS VALIDATION. WE DON'T WANT TO QUERY K DIFFERENT FOLDS 
    # WHEN UNROLLING. JUST ONE, THE TRAINING SET FOLD, AT THIS POINT WE ARE ONLY WORKING WITH ONE FOLD.
    dsi = DI.DSI[0]  
            
    # logging.debug('Add all attribute objects of type %s'%dep.kAttribute.fullname)
    
    # variables used to access the result sets redefined for code readability                    
    n_pk = len(dep.kAttribute.erClass.pk)
    
    
    # load all attribute obj
    dsi.loadAttributeObjects(dep.kAttribute)
                    
    for row in dsi.resultSet():
        
        #extract id information
        attr_obj = row[0:n_pk]
        attr_ID = computeID(dep.kAttribute,attr_obj)
        attr_val = row[-1]
        
        # Add the vertex to the GBN
        # Note that we don't add parent/child information as the reference attribute needs to be instantiated first
        if attr_ID not in GBN:

            if not query.objInEvidence(dep.kAttribute,attr_ID) : #attr obj not in evidence -> add the queue
                
                GBN.addSamplingVertex(attr_ID,dep.kAttribute,attr_obj)
                gbnQ.push(GBN[attr_ID])

            elif attr_val is not None:   #attr obj is in evidence -> d-seperates -> not pushed onto queue 

                GBN.addEvidenceVertex(attr_ID,dep.kAttribute,attr_obj,attr_val)  
            
    # Next, all the potential parents of the exist attributes need to be added to `dep.uncertainRelationship.existParents'        
    
    
    for existdep in dep.uncertainRelationship.existAttribute.dependenciesChild:

        dsi.loadExistParents(dep,existdep)

        # variables for code readability 
        k_entity_pk = dep.kAttribute.erClass.pk
        len_k_entity_pk = len(k_entity_pk)

        parent_pk = existdep.parent.erClass.pk
        len_parent_pk = len(parent_pk)
        end_parent_pk = len_k_entity_pk+len_parent_pk
        
        #  Add initial empty data structure the static ReferenceVertex.existParents


        for row in dsi.resultSet():
            
            #extract object 
            k_entity_obj = row[0:len_k_entity_pk]

            # object id (eg. student.1), note this is not a attribute object id (e.g. student.success.1)
            k_entity_id = computeERID(dep.kAttribute.erClass,k_entity_obj) 

            #extract parent information
            parent_obj = row[len_k_entity_pk:end_parent_pk]
            parent_val = row[-1]
            parent_ID = computeID(existdep.parent,parent_obj) 
                                

            # Add the vertex to the GBN            
            if parent_ID not in GBN:

                if not query.objInEvidence(existdep.parent,parent_ID) : #attr obj not in evidence -> add the queue
                    
                    GBN.addSamplingVertex(parent_ID,existdep.parent,parent_obj)
                    gbnQ.push(GBN[parent_ID])

                elif parent_val is not None:   #attr obj is in evidence -> d-seperates -> not pushed onto queue 

                    GBN.addEvidenceVertex(parent_ID,existdep.parent,parent_obj,parent_val)  
        


            # adding the exist parent information to a static dictionary accessible by all reference vertices

            if k_entity_obj not in ReferenceVertex.existParents:
                ReferenceVertex.existParents[k_entity_id] = {}
            
            if existdep.parent not in ReferenceVertex.existParents[k_entity_id]:
                ReferenceVertex.existParents[k_entity_id][existdep.parent] = {}

            ReferenceVertex.existParents[k_entity_id][existdep.parent][parent_ID] = GBN[parent_ID]
Navigation

Quick search

Source code for inference.engine

Navigation