import numpy as np
#~ from numpy cimport *
cimport numpy as np
cimport cython
from cython.parallel import prange, parallel, threadid
from libc.stdlib cimport malloc, free, calloc

#~ long random()


cdef import from "math.h":
    double log10(double) nogil 
    double log(double) nogil 
    double exp(double) nogil 
    double sqrt(double) nogil 
    double fabs(double) nogil 
    int abs(int) nogil
    double floor(double) nogil
    double powf(double, double) nogil


# External definitions from openmp headers
cimport openmp
cdef import from "omp.h":
    void omp_set_num_threads(int num_threads)
    ctypedef struct omp_lock_t: 
         pass


# External definitions from numpy headers
cdef extern from "numpy/arrayobject.h":
    ctypedef int npy_intp
    ctypedef int intp
    ctypedef extern class numpy.ndarray [object PyArrayObject]:
        cdef char *data
        cdef int nd
        cdef intp *dimensions
        cdef intp *strides
        cdef int flags
    #~ void* PyArray_GetPtr(ndarray aobj, npy_intp* ind) nogil
    void* PyArray_GETPTR2(ndarray m, npy_intp i, npy_intp j) nogil
    void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) nogil

cdef double INFINITY = np.inf
cdef double NEG_INFINITY = -np.inf
    
cdef extern from "box_triangle_intersect.c":
    int triBoxOverlap(float, float, float, float, float, float, float, float, float, float, float, float, float, float, float) nogil
    

import time
from random import random
import logging
from itertools import product
import math

_log = logging.getLogger()

# 
cdef enum:
    UN_BLOCKED = 0
    BLOCKED = 1
    UNPASSABLE = 2
    AIR = 3
    

# viterbi decoder pruning constants
cdef enum:
    PRUNED = 1
    UNPRUNED = 0

cdef class SortedXYZList:
    ''' skiplist like structure that ensures inserted elements are always sorted and that removes
    the smallest entries if capacity is exceeded
    
    http://igoro.com/archive/skip-lists-are-fascinating/
    http://code.activestate.com/recipes/576930-efficient-running-median-using-an-indexable-skipli/
    
    '''
    
    cdef public:
        double minimum
        char isfilled
        
    cdef:
        int maxlevels, capacity, insertion_count
        
        int *nodelinks
        double *nodevalues
        
        int *xyz
        int fill
        
    def __init__(self, int capacity):
        self.fill = 2 # start at position 2 as the head and the tail are allready in lists
        self.maxlevels = int(1 + log(capacity) / log(2))
        self.capacity = capacity + 2
        self.clear(True)
        
    
    @cython.cdivision(True)
    cdef void insert(SortedXYZList self, double value, int x, int y, int z) nogil:
        
        cdef int level, maxlevels, curr, next, new, smallest
        cdef int offset_curr, offset_smallest, offset_xyz
        
        maxlevels = self.maxlevels
        
        
        if not self.isfilled:
            # insert a new element
            new = self.fill
            self.fill += 1
            
            if self.fill == self.capacity:
                smallest = self.nodelinks[0] # the head node points to the smallest element
                offset_smallest = smallest * maxlevels
                
                # update the minimum to the value of the smallest element
                self.minimum = self.nodevalues[self.nodelinks[offset_smallest]]
                
                # check if curent value is even smaller
                if value < self.minimum:
                    self.minimum = value
                
                self.isfilled = 1
                
            # update minimum
            #~ if value < self.minimum:
                #~ self.minimum = value
        else:
            if value <= self.minimum:
                # nothing to do - we will not insert a element 
                # that is too small
                return
                
                
            # replace the smallest element
            smallest = self.nodelinks[0] # the head node points to the smallest element
            offset_smallest = smallest * maxlevels
            
            # update the minimum to the value of the second-smallest element
            self.minimum = self.nodevalues[self.nodelinks[offset_smallest]]
            
            if value < self.minimum: 
                # the new incoming value is still smaller than the *currently* second smallest
                self.minimum =  value
                
            # now update all next-links on head node to the outgoing links of the "node to be removed = the smallest"
            for level in range(maxlevels):
                next_on_level = self.nodelinks[offset_smallest + level]
                if next_on_level > 0: # will be 0 if the "historical" level_for_insert is reached
                    self.nodelinks[level] = next_on_level
                    # clear old entries!
                    self.nodelinks[offset_smallest + level] = 0
                else:
                    break
            # the new node will get the position of the smallest node (in the array)
            new = smallest
        
        
        # With a probability of 1/2, make the node a part of the lowest-level list only. 
        # With 1/4 probability, the node will be a part of the lowest two lists. 
        # With 1/8 probability, the node will be a part of three lists. And so forth. 
        
        cdef int level_for_insert, i
        #~ level_for_insert = min(self.maxlevels, 1 - int(log(random()) / log(2)))        
        for i in range(self.maxlevels - 1, -1, -1):
            if self.insertion_count % (1 << i) == 0:
                level_for_insert = i + 1
                break
        self.insertion_count += 1
        
        curr = 0 # the head node    
        self.nodevalues[new] = value
        
        # store xyz values
        offset_xyz = new*3
        self.xyz[offset_xyz] = x
        self.xyz[offset_xyz + 1] = y
        self.xyz[offset_xyz + 2] = z
        
            
        for level in range(self.maxlevels - 1, -1, -1):
            offset_curr = curr * maxlevels + level
            next = self.nodelinks[offset_curr]
            while value > self.nodevalues[next]:
                curr = next
                offset_curr = curr * maxlevels + level
                next = self.nodelinks[offset_curr]
            
            if level < level_for_insert:
                self.nodelinks[new * maxlevels + level] = self.nodelinks[offset_curr]
                self.nodelinks[offset_curr] = new
            
    
    def getall(self):
        ''' debug function'''
        cdef int curr = 0, next, offset_curr = curr * self.maxlevels
        next = self.nodelinks[offset_curr]
        while self.nodevalues[next] < INFINITY:
            v = self.nodevalues[self.nodelinks[curr * self.maxlevels]]
            curr = next
            print v, curr, self.nodelinks[curr * self.maxlevels]
            
            next = self.nodelinks[curr * self.maxlevels]
    
    @cython.boundscheck(False)
    def xyzvAsArray(self):
        '''transform stored list to ascending sorted np.array'''
        
        cdef np.ndarray[np.double_t, ndim=2] result = np.zeros((self.fill - 2, 4), dtype=np.double)
        cdef int i, offset
        
        cdef int curr = 0, next, offset_curr = curr * self.maxlevels
        next = self.nodelinks[offset_curr]
        i = 0
        
        while self.nodevalues[next] < INFINITY:
            
            offset = self.nodelinks[curr * self.maxlevels] * 3
            
            result[i, 0]  = self.xyz[offset]
            result[i, 1]  = self.xyz[offset+1]
            result[i, 2]  = self.xyz[offset+2]
            result[i, 3]  = self.nodevalues[self.nodelinks[curr * self.maxlevels]]
            
            curr = next
            
            next = self.nodelinks[curr * self.maxlevels]
            i += 1
            
        return result
    
    @cython.boundscheck(False)
    def particleDataAsArray(self):
        '''transform stored list to ascending sorted np.array for ParticleFilter'''
        
        cdef np.ndarray[np.double_t, ndim=2] result = np.zeros((4, self.fill - 2), dtype=np.double)
                    
        cdef int i, offset, sum_x = 0, sum_y = 0, sum_z = 0
        
        cdef int curr = 0, next, offset_curr = curr * self.maxlevels
        cdef double w, sum_w = 0
        next = self.nodelinks[offset_curr]
        i = 0
        
        while self.nodevalues[next] < INFINITY:
            
            offset = self.nodelinks[curr * self.maxlevels] * 3
            
            result[0, i]  = self.xyz[offset]
            result[1, i]  = self.xyz[offset+1]
            result[2, i]  = self.xyz[offset+2]
            sum_x += self.xyz[offset]
            sum_y += self.xyz[offset+1]
            sum_z += self.xyz[offset+2]
            
            w = self.nodevalues[self.nodelinks[curr * self.maxlevels]]
            sum_w += w
            result[3, i]  = w
            
            curr = next
            
            next = self.nodelinks[curr * self.maxlevels]
            i += 1
            
        return result, sum_w, sum_x, sum_y, sum_z
        
    @cython.boundscheck(False)
    cdef int updatePruningArray(SortedXYZList self, np.ndarray[np.int8_t, ndim=3] pruned,
        int t_shape_x, int t_shape_y, int t_shape_z, int s_shape_x, int s_shape_y, int s_shape_z, 
        int offset_x, int offset_y, int offset_z):
        ''' this should be a method of the ViterbiDecoder but have no clue how to get
        access to self.xyz'''
        
        cdef int i, offset, sx, sy, sz, tx, ty, tz
        cdef int prev_idx_x, prev_idx_y, prev_idx_z
            
        # skip first 2 entries (head/tail)
        for i in prange(2, self.fill, nogil=True):
            offset = i*3
            sx = self.xyz[offset]
            sy = self.xyz[offset+1]
            sz = self.xyz[offset+2]
            
            # now unprune environment
            for tx in range(t_shape_x):
                prev_idx_x = sx + tx - offset_x
                
                if not 0 <= prev_idx_x < s_shape_x:
                    continue
                    
                for ty in range(t_shape_y):
                    prev_idx_y = sy + ty - offset_y
                    if not 0 <= prev_idx_y < s_shape_y:
                        continue
                        
                    for tz in range(t_shape_z):
                        
                        prev_idx_z = sz + tz - offset_z
                        if not 0 <= prev_idx_z < s_shape_z:
                            continue
                        
                        # do not overwrite other PRUNED marker
                        #~ if pruned[prev_idx_x, prev_idx_y, prev_idx_z] == PRUNED:
                        pruned[prev_idx_x, prev_idx_y, prev_idx_z] = UNPRUNED
                        
        return i
        
    cdef clear(SortedXYZList self, realloc=False):
        
        cdef int i
        
        if realloc:
            self.nodelinks = <int*>calloc(self.capacity * self.maxlevels, sizeof(int))
            self.nodevalues = <double*>calloc(self.capacity, sizeof(double))
            self.xyz = <int*>calloc(self.capacity *  3, sizeof(int))
        else:
            # only set links to 0
            for i in range(self.capacity * self.maxlevels):
                self.nodelinks[i] = 0
        
        # start at position 2 as the head and the tail are allready in lists
        self.fill = 2 
        self.isfilled = 0
        
        # initialize head node
        for i in range(self.maxlevels): 
            self.nodelinks[0 + i] = 1
        
        # initialize tail node
        self.nodevalues[1] = INFINITY
        
        self.insertion_count = 1 # used for generating probabilities
        
        # will be set automatically to valid values if initial loading happens
        self.minimum = NEG_INFINITY
                
            
    def __dealloc__(SortedXYZList self):
        free(self.nodelinks)
        free(self.nodevalues)


@cython.boundscheck(False)
cdef inline double emission_all(int numaps,  double* mus, double* xs, double no_signal_delta) nogil:
    cdef double emission_cost = 0.0, mu
    cdef int j
        
    for j in range(numaps):
        mu = mus[j]
        #~ sigma = eprobs[j*2+1]
        if xs[j] == 2:
            continue
        elif xs[j] == 1: # no measurement
            if mu > -100:
                emission_cost = emission_cost + no_signal_delta
            continue
        
        emission_cost = emission_cost + fabs(mu - xs[j])
        
    return emission_cost

@cython.boundscheck(False)
cdef inline double emission_euclid_all(int numaps,  double* mus, double* xs, double no_signal_delta) nogil:
    cdef double emission_cost = 0.0, mu, cost = 0.0
    cdef int j
        
    for j in range(numaps):
        mu = mus[j]
        if xs[j] == 2:
            continue
        elif xs[j] == 1: # no measurement
            if mu > -100:
                emission_cost = emission_cost + no_signal_delta * no_signal_delta
            continue
        cost = mu - xs[j]
        emission_cost = emission_cost + cost * cost
        
    return sqrt(emission_cost)


@cython.boundscheck(False)
cdef inline double emission_top3_mu(int numaps, double* mus, double* xs) nogil:
    cdef double emission_cost = 0.0, mu
    cdef int j
    
    cdef double top1_e, top2_e, top3_e
    cdef double top1_mu, top2_mu, top3_mu
    top1_mu = top2_mu = top3_mu = -1000
    
    for j in range(numaps):
        mu = mus[j]
        #~ sigma = eprobs[j*2+1]
        if xs[j] == 2:
            continue
        elif xs[j] == 1: # no measurement
            if mu > -100:
                emission_cost = emission_cost + 4
            continue
        
        emission_cost = fabs(mu - xs[j])
        if mu > top1_mu:
            top3_mu = top2_mu
            top2_mu = top1_mu
            top1_mu = mu
            top3_e = top2_e
            top2_e = top1_e
            top1_e = emission_cost
        elif mu > top2_mu:
            top3_mu = top2_mu
            top2_mu = mu
            top3_e = top2_e
            top2_e = emission_cost
        elif mu > top3_mu:
            top3_mu = mu
            top3_e = emission_cost
    emission_cost = top1_e + top2_e + top3_e
    
    return emission_cost

@cython.boundscheck(False)
cdef inline double emission_top3_deltas(int numaps, double* mus, double* xs) nogil:
    ''' use top3 costly deltas '''
    
    cdef int j
    cdef double top1_e = 0, top2_e = 0, top3_e = 0
        
    for j in range(numaps):
        if xs[j] > 0: # no measurement (1) or globally unseen ap (2)
            continue

        emission_cost = fabs(mus[j] - xs[j])
            
        if emission_cost > top1_e:
            top3_e = top2_e
            top2_e = top1_e
            top1_e = emission_cost
        elif emission_cost > top2_e:
            top3_e = top2_e
            top2_e = emission_cost
        elif emission_cost > top3_e:
            top3_e = emission_cost
            
    #~ return sqrt(top1_e * top1_e + top2_e * top2_e + top3_e * top3_e)
    return top1_e + top2_e + top3_e

@cython.boundscheck(False)
cdef inline double emission_euclid_top3_deltas(int numaps, double* mus, double* xs) nogil:
    ''' use top3 costly deltas '''
    
    cdef int j
    cdef double top1_e = 0, top2_e = 0, top3_e = 0
        
    for j in range(numaps):
        if xs[j] > 0: # no measurement (1) or globally unseen ap (2)
            continue

        emission_cost = fabs(mus[j] - xs[j])
            
        if emission_cost > top1_e:
            top3_e = top2_e
            top2_e = top1_e
            top1_e = emission_cost
        elif emission_cost > top2_e:
            top3_e = top2_e
            top2_e = emission_cost
        elif emission_cost > top3_e:
            top3_e = emission_cost
    
    return sqrt(top1_e * top1_e + top2_e * top2_e + top3_e * top3_e)
    

cdef class ViterbiDecoder:
    cdef public verbose, store_pruning
    cdef public list estimated_paths
    cdef public list history
    cdef public double no_signal_delta
        
    cdef:
        int s_shape_x, s_shape_y, s_shape_z
        int t_shape_x, t_shape_y, t_shape_z
        int offset_x, offset_y, offset_z
        long numstates
        int len_measurements
        int emissionCostMode
        SortedXYZList pl
        
        np.ndarray blockedCubes
        np.ndarray emission_probs
        np.ndarray transition_probs
        np.ndarray last_probs
        
    def __init__(self, np.ndarray[np.int_t, ndim=3] blockedCubes, np.ndarray[np.double_t, ndim=5] emission_probs, 
                    np.ndarray[np.double_t, ndim=6] transition_probs, verbose=True, store_pruning=False, no_signal_delta=10):
        self.verbose = verbose
        self.blockedCubes = blockedCubes
        self.emission_probs = emission_probs
        
        self.transition_probs = transition_probs
        
        self.s_shape_x = emission_probs.shape[0]
        self.s_shape_y = emission_probs.shape[1]
        self.s_shape_z = emission_probs.shape[2]
        
        self.t_shape_x = transition_probs.shape[3]
        self.t_shape_y = transition_probs.shape[4]
        self.t_shape_z = transition_probs.shape[5]
        
        self.estimated_paths = []
        
        self.store_pruning = store_pruning
        self.history = []
        
        self.no_signal_delta = no_signal_delta
        #~ self.emissionCostMode = 1 # manhattan top 3 
        self.emissionCostMode = 3 # euclid all
        #~ self.emissionCostMode = 4 # euclid top 3
        
        #cdef np.ndarray[np.double_t, ndim=3] 
        self.last_probs = np.empty((self.s_shape_x, self.s_shape_y, self.s_shape_z), dtype=np.double)
        
        
    @cython.boundscheck(False)
    def getUnblockedIndices(self):
        cdef np.ndarray[np.int_t, ndim=3] blockedCubes = self.blockedCubes
        cdef np.ndarray[np.int_t, ndim=2] indices = np.zeros((self.s_shape_x * self.s_shape_y * self.s_shape_z, 3), dtype=np.int)
        cdef int i, sx, sy, sz
        i = 0
        for sx in range(self.s_shape_x):
            for sy in range(self.s_shape_y):
                for sz in range(self.s_shape_z):
                    if blockedCubes[sx, sy, sz] == UN_BLOCKED:
                        indices[i, 0] = sx
                        indices[i, 1] = sy
                        indices[i, 2] = sz
                        i = i + 1
        indices = np.resize(indices, (i, 3))
        return indices
            
    cdef inline double _getEmissionCost(ViterbiDecoder self, int numaps, double* mus, double* xs) nogil:
        ''' calc emission cost for vector of rss values and vector of mus'''
        if self.emissionCostMode == 0:
            return emission_all(numaps, mus, xs, self.no_signal_delta)
        elif self.emissionCostMode == 1:
            return emission_top3_deltas(numaps, mus, xs)
        elif self.emissionCostMode == 2:
            return emission_top3_mu(numaps, mus, xs)
        elif self.emissionCostMode == 3:
            return emission_euclid_all(numaps, mus, xs, self.no_signal_delta)
        elif self.emissionCostMode == 4:
            return emission_euclid_top3_deltas(numaps, mus, xs)
    
    def getEmissionCost(self, np.ndarray[np.double_t, ndim=1] mus, np.ndarray[np.double_t, ndim=1] xs):
        ''' pure python access to emission costs'''
        res = self._getEmissionCost(len(mus), <np.double_t*>mus.data, <np.double_t*>xs.data)
        return res
        
    @cython.boundscheck(False)
    @cython.cdivision(True)
    def decode_with_pruning(self, np.ndarray[np.double_t, ndim=2] measurements, 
                                     np.ndarray[np.double_t, ndim=3] curr_probs,
                                     np.ndarray[np.uint8_t, ndim=4] back_track,
                                     prune_to, int continueDecoding):
        ''' only use the TOP-N hypothesis during search (given by prune_to)'''
        
        cdef np.ndarray[np.double_t, ndim=3] last_probs = self.last_probs
            
        cdef int s_shape_x=self.s_shape_x, s_shape_y=self.s_shape_y, s_shape_z=self.s_shape_z 
        cdef int t_shape_x=self.t_shape_x, t_shape_y=self.t_shape_y, t_shape_z=self.t_shape_z 
        cdef int offset_x=self.offset_x, offset_y=self.offset_y, offset_z=self.offset_z
        
        cdef double mu, sq_sigma, x, transition_cost, min_transition_cost, emission_cost, cost, prev_cost
        cdef int sx, sy, sz, tx, ty, tz, offset, shape_idx, best_prev_state=0, i, j, k, numaps=measurements.shape[1]
      
      
        cdef int loggging_at = max(1, self.len_measurements / 10)
            
        cdef int num_unpruned, num_emitted = 0
            
        cdef SortedXYZList pl = SortedXYZList(prune_to)
        
        cdef np.ndarray[np.double_t, ndim=2] xyz_a
        cdef double sum_x, sum_y, sum_z
            
        cdef omp_lock_t mylock
        openmp.omp_init_lock(&mylock) # initialize 
        
        
        cdef np.ndarray[np.int8_t, ndim=3] pruned = np.zeros((s_shape_x, s_shape_y, s_shape_z), dtype=np.int8)
        
        cdef np.ndarray[np.int_t, ndim=3] blockedCubes = self.blockedCubes
        cdef np.ndarray[np.double_t, ndim=5] emission_probs = self.emission_probs
        cdef np.ndarray[np.double_t, ndim=6] transition_probs = self.transition_probs
            
        # start only with unblocked states
        cdef np.ndarray[np.int_t, ndim=2] indices = self.getUnblockedIndices()
        num_unpruned = len(indices)
        
        seq_result = []
        avg_seq_result = []
        
        sum_time2 = sum_time1 = 0
        t = time.time()
        for i in range(0, self.len_measurements):
            #~ t2 = time.time()
            
            if self.verbose and (i % loggging_at == 0 or i == 0 or i == self.len_measurements - 1):
                _log.debug('calc probs col %s of %s [%dk states/sec] [%s]' % (i, self.len_measurements, (self.numstates / 1000.0 * i) / (time.time() - t + 0.000001), num_unpruned))
            
            
            for k in prange(num_unpruned, nogil=True):
                #~ tid = threadid()
                
                sx = indices[k, 0]
                sy = indices[k, 1]
                sz = indices[k, 2]
                
                #~ emission_cost = self._getEmissionCost()
                emission_cost = self._getEmissionCost(numaps, <double*>PyArray_GETPTR4(<ndarray>emission_probs, sx, sy, sz, 0),  
                                                <double*>PyArray_GETPTR2(<ndarray>measurements, i, 0))
        
       
                if i == 0 and continueDecoding == 0:
                    # first column only emissions
                    curr_probs[sx, sy, sz] = emission_cost
                    if -emission_cost > pl.minimum:
                        openmp.omp_set_lock(&mylock) # acquire 
                        pl.insert(-emission_cost, sx, sy, sz)
                        openmp.omp_unset_lock(&mylock) # release
                    continue
                    
                min_transition_cost = INFINITY
                shape_idx = -1
                with gil:
                    for tx in range(t_shape_x):
                        prev_idx_x = sx + tx - offset_x
                        
                        if not 0 <= prev_idx_x < s_shape_x:
                            shape_idx = shape_idx  + t_shape_y * t_shape_z
                            continue
                            
                        for ty in range(t_shape_y):
                            prev_idx_y = sy + ty - offset_y
                            if not 0 <= prev_idx_y < s_shape_y:
                                shape_idx = shape_idx  + t_shape_z
                                continue
                                
                            for tz in range(t_shape_z):
                                shape_idx = shape_idx + 1
                                
                                prev_idx_z = sz + tz - offset_z
                                if not 0 <= prev_idx_z < s_shape_z:
                                    continue
                                
                                prev_cost = last_probs[prev_idx_x, prev_idx_y, prev_idx_z]
                                transition_cost = transition_probs[sx, sy, sz, tx, ty, tz]
                                
                                if transition_cost == INFINITY:
                                    cost = INFINITY
                                else:
                                    cost = prev_cost + transition_cost# * 0
                                
                                if cost < min_transition_cost:
                                    min_transition_cost = cost
                                    best_prev_state = shape_idx
                                    
                    
                cost = min_transition_cost + emission_cost
                curr_probs[sx, sy, sz] = cost
                
                if i > 0: # continueDecoding == 1
                    back_track[i, sx, sy, sz] = best_prev_state
                
                if -cost > pl.minimum:
                    openmp.omp_set_lock(&mylock) # acquire 
                    pl.insert(-cost, sx, sy, sz)
                    openmp.omp_unset_lock(&mylock) # release
                
            #~ sum_time1 += (time.time() - t2)
            
            #~ t2 = time.time()
            # toggle arrays
            last_probs, curr_probs = curr_probs, last_probs
            
            curr_probs.fill(INFINITY)
            pruned.fill(PRUNED)
            
            #~ if self.store_pruning:
            xyz_a = pl.xyzvAsArray()
            
            sum_x = sum_y = sum_z = 0
            for k in range(len(xyz_a)-50, len(xyz_a)):
                sum_x += xyz_a[k, 0]
                sum_y += xyz_a[k, 1]
                sum_z += xyz_a[k, 2]
            
            seq_result.append(tuple(xyz_a[-1][:3]))
            avg_seq_result.append((sum_x / 50.0, sum_y / 50.0, sum_z / 50.0))
            
                
            self.history.append(xyz_a)
                
            pl.updatePruningArray(pruned, t_shape_x, t_shape_y, t_shape_z, 
                                    s_shape_x, s_shape_y, s_shape_z,
                                    offset_x, offset_y, offset_z)
            
            # not parallelizable
            num_unpruned = 0
            for sx in range(s_shape_x):
                for sy in range(s_shape_y):
                    for sz in range(s_shape_z):
                        
                        if pruned[sx, sy, sz] != PRUNED:
                            indices[num_unpruned, 0] = sx
                            indices[num_unpruned, 1] = sy
                            indices[num_unpruned, 2] = sz
                            
                            num_unpruned = num_unpruned + 1
            
            # maybe we get a speedup by reducing the memory that has to be hold in cpu cache
            if len(indices) > num_unpruned * 3:
                indices = np.resize(indices, (num_unpruned * 2, 3))
                
            pl.clear()
            #~ sum_time2 += (time.time() - t2)
            
        openmp.omp_destroy_lock(&mylock) # deallocate the lock
        
        self.last_probs[:] = last_probs
        
        #~ print 'phase1: %.3f phase2: %.3f' % (sum_time1, sum_time2)
        return seq_result, avg_seq_result
        
    @cython.boundscheck(False)
    @cython.cdivision(True)
    def decode_all(self, np.ndarray[np.double_t, ndim=2] measurements, 
                      np.ndarray[np.double_t, ndim=3] last_probs,
                      np.ndarray[np.double_t, ndim=3] curr_probs,
                      np.ndarray[np.uint8_t, ndim=4] back_track):
        ''' decode the complete searchspace without pruning 
            (Out of Date? - only with pruning was used in the end in Dirks DA)
        '''
        
        cdef int s_shape_x=self.s_shape_x, s_shape_y=self.s_shape_y, s_shape_z=self.s_shape_z 
        cdef int t_shape_x=self.t_shape_x, t_shape_y=self.t_shape_y, t_shape_z=self.t_shape_z 
        cdef int offset_x=self.offset_x, offset_y=self.offset_y, offset_z=self.offset_z 
        
        cdef double mu, sq_sigma, x, min_cost, min_transition_cost, emission_cost, cost, prev_cost
        cdef int sx, sy, sz, tx, ty, tz, len_indices, shape_idx, best_prev_state=0, i, j, k, numaps=measurements.shape[1]
        
        cdef int loggging_at = max(1, self.len_measurements / 10)
        
        cdef np.ndarray[np.double_t, ndim=5] emission_probs = self.emission_probs
        cdef np.ndarray[np.double_t, ndim=6] transition_probs = self.transition_probs
        
        #~ len_indices = 
        cdef np.ndarray[np.int_t, ndim=2] indices = self.getUnblockedIndices()
        
        t = time.time()
        # xs -> emitted vector
        for i in range(0, self.len_measurements):
            if self.verbose and (i % loggging_at == 0 or i == 1 or i == self.len_measurements - 1):
                _log.debug('calc probs col %s of %s [%dk states/sec] [%s]' % (i, self.len_measurements, (self.numstates * i / 1000) / (time.time() - t + 0.000001), indices.shape[0]))
            
            for k in prange(indices.shape[0], nogil=True):
                sx = indices[k, 0]
                sy = indices[k, 1]
                sz = indices[k, 2]
                            
                # calc emission prob for vector with rss values
                emission_cost = 0.0
                for j in range(numaps):
                    mu = emission_probs[sx, sy, sz, 0, j]
                    sq_sigma = emission_probs[sx, sy, sz, 1, j]
                    x = measurements[i, j]
                    
                    if x == -1: # no measurement
                        continue 

                    emission_cost = emission_cost + fabs(mu - x)
                
                if i == 0:
                    # first column only emissions
                    curr_probs[sx, sy, sz] = emission_cost
                    continue
                
                min_transition_cost = INFINITY
                shape_idx = -1
                with gil:
                    for tx in range(t_shape_x):
                        prev_idx_x = sx + tx - offset_x
                        
                        if not 0 <= prev_idx_x < s_shape_x:
                            shape_idx = shape_idx  + t_shape_y * t_shape_z
                            continue
                            
                        for ty in range(t_shape_y):
                            prev_idx_y = sy + ty - offset_y
                            if not 0 <= prev_idx_y < s_shape_y:
                                shape_idx = shape_idx  + t_shape_z
                                continue
                                
                            for tz in range(t_shape_z):
                                shape_idx = shape_idx + 1
                                
                                prev_idx_z = sz + tz - offset_z
                                if not 0 <= prev_idx_z < s_shape_z:
                                    continue
                                
                                #~ prev_prob = all_probs[i-1, prev_idx_x, prev_idx_y, prev_idx_z]
                                prev_cost = last_probs[prev_idx_x, prev_idx_y, prev_idx_z]
                                cost = prev_cost + transition_probs[sx, sy, sz, tx, ty, tz]
                                
                                if cost < min_transition_cost:
                                    min_transition_cost = cost
                                    best_prev_state = shape_idx
                                
                
                curr_probs[sx, sy, sz] = min_transition_cost + emission_cost
                
                back_track[i, sx, sy, sz] = best_prev_state
            
            last_probs, curr_probs = curr_probs, last_probs
        
        
        return last_probs
        
    def decode(self, np.ndarray[np.double_t, ndim=2] measurements, int prune_to=0, int num_threads=0, full=False, continueDecoding=False):
        self.history[:] = []
        
        cdef int s_shape_x=self.s_shape_x, s_shape_y=self.s_shape_y, s_shape_z=self.s_shape_z
        
        cdef int t_shape_x=self.t_shape_x, t_shape_y=self.t_shape_y, t_shape_z=self.t_shape_z
         
        # we have to locate the (previous) state referenced by the transition index
        # if the transition_shape is (5, 5, 3) then (tx=2, ty2, tz=1) then previous state is the current state
        cdef int offset_x, offset_y, offset_z
        self.offset_x = t_shape_x / 2
        self.offset_y = t_shape_y / 2
        self.offset_z = t_shape_z / 2
            
        # last column in dynamic programming matrix
        
        # curr column in dynamic programming matrix
        cdef np.ndarray[np.double_t, ndim=3] curr_probs = np.empty((s_shape_x, s_shape_y, s_shape_z), dtype=np.double)
        
        if not continueDecoding:
            self.last_probs.fill(INFINITY)
        
        # TODO: rename curr_probs to curr_costs
        curr_probs.fill(INFINITY)
        
        # backtracking pointers to find best path through dynamic programming matrix
        # we store the only the index of transition shape
        assert t_shape_x * t_shape_y * t_shape_z < 256, 'you have to increase the backpointer array from uint8 to uint16'
        cdef np.ndarray[np.uint8_t, ndim=4] back_track = np.zeros((len(measurements), s_shape_x, s_shape_y, s_shape_z), dtype=np.uint8)
        if self.verbose:
            _log.info('using %.3f GB Memory for backpointer array' % (back_track.size / 1024.0**3))
        
        self.numstates = s_shape_x * s_shape_y * s_shape_z
        self.len_measurements = len(measurements)
        if self.verbose:
            _log.debug('%sk states per col, total: %.1f mio states' % (self.numstates / 1000, self.numstates / 10.0**6 * self.len_measurements))
        
        omp_set_num_threads(num_threads)
        try:
            if prune_to == 0:
                #~ last_probs = self.decode_all(measurements, last_probs, curr_probs, back_track)
                seq_res, avg_seq_res = [], []
            else:
                seq_res, avg_seq_res = self.decode_with_pruning(measurements, curr_probs, back_track, prune_to, continueDecoding)
        finally:
            omp_set_num_threads(0)
        
        
        # find starting point of best backtracking path
        cdef int i, j, best_last_state_x = 0, best_last_state_y = 0, best_last_state_z = 0
        cdef double min_cost = INFINITY, cost
        
        cdef SortedXYZList best_estimates
        if full:
            best_estimates = SortedXYZList(s_shape_x * s_shape_y * s_shape_z)
        
        for sx in range(s_shape_x):
            for sy in range(s_shape_y):
                for sz in range(s_shape_z):
                    cost = self.last_probs[sx, sy, sz]
                    if full:
                        best_estimates.insert(-cost, sx, sy, sz)
                    else:
                        if cost < min_cost:
                            best_last_state_x, best_last_state_y, best_last_state_z = sx, sy, sz
                            min_cost = cost
        
        if full:
            a = best_estimates.xyzvAsArray()
            #~ np.savetxt('r:/out.txt', a, fmt=('%d', '%d', '%d', '%.1f'))
            
            self.estimated_paths[:] = []
            # follow backtracking links from best path
            for i in range(-1, - (prune_to if prune_to > 0 else len(a)), -1):
                
                estimated_path = []
                curr_state = a[i, 0], a[i, 1], a[i, 2]
                curr_prob = min_cost
                for j in range(self.len_measurements-1, -1, -1):
                    estimated_path.append((curr_state, 0))
                    if j > -1:
                        sx, sy, sz = curr_state[0], curr_state[1], curr_state[2]
                        shape_idx = back_track[j, sx, sy, sz]
                        tx = shape_idx / (t_shape_y * t_shape_z)
                        ty = (shape_idx / t_shape_z) % t_shape_y
                        tz = shape_idx % t_shape_z
                        
                        curr_state = sx + tx - self.offset_x, sy + ty - self.offset_y, sz + tz - self.offset_z
                        
                self.estimated_paths.append(list(reversed(estimated_path)))
            estimated_path = [e[0] for e in self.estimated_paths[0]]
        else:
            if self.verbose:
                _log.debug('best last state: (%s, %s, %s) with cost %s' % (best_last_state_x, best_last_state_y, best_last_state_z, min_cost))
     
            # follow backtracking links from best path
            estimated_path = []
            curr_state = best_last_state_x, best_last_state_y, best_last_state_z
            curr_prob = min_cost
            
            for j in range(self.len_measurements-1, -1, -1):
                estimated_path.append(curr_state)
                if j > -1:
                    sx, sy, sz = curr_state[0], curr_state[1], curr_state[2]
                    shape_idx = back_track[j, sx, sy, sz]
                    tx = shape_idx / (t_shape_y * t_shape_z)
                    ty = (shape_idx / t_shape_z) % t_shape_y
                    tz = shape_idx % t_shape_z
                    
                    curr_state = sx + tx - self.offset_x, sy + ty - self.offset_y, sz + tz - self.offset_z
            
            estimated_path = list(reversed(estimated_path))
        
        return estimated_path, seq_res, avg_seq_res


cdef class LMSE:
    cdef public int verbose
    cdef public int with_history
    cdef public list history
            
    cdef:
        int s_shape_x, s_shape_y, s_shape_z
        np.ndarray emission_probs
    
    def __init__(self, np.ndarray[np.double_t, ndim=5] emission_probs, verbose=True, with_history=False):  
        self.emission_probs = emission_probs
        
        self.s_shape_x = emission_probs.shape[0]
        self.s_shape_y = emission_probs.shape[1]
        self.s_shape_z = emission_probs.shape[2]
        self.verbose = verbose
        
        self.with_history = with_history
        self.history = []
    
    @cython.boundscheck(False)
    @cython.cdivision(True)
    def decode(self, np.ndarray[np.double_t, ndim=2]  measurements):
        t = time.time()
        
        cdef int s_shape_x=self.s_shape_x, s_shape_y=self.s_shape_y, s_shape_z=self.s_shape_z
        
        cdef np.ndarray[np.double_t, ndim=5] emission_probs = self.emission_probs
        cdef int numaps = len(measurements[0])
        
        # we dont  track XYZ but only the particle index on X
        cdef SortedXYZList top_mse = SortedXYZList(1000)
        #~ cdef np.ndarray[np.double_t, ndim=2] top_mse_a
        
        
        cdef int sx, sy, sz
        cdef int i, j
        cdef double d, sum_d, sqrt_d, min_d, x
        cdef int best_x, best_y, best_z
        
        seq_res = []
        for j in range(len(measurements)):
            with nogil:
                min_d = INFINITY
                for sx in range(s_shape_x):
                    for sy in range(s_shape_y):
                        for sz in range(s_shape_z):
                            sum_d = 0
                            for i in range(numaps):
                                x = measurements[j, i]
                                if x > 0: # either 1, 2
                                    continue
                                d = emission_probs[sx, sy, sz, 0, i] - x
                                sum_d = sum_d + d * d
                            
                            # sqrt does not change maximum
                            #~ sqrt_d = sqrt(sum_d)
                            sqrt_d = sum_d
                            
                            if sqrt_d < min_d:
                                min_d = sqrt_d
                                best_x = sx; best_y = sy; best_z = sz
                            
                            if self.with_history and -sqrt_d > top_mse.minimum:
                                #~ openmp.omp_set_lock(&mylock) # acquire 
                                top_mse.insert(-sqrt_d, sx, sy, sz)
                                #~ openmp.omp_unset_lock(&mylock) # release
                
            
            seq_res.append((best_x, best_y, best_z))
            if self.with_history:
                self.history.append(top_mse.xyzvAsArray())
                top_mse.clear()
        
        return seq_res
        
cdef inline long lround(double d) nogil:
    return <long>floor(d + 0.5)
    
cdef class ParticleFilter:
    cdef public int verbose
    cdef public int with_history
    cdef public list history
        
    cdef:
        int s_shape_x, s_shape_y, s_shape_z
        
        np.ndarray blockedCubes
        np.ndarray emission_probs
        np.ndarray sampling_pool
        
    def __init__(self, np.ndarray[np.int_t, ndim=3] blockedCubes, np.ndarray[np.double_t, ndim=5] emission_probs, 
            np.ndarray[np.double_t, ndim=2] sampling_pool, verbose=True, with_history=False,
            ):
        self.blockedCubes = blockedCubes
        self.emission_probs = emission_probs
        self.sampling_pool = sampling_pool
        
        self.s_shape_x = emission_probs.shape[0]
        self.s_shape_y = emission_probs.shape[1]
        self.s_shape_z = emission_probs.shape[2]
        self.verbose = verbose
        self.with_history = with_history
        self.history = []
        
    @cython.boundscheck(False)
    @cython.cdivision(True)
    def decode(self, np.ndarray[np.double_t, ndim=2]  measurements, int num_particles, 
                int do_blocking=1, int over_sample_limit=50, int average_over_particles=0, 
                double smooth=1.0, double replace_below=1e-10, int max_replace=100, double sigma=3.0, double turnoff_blocking=1e3):
        t = time.time()
        
        cdef int s_shape_x=self.s_shape_x, s_shape_y=self.s_shape_y, s_shape_z=self.s_shape_z
        
        cdef np.ndarray[np.double_t, ndim=5] emission_probs = self.emission_probs
        cdef np.ndarray[np.int_t, ndim=3] blockedCubes = self.blockedCubes
            
        seq_res = []
        if average_over_particles > 0:
            seq_avg_res = []
        else:
            seq_avg_res = None
        
        cdef int numaps = len(measurements[0])
        cdef int i, j, k, ii
            
        #~ cdef np.ndarray[np.double_t, ndim=1] sigmas = np.empty((numaps, ), dtype=np.double)
        #~ sigmas.fill(9)
        cdef int with_history = self.with_history
        cdef SortedXYZList sorted_weights = SortedXYZList(num_particles)
        self.history[:] = []
        
        # we dont  track XYZ but only the particle index on X
        cdef SortedXYZList top_particles = SortedXYZList(average_over_particles)
        cdef np.ndarray[np.double_t, ndim=2] top_particles_a
            
        cdef np.ndarray[np.int_t, ndim=3] local_block_map = np.zeros((s_shape_x, s_shape_y, s_shape_z), dtype=np.int)
        
        cdef np.ndarray[np.uint32_t, ndim=2] back_track_links = np.zeros((len(measurements), num_particles), dtype=np.uint32)
        cdef np.ndarray[np.uint32_t, ndim=2] back_track_positions = np.zeros((len(measurements), num_particles), dtype=np.uint32)
        
        cdef int OVER_ALLOCATE = num_particles * 2 # if more than half the scene is freespace (UN_BLOCKED), this should be enough
        cdef np.ndarray[np.int_t, ndim=2] particles, new_particles, random_particle_pool
        
        random_particle_pool = np.dstack(np.random.randint(0, size, OVER_ALLOCATE) 
                                    for size in (s_shape_x, s_shape_y, s_shape_z))[0]
        cdef int random_particle_pool_idx = 0, num_replaced = 0
            
        particles = random_particle_pool.copy()
        new_particles = np.zeros((num_particles, 3), dtype=np.int)
        j = 0
        for i in range(OVER_ALLOCATE):
            if blockedCubes[random_particle_pool[i, 0], random_particle_pool[i, 1], random_particle_pool[i, 2]] != UN_BLOCKED:
                continue
            particles[j, 0] = random_particle_pool[i, 0] 
            particles[j, 1] = random_particle_pool[i, 1]
            particles[j, 2] = random_particle_pool[i, 2]
            j += 1
            if j == num_particles:
                break
        
        
        cdef np.ndarray[np.double_t, ndim=1] weights = np.zeros((num_particles, ), dtype=np.double)
        
        cdef np.ndarray[np.double_t, ndim=2] sampling_pool = self.sampling_pool
        cdef int pool_size = len(sampling_pool), curr_pool_idx = 0, sampling_pool_idx = 0, over_sample_limit_reached = 0
        
        cdef int total_spawned=0, idx, num_spawned, num_spawn_particles
        
        cdef int px, py, pz, best_idx, best_px, best_py, best_pz, new_px, new_py, new_pz
        cdef int blocked_path, p_back_track_idx, curr_x_i, curr_y_i, curr_z_i, dxi, dyi, dzi
            
        cdef double sum_w_new, w_new, mu, sq_sigma, x, dx, dy, dz, w
        cdef double step_x, step_y, step_z, curr_x, curr_y, curr_z, distance
        
        cdef double gauss_C1 = sqrt(2 * math.pi * sigma**2)
        cdef double gauss_C2 = -(2 * sigma**2)
        
        for j in range(len(measurements)):
            if self.verbose and (j % 10 == 0 or j == len(measurements) - 1):
                _log.debug('m: %s of %s' % (j, len(measurements)))
            
            with nogil:
                
                w_max = NEG_INFINITY
                sum_w_new = 0
                #~ idx = random_particle_pool_idx
                num_replaced = 0
                for k in range(num_particles):
                    px = particles[k, 0]
                    py = particles[k, 1]
                    pz = particles[k, 2]
                    
                    if px == -1:
                        continue
                        
                    w_new = 1
                    for i in range(numaps):
                        x = measurements[j, i]
                        if x > 0: # either 1, 2
                            continue
                        mu = emission_probs[px, py, pz, 0, i]
                        w_new = w_new * gauss_C1 * exp((x - mu) * (x - mu) / gauss_C2)
                    
                    if w_new == 1:
                        # mo measurements at this position
                        w_new = 0 # hmpf, ugly
                        
                    if w_new < replace_below and num_replaced < max_replace:
                        # replace low prob sample with with random from pool
                        # this will prevent "starving" since new random particels
                        # are permanently fed into the system
                        i = random_particle_pool_idx % OVER_ALLOCATE
                        random_particle_pool_idx += 1
                        num_replaced += 1
                        
                        px = particles[k, 0] = random_particle_pool[i, 0]
                        py = particles[k, 1] = random_particle_pool[i, 1]
                        pz = particles[k, 2] = random_particle_pool[i, 2]
                        
                        w_new = 1
                        for i in range(numaps):
                            x = measurements[j, i]
                            if x > 0: # either 1, 2
                                continue
                            mu = emission_probs[px, py, pz, 0, i]
                            w_new = w_new * gauss_C1 * exp((x - mu) * (x - mu) / gauss_C2)
                        
                        if w_new == 1:
                            w_new = 0 # hmpf, ugly
                        
                    #XXX: how can we formalize this sqrt
                    weights[k] = powf(w_new, smooth)
                    
                    #~ weights[k] = w_new
                    sum_w_new = sum_w_new + weights[k]
                    
                    if average_over_particles > 0:
                        top_particles.insert(w_new, px, py, pz)
                    
                    if with_history:
                        sorted_weights.insert(w_new, px, py, pz)
                        
                    if w_new > w_max:
                        w_max = w_new
                        best_px = px
                        best_py = py
                        best_pz = pz
                        best_idx = k
                #~ print random_particle_pool_idx -idx
                #~ print sum_w_new
                        
                #~ w_max = 10**9
                #~ sum_w_new = 0
                
                #~ for k in range(num_particles):
                    #~ px = particles[k, 0]
                    #~ py = particles[k, 1]
                    #~ pz = particles[k, 2]
                    
                    #~ if px == -1:
                        #~ continue
                        
                    #~ w_new = 0
                    #~ for i in range(numaps):
                        #~ x = measurements[j, i]
                        #~ if x == 2:
                            #~ continue
                        #~ elif x == 1: # no measurement
                            #~ #if mu > -100:
                                #~ #w_new = w_new + 4 * 4
                            #~ continue
                        
                        #~ mu = emission_probs[px, py, pz, 0, i]
                        
                        #~ w_new = w_new + (x - mu) * (x - mu)
                    
                    #~ w_new = sqrt(w_new)
                    
                    #~ if w_new == 0:
                       #~ w_new = INFINITY
                       
                    #~ weights[k] = 1.0 / w_new
                    #~ sum_w_new = sum_w_new + weights[k]
                    
                    #~ if average_over_particles > 0:
                        #~ top_particles.insert(-w_new, px, py, pz)
                    
                    #~ if with_history:
                        #~ sorted_weights.insert(-w_new, px, py, pz)
                        
                    #~ if w_new < w_max:
                        #~ w_max = w_new
                        #~ best_px = px
                        #~ best_py = py
                        #~ best_pz = pz
                        #~ best_idx = k
                
                #~ print w_max
                
                # renormalize
                for k in range(num_particles):
                    weights[k] = weights[k] / sum_w_new
                
                total_spawned = 0
                for k in range(num_particles):
                    w = weights[k]
                    
                    px = particles[k, 0]
                    py = particles[k, 1]
                    pz = particles[k, 2]
                    
                    num_spawn_particles = lround(num_particles * w)
                    
                    # prevent overflow, due to rounding
                    if total_spawned + num_spawn_particles > num_particles:
                        num_spawn_particles = num_particles - total_spawned
                    
                    idx = total_spawned
                    num_spawned = 0
                    for i in range(num_spawn_particles * over_sample_limit): # limit retries
                        
                        if num_spawned == num_spawn_particles:
                            break
                            
                        curr_pool_idx = sampling_pool_idx % pool_size
                        dx = sampling_pool[curr_pool_idx, 0]
                        dy = sampling_pool[curr_pool_idx, 1]
                        dz = sampling_pool[curr_pool_idx, 2]
                        sampling_pool_idx += 1
                        
                        dxi = lround(dx)
                        dyi = lround(dy)
                        dzi = lround(dz)
                        new_px = px + dxi
                        new_py = py + dyi
                        new_pz = pz + dzi
                        
                        # check if we are still in the allowed space
                        if not (0 <= new_px < s_shape_x and 0 <= new_py < s_shape_y and 0 <= new_pz < s_shape_z):
                            continue # try again
                        
                        # check if if we know that the new_p cannot be reached
                        if local_block_map[new_px, new_py, new_pz] == k:
                            continue # try again
        
                        # check if destination is blocked
                        if blockedCubes[new_px, new_py, new_pz] != UN_BLOCKED:
                            continue # try again
                        
                        if do_blocking == 1 and sum_w_new > turnoff_blocking:
                            #~ print
                            # scan space between p and new_p for BLOCKED cubes
                            # and reject sample if True
                            distance = sqrt(dxi * dxi + dyi * dyi + dzi * dzi) # euclidian
                            if distance == 0:
                                continue
                                
                            step_x = dxi / distance
                            step_y = dyi / distance
                            step_z = dzi / distance
                            
                            curr_x = px
                            curr_y = py
                            curr_z = pz
                            curr_x_i = int(curr_x); curr_y_i = int(curr_y); curr_z_i = int(curr_z)
                            
                            blocked_path = 0
                            for ii in range(<int>distance):
                            #~ while not(curr_x_i == new_px or curr_y_i == new_py or curr_z_i == new_pz):
                                curr_x = curr_x + step_x
                                curr_y = curr_y + step_y
                                curr_z = curr_z + step_z
                                curr_x_i = int(curr_x); curr_y_i = int(curr_y); curr_z_i = int(curr_z)
                                #~ print curr_x, curr_y, curr_z, ':', dx, dy, dz, ':', px, py, pz
                                if 0 <= curr_x_i < s_shape_x and 0 <= curr_y_i < s_shape_y and 0 <= curr_z_i < s_shape_z:
                                    if blockedCubes[curr_x_i, curr_y_i, curr_z_i] != UN_BLOCKED:
                                        blocked_path = 1
                                        #mark es blocked with current block indicator
                                        local_block_map[curr_x_i, curr_y_i, curr_z_i] = k
                                        break
                                
                            if blocked_path == 1:
                                # mark all cubes after the blocked hit also with current block indicator
                                for ii in range(ii+1, <int>distance):
                                    curr_x = curr_x + step_x
                                    curr_y = curr_y + step_y
                                    curr_z = curr_z + step_z
                                    curr_x_i = int(curr_x); curr_y_i = int(curr_y); curr_z_i = int(curr_z)
                                    if 0 <= curr_x_i < s_shape_x and 0 <= curr_y_i < s_shape_y and 0 <= curr_z_i < s_shape_z:
                                        local_block_map[curr_x_i, curr_y_i, curr_z_i] = k
                                        
                                #~ print 'blocked..., resample'
                                continue # try again
                            
                        # ok, we got a valid sample
                        new_particles[idx, 0] = new_px
                        new_particles[idx, 1] = new_py
                        new_particles[idx, 2] = new_pz
                        
                        # link to predessor
                        back_track_links[j, idx] = k
                        # current encoded position
                        back_track_positions[j, idx] = new_px * s_shape_y * s_shape_z + new_py * s_shape_z + new_pz
                        
                        idx += 1
                        num_spawned += 1
                        
                    total_spawned += num_spawn_particles
                    
                    if i == num_spawn_particles * over_sample_limit - 1:
                        over_sample_limit_reached += 1
                
                for i in range(total_spawned, num_particles):
                    new_particles[i, 0] = -1
                        
            # END nogil
            
            seq_res.append((best_px, best_py, best_pz))
            
            if with_history:
                self.history.append(sorted_weights.xyzvAsArray())
                sorted_weights.clear()
                
            #~ print best_px, best_py, best_pz
            if average_over_particles > 0:
                top_particles_a, sum_w, sum_x, sum_y, sum_z = top_particles.particleDataAsArray()
                #~ for i in range(len(top_particles_a[0])):
                    #~ print top_particles_a[0, i],
                    #~ print top_particles_a[1, i],
                    #~ print top_particles_a[2, i],
                    #~ print top_particles_a[3, i]
                #~ print len(top_particles_a[0]), sum_w, sum_x, sum_y, sum_z
                tpx, tpy, tpz = (int(top_particles_a[0, -1]), int(top_particles_a[1, -1]), int(top_particles_a[2, -1]))
                assert seq_res[-1] == (tpx, tpy, tpz), '%s != %s' % (seq_res[-1], (tpx, tpy, tpz))
                
                k = len(top_particles_a[0])
                seq_avg_res.append((int(sum_x / k), int(sum_y / k), int(sum_z / k)))
            
            # toggle
            new_particles, particles = particles, new_particles
            # reset
            new_particles.fill(0)
            weights.fill(0)
            local_block_map.fill(0)
            top_particles.clear()
            
            
        back_tracked_res = []
        
        curr_idx = best_idx
        curr_px, curr_py, curr_pz = seq_res[-1]
        
        for j in range(len(measurements)-1, -1, -1):
            position = back_track_positions[j, curr_idx]
            curr_px = position / (s_shape_y * s_shape_z)
            curr_py = (position / s_shape_z) % s_shape_y
            curr_pz = position % s_shape_z
            back_tracked_res.append((curr_px, curr_py, curr_pz))
            curr_idx = back_track_links[j, curr_idx]
        
        if self.verbose:
            needed_samples = num_particles * len(measurements)
            _log.info('sampling stats: poolaccess: %.2e, rejected: %.2e, reject factor: %.1f, oversample limit reached: %s' % 
                        (sampling_pool_idx, sampling_pool_idx - needed_samples, 
                        (sampling_pool_idx - needed_samples) / needed_samples,
                        over_sample_limit_reached)                       
                    )
            _log.info('duration: %.3f secs' % (time.time() -t ))
        
        back_tracked_res = list(reversed(back_tracked_res))
        return back_tracked_res, seq_res, seq_avg_res
    
@cython.boundscheck(False)
def uncube(np.ndarray[np.double_t, ndim=3] data, orgshape):
    cdef np.ndarray[np.double_t, ndim=3] result
    cdef int size_x = orgshape[0]
    cdef int size_y = orgshape[1]
    cdef int size_z = orgshape[2]
        
    result = np.zeros((size_x, size_y, size_z), dtype=np.double)
    cdef int x, y, z, _x, _y, _z
    cdef int cube_width = orgshape[0] / data.shape[0]
    for x in range(data.shape[0]):
        for y in range(data.shape[1]):
            for z in range(data.shape[2]):
                
                for _x in range(x * cube_width, x * cube_width + cube_width):
                    for _y in range(y * cube_width, y * cube_width + cube_width):
                        for _z in range(z * cube_width, z * cube_width + cube_width):
                            result[_x, _y, _z] = data[x, y, z]
    return result
    
def normalize(x, deviceAdaptionValues, adaptMin=-150.0, isInDB=True):
    '''this is slow of course'''
    if isInDB:
        x = 10**(x / 10.0)
        
    return toNormalizedDecibelwithDeviceAdaption(np.array([[[x]]]).astype(np.float32), deviceAdaptionValues, adaptMin)[0][0][0]

@cython.boundscheck(False)
@cython.cdivision(True)
def compareForDeviceAdaption(da_values, np.ndarray[np.float32_t, ndim=3] estimated, station2apid_locid2measurement):
    cdef int i, j
    cdef double delta, total = 0
    cdef int num_deltas = 0
    cdef np.ndarray[np.double_t, ndim=3] normalized_estimated
    cdef np.ndarray[np.double_t, ndim=2] measured
    for station in da_values:
        
        normalized_estimated = toNormalizedDecibelwithDeviceAdaption(estimated, da_values[station])
        measured = station2apid_locid2measurement[station]
        with nogil:
            for i in range(normalized_estimated.shape[0]):
                for j in range(normalized_estimated.shape[1]):
                        
                    delta = fabs(normalized_estimated[i, j, 0] - measured[i, j])
                    
                    # delta != delta -> isnan() check
                    if delta != delta or delta > 100:
                        continue
                    if not normalized_estimated[i, j, 0] > -100:
                        continue
                
                    total = total + delta
                    num_deltas = num_deltas + 1

    return total / num_deltas
    

@cython.boundscheck(False)
@cython.cdivision(True)
def toNormalizedDecibelwithDeviceAdaption(np.ndarray[np.float32_t, ndim=3] data, deviceAdaptionValues, double adaptMin=-150.0):
    cdef int x, y, z, i
    cdef int shape_x=data.shape[0], shape_y=data.shape[1], shape_z=data.shape[2]
    
    cdef double v, left_intv, right_intv, left_val, right_val, m
    cdef int dacount = len(deviceAdaptionValues)
        
    assert dacount < 500
    cdef double[500] daintervals
    cdef double[500] davalues
        
    deviceAdaptionValues = [(adaptMin, -100 - adaptMin)] + deviceAdaptionValues
    
    for i, (dai, dav) in enumerate(deviceAdaptionValues):
        daintervals[i] = dai
        davalues[i] = dav
    cdef np.ndarray[np.double_t, ndim=3] _data = np.zeros((shape_x, shape_y, shape_z), dtype=np.double)
    
    for x in prange(shape_x, nogil=True):
        for y in range(shape_y):
            for z in range(shape_z):
                
                v = data[x, y, z]
                # convert to db
                if v > 0:
                    v = 10 * log10(v)
                    if v < adaptMin:
                        v = -100
                    else:
                        #apply deviceAdaptation
                        with gil:
                            for i in range(dacount):
                                left_intv = daintervals[i]
                                right_intv = daintervals[i+1]
                                left_val = davalues[i]
                                right_val = davalues[i+1]
                                
                                if left_intv <= v < right_intv:
                                    # it would be easy to cache m as well
                                    diff1 = right_intv - left_intv
                                    diff2 = right_intv + right_val - (left_intv + left_val)
                                    m = diff2 / diff1
                                    
                                    v = left_intv + left_val + (v - left_intv) * m
                                    break
                else:
                    
                    v = -100.0
                    
                _data[x, y, z] = v
    
    return _data

def findpath(x, y, z, level):
    #~ print '--- !', x, y, z 
    res = []
    for i, j, k in product([-1, 0, 1], [-1, 0, 1], [-1, 0, 1]):
        #~ print i, j, k
        if x + i == 0 and y + j == 0 and z + k == 0:
            return [[(x,y,z)]]
        elif level > 0:
            children = findpath(x + i, y + j, z + k, level-1)
            for l in children:
                l = list(l)
                l.insert(0, (x, y, z))
                res.append(l)
    return res


@cython.boundscheck(False)
@cython.cdivision(True)
def buildTransitionProbs(cubed_data_shape, transition_shape, np.ndarray[np.int_t, ndim=3] blockedCubes, int cubewidth, int freespace_scan):
    cdef int s_shape_x, s_shape_y, s_shape_z, t_shape_x, t_shape_y, t_shape_z
    
    s_shape_x, s_shape_y, s_shape_z = cubed_data_shape
    t_shape_x, t_shape_y, t_shape_z = transition_shape
    
    
    assert (t_shape_x, t_shape_y, t_shape_z) in {(3,3,1), (3,3,3), (5,5,3)}, 'unsuported transition_shape %s' % transition_shape
    
    cdef np.ndarray[np.double_t, ndim=6] transition_probs = np.zeros((s_shape_x, s_shape_y, s_shape_z, t_shape_x, t_shape_y, t_shape_z), dtype=np.double)
    
    # the combined transition probs leading TO this cube
    #~ cdef np.ndarray[np.double_t, ndim=3] combined_transition_probs = np.zeros((s_shape_x, s_shape_y, s_shape_z), dtype=np.double)
 
    # distribute probability evenly over all states s with sum(p(s) for s in S) = 1.0
    cdef double p_s = 1.0 / (s_shape_x * s_shape_y * s_shape_z)
    cdef double cost_p_s = -log(1.0 / (s_shape_x * s_shape_y * s_shape_z))
    
    #~ cdef double p_s_s
    
        
    cdef int sx, sy, sz, tx, ty, tz, numblocked, numunblocked, curr_x, curr_y, curr_z
    #~ cdef double 
    cdef int freespace, shape_idx, max_freespace_for_transition
        
    max_freespace = 0 # 20 highest res cubes - 4 meter if resolution is 0.2m
    cdef double renormalize_target, renormalize_ratio, renormalize_sum # debugging stuff
    
    cdef int prev_idx_x, prev_idx_y, prev_idx_z
        
    cdef int offset_x = t_shape_x / 2
    cdef int offset_y = t_shape_y / 2
    cdef int offset_z = t_shape_z / 2

    # for (5, 5, 3) model
    MAX_PATH_LENGTH = 1
    MAX_PATHS = 9
    cdef int unblocked_path_found = 0, x, y, z, i
    cdef np.ndarray[np.int_t, ndim=6] path_to_center = np.zeros((t_shape_x, t_shape_y, t_shape_z, MAX_PATHS, MAX_PATH_LENGTH, 3), dtype=np.int)
    
    cdef np.ndarray[np.int_t, ndim=1] numblocked_on_level = np.zeros((3), dtype=np.int)
    cdef np.ndarray[np.double_t, ndim=1] p_s_s_on_level = np.zeros((3), dtype=np.double)
    
    cdef np.ndarray[np.int_t, ndim=1] freespace_for_transition = np.zeros((t_shape_x * t_shape_y * t_shape_z), dtype=np.int)
        
    for tx in range(t_shape_x):
        for ty in range(t_shape_y):
            for tz in range(t_shape_z):
                if tx == 0 or tx == 4 or ty == 0 or ty == 4:

                    all_p = findpath(tx-offset_x, ty-offset_y, tz-offset_z, MAX_PATH_LENGTH)
                    for i, p in enumerate(all_p):
                        for j, (x, y, z) in enumerate(p[1:]):
                            path_to_center[tx, ty, tz, i, j, 0] = x + offset_x
                            path_to_center[tx, ty, tz, i, j, 1] = y + offset_y
                            path_to_center[tx, ty, tz, i, j, 2] = z + offset_z
    
    
    cdef int level0_cubes = 1
    cdef int level1_cubes = 27 - level0_cubes
    cdef int level2_cubes = (5 * 5 * 3) - level1_cubes - level0_cubes
        
    for sx in range(s_shape_x):
        for sy in range(s_shape_y):
            for sz in range(s_shape_z):
                
                numblocked = 0
                for tx in range(t_shape_x):
                    prev_idx_x = sx + tx - offset_x
                    if not 0 <= prev_idx_x < s_shape_x:
                        numblocked = numblocked + t_shape_y * t_shape_z
                        continue
                        
                    for ty in range(t_shape_y):
                        prev_idx_y = sy + ty - offset_y
                        if not 0 <= prev_idx_y < s_shape_y:
                            numblocked = numblocked + t_shape_z
                            continue
                                
                        for tz in range(t_shape_z):
                            prev_idx_z = sz + tz - offset_z
                            if not 0 <= prev_idx_z < s_shape_z:
                                numblocked += 1
                                continue
                            
                            if blockedCubes[prev_idx_x, prev_idx_y, prev_idx_z] != UN_BLOCKED:
                                # from cube is directly blocked
                                transition_probs[sx, sy, sz, tx, ty, tz] = INFINITY
                                numblocked += 1
                            elif blockedCubes[sx, sy, sz] != UN_BLOCKED:
                                # current cube is blocked
                                transition_probs[sx, sy, sz, tx, ty, tz] = INFINITY
                                numblocked += 1
                            elif (t_shape_x == 5 and t_shape_y == 5) and (tx == 0 or tx == 4 or ty == 0 or ty == 4):
                                unblocked_path_found = 0
                                for i in range(MAX_PATH_LENGTH):
                                    x = path_to_center[tx, ty, tz, i, 0, 0]
                                    y = path_to_center[tx, ty, tz, i, 0, 1]
                                    z = path_to_center[tx, ty, tz, i, 0, 2]
                                    if x == 0 and y == 0 and z == 0:
                                        continue
                                    x = x - offset_x
                                    y = y - offset_y
                                    z = z - offset_z
                                    if blockedCubes[sx + x, sy + y, sz + z] == UN_BLOCKED:
                                        unblocked_path_found = 1
                                        break
                                        
                                if unblocked_path_found == 0:
                                    transition_probs[sx, sy, sz, tx, ty, tz] = INFINITY
                                    numblocked += 1
                                        
                
                if freespace_scan == -1:
                    # mode blocked cubes infinity and unblocked cubes zero costs
                    continue
                # elif == 0: no forward scan, constant costs 1/3 1/3 1/3 costs for each hull level
                    
                numblocked_on_level[0] = 0
                numblocked_on_level[1] = 0
                numblocked_on_level[2] = 0
                
                #~ if numblocked < t_shape_x * t_shape_y * t_shape_z :
                    #~ p_s_s = -log(p_s / (t_shape_x * t_shape_y * t_shape_z - numblocked))
                #~ else:
                    #~ p_s_s = INFINITY
                
                for tx in range(t_shape_x):
                    for ty in range(t_shape_y):
                        for tz in range(t_shape_z):
                            #~ if transition_probs[sx, sy, sz, tx, ty, tz] == 0:
                                #~ transition_probs[sx, sy, sz, tx, ty, tz] = p_s_s
                                
                            if abs(tx - offset_x) == 2 or abs(ty - offset_y) == 2:
                                if transition_probs[sx, sy, sz, tx, ty, tz] == INFINITY:
                                    numblocked_on_level[2] += 1
                            elif abs(tx - offset_x) == 1 or abs(ty - offset_y) == 1:
                                if transition_probs[sx, sy, sz, tx, ty, tz] == INFINITY:
                                    numblocked_on_level[1] += 1
                            else:
                                if transition_probs[sx, sy, sz, tx, ty, tz] == INFINITY:
                                    numblocked_on_level[0] += 1
                
                if numblocked < t_shape_x * t_shape_y * t_shape_z :
                    #~ p_s_s_on_level[0] = -log(p_s / level0_cubes / 3.0)
                    #~ p_s_s_on_level[1] = -log(p_s / (level1_cubes - numblocked_on_level[1]) / 3.0)
                    #~ p_s_s_on_level[2] = -log(p_s / (level2_cubes - numblocked_on_level[2]) / 3.0)
                    
                    p_s_s_on_level[0] = -log(p_s / level0_cubes / 3.0) # numblocked_on_level[0] is handled by local INFINITY check
                    if numblocked_on_level[0] == 1:
                        # move probability mass of first level (add all mass from zero level)
                        # double p, half costs 
                        p_s_s_on_level[1] = -log((p_s + p_s) / (level1_cubes - numblocked_on_level[1]) / 3.0)
                    else:
                        p_s_s_on_level[1] = -log(p_s / (level1_cubes - numblocked_on_level[1]) / 3.0)

                    p_s_s_on_level[2] = -log(p_s / (level2_cubes - numblocked_on_level[2]) / 3.0)
                
                
                    #~ print level1_cubes, numblocked_on_level[1], level2_cubes, numblocked_on_level[2]
                    #~ print p_s_s_on_level[0], p_s_s_on_level[1], p_s_s_on_level[2]
                    
                    #~ p_s_s_on_level[0] = 0#-log((1.0 / 3) * p_s / level0_cubes)
                    #~ p_s_s_on_level[1] = 0#-log((1.0 / 3) * p_s / (level1_cubes - numblocked_on_level[1]))
                    #~ p_s_s_on_level[2] = 0#-log((1.0 / 3) * p_s / (level2_cubes - numblocked_on_level[2]))
                    
                    # find max freespace and normalize in the next step by that maximum
                    shape_idx = 0
                    max_freespace_for_transition = 0
                    for tx in range(t_shape_x):
                        for ty in range(t_shape_y):
                            for tz in range(t_shape_z):
                                
                                # forward scan to determine number of unblocked cubes ahead (in direction of jump) to modify cost/p_s_s
                                delta_x = tx - offset_x
                                delta_y = ty - offset_y
                                delta_z = tz - offset_z
                                
                                if delta_x != 0 or delta_y != 0 or delta_z != 0:
                                    numunblocked = 0
                                    curr_x = sx
                                    curr_y = sy
                                    curr_z = sz
                                    
                                    while 0 <= curr_x < s_shape_x and 0 <= curr_y < s_shape_y and 0 <= curr_z < s_shape_z and numunblocked < freespace_scan / cubewidth:
                                        if blockedCubes[curr_x, curr_y, curr_z] == UN_BLOCKED:
                                            numunblocked = numunblocked + 1
                                            curr_x = curr_x - delta_x
                                            curr_y = curr_y - delta_y
                                            # if we dont manipulate the z-delta handling jumping in z-order 
                                            # will be very unlikely as there is normally no free space
                                            if abs(delta_x) > 0 or abs(delta_y) > 0:
                                                pass # do not touch z
                                            else:
                                                curr_z = curr_z - delta_z
                                                
                                        else:
                                            break
                                    
                                    freespace = numunblocked * cubewidth
                                    #~ if abs(delta_z) > 0:
                                        #~ freespace = freespace * 3
                                    if freespace > max_freespace_for_transition:
                                        max_freespace_for_transition = freespace
                                        
                                    freespace_for_transition[shape_idx] = freespace
                                    
                                shape_idx = shape_idx + 1
                    
                    shape_idx = 0
                    renormalize_target = 0
                    cost_p_s_1 = 0
                    for tx in range(t_shape_x):
                        for ty in range(t_shape_y):
                            for tz in range(t_shape_z):
                                delta_x = tx - offset_x
                                delta_y = ty - offset_y
                                delta_z = tz - offset_z
                                
                                freespace = freespace_for_transition[shape_idx]
                                
                                if abs(delta_x) == 2 or abs(delta_y) == 2:
                                    if transition_probs[sx, sy, sz, tx, ty, tz] != INFINITY:
                                        transition_probs[sx, sy, sz, tx, ty, tz] = p_s_s_on_level[2] * (max_freespace_for_transition - freespace + 1)
                                        cost_p_s_1 = cost_p_s_1 + p_s_s_on_level[2]
                                elif abs(delta_x) == 1 or abs(delta_y) == 1 or abs(delta_z) == 1:
                                    if transition_probs[sx, sy, sz, tx, ty, tz] != INFINITY:
                                        transition_probs[sx, sy, sz, tx, ty, tz] = p_s_s_on_level[1] * (max_freespace_for_transition - freespace + 1) #* (1.0 - freespace / <double>max_freespace)
                                        cost_p_s_1 = cost_p_s_1 + p_s_s_on_level[1]
                                else:
                                    if transition_probs[sx, sy, sz, tx, ty, tz] != INFINITY:
                                        transition_probs[sx, sy, sz, tx, ty, tz] = p_s_s_on_level[0] * 3
                                        cost_p_s_1 = cost_p_s_1 + p_s_s_on_level[0]
                            
                                #~ if transition_probs[sx, sy, sz, tx, ty, tz] != INFINITY:
                                    #~ print delta_x, delta_y, delta_z, transition_probs[sx, sy, sz, tx, ty, tz]
                                if transition_probs[sx, sy, sz, tx, ty, tz] != INFINITY:
                                    renormalize_target = renormalize_target + transition_probs[sx, sy, sz, tx, ty, tz]
                                
                                shape_idx = shape_idx + 1
                    
                    renormalize_ratio = cost_p_s_1 / renormalize_target
                    renormalize_sum = 0
                    for tx in range(t_shape_x):
                        for ty in range(t_shape_y):
                            for tz in range(t_shape_z):
                                if transition_probs[sx, sy, sz, tx, ty, tz] != INFINITY:
                                    transition_probs[sx, sy, sz, tx, ty, tz] = transition_probs[sx, sy, sz, tx, ty, tz] * renormalize_ratio
                                
                                #~ if transition_probs[sx, sy, sz, tx, ty, tz] != INFINITY:
                                    #~ renormalize_sum = renormalize_sum + transition_probs[sx, sy, sz, tx, ty, tz]
                    
                    #~ assert renormalize_sum == 0 or (renormalize_sum - 0.1 < cost_p_s < renormalize_sum + 0.1), '%s %s' % (cost_p_s, renormalize_sum)
                               
                                    
    return transition_probs


@cython.boundscheck(False)
@cython.cdivision(True)
def buildStatesWithEmissionProbs(data_shape, apdata_list, int cube_width=3, double default_sq_variance=1, pooled_variance=0):
    ''' build states from accesspoint volume images (apvis should have equal dimensions)
    @param apdata: list of accesspoints volumeimages
    @param cube_width: size of cube for state
    '''
    assert len(apdata_list) > 0
    
    # number of voxels in a state
    cdef int cube_count = cube_width * cube_width * cube_width
        
    cdef int vidi_x = data_shape[0], vidi_y = data_shape[1], vidi_z = data_shape[2]
    cdef int size_x = vidi_x / cube_width, size_y = vidi_y / cube_width, size_z = vidi_z / cube_width
    
    # first 3 dims x, y, z in state-space
    # dim 4 (mean, variance)
    # dim 5 apidx
    cdef np.ndarray[np.double_t, ndim=5] states
    states = np.zeros((size_x, size_y, size_z, 2, len(apdata_list)), dtype=np.double)
    
    cdef np.ndarray[np.double_t, ndim=3] vidata = np.zeros((vidi_x, vidi_y, vidi_z), dtype=np.double)
    
    # in state space
    cdef int sx, sy, sz
    
    # in volume image space
    cdef int vx, vy, vz
    cdef int bound_x1, bound_y1, bound_z1, bound_x2, bound_y2, bound_z2
    
    # auxilary vars
    cdef int apidx # accesspoint index
    cdef double s, delta, mean, squared_variance
    
    for apidx, vidata in enumerate(apdata_list):
        for sx in range(size_x):
            bound_x1 = cube_width * sx
            bound_x2 = bound_x1 + cube_width
            if (bound_x2 >= vidi_x):
                # degenerated boundary cube - will contain less voxels
                # meaybe it would be better to sort them out
                bound_x2 = vidi_x 
            
            for sy in range(size_y):
                bound_y1 = cube_width * sy
                bound_y2 = bound_y1 + cube_width
                if (bound_y2 >= vidi_y):
                    # degenerated boundary cube
                    bound_y2 = vidi_y 
                    
                for sz in range(size_z):
                    bound_z1 = cube_width * sz
                    bound_z2 = bound_z1 + cube_width
                    if (bound_z2 >= vidi_z):
                        # degenerated boundary cube
                        bound_z2 = vidi_z
                    
                    # traverse cube coordinates in volume image
                    #to calc mean
                    s = 0
                    for vx in range(bound_x1, bound_x2):
                        for vy in range(bound_y1, bound_y2):
                            for vz in range(bound_z1, bound_z2):
                                s += vidata[vx, vy, vz]
                                
                    
                    mean = s / cube_count
                    states[sx, sy, sz, 0, apidx] = mean
                    
                    if pooled_variance > 0:
                        states[sx, sy, sz, 1, apidx] = pooled_variance
                    else:
                        # now calculate variance
                        s = 0
                        for vx in range(bound_x1, bound_x2):
                            for vy in range(bound_y1, bound_y2):
                                for vz in range(bound_z1, bound_z2):
                                    delta = mean - vidata[vx, vy, vz]
                                    s += delta * delta
                        
                        # store squared variance
                        squared_variance = s / cube_count
                        
                        if squared_variance == 0:
                            # fully flat voxel data - will lead to singular gaussian
                            squared_variance = default_sq_variance
                        
                        states[sx, sy, sz, 1, apidx] = squared_variance
    
    return states


@cython.boundscheck(False)
@cython.cdivision(True)
def buildTriangleCubeIntersection(np.ndarray[np.float_t, ndim=3] bl_triangles, 
                                    np.ndarray[np.float_t, ndim=3] unpass_triangles, 
                                    np.ndarray[np.float_t, ndim=3] unbl_triangles, 
                                    cubed_data_shape, int cube_width, vi_dimensions):
    di = vi_dimensions
    # do not confuse disx and sx
    # disx is the x-width of a voxel in the volume image dimension and sx is the x-loop var for the state space (the cubes)
    cdef float disx=di.sx, disy=di.sy, disz=di.sz, ditx=di.tx, dity=di.ty, ditz=di.tz
    
    cdef int size_x, size_y, size_z 
    size_x, size_y, size_z = cubed_data_shape[0], cubed_data_shape[1], cubed_data_shape[2]
    
    cdef np.ndarray[np.int_t, ndim=3] blockedCubes = np.zeros((size_x, size_y, size_z), dtype=np.int)
    
    cdef int sx, sy, sz # in state space
    cdef int vx, vy, vz # in volume image space (raytracer space)
    cdef float x, y, z # in real world space
    
    cdef float real_cube_half_width_x = (cube_width * disx) / 2.0
    cdef float real_cube_half_width_y = (cube_width * disy) / 2.0
    cdef float real_cube_half_width_z = (cube_width * disz) / 2.0

    cdef int i, len_triangles, set_to_val
    
    cdef float bc0, bc1, bc2, bhs0, bhs1, bhs2, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, xx
    bhs0 = real_cube_half_width_x
    bhs1 = real_cube_half_width_y
    bhs2 = real_cube_half_width_z
    
    cdef np.ndarray[np.float_t, ndim=3] triangles
    
        
    for triangles, set_to_val in ((bl_triangles, BLOCKED), (unpass_triangles, UNPASSABLE), (unbl_triangles, UN_BLOCKED)):
        len_triangles = len(triangles)
        for i in prange(len_triangles, nogil=True):
            tv0 = triangles[i, 0, 0]
            tv1 = triangles[i, 0, 1]
            tv2 = triangles[i, 0, 2]
            tv3 = triangles[i, 1, 0]
            tv4 = triangles[i, 1, 1]
            tv5 = triangles[i, 1, 2]
            tv6 = triangles[i, 2, 0]
            tv7 = triangles[i, 2, 1]
            tv8 = triangles[i, 2, 2]
            
            for sx in range(size_x):
                vx = sx * cube_width
                x = vx * disx + ditx
                bc0 = x + real_cube_half_width_x
                for sy in range(size_y):
                    vy = sy * cube_width
                    y = vy * disy + dity
                    bc1 = y + real_cube_half_width_y
                    for sz in range(size_z):
                        vz = sz * cube_width
                        z = vz * disz + ditz
                        bc2 = z + real_cube_half_width_z
                        
                        if triBoxOverlap(bc0, bc1, bc2, bhs0, bhs1, bhs2, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8):
                            blockedCubes[sx, sy, sz] = set_to_val
    
    return blockedCubes


@cython.boundscheck(False)
@cython.cdivision(True)
def evalUnreachableAirCubes(np.ndarray[np.int_t, ndim=3] blockedCubes, cube_height, max_reachable_height):
    
    cdef int size_x, size_y, size_z 
    size_x, size_y, size_z = blockedCubes.shape[0], blockedCubes.shape[1], blockedCubes.shape[2]
    
    cdef int max_cubes_above = max_reachable_height / cube_height
    
    cdef int sx, sy, sz, zi # in state space
    for sz in range(size_z):
        for sx in range(size_x):
            for sy in range(size_y):
                # find a currently unblocked/unmarked element
                if blockedCubes[sx, sy, sz] == UN_BLOCKED:
                    # scan in z+ direction and mark everything > max_cubes_above as unreachable
                    # stop if next blocking element hit, probably the floor above us
                    zi = sz
                    while zi < size_z and blockedCubes[sx, sy, zi] == UN_BLOCKED:
                        if zi - sz > max_cubes_above:
                            blockedCubes[sx, sy, zi] = AIR # mark as ureachable air cube
                        zi = zi + 1
                elif blockedCubes[sx, sy, sz] == UNPASSABLE:
                    # scan in z+ direction and if a UN_BLOCKED chain is starting - mark every UN_BLOCKED to UNPASSABLE
                    # stop if next blocking element hit, probably the floor above us
                    zi = sz + 1
                    while zi < size_z and blockedCubes[sx, sy, zi] == UN_BLOCKED:
                        blockedCubes[sx, sy, zi] = UNPASSABLE # mark as unpassable air cube
                        zi = zi + 1


@cython.boundscheck(False)
def viewPruningHistory(histories, idx, orgshape, int cubewidth, count, int view_values=1, int raw_values=0):
    cdef np.ndarray[np.double_t, ndim=2] history = histories[idx]
    cdef np.ndarray[np.double_t, ndim=3] result
    cdef int size_x = orgshape[0]
    cdef int size_y = orgshape[1]
    cdef int size_z = orgshape[2]
        
    result = np.zeros((size_x, size_y, size_z), dtype=np.double)
    if count > history.shape[0]:
        count = history.shape[0] - 1
        
    cdef double cost, min_cost = INFINITY
    cdef int i
    for i in range(history.shape[0] - count, history.shape[0]):
        cost = history[i, 3]
        if cost < min_cost:
            min_cost = cost
    
    cdef int x, y, z, j1, j2, j3, start, end
    start = history.shape[0] - count
    end = history.shape[0]
    for i in prange(start, end, nogil=True):
        x = <int>history[i, 0] * cubewidth
        y = <int>history[i, 1] * cubewidth
        z = <int>history[i, 2] * cubewidth
        cost = history[i, 3]
        for j1 in range(cubewidth):
            if x + j1 >= size_x:
                continue
            for j2 in range(cubewidth):
                if y + j2 >= size_y:
                    continue
                for j3 in range(cubewidth):
                    if z + j3 >= size_z:
                        continue
                    
                    if view_values:
                        if raw_values:
                            result[x + j1, y + j2, z + j3] = -min_cost
                        else:
                            result[x + j1, y + j2, z + j3] = cost - min_cost
                    else:
                        # view positions
                        result[x + j1, y + j2, z + j3] = i - start
                    
    return result