Hypergraph Binning 
+++++++++

Source Code
------------

.. code-block:: python

  import numpy as np
  from math import lgamma

.. _logchoose:

.. code-block:: python

  def logchoose(n, k):
      """
      logarithm of binomial coefficient
      """
      return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1)

.. _logmult:

.. code-block:: python

  def logmult(counts):
      """
      logarithm of multinomial coefficient over count data 'counts'
      """
      total = sum(counts)
      if total == 0:
          return 0.
      result = lgamma(total + 1)
      for count in counts:
          result -= lgamma(count + 1)
      return result

.. _logomega:

.. code-block:: python

  def logOmega(rs, cs, swap=True):
      
      """
      logarithm of number of non-negative integer matrices with row sums 'rs' and column sums 'cs'
      'swap' swaps the definition of rows and columns in the effective columns approximation of Jerdee et al for a minor accuracy improvement
      """
      
      rs = np.array([r for r in rs if r > 0])
      cs = np.array([c for c in cs if c > 0])
      if swap and len(cs) < len(rs): # Swap definitions so that (# rows) <= (# columns) to improve performance
          rs = np.array(cs)
          cs = np.array(rs)
      m = len(rs)
      N = np.sum(rs)
      
      if N == 0:
          return 0.
      if len(rs)*len(cs) == 0:
          return 0
      elif (max(rs) == 1) and (max(cs) == 1):
          return 0.
      elif (max(rs) == 1):
          return logmult(cs)
      elif (max(cs) == 1):
          return logmult(rs)
      
      alphaC = ((1-1/N)+(1-np.sum((cs/N)**2))/(m+1e-100))/(np.sum((cs/N)**2)-1/N+1e-100)
      result = -logchoose(N + m*alphaC - 1, m*alphaC - 1)
      for r in rs:
          result += logchoose(r + alphaC - 1, alphaC-1)
      for c in cs:
          result += logchoose(c + m - 1, m - 1)
          
      return result

.. _mdl-hypergraph-binning:

.. code-block:: python

  def MDL_hypergraph_binning(X,dt,exact=True):
      """
      inputs:
          -X is list with entries of form x_i=[s_i,d_i,w_i,t_i] 
              -w_i is 'weight' of event i, counting number of interactions at a single t_i. set to 1's for unweighted event data
          -dt = (t_N-t_1)/T is time step width
      outputs:
          -best_MDL/L0: compression ratio eta for MDL-optimal temporally contiguous partition of event data X
          -labels: partition of the event data into event clusters
          -number of time steps T corresponding to width dt
      """
      
      """initialize S,D,N,T and fill time steps with corresponding event data"""
      N = int(sum([x[2] for x in X]))
      ts = [x[-1] for x in X]
      tmin,tmax = min(ts),max(ts)+1e-10
      S = len(set([x[0] for x in X]))
      D = len(set([x[1] for x in X]))
      
      cellxs,celldata = {},{}
      for i,x in enumerate(X):
          s,d,w,t = x
          cell = int(np.floor((t-tmin)/dt)) + 1
          if not(cell in celldata):
              celldata[cell] = {}
          if not((s,d) in celldata[cell]):
              celldata[cell][(s,d)] = 0
          celldata[cell][(s,d)] += w
          if not(cell in cellxs):
              cellxs[cell] = []
          cellxs[cell].append(i)
          
      T = max(list(celldata.keys()))
      
      if T == 1:
          return 1.,[0]*N,1

      for l in range(1,T+1):
          if not(l in cellxs):
              cellxs[l] = []
          if not(l in celldata):
              celldata[l] = []
      
      past_dls,past_nks = {},{} #saves past computed values for the cluster description lengths and number of events in each cluster
      
      def DL(i,j):
          """
          inputs interval indices i,j that define boundary of cluster
          returns cluster-level description length, accounting for the empty time intervals
          """
          tauk = j - i + 1
          
          if (i,j) in past_dls: 
              return past_dls[(i,j)]
          
          elif ((i-1,j) in past_dls) and (len(celldata[i-1]) == 0): 
              nk = past_nks[(i-1,j)]
              dl = past_dls[(i-1,j)] - logchoose(nk + (tauk+1) - 1, (tauk+1) - 1) + logchoose(nk + tauk - 1, tauk - 1)
              past_dls[(i,j)] = dl
              past_nks[(i,j)] = nk
              return dl
          
          elif ((i,j-1) in past_dls) and (len(celldata[j]) == 0): 
              nk = past_nks[(i,j-1)]
              dl = past_dls[(i,j-1)] - logchoose(nk + (tauk-1) - 1, (tauk-1) - 1) + logchoose(nk + tauk - 1, tauk - 1) 
              past_dls[(i,j)] = dl
              past_nks[(i,j)] = nk
              return dl
          
          G,nkts,ss,ds = {},{},{},{}
          for l in range(i,j+1):
              
              data = celldata[l]
              for edge in data:
                  
                  n = data[edge]
                  
                  if not(edge in G):
                      G[edge] = 0
                  if not (l in nkts):
                      nkts[l] = 0
                  if not (s in ss):
                      ss[s] = 0
                  if not (d in ds):
                      ds[d] = 0
                  
                  G[edge] += n
                  nkts[l] += n
                  ss[s] += n
                  ds[d] += n
                  
          nk = sum(nkts.values())
          num_sds,num_kts,num_ss,num_ds = list(G.values()),list(nkts.values()),list(ss.values()),list(ds.values())
          dl = np.log((N-1)*(T-1)) + logchoose(nk + tauk - 1, tauk - 1) + logchoose(nk + S - 1, S - 1) + logchoose(nk + D - 1, D - 1) \
                  + logOmega(num_ss,num_ds) + logOmega(num_sds,num_kts)
          past_dls[(i,j)] = dl
          past_nks[(i,j)] = nk

          return dl
      
      if exact: 
          """
          exact dynamic programming solution to identify MDL configuration
          """
          LMDL,LMDL_int = {0:0},{}
          for j in range(1,T+1):
              best_MDL = np.inf
              for i in range(1,j+1):
                  L = LMDL[i-1] + DL(i,j) 
                  if L < best_MDL:
                      best_MDL = L
                      LMDL_int[j] = i
              LMDL[j] = best_MDL

          j = T
          i = np.inf
          MDL_boundaries = []
          while i > 1:
              i = LMDL_int[j]
              MDL_boundaries.append((i,j))
              j = i-1
          MDL_boundaries = sorted(MDL_boundaries,key = lambda I:I[0])
          best_MDL = LMDL[T]
          
      else:
          """
          approximate greedy agglomerative solution to identify MDL configuration
          """
          MDL_boundaries = [(i,i) for i in range(1,T+1)]
          current_boundaries = [(i,i) for i in range(1,T+1)]
          for K in np.arange(T,1,-1).astype('int'):
              
              best_k,best_delta_dl = -100,np.inf
              for k in range(K-1):
                  i1,i2 = current_boundaries[k]
                  i3,i4 = current_boundaries[k+1]
                  delta_dl = DL(i1,i4) - DL(i1,i2) - DL(i3,i4)
                  if delta_dl < best_delta_dl:
                      best_delta_dl = delta_dl
                      best_k = k
              
              i1,i2 = current_boundaries[best_k]
              i3,i4 = current_boundaries[best_k+1]
              current_boundaries = current_boundaries[:best_k] + [(i1,i4)] + current_boundaries[best_k+2:]
              if best_delta_dl < 0:
                  MDL_boundaries = [b for b in current_boundaries]
          
          best_MDL = sum([DL(b[0],b[1]) for b in MDL_boundaries])
          
          MDLK1 = DL(1,T)       
          if MDLK1 < best_MDL:
              best_MDL = MDLK1
              MDL_boundaries = [(1,T)]
      
      labels = np.zeros(N).astype('int')
      for label,bds in enumerate(MDL_boundaries):
          c = []
          for cell in range(bds[0],bds[1]+1):
              for i in cellxs[cell]:
                  labels[i] = label
          
      L0 = DL(1,T) - np.log((N-1)*(T-1)) #subtract off previously ignored constant from description length 
      best_MDL -= np.log((N-1)*(T-1))
          
      return best_MDL/L0,labels,T