Urban Boundary Delineation from Commuting Data with Bayesian Stochastic Blockmodeling: Scale, Contiguity, and Hierarchy

Source Code"""
Module containing the code for the greedy regionalization algorithm
"""

import numpy as np
from scipy.special import loggamma
class DefaultDict(dict):
    """
    default dict that does not add new key when querying a key that does not exist
    """

    def __init__(self, default_factory, **kwargs):
        super().__init__(**kwargs)

        self.default_factory = default_factory

    def __getitem__(self, key):
        try:
            return super().__getitem__(key)
        except KeyError:
            return self.default_factory()
def logchoose(N, K):
    """
    computes log of binomial coefficient

    Parameters
    ----------
    N : int
        total number of elements
    K : int
        number of elements to choose

    Returns
    -------
    float
        log of binomial coefficient
    """
    if (K <= 0) or (N <= 0):
        return 0
    return loggamma(N + 1) - loggamma(K + 1) - loggamma(N - K + 1)
def logmultiset(N, K):
    """
    computes log of multiset coefficient

    Parameters
    ----------
    N : int
        total number of elements
    K : int
        number of elements to choose

    Returns
    -------
    float
        log of multiset coefficient
    """
    return logchoose(N + K - 1, K)
def greedy_opt(N, spatial_elist, flow_elist):
    """
    fast greedy regionalization for objective functions of the form:

                C(B) + sum_r g(r) + sum_rs f(r,s),

    where r,s index clusters and B is the number of clusters.

    Parameters
    ----------
    N : int
        number of nodes
    spatial_elist : list of tuples
        list of edges (i,j) defined by the spatial adjacency between i and j (no repeats)
    flow_elist : list of tuples
        list of weighted edges defined by flows (i,j,w), where flow is from i --> j and has weight w (no repeats)

    Returns
    -------
    DLs : list of floats
        list of description length values at each iteration
    partitions : list of lists
        list of partitions at each iteration

    Notes
    -----
    Make sure nodes are indexed as 0,....,N-1 so as to handle nodes with no flows.
    """

    E = len(flow_elist)  # number of edges
    W = sum([e[-1] for e in flow_elist])  # total flow
    B = N  # initial number of clusters

    clusters, n_c = {}, {}  # dictionaries to track clusters and their sizes
    for i in range(N):
        clusters[i] = set([i])
        n_c[i] = 1

    # directed dictionaries for the number of edges between clusters and total flow between clusters
    # For example:
    #           e_out[i][j] is the number of edges going out from cluster i to cluster j
    #           e_in[i][j] is the number  edges coming into cluster i from cluster j
    # Both dictionaries are needed since we sometimes iterate over in-neighbors, and sometimes over out-neighbors
    ein_c, win_c, eout_c, wout_c = (
        DefaultDict(dict),
        DefaultDict(dict),
        DefaultDict(dict),
        DefaultDict(dict),
    )
    for i in range(N):
        ein_c[i], win_c[i], eout_c[i], wout_c[i] = (
            DefaultDict(int),
            DefaultDict(int),
            DefaultDict(int),
            DefaultDict(int),
        )

    for e in flow_elist:

        i, j, w = e

        eout_c[i][j] = 1
        wout_c[i][j] = w
        ein_c[j][i] = 1
        win_c[j][i] = w
# function definitions for C, g, and f. Can be changed depending on the clustering objective of interest
def C(B):
    """
    Global contribution to the description length

    Parameters
    ----------
    B : int
        number of clusters

    Returns
    -------
    float
        global contribution to the description length
    """
    return (
        logmultiset(B**2, E)
        + logchoose(N - 1, B - 1)
        + logmultiset(B**2, W)
        + np.log(N)
        + loggamma(N + 1)
    )
def g(r):
    """
    Computes the cluster-level contribution to the description length for
    cluster r. If a tuple is entered as the cluster index, it adds the
    corresponding terms for those indices for the merge.

    Parameters
    ----------
    r : int or tuple
        cluster index or tuple of cluster indices

    Returns
    -------
    float
        cluster-level contribution to the description length
    """

    if isinstance(r, tuple):
        n_r = n_c[r[0]] + n_c[r[1]]
    else:
        n_r = n_c[r]

    # return -N*np.log(n_r/N) #use stirling approximation of log binomial
    return -loggamma(n_r + 1)
def f(r, s):
    """
    Cluster-to-cluster contribution to the description length.
    Computes the term for r --> s.
    If a tuple is entered as the cluster index, it adds the corresponding
    terms for those indices for the merge.

    Parameters
    ----------
    r : int or tuple
        cluster index or tuple of cluster indices
    s : int or tuple
        cluster index or tuple of cluster indices

    Returns
    -------
    float
        cluster-to-cluster contribution to the description length
    """
    if isinstance(r, tuple) and isinstance(s, tuple):
        n_r = n_c[r[0]] + n_c[r[1]]
        n_s = n_c[s[0]] + n_c[s[1]]
        e_rs = (
            eout_c[r[0]][s[0]]
            + eout_c[r[0]][s[1]]
            + eout_c[r[1]][s[0]]
            + eout_c[r[1]][s[1]]
        )
        w_rs = (
            wout_c[r[0]][s[0]]
            + wout_c[r[0]][s[1]]
            + wout_c[r[1]][s[0]]
            + wout_c[r[1]][s[1]]
        )

    elif isinstance(r, tuple):
        n_r = n_c[r[0]] + n_c[r[1]]
        n_s = n_c[s]
        e_rs = eout_c[r[0]][s] + eout_c[r[1]][s]
        w_rs = wout_c[r[0]][s] + wout_c[r[1]][s]

    elif isinstance(s, tuple):
        n_r = n_c[r]
        n_s = n_c[s[0]] + n_c[s[1]]
        e_rs = eout_c[r][s[0]] + eout_c[r][s[1]]
        w_rs = wout_c[r][s[0]] + wout_c[r][s[1]]

    else:
        n_r = n_c[r]
        n_s = n_c[s]
        e_rs = eout_c[r][s]
        w_rs = wout_c[r][s]

    return logchoose(n_r * n_s, e_rs) + logchoose(w_rs - 1, e_rs - 1)
def total_dl():
    """
    Computes the total description length or objective value of interest

    Returns
    -------
    float
        total description length
    """
    dl = C(B) + sum([g(r) for r in n_c])
    for r in eout_c:
        for s in eout_c[r]:
            dl += f(r, s)

    return dl
def delta_dl(r, s):
    """
    Computes the change in description length after merging clusters r and s.
    Only needs to be computed entirely when the merge (r,s) is not stored
    in the ddl_c dictionary used to track merges.

    Parameters
    ----------
    r : int
        cluster index
    s : int
        cluster index

    Returns
    -------
    total_change : float
        total change in description length after merging clusters r and s
    """

    if r in ddl_c:
        if s in ddl_c[r]:
            return ddl_c[r][s]

    # cluster-level change in the description length
    dg = g((r, s)) - g(r) - g(s)

    # compute in and out neighbors of the merged cluster
    r_in_neigs, r_out_neigs, s_in_neigs, s_out_neigs = (
        set(ein_c[r].keys()),
        set(eout_c[r].keys()),
        set(ein_c[s].keys()),
        set(eout_c[s].keys()),
    )
    rs_in_neigs = r_in_neigs.union(s_in_neigs) - set([r, s])
    rs_out_neigs = r_out_neigs.union(s_out_neigs) - set([r, s])
    rs_all_neigs = rs_in_neigs.union(rs_out_neigs)

    # changes from neighboring nodes
    df_external = 0
    for u in rs_all_neigs:
        df_external += (
            f((r, s), u) + f(u, (r, s)) - f(r, u) - f(s, u) - f(u, r) - f(u, s)
        )

    # change from flows from r to s
    df_internal = f((r, s), (r, s)) - f(r, s) - f(s, r) - f(r, r) - f(s, s)

    # compute the total change in the description length
    total_change = dg + df_external + df_internal

    # store the change in description length in the ddl_c dictionary
    if not (r in ddl_c):
        ddl_c[r] = {}
    if not (s in ddl_c):
        ddl_c[s] = {}
    ddl_c[r][s] = total_change
    ddl_c[s][r] = total_change

    return total_change
def merge_updates(r, s, DL):
    """
    Merges clusters r and s and updates the ddl_c dictionary of changes in
    description length for all nodes with flows into or out of r or s.
    r and s are the two clusters with best description length change after
    checking all possible merges. Deletes all obsolete information to save memory.

    Parameters
    ----------
    r : int
        cluster index
    s : int
        cluster index
    DL : float
        current description length

    Returns
    -------
    DL : float
        updated description length
    """

    # initialize a new key for the merged cluster key
    rs = np.random.randint(100000000)

    # compute the in and out neighbors of the merged cluster
    r_in_neigs, r_out_neigs, s_in_neigs, s_out_neigs = (
        set(ein_c[r].keys()),
        set(eout_c[r].keys()),
        set(ein_c[s].keys()),
        set(eout_c[s].keys()),
    )
    rs_in_neigs = r_in_neigs.union(s_in_neigs) - set([r, s])
    rs_out_neigs = r_out_neigs.union(s_out_neigs) - set([r, s])
    all_rs_neigs = rs_in_neigs.union(rs_out_neigs)

    # update the dictionaries for the clusters and their sizes
    clusters[rs] = clusters[r].union(clusters[s])
    n_c[rs] = n_c[r] + n_c[s]

    # create a new entry for the merged cluster in the dictionaries for the number of edges and total flow
    ein_c[rs], eout_c[rs], win_c[rs], wout_c[rs] = (
        DefaultDict(int),
        DefaultDict(int),
        DefaultDict(int),
        DefaultDict(int),
    )
    # contributions from neighbors of merged cluster
    for u in all_rs_neigs:

        ein_c[rs][u] = ein_c[r][u] + ein_c[s][u]
        win_c[rs][u] = win_c[r][u] + win_c[s][u]
        ein_c[u][rs] = ein_c[u][r] + ein_c[u][s]
        win_c[u][rs] = win_c[u][r] + win_c[u][s]

        eout_c[rs][u] = eout_c[r][u] + eout_c[s][u]
        wout_c[rs][u] = wout_c[r][u] + wout_c[s][u]
        eout_c[u][rs] = eout_c[u][r] + eout_c[u][s]
        wout_c[u][rs] = wout_c[u][r] + wout_c[u][s]

    # contribution from the clusters being merged
    ein_c[rs][rs] = ein_c[r][r] + ein_c[r][s] + ein_c[s][r] + ein_c[s][s]
    win_c[rs][rs] = win_c[r][r] + win_c[r][s] + win_c[s][r] + win_c[s][s]
    eout_c[rs][rs] = eout_c[r][r] + eout_c[r][s] + eout_c[s][r] + eout_c[s][s]
    wout_c[rs][rs] = wout_c[r][r] + wout_c[r][s] + wout_c[s][r] + wout_c[s][s]

    # remove references to old clusters from the ddl_c dictionary
    rs_ddls = set(ddl_c[r].keys()).union(set(ddl_c[s].keys())) - set([r, s])
    for u in rs_ddls:
        ddl_c[u].pop(r, None)
        ddl_c[r].pop(u, None)
        ddl_c[u].pop(s, None)
        ddl_c[s].pop(u, None)

    # update past merges that involve the (flow) neighbors of the merged cluster
    checked = []
    for u in all_rs_neigs:
        try:
            for v in ddl_c[u]:
                if (u, v) in checked or (v, u) in checked:
                    pass
                else:
                    relevant_terms_after_rs_merge = (
                        f((u, v), (r, s))
                        + f((r, s), (u, v))
                        - f(u, (r, s))
                        - f(v, (r, s))
                        - f((r, s), u)
                        - f((r, s), v)
                    )
                    relevant_terms_before_rs_merge = (
                        f((u, v), r)
                        + f(r, (u, v))
                        - f(u, r)
                        - f(v, r)
                        - f(r, u)
                        - f(r, v)
                        + f((u, v), s)
                        + f(s, (u, v))
                        - f(u, s)
                        - f(v, s)
                        - f(s, u)
                        - f(s, v)
                    )

                    ddl_c[u][v] += (
                        relevant_terms_after_rs_merge
                        - relevant_terms_before_rs_merge
                    )
                    ddl_c[v][u] = ddl_c[u][v]
                    checked.append((u, v))
        except KeyError:
            pass

    # include the global-level change and update the description length
    DL += ddl_c[r][s] + C(B - 1) - C(B)

    # remove references to old clusters from the neighbors' dictionaries
    for u in all_rs_neigs:
        ein_c[u].pop(r, None)
        ein_c[u].pop(s, None)
        win_c[u].pop(r, None)
        win_c[u].pop(s, None)
        eout_c[u].pop(r, None)
        eout_c[u].pop(s, None)
        wout_c[u].pop(r, None)
        wout_c[u].pop(s, None)

    for u in rs_ddls:
        ddl_c[rs][u] = delta_dl(rs, u)
        ddl_c[u][rs] = ddl_c[rs][u]

    # clean up: remove obsolete entries in neighbors' dictionaries and remove all entries correponding to r and s
    del (
        clusters[r],
        clusters[s],
        n_c[r],
        n_c[s],
        ein_c[r],
        ein_c[s],
        eout_c[r],
        eout_c[s],
        win_c[r],
        win_c[s],
        wout_c[r],
        wout_c[s],
        ddl_c[r],
        ddl_c[s],
    )

    return DL

ddl_c = (
    {}
)  # dictionary for past merges, indexed by only spatially adjacent neighbors
for e in spatial_elist:

    i, j = e
    ddl_c[i][j] = delta_dl(i, j)
    ddl_c[j][i] = ddl_c[i][j]

# Initialize the description length and the list of partitions
DL = total_dl()
DLs, partitions = [DL], [clusters.copy()]

# Iterate over the number of clusters and find the best pair to merge
for B in range(N, 1, -1):
    best_pair = (np.inf, np.inf)
    best_ddl = np.inf
    for i in ddl_c:
        for j in ddl_c[i]:
            if i < j:
                ddl = delta_dl(i, j)
                if ddl < best_ddl:
                    best_ddl = ddl
                    best_pair = (i, j)

    r, s = best_pair
    if r == s == np.inf:  # If no more merges are possible return the partitions
        return DLs, partitions

    # Merge the best pair and update the description length
    DL = merge_updates(r, s, DL)

    # Uncomment to stop algorithm when the description length increases
    if DL > DLs[-1]:
        return DLs, partitions

    # Store the description length and the partition
    DLs.append(DL)
    partitions.append(list(clusters.copy().values()))

return DLs, partitions