Urban Boundary Delineation from Commuting Data with Bayesian Stochastic Blockmodeling: Scale, Contiguity, and Hierarchy
+++++++++

Source Code
------------

.. code-block:: python

    """
    Module containing the code for the greedy regionalization algorithm
    """

    import numpy as np
    from scipy.special import loggamma

.. _init:

.. code-block:: python

    class DefaultDict(dict):
        """
        default dict that does not add new key when querying a key that does not exist
        """

        def __init__(self, default_factory, **kwargs):
            super().__init__(**kwargs)

            self.default_factory = default_factory

        def __getitem__(self, key):
            try:
                return super().__getitem__(key)
            except KeyError:
                return self.default_factory()

.. _logchoose:

.. code-block:: python


    def logchoose(N, K):
        """
        computes log of binomial coefficient

        Parameters
        ----------
        N : int
            total number of elements
        K : int
            number of elements to choose

        Returns
        -------
        float
            log of binomial coefficient
        """
        if (K <= 0) or (N <= 0):
            return 0
        return loggamma(N + 1) - loggamma(K + 1) - loggamma(N - K + 1)

.. _logmultiset:

.. code-block:: python

    def logmultiset(N, K):
        """
        computes log of multiset coefficient

        Parameters
        ----------
        N : int
            total number of elements
        K : int
            number of elements to choose

        Returns
        -------
        float
            log of multiset coefficient
        """
        return logchoose(N + K - 1, K)

.. _greedy-opt:

.. code-block:: python

    def greedy_opt(N, spatial_elist, flow_elist):
        """
        fast greedy regionalization for objective functions of the form:

                    C(B) + sum_r g(r) + sum_rs f(r,s),

        where r,s index clusters and B is the number of clusters.

        Parameters
        ----------
        N : int
            number of nodes
        spatial_elist : list of tuples
            list of edges (i,j) defined by the spatial adjacency between i and j (no repeats)
        flow_elist : list of tuples
            list of weighted edges defined by flows (i,j,w), where flow is from i --> j and has weight w (no repeats)

        Returns
        -------
        DLs : list of floats
            list of description length values at each iteration
        partitions : list of lists
            list of partitions at each iteration

        Notes
        -----
        Make sure nodes are indexed as 0,....,N-1 so as to handle nodes with no flows.
        """

        E = len(flow_elist)  # number of edges
        W = sum([e[-1] for e in flow_elist])  # total flow
        B = N  # initial number of clusters

        clusters, n_c = {}, {}  # dictionaries to track clusters and their sizes
        for i in range(N):
            clusters[i] = set([i])
            n_c[i] = 1

        # directed dictionaries for the number of edges between clusters and total flow between clusters
        # For example:
        #           e_out[i][j] is the number of edges going out from cluster i to cluster j
        #           e_in[i][j] is the number  edges coming into cluster i from cluster j
        # Both dictionaries are needed since we sometimes iterate over in-neighbors, and sometimes over out-neighbors
        ein_c, win_c, eout_c, wout_c = (
            DefaultDict(dict),
            DefaultDict(dict),
            DefaultDict(dict),
            DefaultDict(dict),
        )
        for i in range(N):
            ein_c[i], win_c[i], eout_c[i], wout_c[i] = (
                DefaultDict(int),
                DefaultDict(int),
                DefaultDict(int),
                DefaultDict(int),
            )

        for e in flow_elist:

            i, j, w = e

            eout_c[i][j] = 1
            wout_c[i][j] = w
            ein_c[j][i] = 1
            win_c[j][i] = w

.. _c:

.. code-block:: python

        # function definitions for C, g, and f. Can be changed depending on the clustering objective of interest
        def C(B):
            """
            Global contribution to the description length

            Parameters
            ----------
            B : int
                number of clusters

            Returns
            -------
            float
                global contribution to the description length
            """
            return (
                logmultiset(B**2, E)
                + logchoose(N - 1, B - 1)
                + logmultiset(B**2, W)
                + np.log(N)
                + loggamma(N + 1)
            )

.. _g:

.. code-block:: python

        def g(r):
            """
            Computes the cluster-level contribution to the description length for
            cluster r. If a tuple is entered as the cluster index, it adds the
            corresponding terms for those indices for the merge.

            Parameters
            ----------
            r : int or tuple
                cluster index or tuple of cluster indices

            Returns
            -------
            float
                cluster-level contribution to the description length
            """

            if isinstance(r, tuple):
                n_r = n_c[r[0]] + n_c[r[1]]
            else:
                n_r = n_c[r]

            # return -N*np.log(n_r/N) #use stirling approximation of log binomial
            return -loggamma(n_r + 1)

.. _f:

.. code-block:: python

        def f(r, s):
            """
            Cluster-to-cluster contribution to the description length.
            Computes the term for r --> s.
            If a tuple is entered as the cluster index, it adds the corresponding
            terms for those indices for the merge.

            Parameters
            ----------
            r : int or tuple
                cluster index or tuple of cluster indices
            s : int or tuple
                cluster index or tuple of cluster indices

            Returns
            -------
            float
                cluster-to-cluster contribution to the description length
            """
            if isinstance(r, tuple) and isinstance(s, tuple):
                n_r = n_c[r[0]] + n_c[r[1]]
                n_s = n_c[s[0]] + n_c[s[1]]
                e_rs = (
                    eout_c[r[0]][s[0]]
                    + eout_c[r[0]][s[1]]
                    + eout_c[r[1]][s[0]]
                    + eout_c[r[1]][s[1]]
                )
                w_rs = (
                    wout_c[r[0]][s[0]]
                    + wout_c[r[0]][s[1]]
                    + wout_c[r[1]][s[0]]
                    + wout_c[r[1]][s[1]]
                )

            elif isinstance(r, tuple):
                n_r = n_c[r[0]] + n_c[r[1]]
                n_s = n_c[s]
                e_rs = eout_c[r[0]][s] + eout_c[r[1]][s]
                w_rs = wout_c[r[0]][s] + wout_c[r[1]][s]

            elif isinstance(s, tuple):
                n_r = n_c[r]
                n_s = n_c[s[0]] + n_c[s[1]]
                e_rs = eout_c[r][s[0]] + eout_c[r][s[1]]
                w_rs = wout_c[r][s[0]] + wout_c[r][s[1]]

            else:
                n_r = n_c[r]
                n_s = n_c[s]
                e_rs = eout_c[r][s]
                w_rs = wout_c[r][s]

            return logchoose(n_r * n_s, e_rs) + logchoose(w_rs - 1, e_rs - 1)

.. _total-dl:

.. code-block:: python

        def total_dl():
            """
            Computes the total description length or objective value of interest

            Returns
            -------
            float
                total description length
            """
            dl = C(B) + sum([g(r) for r in n_c])
            for r in eout_c:
                for s in eout_c[r]:
                    dl += f(r, s)

            return dl

.. _delta-dl:

.. code-block:: python

        def delta_dl(r, s):
            """
            Computes the change in description length after merging clusters r and s.
            Only needs to be computed entirely when the merge (r,s) is not stored
            in the ddl_c dictionary used to track merges.

            Parameters
            ----------
            r : int
                cluster index
            s : int
                cluster index

            Returns
            -------
            total_change : float
                total change in description length after merging clusters r and s
            """

            if r in ddl_c:
                if s in ddl_c[r]:
                    return ddl_c[r][s]

            # cluster-level change in the description length
            dg = g((r, s)) - g(r) - g(s)

            # compute in and out neighbors of the merged cluster
            r_in_neigs, r_out_neigs, s_in_neigs, s_out_neigs = (
                set(ein_c[r].keys()),
                set(eout_c[r].keys()),
                set(ein_c[s].keys()),
                set(eout_c[s].keys()),
            )
            rs_in_neigs = r_in_neigs.union(s_in_neigs) - set([r, s])
            rs_out_neigs = r_out_neigs.union(s_out_neigs) - set([r, s])
            rs_all_neigs = rs_in_neigs.union(rs_out_neigs)

            # changes from neighboring nodes
            df_external = 0
            for u in rs_all_neigs:
                df_external += (
                    f((r, s), u) + f(u, (r, s)) - f(r, u) - f(s, u) - f(u, r) - f(u, s)
                )

            # change from flows from r to s
            df_internal = f((r, s), (r, s)) - f(r, s) - f(s, r) - f(r, r) - f(s, s)

            # compute the total change in the description length
            total_change = dg + df_external + df_internal

            # store the change in description length in the ddl_c dictionary
            if not (r in ddl_c):
                ddl_c[r] = {}
            if not (s in ddl_c):
                ddl_c[s] = {}
            ddl_c[r][s] = total_change
            ddl_c[s][r] = total_change

            return total_change

.. _merge-updates:

.. code-block:: python

        def merge_updates(r, s, DL):
            """
            Merges clusters r and s and updates the ddl_c dictionary of changes in
            description length for all nodes with flows into or out of r or s.
            r and s are the two clusters with best description length change after
            checking all possible merges. Deletes all obsolete information to save memory.

            Parameters
            ----------
            r : int
                cluster index
            s : int
                cluster index
            DL : float
                current description length

            Returns
            -------
            DL : float
                updated description length
            """

            # initialize a new key for the merged cluster key
            rs = np.random.randint(100000000)

            # compute the in and out neighbors of the merged cluster
            r_in_neigs, r_out_neigs, s_in_neigs, s_out_neigs = (
                set(ein_c[r].keys()),
                set(eout_c[r].keys()),
                set(ein_c[s].keys()),
                set(eout_c[s].keys()),
            )
            rs_in_neigs = r_in_neigs.union(s_in_neigs) - set([r, s])
            rs_out_neigs = r_out_neigs.union(s_out_neigs) - set([r, s])
            all_rs_neigs = rs_in_neigs.union(rs_out_neigs)

            # update the dictionaries for the clusters and their sizes
            clusters[rs] = clusters[r].union(clusters[s])
            n_c[rs] = n_c[r] + n_c[s]

            # create a new entry for the merged cluster in the dictionaries for the number of edges and total flow
            ein_c[rs], eout_c[rs], win_c[rs], wout_c[rs] = (
                DefaultDict(int),
                DefaultDict(int),
                DefaultDict(int),
                DefaultDict(int),
            )
            # contributions from neighbors of merged cluster
            for u in all_rs_neigs:

                ein_c[rs][u] = ein_c[r][u] + ein_c[s][u]
                win_c[rs][u] = win_c[r][u] + win_c[s][u]
                ein_c[u][rs] = ein_c[u][r] + ein_c[u][s]
                win_c[u][rs] = win_c[u][r] + win_c[u][s]

                eout_c[rs][u] = eout_c[r][u] + eout_c[s][u]
                wout_c[rs][u] = wout_c[r][u] + wout_c[s][u]
                eout_c[u][rs] = eout_c[u][r] + eout_c[u][s]
                wout_c[u][rs] = wout_c[u][r] + wout_c[u][s]

            # contribution from the clusters being merged
            ein_c[rs][rs] = ein_c[r][r] + ein_c[r][s] + ein_c[s][r] + ein_c[s][s]
            win_c[rs][rs] = win_c[r][r] + win_c[r][s] + win_c[s][r] + win_c[s][s]
            eout_c[rs][rs] = eout_c[r][r] + eout_c[r][s] + eout_c[s][r] + eout_c[s][s]
            wout_c[rs][rs] = wout_c[r][r] + wout_c[r][s] + wout_c[s][r] + wout_c[s][s]

            # remove references to old clusters from the ddl_c dictionary
            rs_ddls = set(ddl_c[r].keys()).union(set(ddl_c[s].keys())) - set([r, s])
            for u in rs_ddls:
                ddl_c[u].pop(r, None)
                ddl_c[r].pop(u, None)
                ddl_c[u].pop(s, None)
                ddl_c[s].pop(u, None)

            # update past merges that involve the (flow) neighbors of the merged cluster
            checked = []
            for u in all_rs_neigs:
                try:
                    for v in ddl_c[u]:
                        if (u, v) in checked or (v, u) in checked:
                            pass
                        else:
                            relevant_terms_after_rs_merge = (
                                f((u, v), (r, s))
                                + f((r, s), (u, v))
                                - f(u, (r, s))
                                - f(v, (r, s))
                                - f((r, s), u)
                                - f((r, s), v)
                            )
                            relevant_terms_before_rs_merge = (
                                f((u, v), r)
                                + f(r, (u, v))
                                - f(u, r)
                                - f(v, r)
                                - f(r, u)
                                - f(r, v)
                                + f((u, v), s)
                                + f(s, (u, v))
                                - f(u, s)
                                - f(v, s)
                                - f(s, u)
                                - f(s, v)
                            )

                            ddl_c[u][v] += (
                                relevant_terms_after_rs_merge
                                - relevant_terms_before_rs_merge
                            )
                            ddl_c[v][u] = ddl_c[u][v]
                            checked.append((u, v))
                except KeyError:
                    pass

            # include the global-level change and update the description length
            DL += ddl_c[r][s] + C(B - 1) - C(B)

            # remove references to old clusters from the neighbors' dictionaries
            for u in all_rs_neigs:
                ein_c[u].pop(r, None)
                ein_c[u].pop(s, None)
                win_c[u].pop(r, None)
                win_c[u].pop(s, None)
                eout_c[u].pop(r, None)
                eout_c[u].pop(s, None)
                wout_c[u].pop(r, None)
                wout_c[u].pop(s, None)

            for u in rs_ddls:
                ddl_c[rs][u] = delta_dl(rs, u)
                ddl_c[u][rs] = ddl_c[rs][u]

            # clean up: remove obsolete entries in neighbors' dictionaries and remove all entries correponding to r and s
            del (
                clusters[r],
                clusters[s],
                n_c[r],
                n_c[s],
                ein_c[r],
                ein_c[s],
                eout_c[r],
                eout_c[s],
                win_c[r],
                win_c[s],
                wout_c[r],
                wout_c[s],
                ddl_c[r],
                ddl_c[s],
            )

            return DL

        ddl_c = (
            {}
        )  # dictionary for past merges, indexed by only spatially adjacent neighbors
        for e in spatial_elist:

            i, j = e
            ddl_c[i][j] = delta_dl(i, j)
            ddl_c[j][i] = ddl_c[i][j]

        # Initialize the description length and the list of partitions
        DL = total_dl()
        DLs, partitions = [DL], [clusters.copy()]

        # Iterate over the number of clusters and find the best pair to merge
        for B in range(N, 1, -1):
            best_pair = (np.inf, np.inf)
            best_ddl = np.inf
            for i in ddl_c:
                for j in ddl_c[i]:
                    if i < j:
                        ddl = delta_dl(i, j)
                        if ddl < best_ddl:
                            best_ddl = ddl
                            best_pair = (i, j)

            r, s = best_pair
            if r == s == np.inf:  # If no more merges are possible return the partitions
                return DLs, partitions

            # Merge the best pair and update the description length
            DL = merge_updates(r, s, DL)

            # Uncomment to stop algorithm when the description length increases
            if DL > DLs[-1]:
                return DLs, partitions

            # Store the description length and the partition
            DLs.append(DL)
            partitions.append(list(clusters.copy().values()))

        return DLs, partitions