Source code for networkinference.utils.tools

import numpy as np, networkx as nx, math
from scipy.sparse.linalg import inv
from scipy.stats import binom
from scipy.spatial import cKDTree
from scipy.sparse import csgraph, csc_matrix, identity

def nhbr_mean(X, A, distance=1, weight=None):
    """Returns array where the ith entry is the average X of network neighbors at a certain distance from node i.

    This is used, for example, to create a data matrix for estimating linear-in-means models. For distance=2, this returns the average X of network two-neighbors / friends-of-friends, a common instrument used for estimating linear-in-means models (see [1]_, [2]_). Note: the function converts the network to an unweighted, undirected version by dropping edge weights and directionality of links.

    Parameters
    ----------
    X : numpy array
         n-dimensional array of scalar observations or an n x k matrix of n k-dimensional observations.
    A : NetworkX graph
        Network on n nodes. Can be weighted or directed. NOTE: Assumes nodes are labeled 0 through n-1, so that the data for node i is given by the ith component of X. 
    weight : string
            Label of edge weights in A, if A is a weighted graph. Default value: None.

    Returns
    -------
    Xbar : numpy array
        n-dimensional array of scalar observations or an n x k matrix of n k-dimensional observations, where the ith row is the average X of i's friends.

    Examples
    --------
    >>> from networkinference.utils import FakeData, nhbr_mean
    >>> import numpy as np
    >>> A = FakeData.erdos_renyi()
    >>> X = np.random.normal(100)
    >>> Xbar = nhbr_mean(X, A)

    References
    ----------
    .. [1] Bramoullé, Y., H. Djebbari, and B. Fortin, "Identification of Peer Effects Through Social Networks," Journal of Econometrics, 2009, 150 (1), 41-55.
    .. [2] De Giorgi, G., M. Pellizzari, and S. Redaelli, "Identification of Social Interactions Through Partially Overlapping Peer Groups," American Economic Journal: Applied Economics, 2010, 2 (2), 241-75.
    """
    n = A.number_of_nodes()
    if distance == 1:
        A_mat = nx.to_scipy_sparse_matrix(A, nodelist=range(n), weight=weight, format='csc')
        delta = A_mat.dot(np.ones(n)) # number of friends of each node
        Xbar = (A_mat.dot(X)).astype('float')
    else:
        A_mat = nx.to_scipy_sparse_matrix(A.to_undirected(as_view=True), nodelist=range(n), weight=None, format='csc')
        dist_matrix = csgraph.dijkstra(csgraph=A_mat, directed=False, unweighted=True)
        delta = (dist_matrix == distance).dot(np.ones(n)) 
        Xbar = ((dist_matrix == distance).dot(X)).astype('float')

    if X.ndim == 1:
        Xbar[delta != 0] = Xbar[delta != 0] / delta[delta != 0]
    elif X.ndim == 2:
        Xbar[delta != 0,:] = Xbar[delta != 0,:] / delta[delta != 0, np.newaxis]
    else:
        raise ValueError('X must be 1 or 2-dimensional.')
    return Xbar

def adjrownorm(A, weight=None):
    """Returns row-normalized adjacency matrix of NetworkX graph.
    
    Parameters
    ----------
    A : NetworkX graph
        Can be weighted or directed.
    weight : string
        Label of edge weights in A, if A is a weighted graph. Default value: None.

    Returns
    -------
    scipy sparse matrix
        Row-normalized adjacency matrix.

    Examples
    --------
    >>> from networkinference.utils import FakeData, adjrownorm 
    >>> A = FakeData.erdos_renyi()
    >>> A_norm = adjrownorm(A)
    """
    A_mat = nx.to_scipy_sparse_matrix(A, nodelist=range(A.number_of_nodes()), weight=weight, format='csc')
    deg_seq_sim = A_mat.dot(np.ones(A.number_of_nodes()))
    r,c = A_mat.nonzero() 
    rD_sp = csc_matrix(((1.0/np.maximum(deg_seq_sim,1))[r], (r,c)), shape=(A_mat.shape))
    return A_mat.multiply(rD_sp) 

[docs]class FakeData: """Methods for simulating data. """
[docs] @staticmethod def linear_in_means(A, theta=np.array([1,0.5,3,1])): """Returns a scalar outcome generated from a linear-in-means model with scalar covariate. Covariates and errors are i.i.d. standard normal. Outcomes are generated from the following linear-in-means model: .. math:: Y_i = \\alpha + \\beta \\frac{\sum_{j=1}^n A_{ij} Y_j}{\sum_{j=1}^n A_{ij}} + \delta \\frac{\sum_{j=1}^n A_{ij} X_j}{\sum_{j=1}^n A_{ij}} + \gamma X_i + \\varepsilon_i, where :math:`A_{ij}` is an indicator for whether nodes i and j are linked, :math:`\\alpha` is the intercept, :math:`\\beta` the endogenous peer effect, and :math:`\delta` the exogenous peer effect. The default parameter values are 1, 0.5, 3, and 1, respectively. Parameters ---------- A : NetworkX graph Network on n units. Can be weighted or directed. theta : numpy array 4-dimensional array of model parameters. Default value: np.array([1,0.5,3,1]). Returns ------- Y : numpy array n-dimensional array of outcomes. X : numpy array n-dimensional array of covariates. Examples -------- >>> from networkinference.utils import FakeData >>> A = FakeData.erdos_renyi() >>> Y, X = FakeData.linear_in_means(A) """ n = A.number_of_nodes() X = np.random.normal(size=n) errors = np.random.normal(size=n) A_norm = adjrownorm(A) eps = errors + A_norm.dot(errors) LIM_inv = inv( identity(n,format='csc') - theta[1]*A_norm ) Y = LIM_inv.dot( (theta[0] + theta[2]*A_norm.dot(X) + theta[3]*X + eps) ) return Y, X
[docs] @staticmethod def random_geometric(n=500, avg_deg=5, seed=None): """Returns a random geometric graph on n nodes. Nodes are randomly positioned in :math:`[0,1]^2` and form links with all alters within a certain radius. Parameters ---------- n : int Number of nodes. Default value: 500. avg_deg : int Desired average degree of output graph. Default value: 5. seed : int Seed for random positions. Set to None to not set a seed. Default value: None. Returns ------- RGG : NetworkX graph Unweighted and undirected graph on n nodes. Examples -------- >>> from networkinference.utils import FakeData >>> A = FakeData.random_geometric() """ np.random.seed(seed=seed) r = (avg_deg/math.pi/n)**(1/2) positions = np.random.uniform(size=(n,2)) kdtree = cKDTree(positions) pairs = kdtree.query_pairs(r) # Euclidean norm RGG = nx.empty_graph(n=positions.shape[0], create_using=nx.Graph()) RGG.add_edges_from(list(pairs)) return RGG
[docs] @staticmethod def erdos_renyi(n=500, avg_deg=5, seed=None): """Returns an Erdos-Renyi graph on n nodes with linking probability avg_deg / n. Just a wrapper for a NetworkX function. Parameters ---------- n : int Number of nodes. Default value: 500. avg_deg : int Desired average degree of output graph. Default value: 5. seed : int Seed for random links. Set to None to not set a seed. Default value: None. Returns ------- NetworkX graph Undirected and unweighted graph on n nodes. Examples -------- >>> from networkinference.utils import FakeData >>> A = FakeData.erdos_renyi() """ return nx.fast_gnp_random_graph(n, avg_deg/n, seed=seed)
[docs] @staticmethod def ipw(n=500, network='RGG', avg_deg=5, p = 0.15, seed=None): """Returns data (Y, ind1, ind2, pscores1, pscores2, A) to input into IPW class. Outcome model is .. math:: Y_i = \left( \\beta_i + \\frac{\sum_{j=1}^n A_{ij} \\beta_j}{\sum_{j=1}^n A_{ij}} \\right) + \mathbf{1}\left\{\sum_{j=1}^n A_{ij} D_j > 0\\right\} + \left( \\varepsilon_i + \\frac{\sum_{j=1}^n A_{ij} \\varepsilon_j}{\sum_{j=1}^n A_{ij}} \\right) where :math:`A_{ij}` is an indicator for whether nodes i and j are linked, :math:`\\beta_i \stackrel{iid}\sim \mathcal{N}(1,1)`, :math:`\\varepsilon_i` is i.i.d. standard normal, and :math:`D_i` is i.i.d. Bernoulli with success probability p. Parameters ---------- n : int Number of observations. Default value: 500. network : string Type of network to generate. Options: 'ER' (Erdos-Renyi) and 'RGG' (random geometric graph). Default value: 'RGG'. avg_deg : int Desired average degree of output graph. Default value: 5. p : float [0,1] Treatment probability. seed : int Seed for randomness. Set to None to not set a seed. Default value: None. Returns ------- Y : numpy array n-dimensional array of outcomes generated from a linear-in-means model. ind1 : numpy int array n-dimensional array of indicators for having at least one treated friend. ind2 : numpy int array n-dimensional array of indicators for having no treated friends. pscores1 : numpy float array n-dimensional array of probabilities of having at least one treated friend. pscores2 : numpy float array n-dimensional array of probabilities of having no treated friends. A : NetworkX graph Undirected, unweighted graph on n nodes. Examples -------- >>> from networkinference.utils import FakeData >>> Y, ind1, ind2, pscores1, pscores2, A = FakeData.ipw() """ if network=='ER': A = FakeData.erdos_renyi(n, avg_deg, seed) else: A = FakeData.random_geometric(n, avg_deg, seed) np.random.seed(seed=seed) A_mat = nx.to_scipy_sparse_matrix(A, nodelist=range(n), format='csc') A_norm = adjrownorm(A) D = np.random.binomial(1,p,n) # treatments b0 = np.random.normal(1,1,size=n) eps = np.random.normal(0,1,size=n) friends_treated = A_mat.dot(D) # number of friends treated Y = (b0+A_norm.dot(b0)) * (friends_treated > 0).astype('float') + (eps+A_norm.dot(eps)) # outcomes degrees = A_mat.dot(np.ones(n)) # number of friends pscores2 = binom(degrees,p).pmf(0) pscores1 = 1 - binom(degrees,p).pmf(0) ind1 = (friends_treated > 0).astype('float') # exposure mapping indicators for spillover effect ind2 = 1 - ind1 return Y, ind1, ind2, pscores1, pscores2, A
[docs] @staticmethod def ols(n=500, network='RGG', avg_deg=5, seed=None): """Returns data (Y, X, A) to input into OLS class. Outcome model is .. math :: Y_i = 1 + \left( X_i + \\frac{\sum_{j=1}^n A_{ij} X_j}{\sum_{j=1}^n A_{ij}} \\right) + \left( \\varepsilon_i + \\frac{\sum_{j=1}^n A_{ij} \\varepsilon_j}{\sum_{j=1}^n A_{ij}} \\right) where :math:`A_{ij}` is an indicator for whether nodes i and j are linked, :math:`X_i`, and :math:`\\varepsilon_i` are i.i.d. standard normal. Parameters ---------- n : int Number of observations. Default value: 500. network : string Type of network to generate. Options: 'ER' (Erdos-Renyi) and 'RGG' (random geometric graph). Default value: 'RGG'. avg_deg : int Desired average degree of output graph. Default value: 5. seed : int Seed for randomness. Set to None to not set a seed. Default value: None. Returns ------- Y : numpy array n-dimensional array of outcomes. X : numpy array n-dimensional array of covariates. A : NetworkX undirected graph Undirected, unweighted network on n nodes. Examples -------- >>> from networkinference.utils import FakeData >>> Y, X, A = FakeData.ols() """ if network=='ER': A = FakeData.erdos_renyi(n, avg_deg, seed) else: A = FakeData.random_geometric(n, avg_deg, seed) np.random.seed(seed=seed) X = np.random.normal(size=n) eps = np.random.normal(size=n) A_norm = adjrownorm(A) X = X + A_norm.dot(X) errors = eps + A_norm.dot(eps) Y = 1 + X + errors return Y, X, A
[docs] @staticmethod def tsls(n=500, network='RGG', avg_deg=5, seed=None): """Returns data (Y, X, W, A) to input into TSLS class. Outcomes are generated using the linear_in_means() method of this class. X = (average outcomes of neighbors, average covariate of neighbors, own covariate). W = (average covariate of friends of friends, average covariate of neighbors, own covariate). Parameters ---------- n : int Number of observations. Default value: 500. network : string Type of network to generate. Options: 'ER' (Erdos-Renyi) and 'RGG' (random geometric graph). Default value: 'RGG'. avg_deg : int Desired average degree of output graph. Default value: 5. seed : int Seed for randomness. Set to None to not set a seed. Default value: None. Returns ------- Y : numpy array n-dimensional array of outcomes. X : numpy array n x 3 array of regressors. First column is average outcomes of network neighbors, second is average covariate of neighbors, third is own covariate, where covariates are binary. W : numpy array n x 3 array of instruments. First column is average covariate of friends of friends, second is average covariate of neighbors, third is own covariate, where covariates are binary. A : NetworkX undirected graph Undirected, unweighted network on n nodes. Examples -------- >>> from networkinference.utils import FakeData >>> Y, X, W, A = FakeData.tsls() """ if network=='ER': A = FakeData.erdos_renyi(n, avg_deg, seed) else: A = FakeData.random_geometric(n, avg_deg, seed) np.random.seed(seed=seed) A_norm = adjrownorm(A) Y, D = FakeData.linear_in_means(A) Dbar = A_norm.dot(D) Ybar = A_norm.dot(Y) D2bar = nhbr_mean(D, A, distance=2) X = np.vstack([Ybar, Dbar, D]).T W = np.vstack([D2bar, Dbar, D]).T return Y, X, W, A