{ "cells": [ { "cell_type": "code", "execution_count": 21, "id": "a2480175-ba31-4e02-bb87-16697c17c075", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from scipy import stats, sparse\n", "import bottleneck\n", "\n", "def run_egad(go, nw, **kwargs):\n", " \"\"\"EGAD running function\n", " \n", " Wrapper to lower level functions for EGAD\n", "\n", " EGAD measures modularity of gene lists in co-expression networks. \n", "\n", " This was translated from the MATLAB version, which does tiled Cross Validation\n", " \n", " The useful kwargs are:\n", " int - nFold : Number of CV folds to do, default is 3, \n", " int - {min,max}_count : limits for number of terms in each gene list, these are exclusive values\n", "\n", "\n", " Arguments:\n", " go {pd.DataFrame} -- dataframe of genes x terms of values [0,1], where 1 is included in gene lists\n", " nw {pd.DataFrame} -- dataframe of co-expression network, genes x genes\n", " **kwargs \n", " \n", " Returns:\n", " pd.DataFrame -- dataframe of terms x metrics where the metrics are \n", " ['AUC', 'AVG_NODE_DEGREE', 'DEGREE_NULL_AUC', 'P_Value']\n", " \"\"\"\n", " assert nw.shape[0] == nw.shape[1] , 'Network is not square'\n", " #print(nw.index)\n", " #nw.columns = nw.columns.astype(int)\n", " #print(nw.columns.astype(int))\n", " assert np.all(nw.index == nw.columns) , 'Network index and columns are not in the same order'\n", "\n", " #nw_mask = nw.isna().sum(axis=1) != nw.shape[1]\n", " #nw = nw.loc[nw_mask, nw_mask].astype('float')\n", " #np.fill_diagonal(nw.values, 1)\n", " return _runNV(go, nw, **kwargs)\n", "\n", "def _runNV(go, nw, nFold=3, min_count=20, max_count=1000):\n", "\n", " #Make sure genes are same in go and nw\n", " #go.index = go.index.map(str) \n", " #nw.index = nw.index.map(str)\n", " #nw.index = nw.index.str.replace('_', '')\n", " #go.index = go.index.str.replace('_', '')\n", " #print (nw)\n", " genes_intersect = go.index.intersection(nw.index)\n", "\n", "\n", " #print (genes_intersect)\n", " go = go.loc[genes_intersect, :]\n", " nw = nw.loc[genes_intersect, genes_intersect]\n", " #print (go)\n", " print (nw.shape)\n", " print (go.shape)\n", " sparsity = 1.0 - np.count_nonzero(go) / go.size\n", " print (sparsity)\n", " sparsity = 1.0 - np.count_nonzero(nw) / nw.size\n", " print (sparsity)\n", " #print(nw\n", " #print(go\n", " nw_mask = nw.isna().sum(axis=1) != nw.shape[1]\n", " nw = nw.loc[nw_mask, nw_mask].astype('float')\n", " np.fill_diagonal(nw.values, 1)\n", " #Make sure there aren't duplicates\n", " duplicates = nw.index.duplicated(keep='first')\n", " nw = nw.loc[~duplicates, ~duplicates]\n", "\n", " go = go.loc[:, (go.sum(axis=0) > min_count) & (go.sum(axis=0) < max_count)]\n", " go = go.loc[~go.index.duplicated(keep='first'), :]\n", " #print(go)\n", "\n", " roc = _new_egad(go.values, nw.values, nFold)\n", "\n", " col_names = ['AUC', 'AVG_NODE_DEGREE', 'DEGREE_NULL_AUC', 'P_Value']\n", " #Put output in dataframe\n", " return pd.DataFrame(dict(zip(col_names, roc)), index=go.columns)\n", "\n", "def _new_egad(go, nw, nFold):\n", "\n", " #Build Cross validated Positive\n", " x, y = np.where(go)\n", " #print(x, y)\n", " cvgo = {}\n", " for i in np.arange(nFold):\n", " a = x[i::nFold]\n", " #print(a)\n", " b = y[i::nFold]\n", " dat = np.ones_like(a)\n", " mask = sparse.coo_matrix((dat, (a, b)), shape=go.shape)\n", " cvgo[i] = go - mask.toarray()\n", "\n", " CVgo = np.concatenate(list(cvgo.values()), axis=1)\n", " #print(CVgo)\n", "\n", " sumin = np.matmul(nw.T, CVgo)\n", "\n", " degree = np.sum(nw, axis=0)\n", " #print(degree)\n", " #print(degree[:, None])\n", "\n", " predicts = sumin / degree[:, None]\n", " #print(predicts)\n", "\n", " np.place(predicts, CVgo > 0, np.nan)\n", "\n", " #print(predicts)\n", "\n", " #Calculate ranks of positives\n", " rank_abs = lambda x: stats.rankdata(np.abs(x))\n", " predicts2 = np.apply_along_axis(rank_abs, 0, predicts)\n", " #print(predicts2)\n", "\n", " #Masking Nans that were ranked (how tiedrank works in matlab)\n", " predicts2[np.isnan(predicts)] = np.nan\n", " #print(predicts2)\n", "\n", " filtering = np.tile(go, nFold)\n", " #print(filtering)\n", "\n", " #negatives :filtering == 0\n", " #Sets Ranks of negatives to 0\n", " np.place(predicts2, filtering == 0, 0)\n", "\n", " #Sum of ranks for each prediction\n", " p = bottleneck.nansum(predicts2, axis=0)\n", " n_p = np.sum(filtering, axis=0) - np.sum(CVgo, axis=0)\n", "\n", " #Number of negatives\n", " #Number of GO terms - number of postiive\n", " n_n = filtering.shape[0] - np.sum(filtering, axis=0)\n", "\n", " roc = (p / n_p - (n_p + 1) / 2) / n_n\n", " U = roc * n_p * n_n\n", " Z = (np.abs(U - (n_p * n_n / 2))) / np.sqrt(n_p * n_n *\n", " (n_p + n_n + 1) / 12)\n", " roc = roc.reshape(nFold, go.shape[1])\n", " Z = Z.reshape(nFold, go.shape[1])\n", " #Stouffer Z method\n", " Z = bottleneck.nansum(Z, axis=0) / np.sqrt(nFold)\n", " #Calc ROC of Neighbor Voting\n", " roc = bottleneck.nanmean(roc, axis=0)\n", " P = stats.norm.sf(Z)\n", "\n", " #Average degree for nodes in each go term\n", " avg_degree = degree.dot(go) / np.sum(go, axis=0)\n", "\n", " #Calc null auc for degree\n", " ranks = np.tile(stats.rankdata(degree), (go.shape[1], 1)).T\n", "\n", " np.place(ranks, go == 0, 0)\n", "\n", " n_p = bottleneck.nansum(go, axis=0)\n", " nn = go.shape[0] - n_p\n", " p = bottleneck.nansum(ranks, axis=0)\n", "\n", " roc_null = (p / n_p - ((n_p + 1) / 2)) / nn\n", " #print(roc)\n", " return roc, avg_degree, roc_null, P" ] }, { "cell_type": "code", "execution_count": 2, "id": "390162db-e8b8-4a30-821e-d5ef1ece8965", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/software/CoCoCoNet/gene2go/drosophila_gene2go.csv', delim_whitespace=True)\n", "\n", "df['val'] = 1\n", "\n", "go_table = pd.pivot_table(df, index=['NetworkIDs'], values=['val'],columns=['GO_term'])\n", "\n", "go_table = go_table.fillna(0)\n", "\n", "from hicmatrix import HiCMatrix as hm\n", "from hicmatrix.lib import MatrixFileHandler\n", "exp_file_path = \"/grid/gillis/data/lohia/hi_c_data_processing/genomes_jlee/drosophila_dist.h5\" \n", "exp_file = hm.hiCMatrix(exp_file_path)\n", "\n", "exp_genes_all = [x[3].decode() for x in exp_file.cut_intervals]\n", "exp_matrix = exp_file.matrix.toarray()\n", "np.fill_diagonal(exp_matrix , 1)\n", "df_exp = pd.DataFrame(exp_matrix , index=exp_genes_all , columns = exp_genes_all )\n", "\n", "\n", "df_2d = run_egad(go_table, df_exp)\n", "\n", "df_2d.to_csv('./del.csv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 27, "id": "02fd8bda-1c3b-46a0-8d17-6caa24639db7", "metadata": {}, "outputs": [], "source": [ "from hicmatrix import HiCMatrix as hm\n", "from hicmatrix.lib import MatrixFileHandler\n", "exp_file_path = \"/grid/gillis/data/lohia/hi_c_data_processing/genomes_jlee/drosophila_dist.h5\" \n", "exp_file = hm.hiCMatrix(exp_file_path)" ] }, { "cell_type": "code", "execution_count": 31, "id": "c12cf51c-cdcc-4209-b6b2-1066a3285ce5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([7.23276279e+10, 7.16760457e+10, 7.03841510e+10, ...,\n", " 6.80401200e+07, 7.14493540e+07, 7.14627610e+07])" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exp_file.matrix.toarray().sum(axis=0)" ] }, { "cell_type": "code", "execution_count": 32, "id": "68d6e89d-6518-468c-845b-ccf2cef6bb3d", "metadata": {}, "outputs": [], "source": [ "exp_genes_all = [x[3].decode() for x in exp_file.cut_intervals]" ] }, { "cell_type": "code", "execution_count": 33, "id": "1f6b90d5-f0c1-497c-a898-1956274b16db", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | genes | \n", "degree | \n", "
---|---|---|
0 | \n", "FBgn0267431 | \n", "7.232763e+10 | \n", "
1 | \n", "FBgn0085804 | \n", "7.167605e+10 | \n", "
2 | \n", "FBgn0039987 | \n", "7.038415e+10 | \n", "
3 | \n", "FBgn0267798 | \n", "6.898037e+10 | \n", "
4 | \n", "FBgn0267797 | \n", "6.689135e+10 | \n", "
... | \n", "... | \n", "... | \n", "
14871 | \n", "FBgn0250819 | \n", "6.668846e+07 | \n", "
14872 | \n", "FBgn0039924 | \n", "6.782980e+07 | \n", "
14873 | \n", "FBgn0263112 | \n", "6.804012e+07 | \n", "
14874 | \n", "FBgn0027101 | \n", "7.144935e+07 | \n", "
14875 | \n", "FBgn0053653 | \n", "7.146276e+07 | \n", "
14876 rows × 2 columns
\n", "