{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from scipy import sparse, io, stats" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import re \n", "def sorted_nicely( l ): \n", " \"\"\" Sort the given iterable in the way that humans expect.\"\"\" \n", " convert = lambda text: int(text) if text.isdigit() else text \n", " alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] \n", " return sorted(l, key = alphanum_key)" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "df_2_or_agg = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/norm_dist_files/combined_dist_500_agg.h5' )" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [], "source": [ "df_2_or_rao = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/norm_dist_files/combined_dist_500_rao.h5' )" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "df_2_or_rao_kr = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/norm_dist_files/combined_dist_with_georg_hic_rao_kr_vc_500.h5' )" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [], "source": [ "df_2_or_rao_kr.rename(columns={\"hi-c-rao\": \"VC_tss_median\" }, inplace=True)" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "df_2_or_rao['pairs'] = [str(x)+\"_\"+str(y) for x,y in zip(df_2_or_rao['Gene stable ID_y'], df_2_or_rao['Gene stable ID_x'])]\n", "df_2_or_rao.set_index('pairs', inplace=True)\n", "\n", "df_2_or_agg['pairs'] = [str(x)+\"_\"+str(y) for x,y in zip(df_2_or_agg['Gene stable ID_y'], df_2_or_agg['Gene stable ID_x'])]\n", "df_2_or_agg.set_index('pairs', inplace=True)\n", "\n", "df_2_or_rao_kr['pairs'] = [str(x)+\"_\"+str(y) for x,y in zip(df_2_or_rao_kr['Gene stable ID_y'], df_2_or_rao_kr['Gene stable ID_x'])]\n", "df_2_or_rao_kr.set_index('pairs', inplace=True)" ] }, { "cell_type": "code", "execution_count": 129, "metadata": {}, "outputs": [], "source": [ "subset_cols = list (set(df_2_or_rao.index) & set(df_2_or_agg.index) & set(df_2_or_rao_kr.index)) # I only care for genes subset for which expression ans tss is present \n", "df_2_or_agg = df_2_or_agg.loc[subset_cols, :]\n", "df_2_or_rao = df_2_or_rao.loc[subset_cols, :]\n", "df_2_or_rao_kr = df_2_or_rao_kr.loc[subset_cols, :]" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [], "source": [ "chrm_list = []\n", "net_type = []\n", "corr = []\n", "p_val = []\n", "agg_method = []\n", "\n", " \n", "change_group_level_1 = df_2_or_rao.groupby(['chrom_x'])\n", "for network_type in ['VC_tss']:\n", " for chrm in sorted_nicely(change_group_level_1.groups.keys()): \n", " df = change_group_level_1.get_group(chrm)\n", " #df = df[df['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization\n", " #df = df.dropna(subset=[network_type])\n", " c = stats.pearsonr(df[network_type].to_numpy(), df['exp'].to_numpy() )\n", " chrm_list.append(chrm)\n", " net_type.append(network_type)\n", " corr.append(c[0])\n", " p_val.append(c[1])\n", " agg_method.append('rao')\n", " \n", "change_group_level_1 = df_2_or_rao_kr.groupby(['chrom_x'])\n", "for network_type in ['VC_tss_median']:\n", " for chrm in sorted_nicely(change_group_level_1.groups.keys()): \n", " df = change_group_level_1.get_group(chrm)\n", " #df = df[df['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization\n", " #df = df.dropna(subset=[network_type])\n", " c = stats.pearsonr(df[network_type].to_numpy(), df['exp'].to_numpy() )\n", " chrm_list.append(chrm)\n", " net_type.append(network_type)\n", " corr.append(c[0])\n", " p_val.append(c[1])\n", " agg_method.append('rao_kr')" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df = pd.DataFrame(list(zip(chrm_list, net_type, corr, p_val, agg_method)), \n", " columns =['chrm_list', 'net_type', 'corr', 'p_val', 'agg_method']) \n", "df_melted = pd.pivot_table(df, values='corr', index=['chrm_list'],\n", " columns=['agg_method'])\n", "df_melted.plot.bar(figsize=(10,5))" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [], "source": [ "chrm_list = []\n", "net_type = []\n", "corr = []\n", "p_val = []\n", "agg_method = []\n", "change_group_level_1 = df_2_or_agg.groupby(['chrom_x'])\n", "for network_type in ['VC_tss']:\n", " for chrm in sorted_nicely(change_group_level_1.groups.keys()): \n", " df = change_group_level_1.get_group(chrm)\n", " #df = df[df['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization\n", " #df = df.dropna(subset=[network_type])\n", " c = stats.pearsonr(df[network_type].to_numpy(), df['exp'].to_numpy() )\n", " chrm_list.append(chrm)\n", " net_type.append(network_type)\n", " corr.append(c[0])\n", " p_val.append(c[1])\n", " agg_method.append('agg')\n", " \n", "change_group_level_1 = df_2_or_rao.groupby(['chrom_x'])\n", "for network_type in ['VC_tss']:\n", " for chrm in sorted_nicely(change_group_level_1.groups.keys()): \n", " df = change_group_level_1.get_group(chrm)\n", " #df = df[df['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization\n", " #df = df.dropna(subset=[network_type])\n", " c = stats.pearsonr(df[network_type].to_numpy(), df['exp'].to_numpy() )\n", " chrm_list.append(chrm)\n", " net_type.append(network_type)\n", " corr.append(c[0])\n", " p_val.append(c[1])\n", " agg_method.append('rao')" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df_agg = pd.DataFrame(list(zip(chrm_list, net_type, corr, p_val, agg_method)), \n", " columns =['chrm_list', 'net_type', 'corr', 'p_val', 'agg_method']) \n", "df_melted_agg = pd.pivot_table(df_agg, values='corr', index=['chrm_list'],\n", " columns=['agg_method'])\n", "df_melted_agg.plot.bar(figsize=(8,4), legend=False)" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 146, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "(df_melted_agg['agg'] - df_melted_agg['rao'] ).plot.bar(figsize=(8,4), legend=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_agg = pd.DataFrame(list(zip(chrm_list, net_type, corr, p_val, agg_method)), \n", " columns =['chrm_list', 'net_type', 'corr', 'p_val', 'agg_method']) \n", "df_melted_agg = pd.pivot_table(df_agg, values='corr', index=['chrm_list'],\n", " columns=['agg_method'])\n", "df_melted_agg.plot.bar(figsize=(8,4), legend=False)" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "chrm_list = []\n", "net_type = []\n", "corr = []\n", "p_val = []\n", "agg_method = []\n", "change_group_level_1 = df_2_or_agg.groupby(['chrom_x'])\n", "for network_type in ['VC_tss', 'VC_max', 'VC_mean']:\n", " for chrm in sorted_nicely(change_group_level_1.groups.keys()): \n", " df = change_group_level_1.get_group(chrm)\n", " #df = df[df['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization\n", " #df = df.dropna(subset=[network_type])\n", " c = stats.pearsonr(df[network_type].to_numpy(), df['exp'].to_numpy() )\n", " chrm_list.append(chrm)\n", " net_type.append(network_type)\n", " corr.append(c[0])\n", " p_val.append(c[1])\n", " agg_method.append('agg')" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df_map = pd.DataFrame(list(zip(chrm_list, net_type, corr, p_val, agg_method)), \n", " columns =['chrm_list', 'net_type', 'corr', 'p_val', 'agg_method']) \n", "df_melted_map = pd.pivot_table(df_map, values='corr', index=['chrm_list'],\n", " columns=['net_type'])\n", "df_melted_map.plot.bar(figsize=(10,5))" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [], "source": [ "chrm_list = []\n", "net_type = []\n", "corr = []\n", "p_val = []\n", "agg_method = []\n", "change_group_level_1 = df_2_or_agg.groupby(['chrom_x'])\n", "for network_type in ['VC_rank_max', 'VC_max', 'VC_lib_max']:\n", " for chrm in sorted_nicely(change_group_level_1.groups.keys()): \n", " df = change_group_level_1.get_group(chrm)\n", " df = df[df['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization\n", " #df = df.dropna(subset=[network_type])\n", " c = stats.pearsonr(df[network_type].to_numpy(), df['exp'].to_numpy() )\n", " chrm_list.append(chrm)\n", " net_type.append(network_type)\n", " corr.append(c[0])\n", " p_val.append(c[1])\n", " agg_method.append('agg')" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 139, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df_norm = pd.DataFrame(list(zip(chrm_list, net_type, corr, p_val, agg_method)), \n", " columns =['chrm_list', 'net_type', 'corr', 'p_val', 'agg_method']) \n", "df_melted_norm = pd.pivot_table(df_norm, values='corr', index=['chrm_list'],\n", " columns=['net_type'])\n", "df_melted_norm.plot.bar(figsize=(10,5))" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "chrm_list = []\n", "net_type = []\n", "corr = []\n", "p_val = []\n", "agg_method = []\n", "change_group_level_1 = df_2_or_rao.groupby(['chrom_x'])\n", "for network_type in ['VC_rank_max', 'VC_max', 'VC_lib_max']:\n", " for chrm in sorted_nicely(change_group_level_1.groups.keys()): \n", " df = change_group_level_1.get_group(chrm)\n", " #df = df[df['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization\n", " #df = df.dropna(subset=[network_type])\n", " c = stats.pearsonr(df[network_type].to_numpy(), df['exp'].to_numpy() )\n", " chrm_list.append(chrm)\n", " net_type.append(network_type)\n", " corr.append(c[0])\n", " p_val.append(c[1])\n", " agg_method.append('agg')" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df_norm = pd.DataFrame(list(zip(chrm_list, net_type, corr, p_val, agg_method)), \n", " columns =['chrm_list', 'net_type', 'corr', 'p_val', 'agg_method']) \n", "df_melted_norm = pd.pivot_table(df_norm, values='corr', index=['chrm_list'],\n", " columns=['net_type'])\n", "df_melted_norm.plot.bar(figsize=(10,5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 }