# import numpy as np import pandas as pd import warnings from calculate_auc import * from pandas.core.common import SettingWithCopyWarning warnings.simplefilter(action="ignore", category=SettingWithCopyWarning) from create_corr_network import rank import random def calc_auc_hic(resoulution_in_kb, case='simple', dist_tp='exp', prediction='hi-c-rao', shuffle=False): df_2_or = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/combined_dist_with_georg_hic_sub_median_hic_%s.h5' %resoulution_in_kb) #df_2_or = df_2_or[df_2_or['exp_georg'] >= 0] # liming the matrix to only chosen values for rank standerization df_2_or = df_2_or[df_2_or['hi-c-rao'] >= 0] # liming the matrix to only chosen values for rank standerization #ranked_matirx = rank(df_2_or['exp_georg']) #df_2_or['exp_georg'] = ranked_matirx #df_2_or.rename(columns={"exp_georg": "exp (GK)"}, inplace=True) ranked_matirx = rank(df_2_or['exp']) df_2_or['exp'] = ranked_matirx #ranked_matirx = rank(df_2_or['hi-c-rao']) #df_2_or['hi-c-rao'] = ranked_matirx m_l = [] change_group_level_1 = df_2_or.groupby(['chrom_x']) #for chrm in change_group_level_1.groups.keys(): for chrm in ['chr11','chr16', 'chrX', 'chr17']: print (chrm) df = change_group_level_1.get_group(chrm) num_pairs = df['Gene stable ID_x'].nunique() prot_list_sp = np.array_split(df, num_pairs, axis=0) #list(itertools.permutations(range(0,num_pairs)))[0] prot_list = list(range(0,num_pairs)) if shuffle ==True: random.shuffle(prot_list) else: pass #for i, shuf_i in zip(list(range(0,num_pairs)), prot_list): for i in list(range(0,num_pairs)): #for i in list(range(7,9)): long_form_top = prot_list_sp[int(i)] long_form_top['dist'] = long_form_top[dist_tp] long_form_top['i_range'] = list(range(0,long_form_top.shape[0] )) long_form_top = long_form_top[long_form_top['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization long_form_top = long_form_top[long_form_top['Gene stable ID_x'] != long_form_top['Gene stable ID_y']] # remove all the self pairs from each set long_form_top = long_form_top.reset_index() for dist_thresh in [1]: if case == 'simple': long_form_top["True_sim"] = [1 if score > dist_thresh else 0 for score in long_form_top["dist"]] elif case == 'tp': long_form_top = long_form_top.sort_values(by=['dist'], ascending=False) long_form_top["True_sim"] = [0 if score > dist_thresh else 0 for score in long_form_top["dist"]] for ind_val in long_form_top.index.values[0:dist_thresh]: long_form_top.at[ind_val, 'True_sim'] = 1 else: long_form_top = long_form_top.sort_values(by=['dist'], ascending=True) long_form_top["True_sim"] = [1 if score > dist_thresh else 1 for score in long_form_top["dist"]] for ind_val in long_form_top.index.values[0:dist_thresh]: long_form_top.at[ind_val, 'True_sim'] = 0 long_form_top["true_pos"] = [score for score in long_form_top["True_sim"]] long_form_top["true_neg"] = [1 if score==0 else 0 for score in long_form_top["True_sim"]] long_form_top["predicted_sim_from_exp"] = [score for score in long_form_top[prediction]] c_original = calc_auroc (long_form_top,predicted_score='predicted_sim_from_exp') prot_list = list(range(0,num_pairs)) prot_list.remove(i) for shuf_i in prot_list: #for shuf_i in long_form_top['i_range'].to_list(): #for shuf_i in [100]: long_form_top = prot_list_sp[int(i)] long_form_top_shuf = prot_list_sp[int(shuf_i)] long_form_top['dist'] = long_form_top[dist_tp] #long_form_top_shuf['dist'] = long_form_top_shuf[dist_tp] long_form_top['dist_prediction'] = long_form_top_shuf[dist_tp].to_list() long_form_top = long_form_top[long_form_top['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization long_form_top = long_form_top[long_form_top['Gene stable ID_x'] != long_form_top['Gene stable ID_y']] # remove all the self pairs from each set mp = long_form_top['Gene stable ID_y'].values[0] mp_precited = long_form_top_shuf['Gene stable ID_y'].values[0] exp_median = long_form_top['exp'].median() exp_mean = long_form_top['exp'].mean() exp_var = long_form_top['exp'].var() long_form_top = long_form_top.reset_index() for dist_thresh in [1]: if case == 'simple': long_form_top["True_sim"] = [1 if score > dist_thresh else 0 for score in long_form_top["dist"]] elif case == 'tp': long_form_top = long_form_top.sort_values(by=['dist'], ascending=False) long_form_top["True_sim"] = [0 if score > dist_thresh else 0 for score in long_form_top["dist"]] for ind_val in long_form_top.index.values[0:dist_thresh]: long_form_top.at[ind_val, 'True_sim'] = 1 else: long_form_top = long_form_top.sort_values(by=['dist'], ascending=True) long_form_top["True_sim"] = [1 if score > dist_thresh else 1 for score in long_form_top["dist"]] for ind_val in long_form_top.index.values[0:dist_thresh]: long_form_top.at[ind_val, 'True_sim'] = 0 long_form_top["true_pos"] = [score for score in long_form_top["True_sim"]] long_form_top["true_neg"] = [1 if score==0 else 0 for score in long_form_top["True_sim"]] long_form_top["predicted_sim_from_exp"] = [score for score in long_form_top['dist_prediction']] c_hic = calc_auroc (long_form_top,predicted_score='predicted_sim_from_exp') long_form_top = prot_list_sp[int(i)] long_form_top_shuf = prot_list_sp[int(shuf_i)] long_form_top['dist'] = long_form_top[dist_tp] #long_form_top_shuf['dist'] = long_form_top_shuf[dist_tp] long_form_top['exp'] = long_form_top_shuf['exp'].to_list() long_form_top = long_form_top[long_form_top['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization long_form_top = long_form_top[long_form_top['Gene stable ID_x'] != long_form_top['Gene stable ID_y']] # remove all the self pairs from each set mp = long_form_top['Gene stable ID_y'].values[0] mp_precited = long_form_top_shuf['Gene stable ID_y'].values[0] exp_median = long_form_top['exp'].median() exp_mean = long_form_top['exp'].mean() exp_var = long_form_top['exp'].var() long_form_top = long_form_top.reset_index() for dist_thresh in [1]: if case == 'simple': long_form_top["True_sim"] = [1 if score > dist_thresh else 0 for score in long_form_top["dist"]] elif case == 'tp': long_form_top = long_form_top.sort_values(by=['dist'], ascending=False) long_form_top["True_sim"] = [0 if score > dist_thresh else 0 for score in long_form_top["dist"]] for ind_val in long_form_top.index.values[0:dist_thresh]: long_form_top.at[ind_val, 'True_sim'] = 1 else: long_form_top = long_form_top.sort_values(by=['dist'], ascending=True) long_form_top["True_sim"] = [1 if score > dist_thresh else 1 for score in long_form_top["dist"]] for ind_val in long_form_top.index.values[0:dist_thresh]: long_form_top.at[ind_val, 'True_sim'] = 0 long_form_top["true_pos"] = [score for score in long_form_top["True_sim"]] long_form_top["true_neg"] = [1 if score==0 else 0 for score in long_form_top["True_sim"]] long_form_top["predicted_sim_from_exp"] = [score for score in long_form_top[prediction]] ca = calc_auroc (long_form_top,predicted_score='predicted_sim_from_exp') m_curve = {} pr_curve = {} m_l.append((chrm, num_pairs,dist_thresh, ca, m_curve, pr_curve, long_form_top["true_pos"].sum(), long_form_top["true_neg"].sum(), exp_median, exp_mean, exp_var, mp, mp_precited, c_original, c_hic)) df_scores = pd.DataFrame(m_l, columns =['chrm', 'num_pairs','dist_thresh', 'auc', 'plot', 'pr_curve', 'true_pos', 'true_neg', 'exp_median', 'exp_mean', 'exp_var', 'Gene stable ID', 'mp_precited', 'auc_or', 'auc_hic']) df_scores.to_hdf('/data/lohia/gene_distance_expresseion/dist_files/custom_combined_%s_%s_%s_%s_permutations_includes_adjacent_hic.h5' %(resoulution_in_kb, case, dist_tp, prediction), key='df', mode='w') df_scores = pd.DataFrame(m_l, columns =['chrm', 'num_pairs','dist_thresh', 'auc', 'plot', 'pr_curve', 'true_pos', 'true_neg', 'exp_median', 'exp_mean', 'exp_var', 'Gene stable ID', 'mp_precited', 'auc_or', 'auc_hic']) df_scores.to_hdf('/data/lohia/gene_distance_expresseion/custom_dist_files/combined_%s_%s_%s_%s_permutations_includes_adjacent_hic.h5' %(resoulution_in_kb, case, dist_tp, prediction), key='df', mode='w') return df_scores if __name__ == '__main__': for resoultion in [100]: for case in ['tn']: df_scores = calc_auc_hic(resoultion, case=case, dist_tp='hi-c-rao', prediction='exp') #for resoultion in [100, 500]: # for case in ['tn']: # df_scores = calc_auc_hic(resoultion, case=case, dist_tp='exp', prediction='hi-c-rao')