# import numpy as np import pandas as pd import warnings from calculate_auc import * from pandas.core.common import SettingWithCopyWarning warnings.simplefilter(action="ignore", category=SettingWithCopyWarning) from create_corr_network import rank def calc_auc_hic(resoulution_in_kb, case='simple', dist_tp='exp', prediction='hi-c-rao'): df_2_or = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/combined_dist_with_georg_hic_sub_median_hic_%s.h5' %resoulution_in_kb) #df_2_or = df_2_or[df_2_or['exp_georg'] >= 0] # liming the matrix to only chosen values for rank standerization df_2_or = df_2_or[df_2_or['hi-c-rao'] >= 0] # liming the matrix to only chosen values for rank standerization df_2_or_u = df_2_or[df_2_or['Gene stable ID_x'] != df_2_or['Gene stable ID_y']] #ranked_matirx = rank(df_2_or['exp_georg']) #df_2_or['exp_georg'] = ranked_matirx #df_2_or.rename(columns={"exp_georg": "exp (GK)"}, inplace=True) ranked_matirx = rank(df_2_or['exp']) df_2_or['exp'] = ranked_matirx df_2_or['gene_freq'] = df_2_or['gene_occurence_frequency_y'] + df_2_or['gene_occurence_frequency_x'] #ranked_matirx = rank(df_2_or['hi-c-rao']) #df_2_or['hi-c-rao'] = ranked_matirx m_l = [] change_group_level_1 = df_2_or.groupby(['chrom_x']) for chrm in change_group_level_1.groups.keys(): df = change_group_level_1.get_group(chrm) num_pairs = df['Gene stable ID_x'].nunique() prot_list_sp = np.array_split(df, num_pairs, axis=0) for i in range(0,num_pairs): long_form_top = prot_list_sp[int(i)] long_form_top['dist'] = long_form_top[dist_tp] long_form_top = long_form_top[long_form_top['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization long_form_top = long_form_top[long_form_top['Gene stable ID_x'] != long_form_top['Gene stable ID_y']] # remove all the self pairs from each set mp = long_form_top['Gene stable ID_y'].values[0] #print (long_form_top.shape) exp_median = long_form_top['exp'].median() exp_mean = long_form_top['exp'].mean() exp_var = long_form_top['exp'].var() long_form_top = long_form_top.reset_index() if exp_median >=0: for dist_thresh in [1,10,100]: #for dist_thresh in [0.5,0.8]: #for dist_thresh in [100000,1000000,10000000,100000000]: #for dist_thresh in [4000]: #for dist_thresh in [df_2_or_u["hi-c-rao"].min(), df_2_or["hi-c-rao"].max()-1, df_2_or["hi-c-rao"].mean(), df_2_or["hi-c-rao"].median()]: if case == 'simple': long_form_top["True_sim"] = [1 if score > dist_thresh else 0 for score in long_form_top["dist"]] elif case == 'tp': long_form_top = long_form_top.sort_values(by=['dist'], ascending=False) long_form_top["True_sim"] = [0 if score > dist_thresh else 0 for score in long_form_top["dist"]] for ind_val in long_form_top.index.values[0:dist_thresh]: long_form_top.at[ind_val, 'True_sim'] = 1 else: long_form_top = long_form_top.sort_values(by=['dist'], ascending=True) long_form_top["True_sim"] = [1 if score > dist_thresh else 1 for score in long_form_top["dist"]] for ind_val in long_form_top.index.values[0:dist_thresh]: long_form_top.at[ind_val, 'True_sim'] = 0 # #long_form_top["True_sim"] = [1 if score <= dist_thresh else 0 for score in long_form_top["dist"]] #long_form_top["True_sim"] = [1 if score >= dist_thresh else 1 if score2 <= 1000 else 0 for score, score2 in zip(long_form_top["dist"],long_form_top["tss_tss"])] long_form_top["true_pos"] = [score for score in long_form_top["True_sim"]] long_form_top["true_neg"] = [1 if score==0 else 0 for score in long_form_top["True_sim"]] long_form_top["predicted_sim_from_exp"] = [score for score in long_form_top[prediction]] ca = calc_auroc (long_form_top,predicted_score='predicted_sim_from_exp') m_curve = calc_auc_curve (long_form_top,predicted_score='predicted_sim_from_exp') pr_curve = prec_recall (long_form_top,predicted_score='predicted_sim_from_exp') tpd = pd.DataFrame(m_curve) if m_curve: tpd[0] = tpd[0].astype(float).round(2) tpd = tpd.groupby([0]).mean() m_curve = dict(zip(tpd.index, tpd[1])) else: m_curve = {} tpd = pd.DataFrame(pr_curve) if pr_curve: tpd[0] = tpd[0].astype(float).round(2) tpd = tpd.groupby([0]).mean() pr_curve = dict(zip(tpd.index, tpd[1])) else: pr_curve = {} m_l.append((chrm, num_pairs,dist_thresh, ca, m_curve, pr_curve, long_form_top["true_pos"].sum(), long_form_top["true_neg"].sum(), exp_median, exp_mean, exp_var, mp)) else: pass df_scores = pd.DataFrame(m_l, columns =['chrm', 'num_pairs','dist_thresh', 'auc', 'plot', 'pr_curve', 'true_pos', 'true_neg', 'exp_median', 'exp_mean', 'exp_var', 'Gene stable ID']) df_scores.to_hdf('/data/lohia/gene_distance_expresseion/dist_files/combined_%s_%s_%s_%s.h5' %(resoulution_in_kb, case, dist_tp, prediction), key='df', mode='w') return df_scores if __name__ == '__main__': for resoultion in [1, 500]: for case in ['tp', 'tn']: df_scores = calc_auc_hic(resoultion, case=case, dist_tp='hi-c-rao', prediction='exp') for resoultion in [1, 500]: for case in ['tp', 'tn']: df_scores = calc_auc_hic(resoultion, case=case, dist_tp='exp', prediction='hi-c-rao') for resoultion in [1, 500]: for case in ['tp', 'tn']: df_scores = calc_auc_hic(resoultion, case=case, dist_tp='hi-c-rao-common_elements', prediction='exp') for resoultion in [1, 500]: for case in ['tp', 'tn']: df_scores = calc_auc_hic(resoultion, case=case, dist_tp='gene_freq', prediction='exp')