# import numpy as np
import pandas as pd
import warnings
from calculate_auc import *
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
from create_corr_network import rank
import random

def calc_auc_hic(resoulution_in_kb, case='simple', dist_tp='exp', prediction='hi-c-rao', shuffle=False):

    df_2_or = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/combined_dist_with_georg_hic_sub_median_hic_%s.h5' %resoulution_in_kb)

    #df_2_or = df_2_or[df_2_or['exp_georg'] >= 0] # liming the matrix to only chosen values for rank standerization
    df_2_or = df_2_or[df_2_or['hi-c-rao'] >= 0] # liming the matrix to only chosen values for rank standerization

    #ranked_matirx = rank(df_2_or['exp_georg'])
    #df_2_or['exp_georg'] = ranked_matirx
    #df_2_or.rename(columns={"exp_georg": "exp (GK)"}, inplace=True)

    ranked_matirx = rank(df_2_or['exp'])
    df_2_or['exp'] = ranked_matirx

    #ranked_matirx = rank(df_2_or['hi-c-rao'])
    #df_2_or['hi-c-rao'] = ranked_matirx
    m_l = []
    change_group_level_1 = df_2_or.groupby(['chrom_x'])
    #for chrm in change_group_level_1.groups.keys():
    for chrm in ['chr11','chr16', 'chrX', 'chr17']:
    
        print (chrm)
        df = change_group_level_1.get_group(chrm)
        
        num_pairs = df['Gene stable ID_x'].nunique()

        prot_list_sp = np.array_split(df, num_pairs, axis=0)
        
        #list(itertools.permutations(range(0,num_pairs)))[0]
        prot_list = list(range(0,num_pairs))
        if shuffle ==True:
            random.shuffle(prot_list)
        else:
            pass
        #for i, shuf_i in zip(list(range(0,num_pairs)), prot_list):
        for i in list(range(0,num_pairs)):
        #for i in list(range(7,9)):
            
            long_form_top = prot_list_sp[int(i)]
            long_form_top['dist'] = long_form_top[dist_tp]
            long_form_top['i_range'] = list(range(0,long_form_top.shape[0] ))
            long_form_top = long_form_top[long_form_top['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization
            long_form_top = long_form_top[long_form_top['Gene stable ID_x'] != long_form_top['Gene stable ID_y']] # remove all the self pairs from each set
            long_form_top = long_form_top.reset_index()

            for dist_thresh in [1]:
                if case == 'simple':
                    long_form_top["True_sim"] = [1 if score > dist_thresh else 0 for score in long_form_top["dist"]]
                elif case == 'tp':
                    long_form_top = long_form_top.sort_values(by=['dist'], ascending=False) 
                    long_form_top["True_sim"] = [0 if score > dist_thresh else 0 for score in long_form_top["dist"]]  
                    for ind_val in long_form_top.index.values[0:dist_thresh]:
                        long_form_top.at[ind_val, 'True_sim'] = 1
                else:    
                    long_form_top = long_form_top.sort_values(by=['dist'], ascending=True) 
                    long_form_top["True_sim"] = [1 if score > dist_thresh else 1 for score in long_form_top["dist"]]  
                    for ind_val in long_form_top.index.values[0:dist_thresh]:
                        long_form_top.at[ind_val, 'True_sim'] = 0
                long_form_top["true_pos"] = [score for score in long_form_top["True_sim"]]
                long_form_top["true_neg"] = [1 if score==0 else 0 for score in long_form_top["True_sim"]]
                long_form_top["predicted_sim_from_exp"] = [score for score in long_form_top[prediction]]
                c_original = calc_auroc (long_form_top,predicted_score='predicted_sim_from_exp')
                
            prot_list = list(range(0,num_pairs))
            prot_list.remove(i)
            for shuf_i in prot_list:
            #for shuf_i in long_form_top['i_range'].to_list():
            #for shuf_i in [100]:
                long_form_top = prot_list_sp[int(i)]
                long_form_top_shuf = prot_list_sp[int(shuf_i)]
                long_form_top['dist'] = long_form_top[dist_tp]
                #long_form_top_shuf['dist'] = long_form_top_shuf[dist_tp]
                long_form_top['dist_prediction'] = long_form_top_shuf[dist_tp].to_list()
                long_form_top = long_form_top[long_form_top['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization
                long_form_top = long_form_top[long_form_top['Gene stable ID_x'] != long_form_top['Gene stable ID_y']] # remove all the self pairs from each set
                
                mp =  long_form_top['Gene stable ID_y'].values[0]
                mp_precited = long_form_top_shuf['Gene stable ID_y'].values[0]

                exp_median = long_form_top['exp'].median()
                exp_mean = long_form_top['exp'].mean()
                exp_var = long_form_top['exp'].var()

                long_form_top = long_form_top.reset_index()
                
                for dist_thresh in [1]:
                    if case == 'simple':
                        long_form_top["True_sim"] = [1 if score > dist_thresh else 0 for score in long_form_top["dist"]]
                    elif case == 'tp':
                        long_form_top = long_form_top.sort_values(by=['dist'], ascending=False) 
                        long_form_top["True_sim"] = [0 if score > dist_thresh else 0 for score in long_form_top["dist"]]  
                        for ind_val in long_form_top.index.values[0:dist_thresh]:
                            long_form_top.at[ind_val, 'True_sim'] = 1
                    else:    
                        long_form_top = long_form_top.sort_values(by=['dist'], ascending=True) 
                        long_form_top["True_sim"] = [1 if score > dist_thresh else 1 for score in long_form_top["dist"]]  
                        for ind_val in long_form_top.index.values[0:dist_thresh]:
                            long_form_top.at[ind_val, 'True_sim'] = 0
                    long_form_top["true_pos"] = [score for score in long_form_top["True_sim"]]
                    long_form_top["true_neg"] = [1 if score==0 else 0 for score in long_form_top["True_sim"]]
                    long_form_top["predicted_sim_from_exp"] = [score for score in long_form_top['dist_prediction']]
                    c_hic = calc_auroc (long_form_top,predicted_score='predicted_sim_from_exp')

                long_form_top = prot_list_sp[int(i)]
                long_form_top_shuf = prot_list_sp[int(shuf_i)]
                long_form_top['dist'] = long_form_top[dist_tp]
                #long_form_top_shuf['dist'] = long_form_top_shuf[dist_tp]
                long_form_top['exp'] = long_form_top_shuf['exp'].to_list()
                long_form_top = long_form_top[long_form_top['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization
                long_form_top = long_form_top[long_form_top['Gene stable ID_x'] != long_form_top['Gene stable ID_y']] # remove all the self pairs from each set
                
                mp =  long_form_top['Gene stable ID_y'].values[0]
                mp_precited = long_form_top_shuf['Gene stable ID_y'].values[0]

                exp_median = long_form_top['exp'].median()
                exp_mean = long_form_top['exp'].mean()
                exp_var = long_form_top['exp'].var()

                long_form_top = long_form_top.reset_index()

                for dist_thresh in [1]:
                    if case == 'simple':
                        long_form_top["True_sim"] = [1 if score > dist_thresh else 0 for score in long_form_top["dist"]]
                    elif case == 'tp':
                        long_form_top = long_form_top.sort_values(by=['dist'], ascending=False) 
                        long_form_top["True_sim"] = [0 if score > dist_thresh else 0 for score in long_form_top["dist"]]  
                        for ind_val in long_form_top.index.values[0:dist_thresh]:
                            long_form_top.at[ind_val, 'True_sim'] = 1
                    else:    
                        long_form_top = long_form_top.sort_values(by=['dist'], ascending=True) 
                        long_form_top["True_sim"] = [1 if score > dist_thresh else 1 for score in long_form_top["dist"]]  
                        for ind_val in long_form_top.index.values[0:dist_thresh]:
                            long_form_top.at[ind_val, 'True_sim'] = 0
                    long_form_top["true_pos"] = [score for score in long_form_top["True_sim"]]
                    long_form_top["true_neg"] = [1 if score==0 else 0 for score in long_form_top["True_sim"]]
                    long_form_top["predicted_sim_from_exp"] = [score for score in long_form_top[prediction]]
                    ca = calc_auroc (long_form_top,predicted_score='predicted_sim_from_exp')
                    m_curve  = {}
                    pr_curve  = {}
                    m_l.append((chrm, num_pairs,dist_thresh, ca, m_curve, pr_curve, long_form_top["true_pos"].sum(), long_form_top["true_neg"].sum(), exp_median, exp_mean, exp_var, mp, mp_precited, c_original, c_hic))

        df_scores = pd.DataFrame(m_l, columns =['chrm', 'num_pairs','dist_thresh',  'auc', 'plot', 'pr_curve', 'true_pos', 'true_neg', 'exp_median', 'exp_mean', 'exp_var', 'Gene stable ID', 'mp_precited', 'auc_or', 'auc_hic'])
        df_scores.to_hdf('/data/lohia/gene_distance_expresseion/dist_files/custom_combined_%s_%s_%s_%s_permutations_includes_adjacent_hic.h5' %(resoulution_in_kb, case, dist_tp, prediction), key='df', mode='w')    
    df_scores = pd.DataFrame(m_l, columns =['chrm', 'num_pairs','dist_thresh',  'auc', 'plot', 'pr_curve', 'true_pos', 'true_neg', 'exp_median', 'exp_mean', 'exp_var', 'Gene stable ID', 'mp_precited', 'auc_or', 'auc_hic'])
    df_scores.to_hdf('/data/lohia/gene_distance_expresseion/custom_dist_files/combined_%s_%s_%s_%s_permutations_includes_adjacent_hic.h5' %(resoulution_in_kb, case, dist_tp, prediction), key='df', mode='w') 
    return df_scores

if __name__ == '__main__':  
    
    for resoultion in [100]:
        for case in ['tn']:
            df_scores = calc_auc_hic(resoultion, case=case, dist_tp='hi-c-rao', prediction='exp')
            
    #for resoultion in [100, 500]:
    #    for case in ['tn']:
    #        df_scores = calc_auc_hic(resoultion, case=case, dist_tp='exp', prediction='hi-c-rao')