# import numpy as np
import pandas as pd
import warnings
from calculate_auc import *
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
from create_corr_network import rank

def calc_auc_hic(resoulution_in_kb, case='simple', dist_tp='exp', prediction='hi-c-rao'):

    df_2_or = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/combined_dist_with_georg_hic_sub_median_hic_%s.h5' %resoulution_in_kb)

    #df_2_or = df_2_or[df_2_or['exp_georg'] >= 0] # liming the matrix to only chosen values for rank standerization
    df_2_or = df_2_or[df_2_or['hi-c-rao'] >= 0] # liming the matrix to only chosen values for rank standerization
    df_2_or_u = df_2_or[df_2_or['Gene stable ID_x'] != df_2_or['Gene stable ID_y']]
    #ranked_matirx = rank(df_2_or['exp_georg'])
    #df_2_or['exp_georg'] = ranked_matirx
    #df_2_or.rename(columns={"exp_georg": "exp (GK)"}, inplace=True)

    ranked_matirx = rank(df_2_or['exp'])
    df_2_or['exp'] = ranked_matirx
    df_2_or['gene_freq'] = df_2_or['gene_occurence_frequency_y'] + df_2_or['gene_occurence_frequency_x']

    #ranked_matirx = rank(df_2_or['hi-c-rao'])
    #df_2_or['hi-c-rao'] = ranked_matirx
    m_l = []
    change_group_level_1 = df_2_or.groupby(['chrom_x'])
    for chrm in change_group_level_1.groups.keys():
        df = change_group_level_1.get_group(chrm)
        num_pairs = df['Gene stable ID_x'].nunique()

        prot_list_sp = np.array_split(df, num_pairs, axis=0)
        for i in range(0,num_pairs):

            long_form_top = prot_list_sp[int(i)]
            long_form_top['dist'] = long_form_top[dist_tp]
            long_form_top = long_form_top[long_form_top['tss_tss'] >= 10000000] # liming the matrix to only chosen values for rank standerization

            long_form_top = long_form_top[long_form_top['Gene stable ID_x'] != long_form_top['Gene stable ID_y']] # remove all the self pairs from each set

            mp =  long_form_top['Gene stable ID_y'].values[0]
            #print (long_form_top.shape)
            
            exp_median = long_form_top['exp'].median()
            exp_mean = long_form_top['exp'].mean()
            exp_var = long_form_top['exp'].var()

            long_form_top = long_form_top.reset_index()
            if exp_median >=0:
        
           

                for dist_thresh in [1,10,100]:
                #for dist_thresh in [0.5,0.8]:
                #for dist_thresh in [100000,1000000,10000000,100000000]:
                #for dist_thresh in [4000]:
                #for dist_thresh in [df_2_or_u["hi-c-rao"].min(), df_2_or["hi-c-rao"].max()-1, df_2_or["hi-c-rao"].mean(), df_2_or["hi-c-rao"].median()]:
                    if case == 'simple':
                        long_form_top["True_sim"] = [1 if score > dist_thresh else 0 for score in long_form_top["dist"]]
                    elif case == 'tp':
                    
                        long_form_top = long_form_top.sort_values(by=['dist'], ascending=False) 
                        long_form_top["True_sim"] = [0 if score > dist_thresh else 0 for score in long_form_top["dist"]]  
                        for ind_val in long_form_top.index.values[0:dist_thresh]:
                            long_form_top.at[ind_val, 'True_sim'] = 1
                    else:    
                        long_form_top = long_form_top.sort_values(by=['dist'], ascending=True) 
                        long_form_top["True_sim"] = [1 if score > dist_thresh else 1 for score in long_form_top["dist"]]  
                        for ind_val in long_form_top.index.values[0:dist_thresh]:
                            long_form_top.at[ind_val, 'True_sim'] = 0
                    #
                    #long_form_top["True_sim"] = [1 if score <= dist_thresh else 0 for score in long_form_top["dist"]] 
                    #long_form_top["True_sim"] = [1 if score >= dist_thresh else 1 if score2 <= 1000 else 0 for score, score2 in zip(long_form_top["dist"],long_form_top["tss_tss"])]    
                    long_form_top["true_pos"] = [score for score in long_form_top["True_sim"]]
                    long_form_top["true_neg"] = [1 if score==0 else 0 for score in long_form_top["True_sim"]]
                    long_form_top["predicted_sim_from_exp"] = [score for score in long_form_top[prediction]]
                    ca = calc_auroc (long_form_top,predicted_score='predicted_sim_from_exp')
                    m_curve = calc_auc_curve (long_form_top,predicted_score='predicted_sim_from_exp')
                    pr_curve = prec_recall (long_form_top,predicted_score='predicted_sim_from_exp')

                    tpd = pd.DataFrame(m_curve)
                    if m_curve:
                        tpd[0] =  tpd[0].astype(float).round(2)
                        tpd = tpd.groupby([0]).mean()
                        m_curve  = dict(zip(tpd.index, tpd[1]))
                    else:
                        m_curve  = {}
                    tpd = pd.DataFrame(pr_curve)
                    if pr_curve:
                        tpd[0] =  tpd[0].astype(float).round(2)
                        tpd = tpd.groupby([0]).mean()
                        pr_curve  = dict(zip(tpd.index, tpd[1]))
                    else:
                        pr_curve  = {}
                    m_l.append((chrm, num_pairs,dist_thresh, ca, m_curve, pr_curve, long_form_top["true_pos"].sum(), long_form_top["true_neg"].sum(), exp_median, exp_mean, exp_var, mp))
            else:
                pass

    df_scores = pd.DataFrame(m_l, columns =['chrm', 'num_pairs','dist_thresh',  'auc', 'plot', 'pr_curve', 'true_pos', 'true_neg', 'exp_median', 'exp_mean', 'exp_var', 'Gene stable ID'])
    df_scores.to_hdf('/data/lohia/gene_distance_expresseion/dist_files/combined_%s_%s_%s_%s.h5' %(resoulution_in_kb, case, dist_tp, prediction), key='df', mode='w')    
    return df_scores

if __name__ == '__main__':  
    
    for resoultion in [1, 500]:
        for case in ['tp', 'tn']:
            df_scores = calc_auc_hic(resoultion, case=case, dist_tp='hi-c-rao', prediction='exp')
            
    for resoultion in [1, 500]:
        for case in ['tp', 'tn']:
            df_scores = calc_auc_hic(resoultion, case=case, dist_tp='exp', prediction='hi-c-rao')
            
    for resoultion in [1, 500]:
        for case in ['tp', 'tn']:
            df_scores = calc_auc_hic(resoultion, case=case, dist_tp='hi-c-rao-common_elements', prediction='exp')
            
    for resoultion in [1, 500]:
        for case in ['tp', 'tn']:
            df_scores = calc_auc_hic(resoultion, case=case, dist_tp='gene_freq', prediction='exp')