#!/home/nfox/miniconda3/bin/python
import os
import json
import pandas as pd
import shutil
from read_disorder import *


def load_biomart_files():
    """This loads the mappping from biomart database and merges then to uniprot database"""
    #get the mapping of terms from biomart
    df_enst_ensg = pd.read_csv('/data/lohia/gene_distance_expresseion/ENST_ENSG_mapping_v2.txt', sep='\t') #this file has a lot of mapping, EntrezID, uniprot, Ensembl ID, genename
    mydict_EST_ESG = dict(zip(df_enst_ensg['Gene stable ID'], df_enst_ensg['Transcript stable ID']))
    
    df_ensg_uni_genetype = pd.read_csv('/data/lohia/gene_distance_expresseion/mart_export_ensg_uni_genetype.txt', sep='\t')
    disorder_content_mobidb_file = '/data/lohia/gene_distance_expresseion/disorder_UP000005640.mjson' 
    dc = read_dc(disorder_content_mobidb_file)

    df_ensg_uni_genetype_t = df_ensg_uni_genetype.merge(how='left', right=dc, left_on='UniProtKB/TrEMBL ID', right_on='acc' )
    df_ensg_uni_genetype_t_s = df_ensg_uni_genetype_t.merge(how='left', right=dc, left_on='UniProtKB/Swiss-Prot ID', right_on='acc' )
    df_ensg_uni_genetype_t_s['Uniprot_dc'] = [swis_id if (trem>=0 and swis >=0) else trem_id if trem >= 0 else swis_id for trem,swis,trem_id, swis_id in zip(df_ensg_uni_genetype_t_s['dc_x'], df_ensg_uni_genetype_t_s['dc_y'], df_ensg_uni_genetype_t_s['UniProtKB/TrEMBL ID'], df_ensg_uni_genetype_t_s['UniProtKB/Swiss-Prot ID'] )]
    df_ensg_uni_genetype_t_s['dc'] = [swis if (trem>=0 and swis >=0) else trem if trem >= 0 else swis for trem,swis in zip(df_ensg_uni_genetype_t_s['dc_x'], df_ensg_uni_genetype_t_s['dc_y'])]
    df_ensg_uni_genetype_t_s['seq_length'] = [swis if (trem>=0 and swis >=0) else trem if trem >= 0 else swis for trem,swis in zip(df_ensg_uni_genetype_t_s['seq_length_x'], df_ensg_uni_genetype_t_s['seq_length_y'])]
    df_ensg_uni_genetype_t_s.drop_duplicates(subset= ['Gene stable ID', 'Uniprot_dc'], inplace=True)
    df_ensg_uni_genetype_dc_len = df_ensg_uni_genetype_t_s.drop(['acc_x', 'seq_length_x', 'dc_x', 'acc_y', 'seq_length_y', 'dc_y'], axis=1)
    df_ensg_uni_genetype_dc_len = df_ensg_uni_genetype_dc_len.sort_values('seq_length', ascending=False).drop_duplicates('Gene stable ID') # same gene can have multiple disorder content, I am taking the maximum
    return df_ensg_uni_genetype_dc_len


def process_tss(gene_file,  save_loc_folder):
    """This files prodess the TSS, takes the outermost TSS per gene"""
    if os.path.isdir(save_loc_folder):
        shutil.rmtree(save_loc_folder)
    os.mkdir(save_loc_folder)
    if not os.path.exists(gene_file):
        raise FileNotFoundError(f'{gene_file} not found!')
    genes = pd.read_csv(gene_file, sep='\t', header=0, index_col=None)
    prefix = '/'.join(gene_file.split('/')[:-1])
    prefix2 = ''.join(gene_file.split('/')[-1])
    df_ensg_uni_genetype_dc_len = load_biomart_files()
    
    for chrom in genes['chrom'].unique():
        
        chr_df = genes.loc[genes['chrom'].isin([chrom]), ['#name', 'txStart', 'txEnd', 'strand']]
        chr_df['#name'] = [x.split(".")[0] for x in chr_df['#name']]
        chr_df_iso = chr_df.merge(right=df_enst_ensg, left_on=['#name'], right_on=['Transcript stable ID'])
        
        chr_df_iso_longest = chr_df_iso.groupby(['Gene stable ID']).agg({'txStart': lambda x: list(set(x)), 'txEnd': lambda x: list(set(x)), 'strand': lambda x: list(set(x))[0]})
        chr_df_iso_longest['txStart_outer'] = [min(x) if z=='+' else max(y) for x,y,z in zip(chr_df_iso_longest['txStart'], chr_df_iso_longest['txEnd'], chr_df_iso_longest['strand'])]
        chr_df_iso_longest['txEnd_outer'] = [max(x) if z=='+' else min(y) for x,y,z in zip(chr_df_iso_longest['txEnd'], chr_df_iso_longest['txStart'], chr_df_iso_longest['strand'])]
        chr_df_iso_longest.sort_values(by='txStart_outer', inplace=True)
        chr_df_iso_longest['gene_order_tss'] = [count+1 for count,x in enumerate(chr_df_iso_longest['txStart_outer'])]
        chr_df_iso_longest.sort_values(by='txEnd_outer', inplace=True)
        chr_df_iso_longest['gene_order_tes'] = [count+1 for count,x in enumerate(chr_df_iso_longest['txEnd_outer'])]

        chr_df_iso_longest_all = chr_df_iso_longest.merge(right=df_ensg_uni_genetype_dc_len, left_on=chr_df_iso_longest.index, right_on='Gene stable ID',how='left')
        chr_df_iso_longest_all['chrom'] = chrom

        chr_df_iso_longest_all.drop(['txStart', 'txEnd'], axis=1).to_csv(
            f'{prefix}/gene_tx_map/{prefix2}_{chrom}_order.tsv', sep='\t',
            header=True, index=False)
if __name__ == '__main__':
    process_tss('/data/lohia/gene_distance_expresseion/gencodev32_hg38_known_gene', '/data/lohia/gene_distance_expresseion/gene_tx_map', map_file1= '/data/lohia/gene_distance_expresseion/ENST_ENSG_mapping_v2.txt', map_file2='/data/lohia/gene_distance_expresseion/mart_export_ensg_uni_genetype.txt',  map_file3='/data/lohia/gene_distance_expresseion/disorder_UP000005640.mjson' )