#!/home/nfox/miniconda3/bin/python
import os
import json
import pandas as pd
import shutil

def read_dc(file):
    """This takes as input the json file downloaded from mobid-database and outputs it dataframe format"""
    tweets = []
    for line in open(file, 'r'):
        tweets.append(json.loads(line))
    _y = json.loads(json.dumps(tweets))
    df = pd.DataFrame.from_dict(_y)
    df = pd.concat([df.drop(['mobidb_consensus'], axis=1), df['mobidb_consensus'].apply(pd.Series)], axis=1)
    df = pd.concat([df.drop(['disorder'], axis=1), df['disorder'].apply(pd.Series)], axis=1)
    df = pd.concat([df.drop(['predictors'], axis=1), df['predictors'].apply(pd.Series)], axis=1)
    df['seq_length'] = [len(x) for x in df['sequence'] ]
    df['dc'] = [x['dc'] for x in df[0]]
    return df.drop(['sequence', 'organism', 'derived', 'db', 1, 2, 0, 'ncbi_taxon_id'], axis=1)

def load_biomart_files(map_file1, map_file2, map_file3, map_file1_colname):
    """This loads the mappping from biomart database and merges then to uniprot database"""
    #get the mapping of terms from biomart
    df_enst_ensg = pd.read_csv(map_file1, sep='\t')  #this file has a lot of mapping, EntrezID, uniprot, Ensembl ID, genename
    df_enst_ensg[map_file1_colname] = df_enst_ensg[map_file1_colname].astype(str)
    df_enst_ensg[map_file1_colname] = [x.split(".")[0] for x in df_enst_ensg[map_file1_colname]]
    mydict_EST_ESG = dict(zip(df_enst_ensg['Gene stable ID'], df_enst_ensg[map_file1_colname]))
    df_ensg_uni_genetype = pd.read_csv(map_file2, sep='\t')
    disorder_content_mobidb_file = map_file3
    dc = read_dc(disorder_content_mobidb_file)

    df_ensg_uni_genetype_t = df_ensg_uni_genetype.merge(how='left', right=dc, left_on='UniProtKB/TrEMBL ID', right_on='acc' )
    df_ensg_uni_genetype_t_s = df_ensg_uni_genetype_t.merge(how='left', right=dc, left_on='UniProtKB/Swiss-Prot ID', right_on='acc' )
    df_ensg_uni_genetype_t_s['Uniprot_dc'] = [swis_id if (trem>=0 and swis >=0) else trem_id if trem >= 0 else swis_id for trem,swis,trem_id, swis_id in zip(df_ensg_uni_genetype_t_s['dc_x'], df_ensg_uni_genetype_t_s['dc_y'], df_ensg_uni_genetype_t_s['UniProtKB/TrEMBL ID'], df_ensg_uni_genetype_t_s['UniProtKB/Swiss-Prot ID'] )]
    df_ensg_uni_genetype_t_s['dc'] = [swis if (trem>=0 and swis >=0) else trem if trem >= 0 else swis for trem,swis in zip(df_ensg_uni_genetype_t_s['dc_x'], df_ensg_uni_genetype_t_s['dc_y'])]
    df_ensg_uni_genetype_t_s['seq_length'] = [swis if (trem>=0 and swis >=0) else trem if trem >= 0 else swis for trem,swis in zip(df_ensg_uni_genetype_t_s['seq_length_x'], df_ensg_uni_genetype_t_s['seq_length_y'])]
    df_ensg_uni_genetype_t_s.drop_duplicates(subset= ['Gene stable ID', 'Uniprot_dc'], inplace=True)
    df_ensg_uni_genetype_dc_len = df_ensg_uni_genetype_t_s.drop(['acc_x', 'seq_length_x', 'dc_x', 'acc_y', 'seq_length_y', 'dc_y'], axis=1)
    df_ensg_uni_genetype_dc_len = df_ensg_uni_genetype_dc_len.sort_values('seq_length', ascending=False).drop_duplicates('Gene stable ID') # same gene can have multiple disorder content, I am taking the maximum
    return df_enst_ensg, df_ensg_uni_genetype_dc_len


def process_tss(gene_file,  save_loc_folder, map_file1='', map_file2='', map_file3='', map_file1_colname='Transcript stable ID'):
    """This files prodess the TSS, takes the outermost TSS per gene, merges with gene mapping"""
    if os.path.isdir(save_loc_folder):
        shutil.rmtree(save_loc_folder)
    os.mkdir(save_loc_folder)
    if not os.path.exists(gene_file):
        raise FileNotFoundError(f'{gene_file} not found!')
    genes = pd.read_csv(gene_file, sep='\t', header=0, index_col=None)
    prefix = '/'.join(gene_file.split('/')[:-1])
    prefix2 = ''.join(gene_file.split('/')[-1])
    df_enst_ensg, df_ensg_uni_genetype_dc_len = load_biomart_files(map_file1, map_file2, map_file3,map_file1_colname)
    
    for chrom in genes['chrom'].unique():
        
        chr_df = genes.loc[genes['chrom'].isin([chrom]), ['#name', 'txStart', 'txEnd', 'strand']]
        chr_df['#name'] = [x.split(".")[0] for x in chr_df['#name']]
        chr_df_iso = chr_df.merge(right=df_enst_ensg, left_on=['#name'], right_on=[map_file1_colname])
        
        chr_df_iso_longest = chr_df_iso.groupby(['Gene stable ID']).agg({'txStart': lambda x: list(set(x)), 'txEnd': lambda x: list(set(x)), 'strand': lambda x: list(set(x))[0]})
        chr_df_iso_longest['txStart_outer'] = [min(x) if z=='+' else max(y) for x,y,z in zip(chr_df_iso_longest['txStart'], chr_df_iso_longest['txEnd'], chr_df_iso_longest['strand'])]
        chr_df_iso_longest['txEnd_outer'] = [max(x) if z=='+' else min(y) for x,y,z in zip(chr_df_iso_longest['txEnd'], chr_df_iso_longest['txStart'], chr_df_iso_longest['strand'])]
        chr_df_iso_longest.sort_values(by='txStart_outer', inplace=True)
        chr_df_iso_longest['gene_order_tss'] = [count+1 for count,x in enumerate(chr_df_iso_longest['txStart_outer'])]
        chr_df_iso_longest.sort_values(by='txEnd_outer', inplace=True)
        chr_df_iso_longest['gene_order_tes'] = [count+1 for count,x in enumerate(chr_df_iso_longest['txEnd_outer'])]

        chr_df_iso_longest_all = chr_df_iso_longest.merge(right=df_ensg_uni_genetype_dc_len, left_on=chr_df_iso_longest.index, right_on='Gene stable ID',how='left')
        chr_df_iso_longest_all['chrom'] = chrom

        chr_df_iso_longest_all.drop(['txStart', 'txEnd'], axis=1).to_csv(
            f'save_loc_folder/{prefix2}_{chrom}_order.tsv', sep='\t',
            header=True, index=False)
if __name__ == '__main__':
    #wget -O mapping_37.txt 'http://grch37.ensembl.org/biomart/martservice?query=<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" count="" datasetConfigVersion = "0.6"><Dataset name="hsapiens_gene_ensembl" interface="default"><Attribute name="ucsc"/><Attribute name="ensembl_gene_id"/></Dataset></Query>'
#after downloading I have added the headers manually
    
http://jan2019.archive.ensembl.org/biomart/martservice?query=<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "with_mmusculus_homolog" excluded = "0"/><Attribute name = "ensembl_gene_id" /><Attribute name = "ensembl_transcript_id" /><Attribute name = "mmusculus_homolog_dn" /><Attribute name = "mmusculus_homolog_ds" /><Attribute name = "mmusculus_homolog_ensembl_gene" /><Attribute name = "mmusculus_homolog_ensembl_peptide" /><Attribute name = "mmusculus_homolog_goc_score" /><Attribute name = "mmusculus_homolog_orthology_confidence" /></Dataset></Query>
    
    
    process_tss('/data/lohia/gene_distance_expresseion/gencodev32_hg38_known_gene', '/data/lohia/gene_distance_expresseion/gene_tx_map', map_file1= '/data/lohia/gene_distance_expresseion/ENST_ENSG_mapping_v2.txt', map_file2='/data/lohia/gene_distance_expresseion/mart_export_ensg_uni_genetype.txt',  map_file3='/data/lohia/gene_distance_expresseion/disorder_UP000005640.mjson',map_file1_colname='Transcript stable ID' )
    
    process_tss('/data/lohia/gene_distance_expresseion/gencode32_ucscgenes_hg37_knowngene', '/data/lohia/gene_distance_expresseion/gene_tx_map_37',map_file1_colname='ucsc id')
    
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
			
	<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
		<Filter name = "with_mmusculus_homolog" excluded = "0"/>
		<Attribute name = "ensembl_gene_id" />
		<Attribute name = "ensembl_transcript_id" />
		<Attribute name = "mmusculus_homolog_dn" />
		<Attribute name = "mmusculus_homolog_ds" />
		<Attribute name = "mmusculus_homolog_ensembl_gene" />
		<Attribute name = "mmusculus_homolog_ensembl_peptide" />
		<Attribute name = "mmusculus_homolog_goc_score" />
		<Attribute name = "mmusculus_homolog_orthology_confidence" />
	</Dataset>
</Query>

http://jan2019.archive.ensembl.org/biomart/martview/32b1c329b6258028ddcb38405877a7e4?VIRTUALSCHEMANAME=default&ATTRIBUTES=hsapiens_gene_ensembl.default.homologs.ensembl_gene_id|hsapiens_gene_ensembl.default.homologs.ensembl_transcript_id|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_dn|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_ds|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_ensembl_gene|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_ensembl_peptide|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_goc_score|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_orthology_confidence&FILTERS=hsapiens_gene_ensembl.default.filters.with_mmusculus_homolog.only&VISIBLEPANEL=resultspanel