#!/home/nfox/miniconda3/bin/python import os import json import pandas as pd import shutil from read_disorder import * def load_biomart_files(): """This loads the mappping from biomart database and merges then to uniprot database""" #get the mapping of terms from biomart df_enst_ensg = pd.read_csv('/data/lohia/gene_distance_expresseion/ENST_ENSG_mapping_v2.txt', sep='\t') #this file has a lot of mapping, EntrezID, uniprot, Ensembl ID, genename mydict_EST_ESG = dict(zip(df_enst_ensg['Gene stable ID'], df_enst_ensg['Transcript stable ID'])) df_ensg_uni_genetype = pd.read_csv('/data/lohia/gene_distance_expresseion/mart_export_ensg_uni_genetype.txt', sep='\t') disorder_content_mobidb_file = '/data/lohia/gene_distance_expresseion/disorder_UP000005640.mjson' dc = read_dc(disorder_content_mobidb_file) df_ensg_uni_genetype_t = df_ensg_uni_genetype.merge(how='left', right=dc, left_on='UniProtKB/TrEMBL ID', right_on='acc' ) df_ensg_uni_genetype_t_s = df_ensg_uni_genetype_t.merge(how='left', right=dc, left_on='UniProtKB/Swiss-Prot ID', right_on='acc' ) df_ensg_uni_genetype_t_s['Uniprot_dc'] = [swis_id if (trem>=0 and swis >=0) else trem_id if trem >= 0 else swis_id for trem,swis,trem_id, swis_id in zip(df_ensg_uni_genetype_t_s['dc_x'], df_ensg_uni_genetype_t_s['dc_y'], df_ensg_uni_genetype_t_s['UniProtKB/TrEMBL ID'], df_ensg_uni_genetype_t_s['UniProtKB/Swiss-Prot ID'] )] df_ensg_uni_genetype_t_s['dc'] = [swis if (trem>=0 and swis >=0) else trem if trem >= 0 else swis for trem,swis in zip(df_ensg_uni_genetype_t_s['dc_x'], df_ensg_uni_genetype_t_s['dc_y'])] df_ensg_uni_genetype_t_s['seq_length'] = [swis if (trem>=0 and swis >=0) else trem if trem >= 0 else swis for trem,swis in zip(df_ensg_uni_genetype_t_s['seq_length_x'], df_ensg_uni_genetype_t_s['seq_length_y'])] df_ensg_uni_genetype_t_s.drop_duplicates(subset= ['Gene stable ID', 'Uniprot_dc'], inplace=True) df_ensg_uni_genetype_dc_len = df_ensg_uni_genetype_t_s.drop(['acc_x', 'seq_length_x', 'dc_x', 'acc_y', 'seq_length_y', 'dc_y'], axis=1) df_ensg_uni_genetype_dc_len = df_ensg_uni_genetype_dc_len.sort_values('seq_length', ascending=False).drop_duplicates('Gene stable ID') # same gene can have multiple disorder content, I am taking the maximum return df_ensg_uni_genetype_dc_len def process_tss(gene_file, save_loc_folder): """This files prodess the TSS, takes the outermost TSS per gene""" if os.path.isdir(save_loc_folder): shutil.rmtree(save_loc_folder) os.mkdir(save_loc_folder) if not os.path.exists(gene_file): raise FileNotFoundError(f'{gene_file} not found!') genes = pd.read_csv(gene_file, sep='\t', header=0, index_col=None) prefix = '/'.join(gene_file.split('/')[:-1]) prefix2 = ''.join(gene_file.split('/')[-1]) df_ensg_uni_genetype_dc_len = load_biomart_files() for chrom in genes['chrom'].unique(): chr_df = genes.loc[genes['chrom'].isin([chrom]), ['#name', 'txStart', 'txEnd', 'strand']] chr_df['#name'] = [x.split(".")[0] for x in chr_df['#name']] chr_df_iso = chr_df.merge(right=df_enst_ensg, left_on=['#name'], right_on=['Transcript stable ID']) chr_df_iso_longest = chr_df_iso.groupby(['Gene stable ID']).agg({'txStart': lambda x: list(set(x)), 'txEnd': lambda x: list(set(x)), 'strand': lambda x: list(set(x))[0]}) chr_df_iso_longest['txStart_outer'] = [min(x) if z=='+' else max(y) for x,y,z in zip(chr_df_iso_longest['txStart'], chr_df_iso_longest['txEnd'], chr_df_iso_longest['strand'])] chr_df_iso_longest['txEnd_outer'] = [max(x) if z=='+' else min(y) for x,y,z in zip(chr_df_iso_longest['txEnd'], chr_df_iso_longest['txStart'], chr_df_iso_longest['strand'])] chr_df_iso_longest.sort_values(by='txStart_outer', inplace=True) chr_df_iso_longest['gene_order_tss'] = [count+1 for count,x in enumerate(chr_df_iso_longest['txStart_outer'])] chr_df_iso_longest.sort_values(by='txEnd_outer', inplace=True) chr_df_iso_longest['gene_order_tes'] = [count+1 for count,x in enumerate(chr_df_iso_longest['txEnd_outer'])] chr_df_iso_longest_all = chr_df_iso_longest.merge(right=df_ensg_uni_genetype_dc_len, left_on=chr_df_iso_longest.index, right_on='Gene stable ID',how='left') chr_df_iso_longest_all['chrom'] = chrom chr_df_iso_longest_all.drop(['txStart', 'txEnd'], axis=1).to_csv( f'{prefix}/gene_tx_map/{prefix2}_{chrom}_order.tsv', sep='\t', header=True, index=False) if __name__ == '__main__': process_tss('/data/lohia/gene_distance_expresseion/gencodev32_hg38_known_gene', '/data/lohia/gene_distance_expresseion/gene_tx_map', map_file1= '/data/lohia/gene_distance_expresseion/ENST_ENSG_mapping_v2.txt', map_file2='/data/lohia/gene_distance_expresseion/mart_export_ensg_uni_genetype.txt', map_file3='/data/lohia/gene_distance_expresseion/disorder_UP000005640.mjson' )