#!/home/nfox/miniconda3/bin/python import os import json import pandas as pd import shutil def read_dc(file): """This takes as input the json file downloaded from mobid-database and outputs it dataframe format""" tweets = [] for line in open(file, 'r'): tweets.append(json.loads(line)) _y = json.loads(json.dumps(tweets)) df = pd.DataFrame.from_dict(_y) df = pd.concat([df.drop(['mobidb_consensus'], axis=1), df['mobidb_consensus'].apply(pd.Series)], axis=1) df = pd.concat([df.drop(['disorder'], axis=1), df['disorder'].apply(pd.Series)], axis=1) df = pd.concat([df.drop(['predictors'], axis=1), df['predictors'].apply(pd.Series)], axis=1) df['seq_length'] = [len(x) for x in df['sequence'] ] df['dc'] = [x['dc'] for x in df[0]] return df.drop(['sequence', 'organism', 'derived', 'db', 1, 2, 0, 'ncbi_taxon_id'], axis=1) def load_biomart_files(map_file1, map_file2, map_file3, map_file1_colname): """This loads the mappping from biomart database and merges then to uniprot database""" #get the mapping of terms from biomart df_enst_ensg = pd.read_csv(map_file1, sep='\t') #this file has a lot of mapping, EntrezID, uniprot, Ensembl ID, genename df_enst_ensg[map_file1_colname] = df_enst_ensg[map_file1_colname].astype(str) df_enst_ensg[map_file1_colname] = [x.split(".")[0] for x in df_enst_ensg[map_file1_colname]] mydict_EST_ESG = dict(zip(df_enst_ensg['Gene stable ID'], df_enst_ensg[map_file1_colname])) df_ensg_uni_genetype = pd.read_csv(map_file2, sep='\t') disorder_content_mobidb_file = map_file3 dc = read_dc(disorder_content_mobidb_file) df_ensg_uni_genetype_t = df_ensg_uni_genetype.merge(how='left', right=dc, left_on='UniProtKB/TrEMBL ID', right_on='acc' ) df_ensg_uni_genetype_t_s = df_ensg_uni_genetype_t.merge(how='left', right=dc, left_on='UniProtKB/Swiss-Prot ID', right_on='acc' ) df_ensg_uni_genetype_t_s['Uniprot_dc'] = [swis_id if (trem>=0 and swis >=0) else trem_id if trem >= 0 else swis_id for trem,swis,trem_id, swis_id in zip(df_ensg_uni_genetype_t_s['dc_x'], df_ensg_uni_genetype_t_s['dc_y'], df_ensg_uni_genetype_t_s['UniProtKB/TrEMBL ID'], df_ensg_uni_genetype_t_s['UniProtKB/Swiss-Prot ID'] )] df_ensg_uni_genetype_t_s['dc'] = [swis if (trem>=0 and swis >=0) else trem if trem >= 0 else swis for trem,swis in zip(df_ensg_uni_genetype_t_s['dc_x'], df_ensg_uni_genetype_t_s['dc_y'])] df_ensg_uni_genetype_t_s['seq_length'] = [swis if (trem>=0 and swis >=0) else trem if trem >= 0 else swis for trem,swis in zip(df_ensg_uni_genetype_t_s['seq_length_x'], df_ensg_uni_genetype_t_s['seq_length_y'])] df_ensg_uni_genetype_t_s.drop_duplicates(subset= ['Gene stable ID', 'Uniprot_dc'], inplace=True) df_ensg_uni_genetype_dc_len = df_ensg_uni_genetype_t_s.drop(['acc_x', 'seq_length_x', 'dc_x', 'acc_y', 'seq_length_y', 'dc_y'], axis=1) df_ensg_uni_genetype_dc_len = df_ensg_uni_genetype_dc_len.sort_values('seq_length', ascending=False).drop_duplicates('Gene stable ID') # same gene can have multiple disorder content, I am taking the maximum return df_enst_ensg, df_ensg_uni_genetype_dc_len def process_tss(gene_file, save_loc_folder, map_file1='', map_file2='', map_file3='', map_file1_colname='Transcript stable ID'): """This files prodess the TSS, takes the outermost TSS per gene, merges with gene mapping""" if os.path.isdir(save_loc_folder): shutil.rmtree(save_loc_folder) os.mkdir(save_loc_folder) if not os.path.exists(gene_file): raise FileNotFoundError(f'{gene_file} not found!') genes = pd.read_csv(gene_file, sep='\t', header=0, index_col=None) prefix = '/'.join(gene_file.split('/')[:-1]) prefix2 = ''.join(gene_file.split('/')[-1]) df_enst_ensg, df_ensg_uni_genetype_dc_len = load_biomart_files(map_file1, map_file2, map_file3,map_file1_colname) for chrom in genes['chrom'].unique(): chr_df = genes.loc[genes['chrom'].isin([chrom]), ['#name', 'txStart', 'txEnd', 'strand']] chr_df['#name'] = [x.split(".")[0] for x in chr_df['#name']] chr_df_iso = chr_df.merge(right=df_enst_ensg, left_on=['#name'], right_on=[map_file1_colname]) chr_df_iso_longest = chr_df_iso.groupby(['Gene stable ID']).agg({'txStart': lambda x: list(set(x)), 'txEnd': lambda x: list(set(x)), 'strand': lambda x: list(set(x))[0]}) chr_df_iso_longest['txStart_outer'] = [min(x) if z=='+' else max(y) for x,y,z in zip(chr_df_iso_longest['txStart'], chr_df_iso_longest['txEnd'], chr_df_iso_longest['strand'])] chr_df_iso_longest['txEnd_outer'] = [max(x) if z=='+' else min(y) for x,y,z in zip(chr_df_iso_longest['txEnd'], chr_df_iso_longest['txStart'], chr_df_iso_longest['strand'])] chr_df_iso_longest.sort_values(by='txStart_outer', inplace=True) chr_df_iso_longest['gene_order_tss'] = [count+1 for count,x in enumerate(chr_df_iso_longest['txStart_outer'])] chr_df_iso_longest.sort_values(by='txEnd_outer', inplace=True) chr_df_iso_longest['gene_order_tes'] = [count+1 for count,x in enumerate(chr_df_iso_longest['txEnd_outer'])] chr_df_iso_longest_all = chr_df_iso_longest.merge(right=df_ensg_uni_genetype_dc_len, left_on=chr_df_iso_longest.index, right_on='Gene stable ID',how='left') chr_df_iso_longest_all['chrom'] = chrom chr_df_iso_longest_all.drop(['txStart', 'txEnd'], axis=1).to_csv( f'save_loc_folder/{prefix2}_{chrom}_order.tsv', sep='\t', header=True, index=False) if __name__ == '__main__': #wget -O mapping_37.txt 'http://grch37.ensembl.org/biomart/martservice?query=' #after downloading I have added the headers manually http://jan2019.archive.ensembl.org/biomart/martservice?query= process_tss('/data/lohia/gene_distance_expresseion/gencodev32_hg38_known_gene', '/data/lohia/gene_distance_expresseion/gene_tx_map', map_file1= '/data/lohia/gene_distance_expresseion/ENST_ENSG_mapping_v2.txt', map_file2='/data/lohia/gene_distance_expresseion/mart_export_ensg_uni_genetype.txt', map_file3='/data/lohia/gene_distance_expresseion/disorder_UP000005640.mjson',map_file1_colname='Transcript stable ID' ) process_tss('/data/lohia/gene_distance_expresseion/gencode32_ucscgenes_hg37_knowngene', '/data/lohia/gene_distance_expresseion/gene_tx_map_37',map_file1_colname='ucsc id') http://jan2019.archive.ensembl.org/biomart/martview/32b1c329b6258028ddcb38405877a7e4?VIRTUALSCHEMANAME=default&ATTRIBUTES=hsapiens_gene_ensembl.default.homologs.ensembl_gene_id|hsapiens_gene_ensembl.default.homologs.ensembl_transcript_id|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_dn|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_ds|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_ensembl_gene|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_ensembl_peptide|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_goc_score|hsapiens_gene_ensembl.default.homologs.mmusculus_homolog_orthology_confidence&FILTERS=hsapiens_gene_ensembl.default.filters.with_mmusculus_homolog.only&VISIBLEPANEL=resultspanel