from load_lee_matrix import parse_expression_hd5_lee from create_corr_network import * import pandas as pd import rpy2.robjects as robjects from functools import reduce import os def create_combined_dist_file(folder_location, exp_file, chr_list, network_keyord, hi_file_name="path", resoultion_in_kb=100, outdir=""): matrix_all_dist = [] for chrm in chr_list: gene_file = pd.read_csv('%s/gene_tx_map/gencodev32_hg38_known_gene_chr%s_order.tsv' %(folder_location, chrm), sep='\t') #this file has a lot of mapping, EntrezID, uniprot, Ensembl ID, genename #gene_file = gene_file[gene_file['Gene type'] == 'protein_coding'] #pc_genes = gene_file['Gene stable ID'].to_list() df_ensg_matrix = pd.read_csv('%s/sequence_proximity_pairs/tss_tss/chr%s_tss_tss.csv' %(folder_location, chrm)) df_ensg_matrix.set_index('Gene stable ID', inplace = True) tes_matrix = pd.read_csv('%s/sequence_proximity_pairs/tes_tes/chr%s_tes_tes.csv' %(folder_location, chrm)) tes_matrix.set_index('Gene stable ID', inplace = True) hi_file_list = [] col_name_list = [] for norm_type in ["VC", "VC_lib", "VC_rank"]: hi_file = pd.read_hdf('%s/chr%s_tss_tss_%s_%s_entire_genes_%s.hdf' %(hi_file_name, chrm, resoultion_in_kb, norm_type, network_keyord) ) hi_file.rename(columns={"tss": "%s_tss" %norm_type, "max": "%s_max" %norm_type}, inplace=True) hi_file_list.append(hi_file) tmp = norm_type col_name_list.append(["%s_tss" %norm_type, "%s_max" %norm_type]) subset_cols = list (set(exp_file.index) & set(df_ensg_matrix.index)) #df_ensg_subset = df_ensg_matrix.loc[subset_cols, subset_cols] df_exp_subset = exp_file.loc[subset_cols, subset_cols] # only taking the genes present in a given chromosme df = df_ensg_matrix.stack().reset_index() df.columns = ['gene_1', 'gene_2','tss_tss'] df['pairs'] = [x+"_"+y for x,y in zip(df['gene_1'], df['gene_2'])] df.set_index('pairs', inplace=True) for col_name, matrix_name in zip(['exp', 'tes_tes'], [df_exp_subset, tes_matrix] ): long_form = matrix_name.stack().reset_index() long_form.columns = ['gene_1', 'gene_2', col_name] long_form['pairs'] = [str(x)+"_"+str(y) for x,y in zip(long_form['gene_1'], long_form['gene_2'])] long_form.set_index('pairs', inplace=True) df = df.join(long_form[[col_name]], how='left') #merging on the index of the two dataframe for col_name, matrix_name in zip(col_name_list, hi_file_list ): matrix_name['pairs'] = [str(x)+"_"+str(y) for x,y in zip(matrix_name['gene_x'], matrix_name['gene_y'])] matrix_name.set_index('pairs', inplace=True) df = df.join(matrix_name[col_name], how='left') #merging on the index of the two dataframe df['pair_genes'] = [sorted((x,y))[0] + sorted((x,y))[1] for x,y in zip(df['gene_1'], df['gene_2'])] df.drop_duplicates(subset=['pair_genes'], inplace=True) df = df[df['gene_1'] != df['gene_2']] df_1_or = df.merge(right=gene_file, left_on='gene_1', right_on='Gene stable ID') df_2_or = df_1_or.merge(right=gene_file, left_on='gene_2', right_on='Gene stable ID') df_2_or.reset_index(inplace=True) df_2_or.drop(['gene_1', 'gene_2', 'UniProtKB/TrEMBL ID_y', 'UniProtKB/TrEMBL ID_x', 'UniProtKB/Swiss-Prot ID_x', 'UniProtKB/Swiss-Prot ID_y', 'txEnd_outer_x', 'txEnd_outer_y' ,'index', 'pair_genes', 'txStart_outer_x', 'Uniprot_dc_x', 'dc_x', 'txStart_outer_y', 'gene_order_tes_x', 'gene_order_tes_y', 'Uniprot_dc_y', 'dc_y'], inplace=True, axis=1) df_2_or.to_hdf('%s/%s_dist_%s_%s_entire_genes.h5' %(outdir, chrm, resoultion_in_kb, network_keyord) , key='df', mode='w') #contains only values for one set of pairs and do not have self pairs matrix_all_dist.append(df_2_or[['tss_tss', 'exp', 'tes_tes', 'VC_tss', 'VC_max', 'VC_lib_tss', 'VC_lib_max', 'VC_rank_tss', 'VC_rank_max', 'Gene stable ID_x', 'Gene stable ID_y']]) prot_list = pd.concat(matrix_all_dist) prot_list.to_hdf('%s/combined_dist_%s_%s_entire_genes.h5' %(outdir, resoultion_in_kb, network_keyord) , key='df', mode='w') if __name__ == '__main__': exp_file = parse_expression_hd5_lee("/data/johlee/CoCoCoNet/networks/human_prioAggNet.hdf5") chr_list = list(range(1,13)) + list(range(16,20)) +['X'] chr_list.reverse() folder_location = '/grid/gillis/data/lohia/hi_c_data_processing/software/' network_keyord = sys.argv[1] hi_file_name = '/grid/gillis/data/lohia/data/' outdir = sys.argv[2] resolution= sys.argv[3] create_combined_dist_file(folder_location, exp_file, chr_list, network_keyord, hi_file_name=hi_file_name, resoultion_in_kb=resolution, outdir=outdir)