import argparse import os import shutil import numpy as np import pandas as pd import numpy as np import scipy.spatial.distance as ssd import gc import h5py import itertools def make_tss_tss(gene_data, chrom, dist_folder, RAWobserved, KRnorm, resoulution_in_kb): RAWobserved.columns = ["pos1", "pos2", "freq"] #normalizing the dataframe as done in the Rao paper RAWobserved["pos1_norm"]=RAWobserved["pos1"]/(resoulution_in_kb *1000) RAWobserved["pos2_norm"]=RAWobserved["pos2"]/(resoulution_in_kb *1000) RAWobserved["pos1_norm"]=RAWobserved["pos1_norm"].astype(int) RAWobserved["pos2_norm"]=RAWobserved["pos2_norm"].astype(int) RAWobserved_pos1 = RAWobserved.set_index("pos1_norm") raw_df = RAWobserved_pos1.merge(right=KRnorm, left_on=RAWobserved_pos1.index, right_on=KRnorm.index, how='left') raw_df.drop(['key_0'], inplace=True, axis=1) RAWobserved_pos2 = raw_df.set_index("pos2_norm") raw_df = RAWobserved_pos2.merge(right=KRnorm, left_on=RAWobserved_pos2.index, right_on=KRnorm.index, how='left') raw_df.drop(['key_0'], inplace=True, axis=1) raw_df['freq_norm'] = raw_df['freq'] / (raw_df['0_x'] * raw_df['0_y']) raw_df.dropna(inplace=True) raw_df.reset_index(inplace=True) del RAWobserved_pos1, RAWobserved_pos2, RAWobserved, KRnorm gc.collect() freq_norm = raw_df['freq_norm'].to_numpy() freq_norm_dict = dict(enumerate(freq_norm, 0)) print (freq_norm.shape) #this step is done to reduce the memmory consumption #df_tmp = pd.read_csv("/home/lohia/data/gene_distance_expresseion/georg_paper/MSB-17-7548_Dataset_EV1.csv") #GK_genes = df_tmp['Gene_IDs'].to_list() #gene_data = gene_data[gene_data['Gene stable ID'].isin(GK_genes)] #gene_data = gene_data[gene_data['Gene type']== 'protein_coding'] tmp_val = [] gene_occurence_frequency = [] for tss, tes in zip(gene_data['txStart_outer'], gene_data['txStart_outer']): raw_df['tmpv1'] = (abs(raw_df['pos1']- tss)) <= (resoulution_in_kb *1000) raw_df['tmpv2']= (abs(raw_df['pos2']- tss)) <= (resoulution_in_kb *1000) #raw_df['tmpv1'] = (abs(raw_df['pos1']- tss)) <= (resoulution_in_kb *1000*10) #raw_df['tmpv2']= (abs(raw_df['pos2']- tss)) <= (resoulution_in_kb *1000*10) #tmpv3 = (abs(raw_df['pos1']- tes)) <= (resoulution_in_kb *2000) #tmpv4 = (abs(raw_df['pos2']- tes)) <= (resoulution_in_kb *2000) #raw_df['BoolCol'] = [x+y for x, y in zip(tmpv1, tmpv2)] raw_df['BoolCol'] = raw_df[['tmpv1','tmpv2']].sum(axis=1) index_list = raw_df.index[raw_df['BoolCol'] > 0].tolist() tmp_val.append(index_list) gene_occurence_frequency.append(len(index_list)) gene_data['gene_occurence_frequency'] = gene_occurence_frequency gene_data.to_csv(os.path.join(dist_folder, f'{chrom}_tss_tss_{resoulution_in_kb}_freq.csv'), sep=',', columns = ['Gene stable ID', 'gene_occurence_frequency'], index=False) pairwise_distance = ([np.median([freq_norm_dict[x] for x in list(set(tmp_val[pairs[0]]).intersection(tmp_val[pairs[1]]))] ) for pairs in list(itertools.combinations(range(len(tmp_val)),2))]) common_elements = [len(list(set(tmp_val[pairs[0]]).intersection(tmp_val[pairs[1]]))) for pairs in list(itertools.combinations(range(len(tmp_val)),2))] dist_mat = (ssd.squareform(pairwise_distance)) dist_mat_common_elements = (ssd.squareform(common_elements)) #dist_mat = ssd.squareform(ssd.pdist(map_hic_tss_bool, lambda u, v: np.median( (freq_norm)[np.nonzero(u*v)] ))) #dist_mat = ssd.squareform(ssd.pdist(genes_freq, lambda u, v: np.median(np.sqrt( (u * v)[np.nonzero(u*v)] )))) #dist_mat = ssd.squareform(ssd.pdist(genes_freq, lambda u, v: np.nanmedian(np.sqrt((u * v))))) dist_mat = pd.DataFrame(dist_mat, index=gene_data['Gene stable ID'], columns=gene_data['Gene stable ID']) dist_mat.fillna(0, inplace=True) #del dist_mat.index.Gene stable ID dist_mat.to_csv(os.path.join(dist_folder, f'{chrom}_tss_tss_{resoulution_in_kb}.csv'), sep=',', float_format='%d') dist_mat_common_elements = pd.DataFrame(dist_mat_common_elements, index=gene_data['Gene stable ID'], columns=gene_data['Gene stable ID']) #del dist_mat.index.Gene stable ID dist_mat_common_elements.to_csv(os.path.join(dist_folder, f'{chrom}_tss_tss_{resoulution_in_kb}_common_elements.csv'), sep=',', float_format='%d') def make_distance_matrix(gene_folder, dist_folder,resoulution_in_kb=100): print (resoulution_in_kb) if os.path.isdir(dist_folder): # shutil.rmtree(dist_folder) pass else: os.mkdir(dist_folder) #for subdir in ['tss_tss']: # os.mkdir(os.path.join(dist_folder, subdir)) for gene_file in os.scandir(gene_folder): gene_data = pd.read_csv(gene_file.path, sep='\t', header=0, index_col=None) if not gene_data.empty: print (gene_file.path) chrom = gene_data.iloc[0]['chrom'] chrom_list = list(range(1,23)) + ['X'] #chrom_list = [9] chrom_list = ['chr%s' %x for x in chrom_list ] if chrom in chrom_list: prefix = '/'.join(dist_folder.split('/')[:-1]) RAWobserved = pd.read_csv('%s/GM12878_combined/%skb_resolution_intrachromosomal/%s/MAPQG0/%s_%skb.RAWobserved' %(prefix, resoulution_in_kb, chrom, chrom,resoulution_in_kb),sep='\t', header = None) KRnorm = pd.read_csv('%s/GM12878_combined/%skb_resolution_intrachromosomal/%s/MAPQG0/%s_%skb.KRnorm' %(prefix, resoulution_in_kb, chrom, chrom,resoulution_in_kb) ,sep='\t', header = None) make_tss_tss(gene_data, chrom, dist_folder, RAWobserved, KRnorm, resoulution_in_kb) else: pass else: pass if __name__ == '__main__': make_distance_matrix('/data/lohia/gene_distance_expresseion/gene_tx_map_37', '/data/lohia/gene_distance_expresseion/gene_contact_map_rao/processed_hi_c_files_one_buffer', resoulution_in_kb=1)