import argparse import os import shutil import numpy as np import pandas as pd import numpy as np import scipy.spatial.distance as ssd def make_tss_tss(gene_data, chrom, dist_folder, RAWobserved, KRnorm): RAWobserved.columns = ["pos1", "pos2", "freq"] #normalizing the dataframe RAWobserved["pos1_norm"]=RAWobserved["pos1"]/10000 #this is because the file is read at 10kb resolution RAWobserved["pos2_norm"]=RAWobserved["pos2"]/10000 RAWobserved['freq_norm'] = [ freq / (KRnorm[0][pos1_norm] * KRnorm[0][pos2_norm]) for pos1_norm, pos2_norm, freq in zip(RAWobserved["pos1_norm"], RAWobserved["pos2_norm"], RAWobserved["freq"])] z = RAWobserved['freq_norm'].to_numpy() gene_data['row_numbers_pos1'] = [ (abs(RAWobserved['pos1'] - tss)).to_numpy() <= 40000 for tss in gene_data['txStart_outer'] ] gene_data['row_numbers_pos2'] = [ (abs(RAWobserved['pos2'] - tss)).to_numpy() <= 40000 for tss in gene_data['txStart_outer'] ] gene_data['row_numbers'] = gene_data['row_numbers_pos1'] + gene_data['row_numbers_pos2'] y = gene_data['row_numbers'].apply(pd.Series).to_numpy() z_t = np.tile(z, (y.shape[0],1)) z_t[~y] = np.nan dist_mat = ssd.squareform(ssd.pdist(z_t, lambda u, v: np.nanmedian(np.sqrt((u * v))))) dist_mat = pd.DataFrame(dist_mat, index=gene_data['Gene stable ID'], columns=gene_data['Gene stable ID']) #del dist_mat.index.Gene stable ID dist_mat.to_csv(os.path.join(dist_folder, f'tss_tss/{chrom}_tss_tss.csv'), sep=',', float_format='%d') def make_distance_matrix(gene_folder, dist_folder): if os.path.isdir(dist_folder): shutil.rmtree(dist_folder) os.mkdir(dist_folder) for subdir in ['tss_tss']: os.mkdir(os.path.join(dist_folder, subdir)) for gene_file in os.scandir(gene_folder): gene_data = pd.read_csv(gene_file.path, sep='\t', header=0, index_col=None) if not gene_data.empty: print (gene_file.path) chrom = gene_data.iloc[0]['chrom'] chrom_list = list(range(1,23)) + ['X'] chrom_list = ['chr%s' %x for x in chrom_list ] if chrom in chrom_list: prefix = '/'.join(dist_folder.split('/')[:-1]) RAWobserved = pd.read_csv('%s/GM12878_combined/10kb_resolution_intrachromosomal/%s/MAPQG0/%s_10kb.RAWobserved' %(prefix, chrom, chrom),sep='\t', header = None) KRnorm = pd.read_csv('%s/GM12878_combined/10kb_resolution_intrachromosomal/%s/MAPQG0/%s_10kb.KRnorm' %(prefix, chrom, chrom) ,sep='\t', header = None) make_tss_tss(gene_data, chrom, dist_folder, RAWobserved, KRnorm) else: pass else: pass if __name__ == '__main__': make_distance_matrix('/data/lohia/gene_distance_expresseion/gene_tx_map_37', '/data/lohia/gene_distance_expresseion/gene_contact_map_rao/processed_hi_c_files')