import argparse import os import shutil import numpy as np import pandas as pd import numpy as np import scipy.spatial.distance as ssd import gc import h5py import itertools import scipy.stats as stats from scipy.io import mmread from scipy import sparse, io from scipy.sparse import csr_matrix, triu, coo_matrix import numpy as np from load_lee_matrix import parse_expression_hd5_lee import time #have to ensure that the input matrix is a square matrix def lib_3d_norm(arr, gene_bin_list): s = sparse.csr_matrix.get_shape(arr)[0] #arr = sparse.dia_matrix(arr) lib_norm_hic = sparse.dia_matrix((s, s), dtype=np.float32) for K in gene_bin_list: diagonal_array = sparse.csr_matrix.diagonal(arr, k=K) diagonal_sum = np.nanmean(diagonal_array) if diagonal_sum == 0: sparse.dia_matrix.setdiag(lib_norm_hic,diagonal_array, k=K) else: sparse.dia_matrix.setdiag(lib_norm_hic, (diagonal_array/diagonal_sum), k=K) return sparse.csr_matrix(lib_norm_hic) def rank_norm(arr, gene_bin_list): s = sparse.csr_matrix.get_shape(arr)[0] #arr = sparse.dia_matrix(arr) rank_norm_hic = sparse.dia_matrix((s, s), dtype=np.float16) L = 2 * int(250000/ resolution) for K in gene_bin_list: diagonal_array = sparse.csr_matrix.diagonal(arr, k=K) counter = 0 while diagonal_array.shape[0] < L: counter = counter + 1 if (K + counter) < s: diagonal_array = np.concatenate((diagonal_array, sparse.csr_matrix.diagonal(arr, k=(K+counter))), axis=0) else: pass diagonal_array = np.concatenate((diagonal_array, sparse.csr_matrix.diagonal(arr, k=(K-counter))), axis=0) rank_abs = lambda x: stats.rankdata(np.abs(x)) normalized_diag = np.apply_along_axis(rank_abs, 0, diagonal_array[0:L]) sparse.dia_matrix.setdiag(rank_norm_hic, normalized_diag, k=K) return sparse.csr_matrix(rank_norm_hic) def make_tss_tss(gene_data, chrom, dist_folder, RAWobserved, KRnorm, resoulution_in_kb): #raw_matrix = np.where(RAWobserved,RAWobserved,RAWobserved.T) RAWobserved.columns = ["pos1", "pos2", "freq"] RAWobserved["pos1"] = RAWobserved["pos1"] / (resoulution_in_kb * 1000) RAWobserved["pos2"] = RAWobserved["pos2"] / (resoulution_in_kb * 1000) row = RAWobserved["pos1"].astype(int).values col = RAWobserved["pos2"].astype(int).values data = RAWobserved["freq"].values max_row = np.amax(row) max_col = np.amax(col) max_dim = max(max_row, max_col) + 1 print (max_dim) norm_vector = KRnorm.values norm_vector_p1 = sparse.csr_matrix(1/norm_vector) norm_vector_p2 = sparse.csr_matrix(1/(norm_vector.T)) max_dim = norm_vector.shape[0] RAWobserved_coo = sparse.coo_matrix((data, (row, col)), shape=(max_dim, max_dim)) RAWobserved_csr = sparse.csr_matrix(RAWobserved_coo) #converted coordiante format to csr format RAWobserved_csr = RAWobserved_csr.multiply(norm_vector_p1) RAWobserved_csr = RAWobserved_csr.multiply(norm_vector_p2) raw_matrix_ice_sparse = sparse.csr_matrix.transpose(RAWobserved_csr) + RAWobserved_csr gene_data['tss_bin'] = gene_data['txStart_outer'] /(resoulution_in_kb*1000) gene_data['tes_bin'] = gene_data['txEnd_outer'] / (resoulution_in_kb*1000) gene_data['tes_bin'] = gene_data['tes_bin'].astype('int') gene_data['tss_bin'] = gene_data['tss_bin'].astype('int') gene_data['num_range'] = [list(range(min(tss,tes), max(tss,tes)+1)) for tss, tes in zip(gene_data['tss_bin'] , gene_data['tes_bin']) ] gene_bin_list = list(set(gene_data['num_range'].sum())) tss = gene_data['tss_bin'].to_list() tes = gene_data['tes_bin'].to_list() gene_range = gene_data['num_range'].to_list() gene_name = gene_data.index.to_list() gene_len = len(tes) #raw_matrix = raw_matrix.astype(dtype='float32') #raw_matrix = RAWobserved #print (raw_matrix.shape) #print (raw_matrix) #mat_ic = HiCNorm(raw_matrix) #mat_ic.iterative_correction() #raw_matrix_ice_sparse = mat_ic.norm_matrix #print (mat_ic._bias_var) #print (raw_matrix_ice_sparse.shape) #print (raw_matrix_ice.shape) # for normalization #L = int(249200000/(resoulution_in_kb*1000)) #length of chromosome 1 #raw_matrix_ice_sparse = sparse.csr_matrix(raw_matrix_ice) lib_n = lib_3d_norm(raw_matrix_ice_sparse, gene_bin_list) print ("lib_n") rank_n = rank_norm(raw_matrix_ice_sparse, gene_bin_list) print ("here") print ("norm_type") rank_n = sparse.csr_matrix.transpose(rank_n) + rank_n lib_n = sparse.csr_matrix.transpose(lib_n) + lib_n for norm_type, norm_name in zip([raw_matrix_ice_sparse, rank_n, lib_n], ["VC_rank", "VC_lib"]): freq_tss_list = [] freq_max_list = [] freq_mean_list = [] gene_x = [] gene_y = [] counter = 0 for tss_x, tes_x, gene_a, gene_range_x in zip(tss, tes, gene_name, gene_range): counter = counter + 1 #print (counter) if counter <= gene_len: row_num_tss = tss_x row_num_tes = tes_x row_num_range = gene_range_x col_mat_ind = sparse.csc_matrix(norm_type[row_num_tss,:]) col_mat = sparse.csc_matrix(norm_type[row_num_range,:]) for tss_y, tes_y, gene_b, gene_range_y in zip(tss[counter::], tes[counter::], gene_name[counter::], gene_range[counter::]): col_num_tss = tss_x col_num_tes = tes_y #row_num_range = [range(min(i,j), max(i,j)+1) for i,j in zip(row_num_tss, row_num_tes)] col_num_range = gene_range_y freq_tss_list.append(col_mat_ind[:,col_num_tss][0,0]) #print (row_num_range,col_num_range) #print (norm_type[row_num_range,col_num_range]) block = col_mat[:,col_num_range] freq_max_list.append(np.amax(block)) freq_mean_list.append(np.mean(block)) gene_x.append(gene_a) gene_y.append(gene_b) else: break print ("1type") df = pd.DataFrame(list(zip(gene_x, gene_y, freq_tss_list, freq_max_list, freq_mean_list)), columns =['gene_x', 'gene_y', 'tss', 'max', 'mean']) df.to_csv(os.path.join(dist_folder, f'{chrom}_tss_tss_{resoulution_in_kb}_{norm_name}.csv'), sep=',', float_format='%.5f', index=False) def make_distance_matrix(gene_folder, dist_folder,resoulution_in_kb=100): print (resoulution_in_kb) if os.path.isdir(dist_folder): # shutil.rmtree(dist_folder) pass else: os.mkdir(dist_folder) chrom_list = list(range(1,21)) + ['X'] chrom_list.reverse() #chrom_list = list(range(3,23)) + ['X'] #chrom_list = list(range(1,3)) #chrom_list = ['X'] #chrom_list = ['X'] chrom_list = ['chr%s' %x for x in chrom_list ] gene_file_list = ["gencode32_ucscgenes_hg37_knowngene_%s_order.tsv" %x for x in chrom_list] for gene_file in gene_file_list: gene_data = pd.read_csv("%s/%s" %(gene_folder, gene_file), sep='\t', header=0, index_col=None) gene_data.set_index('Gene stable ID', inplace=True) exp_file = parse_expression_hd5_lee("/data/johlee/CoCoCoNet/networks/human_prioAggNet.hdf5") subset_cols = list (set(gene_data.index) & set(exp_file.index)) gene_data = gene_data.loc[subset_cols, :] if not gene_data.empty: print (gene_file) chrom = gene_data.iloc[0]['chrom'] if chrom in chrom_list: prefix = '/'.join(dist_folder.split('/')[:-1]) RAWobserved = pd.read_csv('%s/GM12878_combined/%skb_resolution_intrachromosomal/%s/MAPQG0/%s_%skb.RAWobserved' %(prefix, resoulution_in_kb, chrom, chrom,resoulution_in_kb),sep='\t', header = None) KRnorm = pd.read_csv('%s/GM12878_combined/%skb_resolution_intrachromosomal/%s/MAPQG0/%s_%skb.KRnorm' %(prefix, resoulution_in_kb, chrom, chrom,resoulution_in_kb) ,sep='\t', header = None) make_tss_tss(gene_data, chrom, dist_folder, RAWobserved, KRnorm, resoulution_in_kb) else: pass else: pass if __name__ == '__main__': #for resolution in [500, 50, 10, 1, 100, 5, 250 ,1000]: for resolution in [10]: make_distance_matrix('/data/lohia/gene_distance_expresseion/gene_tx_map_37', '/data/lohia/gene_distance_expresseion/gene_contact_map_rao/processed_hi_c_files_rao_kr_buffer', resoulution_in_kb=resolution)