#!/grid/gillis/home/nfox/software/miniconda3/bin/python #!/sonas-hs/gillis/hpc/home/nfox/software/miniconda3/bin/python import argparse import os import shutil import numpy as np import pandas as pd import numpy as np import scipy.spatial.distance as ssd import gc import h5py import itertools import scipy.stats as stats from scipy.io import mmread from scipy import sparse, io from scipy.sparse import csr_matrix, triu, coo_matrix import numpy as np #from hic_normalization import * import sys class HiCNorm(object): def __init__(self, matrix): self.bias = None self.matrix = matrix self.norm_matrix = None self._bias_var = None def iterative_correction(self, max_iter=400): #mat = self.matrix * 1.0 ## use float to avoid int overflow mat_lil = sparse.lil_matrix(self.matrix) # remove low count rows for normalization row_sum = np.asarray(np.sum(mat_lil, axis=1)).ravel() low_count = np.quantile(row_sum, 0.15) # 0.15 gives best correlation between KR/SP/VC mask_row = row_sum < low_count #mask_row = np.asarray(mask_row.ravel()) #mask_row = [count for count, value in enumerate(mask_row) if value] #print (mask_row) #mat_lil = sparse.lil_matrix(mat) mat_lil[mask_row, :] = 0 mat_lil[:, mask_row] = 0 self.bias = np.ones(mat_lil.shape[0]) print (self.bias.dtype) self._bias_var = [] x, y = np.nonzero(mat_lil) mat_sum = np.sum(mat_lil) # force the sum of matrix to stay the same, otherwise ICE may not converge as fast for i in range(max_iter): #bias =sparse.csr_matrix.sum(mat, axis=1) bias = np.asarray(np.sum(mat_lil, axis=1)).ravel() bias_mean = np.mean(bias[bias > 0]) bias_var = np.var(bias[bias > 0]) self._bias_var.append(bias_var) bias = bias / bias_mean bias[bias == 0] = 1 #print (np.asarray(bias)[:,:].ravel()) #print (bias[x]) #print (bias[x]*bias[y]) mat = sparse.csr_matrix(mat_lil) mat[x, y] = mat[x, y] / (bias[x]*bias[y]) #print (mat) new_sum = np.sum(mat) mat = mat * (mat_sum / new_sum) mat_lil = sparse.lil_matrix(mat) # update total bias self.bias = self.bias * bias * np.sqrt(new_sum / mat_sum) if bias_var < .001: break else: pass self.norm_matrix = csr_matrix(self.matrix, dtype=np.float64) self.norm_matrix[x, y] = self.norm_matrix[x, y] / (self.bias[x] * self.bias[y]) def vanilla_coverage(self): self.iterative_correction(max_iter=1) #have to ensure that the input matrix is a square matrix def lib_3d_norm_old(arr): s = sparse.csr_matrix.get_shape(arr)[0] arr = sparse.dia_matrix(arr, dtype=np.float32) lib_norm_hic = sparse.dia_matrix((s, s), dtype=np.float32) for K in list(range(1, s)): diagonal_array = sparse.dia_matrix.diagonal(arr, k=K) diagonal_mean = np.nanmean(diagonal_array) if diagonal_mean == 0: pass else: sparse.dia_matrix.setdiag(lib_norm_hic, (diagonal_array/diagonal_mean), k=K) return sparse.csr_matrix(lib_norm_hic) def lib_3d_norm(arr, resolution): s = arr.shape if s[0] != s[1]: raise ValueError('network must be a square matrix') else: s = s[0] arr = sparse.dia_matrix(arr, dtype=np.float32) lib_norm_hic = sparse.dia_matrix((s, s), dtype=np.float32) L = int(250000/ resolution) for K in list(range(1, s)): diagonal_array = sparse.dia_matrix.diagonal(arr, k=K) counter = 0 while diagonal_array.shape[0] < L: counter = counter + 1 if (K + counter) < s: diagonal_array = np.concatenate((diagonal_array, sparse.dia_matrix.diagonal(arr, k=(K+counter))), axis=0) else: pass diagonal_array = np.concatenate((diagonal_array, sparse.dia_matrix.diagonal(arr, k=(K-counter))), axis=0) diagonal_mean = np.nanmean(diagonal_array[0:L]) if diagonal_mean == 0: pass else: sparse.dia_matrix.setdiag(lib_norm_hic, (diagonal_array/diagonal_mean), k=K) return sparse.csr_matrix(lib_norm_hic) def rank_norm(arr, resolution): s = arr.shape if s[0] != s[1]: raise ValueError('network must be a square matrix') else: s = s[0] arr = sparse.dia_matrix(arr, dtype=np.float32) rank_norm_hic = sparse.dia_matrix((s, s), dtype=np.float16) L = int(250000/ resolution) for K in list(range(1, s)): diagonal_array = sparse.dia_matrix.diagonal(arr, k=K) counter = 0 while diagonal_array.shape[0] < L: counter = counter + 1 if (K + counter) < s: diagonal_array = np.concatenate((diagonal_array, sparse.dia_matrix.diagonal(arr, k=(K+counter))), axis=0) else: pass diagonal_array = np.concatenate((diagonal_array, sparse.dia_matrix.diagonal(arr, k=(K-counter))), axis=0) rank_abs = lambda x: stats.rankdata(np.abs(x)) normalized_diag = np.apply_along_axis(rank_abs, 0, diagonal_array[0:L]) sparse.dia_matrix.setdiag(rank_norm_hic, normalized_diag, k=K) return sparse.csr_matrix(rank_norm_hic) def make_tss_tss(gene_data, chrom, dist_folder, RAWobserved, resoulution_in_kb, ind_network): RAWobserved_csr = sparse.csr_matrix(RAWobserved) #converted coordiante format to csr format RAWobserved_csr.data = np.nan_to_num(RAWobserved_csr.data) RAWobserved_csr = RAWobserved_csr.astype('float32') raw_matrix_ice_sparse = sparse.csr_matrix.transpose(RAWobserved_csr) + sparse.triu(RAWobserved_csr, k=1, format='csr') mat_ic = HiCNorm(raw_matrix_ice_sparse) mat_ic.iterative_correction() raw_matrix_ice_sparse = mat_ic.norm_matrix RAWobserved_csr_triu = sparse.triu(raw_matrix_ice_sparse, k=1, format='csr') lib_n = lib_3d_norm(RAWobserved_csr_triu, resoulution_in_kb) print ("lib_n") rank_n = rank_norm(RAWobserved_csr_triu, resoulution_in_kb) #print ("here") print ("rank") rank_n = sparse.csr_matrix.transpose(rank_n) + sparse.triu(rank_n, k=1, format='csr') lib_n = sparse.csr_matrix.transpose(lib_n) + sparse.triu(lib_n, k=1, format='csr') tss = gene_data['txStart_outer'].to_list() tes = gene_data['txEnd_outer'].to_list() gene_name = gene_data.index.to_list() gene_len = len(tes) freq_tss_list_all = [[], [], []] freq_max_list_all = [[], [], []] gene_x = [] gene_y = [] counter = 0 for tss_x, tes_x, gene_a in zip(tss, tes, gene_name): counter = counter + 1 #print (counter) if counter <= gene_len: row_num_tss = [int(tss_x/(resoulution_in_kb*1000))] row_num_tes = [int(tes_x/(resoulution_in_kb*1000))] row_num_range = list(range(min(row_num_tss[0],row_num_tes[0]), max(row_num_tss[0],row_num_tes[0])+1)) col_mat_ind_list = [] col_mat_list = [] for norm_type in [raw_matrix_ice_sparse, lib_n, rank_n]: col_mat_ind = sparse.csc_matrix(norm_type[row_num_tss,:]) col_mat = sparse.csc_matrix(norm_type[row_num_range,:]) col_mat_ind_list.append(col_mat_ind) col_mat_list.append(col_mat) for tss_y, tes_y, gene_b in zip(tss[counter::], tes[counter::], gene_name[counter::]): col_num_tss = [int(tss_y/(resoulution_in_kb*1000))] col_num_tes = [int(tes_y/(resoulution_in_kb*1000))] #row_num_range = [range(min(i,j), max(i,j)+1) for i,j in zip(row_num_tss, row_num_tes)] col_num_range = list(range(min(col_num_tss[0],col_num_tes[0]), max(col_num_tss[0],col_num_tes[0])+1)) gene_x.append(gene_a) gene_y.append(gene_b) for col_mat_ind, col_mat, freq_tss_list, freq_max_list, in zip(col_mat_ind_list, col_mat_list, freq_tss_list_all, freq_max_list_all): freq_tss_list.append(col_mat_ind[:,col_num_tss][0,0]) #print (row_num_range,col_num_range) #print (norm_type[row_num_range,col_num_range]) block = col_mat[:,col_num_range] freq_max_list.append(np.amax(block)) #freq_mean_list.append(np.mean(block)) else: break print ("1type") for norm_name, freq_tss_list, freq_max_list in zip(["VC", "VC_lib", "VC_rank"], freq_tss_list_all, freq_max_list_all ): df = pd.DataFrame(list(zip(gene_x, gene_y, freq_tss_list, freq_max_list)), columns =['gene_x', 'gene_y', 'tss', 'max']) df.to_hdf(os.path.join(dist_folder, f'{chrom}_tss_tss_{resoulution_in_kb}_{norm_name}_entire_genes_{ind_network}.hdf'), index=False, key='df', mode='w') def make_distance_matrix(gene_folder, dist_folder,ind_network, resoulution_in_kb=100): print (resoulution_in_kb) if os.path.isdir(dist_folder): # shutil.rmtree(dist_folder) pass else: os.mkdir(dist_folder) #for subdir in ['tss_tss']: # os.mkdir(os.path.join(dist_folder, subdir)) #chrom_list = list(range(1,6)) chrom_list = list(range(1,13)) + list(range(16,20)) +['X'] chrom_list.reverse() #chrom_list = list(range(3,23)) + ['X'] #chrom_list = list(range(1,3)) #chrom_list = ['X'] #chrom_list = ['X'] chrom_list = ['chr%s' %x for x in chrom_list ] gene_file_list = ["gencodev32_hg38_known_gene_%s_order.tsv" %x for x in chrom_list] for gene_file in gene_file_list: print (gene_file) gene_data = pd.read_csv("%s/%s" %(gene_folder, gene_file), sep='\t', header=0, index_col=None) gene_data.set_index('Gene stable ID', inplace=True) #exp_file = parse_expression_hd5_lee("/data/johlee/CoCoCoNet/networks/human_prioAggNet.hdf5") #subset_cols = list (set(gene_data.index) & set(exp_file.index)) #gene_data = gene_data.loc[subset_cols, :] if not gene_data.empty: chrom = gene_data.iloc[0]['chrom'] if chrom in chrom_list: prefix = '/'.join(dist_folder.split('/')[:-1]) RAWobserved = mmread("/grid/gillis/data/lohia/hi_c_data_processing/data/%s_%sKB/%s.mtx" %(ind_network, resoulution_in_kb, chrom)) #print ("/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/data/%s/%s_%sKB/%s_binned.mtx" %(ind_network, ind_network, resoulution_in_kb, chrom)) make_tss_tss(gene_data, chrom, dist_folder, RAWobserved, resoulution_in_kb, ind_network) else: pass else: pass if __name__ == '__main__': print ("temp") #for resolution in [500, 50, 100, 10, 5]: for resolution in [sys.argv[3]]: for ind_network in [sys.argv[1]]: #for resolution in [500, 50, 10]: make_distance_matrix('/grid/gillis/data/lohia/hi_c_data_processing/software/gene_tx_map', sys.argv[2], ind_network, resoulution_in_kb=int(resolution))