import h5py import scipy.sparse as ss import os import sys import numpy as np import pandas as pd from hicmatrix import HiCMatrix as hm from hicmatrix.lib import MatrixFileHandler from itertools import combinations from scipy.sparse import csr_matrix, dia_matrix, triu, tril, coo_matrix from sklearn.metrics.pairwise import pairwise_distances def jac_sim_arr(arr, percentile=90): rank_n = coo_matrix(arr) rank_n.setdiag(0, k=0) thresh = np.percentile(rank_n.data, percentile) arr = np.where(rank_n.toarray() < thresh, 0, 1) zero_row_indices = np.where(~arr.any(axis=1))[0] #[when the gene row sum is zero, its jaccard similarity is zero] jac_sim = 1 - pairwise_distances(arr, metric = "jaccard") #[calculates the jaccard coefficient for each bin pair based on the , allbins X allbins where values are number of common neighbours] jac_sim[zero_row_indices,:] = 0 return jac_sim, arr if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--base_folder', default='/grid/gillis/data/', help='The base folder for rugen vs cluster') parser.add_argument('--resolution_network', default='10kbp_raw', help='resolution in kb format') parser.add_argument('--SRP', default='', help='SRP project name') parser.add_argument('--candidate_h5_file', default='aggregate.hdf5', help='name of the hdf5 file') parser.add_argument('--chr_list', default='tss', help='tss or tss_max') parser.add_argument('--input_file', default='tss', help='tss or tss_max') parser.add_argument('--outfile', default='tss', help='tss or tss_max') parser.add_argument('--outfile_thresh', default='tss', help='tss or tss_max') parser.add_argument('--percentile_value', default='tss', help='tss or tss_max') args = parser.parse_args() chr_list = args.chr_list.split(',') hic = hm.hiCMatrix(args.input_file) jac_list = [] thresholded_list = [] cut_intervals_list = [] cut_intervals = hic.cut_intervals for chrom in chr_list: gene_chr = hic.matrix[hic.getChrBinRange(chrom)[0]:hic.getChrBinRange(chrom)[1], hic.getChrBinRange(chrom)[0]:hic.getChrBinRange(chrom)[1]] jac_sim_matrix, thresolded_matrix = jac_sim_arr(gene_chr, percentile=int(args.percentile_value)) jac_list.append(jac_sim_matrix) cut_intervals_list.extend(cut_intervals[hic.getChrBinRange(chrom)[0]:hic.getChrBinRange(chrom)[1]]) thresholded_list.append(thresolded_matrix) matrix_genome_wide = ss.block_diag(jac_list) #initialize hic object again for saving hic = hm.hiCMatrix() hic.nan_bins = [] # make matrix symmetric hic.setMatrix(matrix_genome_wide, cut_intervals_list) #hic.fillLowerTriangle() hic.save(args.outfile) matrix_genome_wide = ss.block_diag(thresholded_list) # hic = hm.hiCMatrix() hic.nan_bins = [] # make matrix symmetric hic.setMatrix(matrix_genome_wide, cut_intervals_list) ##hic.fillLowerTriangle() hic.save(args.outfile_thresh)