#!/grid/gillis/home/nfox/software/miniconda3/bin/python """Used to build a Hi-C contact network in a pipeline networks HDF5 file. The pipeline governed by hicup_data_processing.py and executed on each sample by process_srr.py ultimately stores Hi-C contacts in chromosome x chromosome sparse contact matrices of varying resolutions. These matrices with accompanying metadata are stored in each sample's Run directory as an HDF5 file called "networks.hdf5". In the step preceding HDF5 file building, the contacts are converted from a SAM file to a "pairs" file, a file format defined by the 4DN consortium. This is a tabular sparse triplets format for storing Hi-C contacts. It is compressed and indexed. The "source" data so to speak. This program parses compressed and indexed pairs files to convert them into collections of chromosome x chromosome sparse contact matrices. The raw data is stored in the 1bp_raw network, and then binned and/or normalized networks can be stored arbitrarily. """ import h5py import itertools import logging import numpy as np import os import pandas as pd import scipy.sparse as ss import subprocess as sp from subprocess import CalledProcessError import sys def setup_loggers(global_level=logging.DEBUG, logfile='', splitouterr=False, reset=[]): """Set up loggers. Powerful function to set up the loggers require for this programs. Includes filters for .out files and .err files. Includes a reset function to avoid any carryover from parent programs or external calls. Can parse a target log file for default names, a default name in a passed location, or a given filepath. Included Functions and Classes: reset_logging() This particular function builds 2 loggers: a generic logger that includes the timestamp and the message type, and a "clean" logger that functions the same way as a bare print function with no annotation. Args: global_level: int. One of [0, 10, 20, 30, 40, 50], corresponding the logging levels in the standard Python logging module: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL. Indicates the lowest level to include in output. e.g. a value of INFO will ignore all logging messages that are DEBUG and lower. logfile: str. The file to which log messages will be written. If '', the __file__ variable will be used to generate a default SCRIPT_NAME.log file in the current directory. If `logfile` is a directory, the same default file will be used in that directory instead of the current directory. If `logfile` is a file, it will be used. splitouterr: bool. An option to use typical stdout and stderr files instead of a *.log file. e.g. your_program.out and your_program.err instead of your_program.log. Messages of INFO and below will be sent to the outfile and messages of WARNING and above will be sent to the errfile. reset: list. Loggers to be reset/cleared and rebuilt from scratch. Empty list indicates all active loggers. Can optionally pass a single logger name as a string. e.g. 'logger' instead of ['logger']. """ # Functions and Classes def reset_logging(loggers=[]): """Reset and clear loggers. Loggers can sometimes produce unexpected effects when called in submodules, from external functions, or across multiple sessions. This function completely clears a logger from the existing logging environment to avoid any side effects. Args: reset: list. Loggers to be reset/cleared and rebuilt from scratch. Empty list indicates all active loggers. Can optionally pass a single logger name as a string. e.g. 'logger' instead of ['logger']. """ manager = logging.Logger.manager manager.disabled = logging.NOTSET if type(loggers) is str: loggers = [loggers] if len(loggers) == 0: lkeys = list(manager.loggerDict.keys()) else: lkeys = [] for lkey in loggers: if lkey not in manager.loggerDict.keys(): print(f'{lkey} is not a valid logger. Will not be reset.', file=sys.stderr) else: lkeys.append(lkey) for lkey in lkeys: logger = manager.loggerDict[lkey] if isinstance(logger, logging.Logger): logger.setLevel(logging.NOTSET) logger.propagate = True logger.disabled = False logger.filters.clear() handlers = logger.handlers.copy() for handler in handlers: # Copied from `logging.shutdown`. try: handler.acquire() handler.flush() handler.close() except (OSError, ValueError): pass finally: handler.release() logger.removeHandler(handler) class Out_Filter(logging.Filter): """Extension of logging.Filter to get messages for stdout. Filters for messages that are INFO or less severe, which would be expected to go to an *.out file, as opposed to an *.err file. """ def filter(self, rec): """Filter for messages of WARNING and more severe.""" if rec.levelno <= logging.INFO: return(True) else: return(False) class Err_Filter(logging.Filter): """Extension of logging.Filter to get messages for stderr. Filters for messages that are WARNING or more severe, which would be expected to go to an *.err file, as opposed to an *.out file. """ def filter(self, rec): """Filter for messages of WARNING and more severe.""" if rec.levelno >= logging.WARNING: return(True) else: return(False) reset_logging(loggers=reset) # Formatters std_format = logging.Formatter(fmt=('%(asctime)s [%(levelname)s] ' '%(message)s'), datefmt='%Y-%m-%d %H:%M:%S') clean_format = logging.Formatter(fmt='') # Parse logfile name if logfile == '': logfile = os.path.basename(__file__) if logfile.endswith('.py'): logfile = logfile[:-3] logfile = logfile + '.log' elif os.path.isdir(logfile): logdir = logfile logfile = os.path.basename(__file__) if logfile.endswith('.py'): logfile = logfile[:-3] logfile = logfile + '.log' logfile = os.path.join(logdir, logfile) else: # No effect, here for readability logfile = logfile # Build Handlers std_handlers = {} clean_handlers = {} if splitouterr: logfile_name = os.path.splitext(os.path.basename(logfile))[0] logout_file = os.path.join(os.path.dirname(logfile), f'{logfile_name}.out') logerr_file = os.path.join(os.path.dirname(logfile), f'{logfile_name}.err') std_handlers['logout'] = logging.FileHandler(filename=logout_file, mode='a') std_handlers['logerr'] = logging.FileHandler(filename=logerr_file, mode='a') std_handlers['logout'].addFilter(Out_Filter()) std_handlers['logerr'].addFilter(Err_Filter()) clean_handlers['logout'] = logging.FileHandler(filename=logout_file, mode='a') clean_handlers['logerr'] = logging.FileHandler(filename=logerr_file, mode='a') clean_handlers['logout'].addFilter(Out_Filter()) clean_handlers['logerr'].addFilter(Err_Filter()) else: std_handlers['logfile'] = logging.FileHandler(filename=logfile, mode='a') clean_handlers['logfile'] = logging.FileHandler(filename=logfile, mode='a') for hdler in std_handlers: std_handlers[hdler].setFormatter(std_format) for hdler in clean_handlers: clean_handlers[hdler].setFormatter(clean_format) # Building loggers logger = logging.getLogger('standard') if splitouterr: logger.addHandler(std_handlers['logout']) logger.addHandler(std_handlers['logerr']) else: logger.addHandler(std_handlers['logfile']) logger.setLevel(global_level) logger = logging.getLogger('clean') if splitouterr: logger.addHandler(clean_handlers['logout']) logger.addHandler(clean_handlers['logerr']) else: logger.addHandler(clean_handlers['logfile']) logger.setLevel(global_level) def generate_regions(interval, chrom_names, chrom_sizes): """Generate a regions DataFrame from an integer interval. From a list of chromosomes and a bin size, creates a table containing the chromosome regions or bins in base pairs. Stored as a triplet of the chromosome, bin start index and bin end index. Essentially binning a genome. Args: interval: int. Interval between regions. The "bin size" for binning a chromosome length into fixed size bins. chrom_names: Names of the chromosomes to be binned. Must be matched to `chrom_sizes`. chrom_sizes: Sizes of the chromosomes to be binned. Must be matched to `chrom_names`. Returns: pandas DataFrame. 3 columns: chrom, start, end. Each row is a region or bin. The chromosome it belongs to, and the start and end position. The interval indices begin at 1 and are closed. e.g. The first two intervals of 50 for chromosome chr1 will be [('chr1', 1, 50), ('chr1', 51, 100)]. """ logger = logging.getLogger('standard') assert(type(interval) == int) chrom_regions = [] for c, size in zip(chrom_names, chrom_sizes): start = np.arange(1, size+1, interval) end = np.arange(interval, size+1, interval) if start.size == end.size: pass elif (start.size - end.size) == 1: end = np.append(end, size) else: logger.error(f'unexpected start and end arrays for {c}') continue chrom_regions.append((pd.DataFrame({'chrom': c, 'start': start, 'end': end}))) return(pd.concat(chrom_regions, ignore_index=True)) def get_chrom_sizes(chroms): """Parse a standard UCSC chromosome sizes file. A standard UCSC chromosome sizes file includes two tab-separated values per line, each corresponding to a chromosome name and a chromosome size in base pairs. Args: chroms: str. Path to the chromosome sizes file. Returns: dict. Chromosome sizes as integers, keyed by chromosome name. """ chrom_sizes = {} with open(chroms, 'r') as f: for line in f: values = line.split('\t') chrom_sizes[values[0]] = int(values[1]) return(chrom_sizes) def get_main_chroms(chroms): """Filter for main chromosomes. Most genomes have main chromosomes as well as a host of incomplete or partial chromosome constructs. Filters an iterable of chromosome names for the main chromosomes, under the assumption that main chromosomes exclusively do not contain an underscore character '_' in their name. Returns: list. List of main chromosome names. """ main_chroms = [c for c in chroms if '_' not in c] return(main_chroms) def get_chrom_combs(chroms): """Create 2-chromosome combinations. Given a list of chromosomes, create all possible combinations of chromosomes as a list of strings in the format "CHROM1_vs_CHROM2". Includes same chromosome pairs. e.g. chr1_vs_chr1. Filters for main chromosomes first. Args: chroms: iterable. Chromosome names. Will be filtered for main chromosomes only. Returns: list. Chromosome combinations as CHROM1_vs_CHROM2 strings. """ chrom_combs = [f'{c}_vs_{c}' for c in chroms] main_chroms = get_main_chroms(chroms) for c1, c2 in itertools.combinations(main_chroms, 2): if c1 > c2: temp = c1 c1 = c2 c2 = temp chrom_combs.append(f'{c1}_vs_{c2}') return(chrom_combs) def build_filter_mat(regions, chrom=None): """Build a filter matrix for binning a data matrix. To bin a 2-dimensional matrix, the bin regions can be one-hot encoded. e.g. for a square matrix of size 6, binned at intervals of 2, this binning can be represented as a matrix of positions x bins: [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1]] Args: regions: pass chrom: str. Name of the chromosome for which the filter matrix will be built. Default is None, which defaults to the first chromosome found in the `regions`. Returns: scipy.sparse.csc_matrix. Size is CHROM_SIZE x NUM_BINS. Each column encodes all positions that belong to that bin as 1. All other values are 0. """ if chrom is None: # Defaults to the first chromosome it can find chrom = list(regions.iloc[:, 0].unique())[0] else: chrom = [chrom] bins = regions[regions.iloc[:, 0].isin(chrom)].iloc[:, 1:3] chrom_size = bins.iloc[:, -1].max() fm = None fm_cols = [] for start, end in bins.itertuples(index=False, name=None): start = int(start) end = int(end) # Could be changed to a COO construction, but not # strictly necessary fm_cols.append(ss.csc_matrix((np.ones(end - start + 1), np.arange(start - 1, end), (0, end - start + 1)), shape=(chrom_size, 1))) fm = ss.hstack(fm_cols) return(fm) def build_network(chroms, sizes, run_dir): """Build a sparse contact matrix from a pairix-index pairs file. The 4DN consortium defines a file format for holding Hi-C contacts: the pairs file. Essentially a table holding contact ID, chromosome 1, position 1, chromosome 2, position 2, and optionally strand 1 and strand 2. There are often header lines at the top. 4DN provides a tool, pairix, that compresses the pairs file and creates an index for fast access. Given two chromosomes, this function queries a compressed and indexed pairs file using pairix and creates a 2-dimensional sparse matrix containing the contact counts for the chromosome pair. Args: chroms: tuple of 2 str. The two chromosome names for which contacts will be retrieved. May be identical for cis- chromosomal contacts. Will be reordered if necessary so that chromosome 1 >= chromosome 2 alphabetically. sizes: tuple of 2 int. The sizes of the two chromosomes passed in `chroms`. run_dir: str. Path to the directory for the SRA Run sample. Should be named the SRA Run ID, e.g. SRR123456. Should hold a pairs file named RUN_ID.bsorted.pairs.gz with an accompanying RUN_ID.bsorted.pairs.gz.px2 index file. Returns: tuple of length 2. First item is a scipy.sparse.coo_matrix holding the contacts for the chromosome pair. The matrix is CHROM_1_LENGTH x CHROM_2_LENGTH. Upon failure, two different types of values will be returned: (None, {}): Generic failure. ('nopairs', {}): Pairs file not found. """ logger = logging.getLogger('standard') c1, c2 = chroms s1, s2 = sizes if c1 > c2: temp = c1 c1 = c2 c2 = temp c = f'{c1}_vs_{c2}' process = get_contacts(c1, c2, run_dir) if not process: return None, {} elif str(process) == 'nopairs': return 'nopairs', {} new_triplets, contact_counts = get_triplets(process, c1, c2, s1, s2) cis_contacts, trans_contacts, invalid_locus_contacts = contact_counts if not new_triplets: return None, {} new_mat = ss.coo_matrix((new_triplets['data'], (new_triplets['rows'], new_triplets['cols'])), shape=(s1, s2)) if (cis_contacts == 0) and (trans_contacts == 0): stats = {'cis': cis_contacts, 'trans': trans_contacts, 'invalid_locus': invalid_locus_contacts} return new_mat, stats # assert that cis_contacts == 0 xor trans_contacts == 0, if either not 0 elif not (cis_contacts == 0) ^ (trans_contacts == 0): logger.error(f'Chromosome pair "{c}" returned cis AND trans contacts. ' 'Skipping.') return None, {} else: # Same as the first case stats = {'cis': cis_contacts, 'trans': trans_contacts, 'invalid_locus': invalid_locus_contacts} return new_mat, stats def get_contacts(c1, c2, run_dir): """Retrieve contacts for a chromosome pair from a pairs file. Args: c1: str. First chromosome of the pair for which to retrieve contacts. Order does not matter. c2: str. Second chromosome of the pair for which to retrieve contacts. Order does not matter. run_dir: str. Path to the directory for the SRA Run sample. Should be named the SRA Run ID, e.g. SRR123456. Should hold a pairs file named RUN_ID.bsorted.pairs.gz with an accompanying RUN_ID.bsorted.pairs.gz.px2 index file. Returns: str. Contains the stdout and stderr of the pairix retrieval process combined. On failure, two different outputs may be returned: None: Generic failure 'nopairs': Pairs file not found """ PAIRIX = '/grid/gillis/home/nfox/software/pairix/bin/pairix' logger = logging.getLogger('standard') logger_c = logging.getLogger('clean') run = os.path.basename(run_dir) pairs_file = os.path.join(run_dir, f'{run}.bsorted.pairs.gz') if not os.path.exists(pairs_file): logger.error(f'No pairs file for {run}. Exiting.') return 'nopairs' command = ' '.join([PAIRIX, pairs_file, '-a', f'"{c1}|{c2}"']) try: # subprocess.check_output is used instead of the more # typical subprocess.run because of an undiagnosed # incompatibility with the pairix program. When using # run(), the correct output was not obtained. # # Though stderr is redirected, a non-zero exit code # will throw an exception. Intended to catch unusual # practice of providing non-error messages to stderr. process = sp.check_output(command, text=True, shell=True, stderr=sp.STDOUT) return process except CalledProcessError as e: logger.error('pairix returned a non-zero error code! Skipping.') logger_c.error(f'return code : {e.returncode}') logger_c.error(f'error message : {e.output}') return def get_triplets(process, c1, c2, s1, s2): """Get Hi-C contacts in sparse triplet format. Args: process: str. Combined stdout and stderr from the pairix retrieval process. c1: str. First chromosome of the pair for which to retrieve contacts. Order does not matter. c2: str. Second chromosome of the pair for which to retrieve contacts. Order does not matter. s1: int. Size of `c1` in base pairs. s2: int. Size of `c2` in base pairs. return new_triplets, (cis_contacts, trans_contacts, invalid_locus_contacts) Returns: tuple with 2 items. The first item is a dict containing the 3 components needed to build a scipy.sparse.coo_matrix object: rows, cols, data. The dict holds those three lists keyed by name. Upon construction, the matrix will be an s1 x s2 matrix holding the Hi-C contact counts. The second item is a 3-member tuple containing integers for the number of cis contacts, trans contacts, and contacts with an invalid locus. Upon failure, (None, tuple()) is returned. """ logger = logging.getLogger('standard') cis_contacts = 0 trans_contacts = 0 invalid_locus_contacts = 0 new_triplets = {'rows': [], 'cols': [], 'data': []} for line in process.split('\n'): if line == '': break else: values = line.split('\t') # Ignoring contact ID # read_id = values[0] chrom_1, chrom_2 = (values[1], values[3]) if chrom_1 == c1 and chrom_2 == c2: locus_1, locus_2 = (int(values[2]), int(values[4])) elif chrom_1 == c2 and chrom_2 == c1: locus_2, locus_1 = (int(values[2]), int(values[4])) else: logger.error('Chromosomes in pairix output don\'t ' 'match the chromosomes in this ' 'iteration! Skipping.') return None, tuple() # Ignoring strand information # strnd_1, strnd_2 = (values[5], values[6]) # -1 is because loci are 1-indexed but the matrix is 0-indexed. if c1 == c2: # ensures entries for cis contacts are on the upper triangular row = min(locus_1, locus_2) - 1 col = max(locus_1, locus_2) - 1 else: row = locus_1 - 1 col = locus_2 - 1 if (locus_1 > s1 or locus_2 > s2): invalid_locus_contacts += 1 logger.error(f'ERROR: {line.strip()} has a locus ' 'larger than the size of the chromosome! ' 'Skipping.') continue if c1 == c2: cis_contacts += 1 else: trans_contacts += 1 # Append a count of 1. Duplicate counts will be # summed upon conversion to CSC format. new_triplets['rows'].append(row) new_triplets['cols'].append(col) new_triplets['data'].append(1) return new_triplets, (cis_contacts, trans_contacts, invalid_locus_contacts) def bin_network(network, filter_mats): """Bin a matrix using a pair of one-hot encoded filter matrices. A matrix can be binned, summing the counts for a range of values. e.g. binning a 4 x 6 matrix into a 2 x 2 binned matrix would look like this: [[1, 1, 0, 1, 0, 1], [0, 0, 0, 0, 1, 0], to [[2, 3], [1, 0, 1, 0, 0, 0], [4, 0]] [0, 1, 1, 0, 0, 0]] This function bins a matrix using two one-hot-encoded bin matrices. e.g. for that same matrix binned as above, the two filter matrices would be: [[1, 0], [[1, 0], [1, 0], [1, 0], and [1, 0], [0, 1], [0, 1], [0, 1]] [0, 1], [0, 1]] Args: network: scipy.sparse.csc_matrix. The matrix to be binned. filter_mats: tuple of 2 scipy.sparse.csc_matrix objects. Each is a one-hot encoded matrix representation of the bin intervals, returned from build_filter_mat(). The first matrix must be size A x B where A is the number of rows in `network` and B is the number of bins for those rows to be combined into. The second matrix must be size C x D where C is the number of columns in `network` and D is the number of bins for those columns to be combined into. Returns: A scipy.sparse.csc_matrix containing the binned version of the `network` matrix. It will be size B x D. """ return(filter_mats[0].transpose().dot(network.dot(filter_mats[1]))) def build_1bp_raw(run_dir, filepath, chrom_names, chrom_sizes, size_lookup): """Build an initial 1 base pair resolution raw contact counts network. This pipeline ultimately stores all contact matrices as separate sparse matrices inside an HDF5. For a network, e.g. 1 base pair resolution without any normalization or transformation (1bp_raw), the chromosome names and sizes will be stored. All pairwise combinations, including identical pairs, of chromosomes will be stored as a sparse matrix containing the Hi-C contacts for that pair. Args: run_dir: str. The path to the directory holding the SRA Run sample. Should be named the SRA Run ID. filepath: str. The path to the HDF5 file holding networks for this sample. Should be named 'networks.hdf5'. chrom_names: list of str. The names of the chromosomes that will be paired to store contacts for that pair. chrom_sizes: list of int. The sizes of the chromosomes named in `chrom_names`. Must match up. size_lookup: dict. Chromosome sizes as integers, keyed by chromosome name. Must contain at least all the chromosomes and sizes in `chrom_names` and `chrom_sizes`. NOTE: This is redundant information. Should be refactored to construct this dictionary from `chrom_names` and `chrom_sizes` or vice versa. Returns: None. Writes network and metadata to the file indicated by `filepath`. """ logger = logging.getLogger('standard') logger_c = logging.getLogger('clean') with h5py.File(filepath, 'a') as hf: if '/networks/1bp_raw' in hf: logger.info('Deleting existing 1bp_raw network') del hf['/networks/1bp_raw'] new_raw_net = hf.create_group('/networks/1bp_raw') new_raw_net.create_dataset('chrom_names', data=chrom_names) new_raw_net.create_dataset('chrom_sizes', data=chrom_sizes) contact_counts = {'cis': 0, 'trans': 0, 'invalid_locus': 0, 'total': 0} logger.info('Building 1bp raw network.') for c in list(get_chrom_combs(get_main_chroms(chrom_names))): c1, c2 = c.split('_vs_') logger.info(f'Building {c1} x {c2}') network, stats = build_network((c1, c2), (size_lookup[c1], size_lookup[c2]), run_dir) # check for no contacts or build_network failure if str(network) == 'nopairs': # error message produced inside build_network return elif (network is None or not stats): # error message produced inside build_network continue stats["total"] = (stats["cis"] + stats["trans"] + stats["invalid_locus"]) if stats["total"] == 0: logger.info('No contacts found!') else: logger.info(f'Reporting {c1} x {c2} statistics ...') logger_c.info(f'{"Cis":14s} : {stats["cis"]} ' f'({stats["cis"]/stats["total"]:.2%})') logger_c.info(f'{"Trans":14s} : {stats["trans"]} ' f'({stats["trans"]/stats["total"]:.2%})') logger_c.info(f'{"Invalid Locus":14s} : ' f'{stats["invalid_locus"]} ' f'({stats["invalid_locus"]/stats["total"]:.2%})') logger_c.info(f'{"Total Contacts":14s} : {stats["total"]}') for k in contact_counts.keys(): contact_counts[k] += stats[k] network = network.tocoo() data, row, col, shape = (network.data, network.row, network.col, network.shape) new_raw_net.create_group(c) new_raw_net.create_dataset(f'{c}/data', data=data) new_raw_net.create_dataset(f'{c}/row', data=row) new_raw_net.create_dataset(f'{c}/col', data=col) new_raw_net.create_dataset(f'{c}/shape', data=shape) if contact_counts['total'] == 0: logger.info('No contacts found!') else: logger.info('Reporting full statistics ...') logger_c.info(f'{"Cis":14s} : {contact_counts["cis"]} ' f"""({contact_counts["cis"] / contact_counts["total"]:.2%})""") logger_c.info(f'{"Trans":14s} : {contact_counts["trans"]} ' f"""({contact_counts["trans"] / contact_counts["total"]:.2%})""") logger_c.info(f'{"Invalid Locus":14s} : ' f'{contact_counts["invalid_locus"]} ' f"""({contact_counts["invalid_locus"] / contact_counts["total"]:.2%})""") logger_c.info(f'{"Total Contacts":14s} : ' f'{contact_counts["total"]}') def bin_to_interval(filepath, regions, humanres, chrom_names): """Bin a 1 base pair resolution raw contact counts network. This pipeline ultimately stores all contact matrices as separate sparse matrices inside an HDF5. For a network, e.g. 1 base pair resolution without any normalization or transformation (1bp_raw), the chromosome names and sizes will be stored. All pairwise combinations, including identical pairs, of chromosomes will be stored as a sparse matrix containing the Hi-C contacts for that pair. This pipeline also bins that base 1bp_raw matrix to lower resolutions, e.g. binning the two chromosomes to 10 kilobase-pair intervals and summing the contacts between any loci in a pair of intervals. See bin_network() docstring for details. Those binned networks are stored identically to the 1bp_raw network, with the addition of the regions DataFrame (see the generate_regions() docstring), stored as separate columns. Args: filepath: str. The path to the HDF5 file holding networks for this sample. Should be named 'networks.hdf5'. regions: pandas DataFrame. A valid regions dataframe returned by generate_regions(). Holds the indices defining the bins for each chromosome. humanres: The human readable version of the bin interval, e.g. an appropriate value for a bin size of 10000 would be '10kbp'. chrom_names: list of str. The names of the chromosomes that are paired to generate the 1bp_raw network in this file. Must all be present in `regions`. Returns: None. Writes binned network and metadata to the file indicated by `filepath`. """ logger = logging.getLogger('standard') logger.info(f'Binning 1bp raw networks to {humanres} raw networks') with h5py.File(filepath, 'a') as hf: new_raw_net = hf['/networks/1bp_raw'] if f'/networks/{humanres}_raw' in hf: logger.info(f'Deleting existing {humanres}_raw network') del hf[f'/networks/{humanres}_raw'] new_bin_net = hf.create_group(f'/networks/{humanres}_raw') new_bin_net.create_group('binning_data') logger.info('Generating filter matrices') filter_mats = {} for cname in new_raw_net['chrom_names'][:]: # FLAG cname = cname.decode('utf-8') logger.info(f'Building filter matrix for {cname}') filt = build_filter_mat(regions, cname) filt = filt.tocoo() filter_mats[cname] = filt # Assume regions has 3 columns: chrom, start, end # Assume the dtypes are right (string, int, int) new_bin_net.create_group('binning_data/regions') for col in ['chrom', 'start', 'end']: new_bin_net['binning_data/regions'].create_dataset( col, data=regions[col].values) for c in list(get_chrom_combs(get_main_chroms(chrom_names))): if c not in new_raw_net.keys(): logger.error(f'Internal logic error. {c} not in the keys' 'for the newly created 1bp raw network. ' f'Skipping binning {c}.') continue c1, c2 = c.split('_vs_') logger.info(f'Binning {c1} x {c2}') orig_mat = ss.coo_matrix((new_raw_net[f'{c}/data'][:], (new_raw_net[f'{c}/row'][:], new_raw_net[f'{c}/col'][:])), new_raw_net[f'{c}/shape'][:]).tocsc() filter_1 = filter_mats[c1].tocsc() filter_2 = filter_mats[c2].tocsc() binned_network = bin_network(orig_mat, (filter_1, filter_2)) binned_network = binned_network.tocoo() data, row, col, shape = (binned_network.data, binned_network.row, binned_network.col, binned_network.shape) new_bin_net.create_group(c) new_bin_net.create_dataset(f'{c}/data', data=data) new_bin_net.create_dataset(f'{c}/row', data=row) new_bin_net.create_dataset(f'{c}/col', data=col) new_bin_net.create_dataset(f'{c}/shape', data=shape) def main(regions, run_dir, chroms, interval=False, humanres='', delim='\t', columns=[0, 1, 2], comment='', logfile='', splitouterr=False): if not logfile: logs_dir = os.path.join(run_dir, 'logs') if not os.path.exists(logs_dir): os.makedirs(logs_dir) logfile = os.path.join(run_dir, 'logs', 'build_network.log') setup_loggers(logfile=logfile, global_level=logging.INFO, splitouterr=splitouterr) logger = logging.getLogger('standard') logger.info('Starting.') run_dir = run_dir.rstrip('/') temp = get_chrom_sizes(chroms) size_lookup = {k: v for k, v in temp.items() if '_' not in k} filepath = os.path.join(run_dir, 'networks.hdf5') chrom_names = list(size_lookup.keys()) chrom_sizes = [size_lookup[k] for k in chrom_names] if interval: try: regions = int(regions) except ValueError: logger.error('If --interval is passed, regions must be ' 'a valid integer! Exiting.') return regions_interval = regions regions = generate_regions(regions, chrom_names, chrom_sizes) else: logger.error('Currently does not support not using --interval. ' 'Exiting.') return # DEBUG >>> # if comment == '': # comment = None # columns = [int(c) for c in columns] # regions = pd.read_csv(regions, sep=delim, comment=comment, # header=0, usecols=columns) # regions_interval = None # if humanres != '': # logger.warning('humanres is ignored when --interval is ' # 'not passed.') # <<< DEBUG if humanres == '' and interval: humanres = str(regions_interval) logger.info(f'Writing to {filepath}') if regions_interval == 1: build_1bp_raw(run_dir, filepath, chrom_names, chrom_sizes, size_lookup) elif type(regions_interval) is int: bin_to_interval(filepath, regions, humanres, chrom_names) logger.info('Finished.') if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('regions', help='Path to file containing the bins. The file ' 'must be tabular, with one region per line ' 'with at least 3 columns: chromosome, start, ' 'and end. The first line will be taken as a ' 'header and cannot start with the comment char. ' 'If --interval is passed, this argument should ' 'just be a single integer, representing the ' 'size of the interval in base pairs.') parser.add_argument('run_dir', help='Path to the Run folder containing the ' 'source pairs file.') parser.add_argument('chroms', help='Path to the chrom sizes file used for ' 'these data') parser.add_argument('--interval', action='store_true', help='Pass if each chromosome should be binned at a ' 'regular interval, instead of by specific ' 'regions. If passed, the "regions" argument ' 'should be a single integer, denoting the ' 'size of the interval in base pairs.') parser.add_argument('--humanres', default='', help='The human readable version of the resolution, ' 'e.g. 10Kbp for 10000. Used for file naming. ' 'Only applicable when --interval is passed.') parser.add_argument('-d', '--delim', default='\t', help='Character string separating columns in ' 'the regions file. TAB character by default. ') parser.add_argument('-c', '--columns', nargs=3, default=[0, 1, 2], help='Columns in the regions file for ' 'chromosome, start, and end, in that order. ' 'First column is 0, and so on. Default is ' '"-c 0 1 2"') parser.add_argument('--comment', default='', help='Single character that prefixes all lines ' 'that should be ignored.') parser.add_argument('-l', '--logfile', help='The file to which all log messages should be ' 'appended. If it is a directory, the log will ' 'be "build_network.log"') parser.add_argument('--splitouterr', action='store_true', help='If set, the single logfile will instead be ' 'split from LOGFILE.EXTENSION to LOGFILE.out ' 'and LOGFILE.err') # Needs open/closed interval spec args = parser.parse_args() main(args.regions, args.run_dir, args.chroms, args.interval, args.humanres, args.delim, args.columns, args.comment, args.logfile, args.splitouterr)