#!/grid/gillis/home/nfox/software/miniconda3/bin/python

"""Used to build a Hi-C contact network in a pipeline networks HDF5 file.

The pipeline governed by hicup_data_processing.py and executed
on each sample by process_srr.py ultimately stores Hi-C contacts
in chromosome x chromosome sparse contact matrices of varying
resolutions. These matrices with accompanying metadata are stored
in each sample's Run directory as an HDF5 file called "networks.hdf5".
In the step preceding HDF5 file building, the contacts are converted
from a SAM file to a "pairs" file, a file format defined by the
4DN consortium. This is a tabular sparse triplets format for storing
Hi-C contacts. It is compressed and indexed. The "source" data so
to speak.

This program parses compressed and indexed pairs files to convert
them into collections of chromosome x chromosome sparse contact
matrices. The raw data is stored in the 1bp_raw network, and then
binned and/or normalized networks can be stored arbitrarily.
"""

import h5py
import itertools
import logging
import numpy as np
import os
import pandas as pd
import scipy.sparse as ss
import subprocess as sp
from subprocess import CalledProcessError
import sys


def setup_loggers(global_level=logging.DEBUG, logfile='',
                  splitouterr=False, reset=[]):
    """Set up loggers.

    Powerful function to set up the loggers require for
    this programs. Includes filters for .out files and .err
    files. Includes a reset function to avoid any carryover
    from parent programs or external calls. Can parse a
    target log file for default names, a default name in
    a passed location, or a given filepath.

    Included Functions and Classes:
        reset_logging()


    This particular function builds 2 loggers: a generic
    logger that includes the timestamp and the message type,
    and a "clean" logger that functions the same way as
    a bare print function with no annotation.

    Args:
        global_level: int. One of [0, 10, 20, 30, 40, 50], corresponding
            the logging levels in the standard Python logging module:
            NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL. Indicates
            the lowest level to include in output. e.g. a value of INFO
            will ignore all logging messages that are DEBUG and lower.
        logfile: str. The file to which log messages will be written.
            If '', the __file__ variable will be used to generate
            a default SCRIPT_NAME.log file in the current directory.
            If `logfile` is a directory, the same default file will be
            used in that directory instead of the current directory.
            If `logfile` is a file, it will be used.
        splitouterr: bool. An option to use typical stdout and stderr
            files instead of a *.log file. e.g. your_program.out and
            your_program.err instead of your_program.log. Messages
            of INFO and below will be sent to the outfile and messages
            of WARNING and above will be sent to the errfile.
        reset: list. Loggers to be reset/cleared and rebuilt from
            scratch. Empty list indicates all active loggers. Can
            optionally pass a single logger name as a string.
            e.g. 'logger' instead of ['logger'].
    """
    # Functions and Classes
    def reset_logging(loggers=[]):
        """Reset and clear loggers.

        Loggers can sometimes produce unexpected effects when
        called in submodules, from external functions, or across
        multiple sessions. This function completely clears a logger
        from the existing logging environment to avoid any
        side effects.

        Args:
            reset: list. Loggers to be reset/cleared and rebuilt from
                scratch. Empty list indicates all active loggers. Can
                optionally pass a single logger name as a string.
                e.g. 'logger' instead of ['logger'].
        """
        manager = logging.Logger.manager
        manager.disabled = logging.NOTSET
        if type(loggers) is str:
            loggers = [loggers]
        if len(loggers) == 0:
            lkeys = list(manager.loggerDict.keys())
        else:
            lkeys = []
            for lkey in loggers:
                if lkey not in manager.loggerDict.keys():
                    print(f'{lkey} is not a valid logger. Will not be reset.',
                          file=sys.stderr)
                else:
                    lkeys.append(lkey)
        for lkey in lkeys:
            logger = manager.loggerDict[lkey]
            if isinstance(logger, logging.Logger):
                logger.setLevel(logging.NOTSET)
                logger.propagate = True
                logger.disabled = False
                logger.filters.clear()
                handlers = logger.handlers.copy()
                for handler in handlers:
                    # Copied from `logging.shutdown`.
                    try:
                        handler.acquire()
                        handler.flush()
                        handler.close()
                    except (OSError, ValueError):
                        pass
                    finally:
                        handler.release()
                    logger.removeHandler(handler)

    class Out_Filter(logging.Filter):
        """Extension of logging.Filter to get messages for stdout.

        Filters for messages that are INFO or less severe,
        which would be expected to go to an *.out file, as opposed
        to an *.err file.
        """
        def filter(self, rec):
            """Filter for messages of WARNING and more severe."""
            if rec.levelno <= logging.INFO:
                return(True)
            else:
                return(False)

    class Err_Filter(logging.Filter):
        """Extension of logging.Filter to get messages for stderr.

        Filters for messages that are WARNING or more severe,
        which would be expected to go to an *.err file, as opposed
        to an *.out file.
        """
        def filter(self, rec):
            """Filter for messages of WARNING and more severe."""
            if rec.levelno >= logging.WARNING:
                return(True)
            else:
                return(False)

    reset_logging(loggers=reset)
    # Formatters
    std_format = logging.Formatter(fmt=('%(asctime)s [%(levelname)s] '
                                        '%(message)s'),
                                   datefmt='%Y-%m-%d %H:%M:%S')
    clean_format = logging.Formatter(fmt='')

    # Parse logfile name
    if logfile == '':
        logfile = os.path.basename(__file__)
        if logfile.endswith('.py'):
            logfile = logfile[:-3]
        logfile = logfile + '.log'
    elif os.path.isdir(logfile):
        logdir = logfile
        logfile = os.path.basename(__file__)
        if logfile.endswith('.py'):
            logfile = logfile[:-3]
        logfile = logfile + '.log'
        logfile = os.path.join(logdir, logfile)
    else:
        # No effect, here for readability
        logfile = logfile

    # Build Handlers
    std_handlers = {}
    clean_handlers = {}
    if splitouterr:
        logfile_name = os.path.splitext(os.path.basename(logfile))[0]
        logout_file = os.path.join(os.path.dirname(logfile),
                                   f'{logfile_name}.out')
        logerr_file = os.path.join(os.path.dirname(logfile),
                                   f'{logfile_name}.err')

        std_handlers['logout'] = logging.FileHandler(filename=logout_file,
                                                     mode='a')
        std_handlers['logerr'] = logging.FileHandler(filename=logerr_file,
                                                     mode='a')
        std_handlers['logout'].addFilter(Out_Filter())
        std_handlers['logerr'].addFilter(Err_Filter())

        clean_handlers['logout'] = logging.FileHandler(filename=logout_file,
                                                       mode='a')
        clean_handlers['logerr'] = logging.FileHandler(filename=logerr_file,
                                                       mode='a')
        clean_handlers['logout'].addFilter(Out_Filter())
        clean_handlers['logerr'].addFilter(Err_Filter())
    else:
        std_handlers['logfile'] = logging.FileHandler(filename=logfile,
                                                      mode='a')
        clean_handlers['logfile'] = logging.FileHandler(filename=logfile,
                                                        mode='a')
    for hdler in std_handlers:
        std_handlers[hdler].setFormatter(std_format)
    for hdler in clean_handlers:
        clean_handlers[hdler].setFormatter(clean_format)

    # Building loggers
    logger = logging.getLogger('standard')
    if splitouterr:
        logger.addHandler(std_handlers['logout'])
        logger.addHandler(std_handlers['logerr'])
    else:
        logger.addHandler(std_handlers['logfile'])
    logger.setLevel(global_level)

    logger = logging.getLogger('clean')
    if splitouterr:
        logger.addHandler(clean_handlers['logout'])
        logger.addHandler(clean_handlers['logerr'])
    else:
        logger.addHandler(clean_handlers['logfile'])
    logger.setLevel(global_level)


def generate_regions(interval, chrom_names, chrom_sizes):
    """Generate a regions DataFrame from an integer interval.

    From a list of chromosomes and a bin size,
    creates a table containing the chromosome regions or bins
    in base pairs. Stored as a triplet of the chromosome,
    bin start index and bin end index.

    Essentially binning a genome.

    Args:
        interval: int. Interval between regions. The "bin size"
            for binning a chromosome length into fixed size bins.
        chrom_names: Names of the chromosomes to be binned.
            Must be matched to `chrom_sizes`.
        chrom_sizes: Sizes of the chromosomes to be binned.
            Must be matched to `chrom_names`.

    Returns:
        pandas DataFrame. 3 columns: chrom, start, end. Each row
        is a region or bin. The chromosome it belongs to, and the
        start and end position. The interval indices begin at 1
        and are closed. e.g. The first two intervals of 50 for
        chromosome chr1 will be [('chr1', 1, 50), ('chr1', 51, 100)].
    """
    logger = logging.getLogger('standard')
    assert(type(interval) == int)
    chrom_regions = []
    for c, size in zip(chrom_names, chrom_sizes):
        start = np.arange(1, size+1, interval)
        end = np.arange(interval, size+1, interval)
        if start.size == end.size:
            pass
        elif (start.size - end.size) == 1:
            end = np.append(end, size)
        else:
            logger.error(f'unexpected start and end arrays for {c}')
            continue
        chrom_regions.append((pd.DataFrame({'chrom': c,
                                            'start': start,
                                            'end': end})))
    return(pd.concat(chrom_regions, ignore_index=True))


def get_chrom_sizes(chroms):
    """Parse a standard UCSC chromosome sizes file.

    A standard UCSC chromosome sizes file includes two
    tab-separated values per line, each corresponding
    to a chromosome name and a chromosome size in base pairs.

    Args:
        chroms: str. Path to the chromosome sizes file.

    Returns:
        dict. Chromosome sizes as integers, keyed by
        chromosome name.
    """
    chrom_sizes = {}
    with open(chroms, 'r') as f:
        for line in f:
            values = line.split('\t')
            chrom_sizes[values[0]] = int(values[1])
    return(chrom_sizes)


def get_main_chroms(chroms):
    """Filter for main chromosomes.

    Most genomes have main chromosomes as well as a host
    of incomplete or partial chromosome constructs.
    Filters an iterable of chromosome names for the main
    chromosomes, under the assumption that main chromosomes
    exclusively do not contain an underscore character '_'
    in their name.

    Returns:
        list. List of main chromosome names.
    """
    main_chroms = [c for c in chroms if '_' not in c]
    return(main_chroms)


def get_chrom_combs(chroms):
    """Create 2-chromosome combinations.

    Given a list of chromosomes, create all possible
    combinations of chromosomes as a list of strings
    in the format "CHROM1_vs_CHROM2". Includes same
    chromosome pairs. e.g. chr1_vs_chr1. Filters for
    main chromosomes first.

    Args:
        chroms: iterable. Chromosome names. Will be
            filtered for main chromosomes only.

    Returns:
        list. Chromosome combinations as CHROM1_vs_CHROM2
        strings.
    """
    chrom_combs = [f'{c}_vs_{c}' for c in chroms]
    main_chroms = get_main_chroms(chroms)
    for c1, c2 in itertools.combinations(main_chroms, 2):
        if c1 > c2:
            temp = c1
            c1 = c2
            c2 = temp
        chrom_combs.append(f'{c1}_vs_{c2}')
    return(chrom_combs)


def build_filter_mat(regions, chrom=None):
    """Build a filter matrix for binning a data matrix.

    To bin a 2-dimensional matrix, the bin regions can
    be one-hot encoded. e.g. for a square matrix of size
    6, binned at intervals of 2, this binning can be
    represented as a matrix of positions x bins:
        [[1, 0, 0],
         [1, 0, 0],
         [0, 1, 0],
         [0, 1, 0],
         [0, 0, 1],
         [0, 0, 1]]

    Args:
        regions: pass
        chrom: str. Name of the chromosome for which the
            filter matrix will be built. Default is None,
            which defaults to the first chromosome found
            in the `regions`.

    Returns:
        scipy.sparse.csc_matrix. Size is CHROM_SIZE x NUM_BINS.
        Each column encodes all positions that belong to that
        bin as 1. All other values are 0.
    """
    if chrom is None:
        # Defaults to the first chromosome it can find
        chrom = list(regions.iloc[:, 0].unique())[0]
    else:
        chrom = [chrom]
    bins = regions[regions.iloc[:, 0].isin(chrom)].iloc[:, 1:3]
    chrom_size = bins.iloc[:, -1].max()
    fm = None
    fm_cols = []
    for start, end in bins.itertuples(index=False, name=None):
        start = int(start)
        end = int(end)
        # Could be changed to a COO construction, but not
        # strictly necessary
        fm_cols.append(ss.csc_matrix((np.ones(end - start + 1),
                                      np.arange(start - 1, end),
                                      (0, end - start + 1)),
                                     shape=(chrom_size, 1)))
    fm = ss.hstack(fm_cols)
    return(fm)


def build_network(chroms, sizes, run_dir):
    """Build a sparse contact matrix from a pairix-index pairs file.

    The 4DN consortium defines a file format for holding Hi-C
    contacts: the pairs file. Essentially a table holding contact ID,
    chromosome 1, position 1, chromosome 2, position 2, and optionally
    strand 1 and strand 2. There are often header lines at the top.
    4DN provides a tool, pairix, that compresses the pairs file and
    creates an index for fast access.

    Given two chromosomes, this function queries a compressed and
    indexed pairs file using pairix and creates a 2-dimensional
    sparse matrix containing the contact counts for the chromosome pair.

    Args:
        chroms: tuple of 2 str. The two chromosome names for which
            contacts will be retrieved. May be identical for cis-
            chromosomal contacts. Will be reordered if necessary
            so that chromosome 1 >= chromosome 2 alphabetically.
        sizes: tuple of 2 int. The sizes of the two chromosomes
            passed in `chroms`.
        run_dir: str. Path to the directory for the SRA Run sample.
            Should be named the SRA Run ID, e.g. SRR123456.
            Should hold a pairs file named RUN_ID.bsorted.pairs.gz
            with an accompanying RUN_ID.bsorted.pairs.gz.px2 index
            file.

    Returns:
        tuple of length 2. First item is a scipy.sparse.coo_matrix
        holding the contacts for the chromosome pair. The matrix
        is CHROM_1_LENGTH x CHROM_2_LENGTH. Upon failure, two
        different types of values will be returned:
            (None, {}): Generic failure.
            ('nopairs', {}): Pairs file not found.
    """
    logger = logging.getLogger('standard')
    c1, c2 = chroms
    s1, s2 = sizes
    if c1 > c2:
        temp = c1
        c1 = c2
        c2 = temp
    c = f'{c1}_vs_{c2}'
    process = get_contacts(c1, c2, run_dir)
    if not process:
        return None, {}
    elif str(process) == 'nopairs':
        return 'nopairs', {}
    new_triplets, contact_counts = get_triplets(process, c1, c2, s1, s2)
    cis_contacts, trans_contacts, invalid_locus_contacts = contact_counts
    if not new_triplets:
        return None, {}
    new_mat = ss.coo_matrix((new_triplets['data'],
                             (new_triplets['rows'],
                              new_triplets['cols'])),
                            shape=(s1, s2))
    if (cis_contacts == 0) and (trans_contacts == 0):
        stats = {'cis': cis_contacts,
                 'trans': trans_contacts,
                 'invalid_locus': invalid_locus_contacts}
        return new_mat, stats
    # assert that cis_contacts == 0 xor trans_contacts == 0, if either not 0
    elif not (cis_contacts == 0) ^ (trans_contacts == 0):
        logger.error(f'Chromosome pair "{c}" returned cis AND trans contacts. '
                     'Skipping.')
        return None, {}
    else:
        # Same as the first case
        stats = {'cis': cis_contacts,
                 'trans': trans_contacts,
                 'invalid_locus': invalid_locus_contacts}
        return new_mat, stats


def get_contacts(c1, c2, run_dir):
    """Retrieve contacts for a chromosome pair from a pairs file.

    Args:
        c1: str. First chromosome of the pair for which to
            retrieve contacts. Order does not matter.
        c2: str. Second chromosome of the pair for which to
            retrieve contacts. Order does not matter.
        run_dir: str. Path to the directory for the SRA Run sample.
            Should be named the SRA Run ID, e.g. SRR123456.
            Should hold a pairs file named RUN_ID.bsorted.pairs.gz
            with an accompanying RUN_ID.bsorted.pairs.gz.px2 index
            file.

    Returns:
        str. Contains the stdout and stderr of the pairix retrieval
        process combined. On failure, two different outputs may be
        returned:
            None: Generic failure
            'nopairs': Pairs file not found
    """
    PAIRIX = '/grid/gillis/home/nfox/software/pairix/bin/pairix'
    logger = logging.getLogger('standard')
    logger_c = logging.getLogger('clean')
    run = os.path.basename(run_dir)
    pairs_file = os.path.join(run_dir, f'{run}.bsorted.pairs.gz')
    if not os.path.exists(pairs_file):
        logger.error(f'No pairs file for {run}. Exiting.')
        return 'nopairs'

    command = ' '.join([PAIRIX, pairs_file, '-a', f'"{c1}|{c2}"'])
    try:
        # subprocess.check_output is used instead of the more
        # typical subprocess.run because of an undiagnosed
        # incompatibility with the pairix program. When using
        # run(), the correct output was not obtained.
        #
        # Though stderr is redirected, a non-zero exit code
        # will throw an exception. Intended to catch unusual
        # practice of providing non-error messages to stderr.
        process = sp.check_output(command, text=True,
                                  shell=True, stderr=sp.STDOUT)
        return process
    except CalledProcessError as e:
        logger.error('pairix returned a non-zero error code! Skipping.')
        logger_c.error(f'return code   : {e.returncode}')
        logger_c.error(f'error message : {e.output}')
        return


def get_triplets(process, c1, c2, s1, s2):
    """Get Hi-C contacts in sparse triplet format.

    Args:
        process: str. Combined stdout and stderr from the
            pairix retrieval process.
        c1: str. First chromosome of the pair for which to
            retrieve contacts. Order does not matter.
        c2: str. Second chromosome of the pair for which to
            retrieve contacts. Order does not matter.
        s1: int. Size of `c1` in base pairs.
        s2: int. Size of `c2` in base pairs.
    return new_triplets, (cis_contacts, trans_contacts, invalid_locus_contacts)

    Returns:
        tuple with 2 items. The first item is a dict containing the 3
        components needed to build a scipy.sparse.coo_matrix object:
        rows, cols, data. The dict holds those three lists keyed by
        name. Upon construction, the matrix will be an s1 x s2 matrix
        holding the Hi-C contact counts. The second item is a 3-member
        tuple containing integers for the number of cis contacts,
        trans contacts, and contacts with an invalid locus. Upon failure,
        (None, tuple()) is returned.
    """
    logger = logging.getLogger('standard')
    cis_contacts = 0
    trans_contacts = 0
    invalid_locus_contacts = 0
    new_triplets = {'rows': [], 'cols': [], 'data': []}
    for line in process.split('\n'):
        if line == '':
            break
        else:
            values = line.split('\t')
            # Ignoring contact ID
            # read_id = values[0]
            chrom_1, chrom_2 = (values[1], values[3])
            if chrom_1 == c1 and chrom_2 == c2:
                locus_1, locus_2 = (int(values[2]), int(values[4]))
            elif chrom_1 == c2 and chrom_2 == c1:
                locus_2, locus_1 = (int(values[2]), int(values[4]))
            else:
                logger.error('Chromosomes in pairix output don\'t '
                             'match the chromosomes in this '
                             'iteration! Skipping.')
                return None, tuple()
            # Ignoring strand information
            # strnd_1, strnd_2 = (values[5], values[6])

            # -1 is because loci are 1-indexed but the matrix is 0-indexed.
            if c1 == c2:
                # ensures entries for cis contacts are on the upper triangular
                row = min(locus_1, locus_2) - 1
                col = max(locus_1, locus_2) - 1
            else:
                row = locus_1 - 1
                col = locus_2 - 1

            if (locus_1 > s1 or locus_2 > s2):
                invalid_locus_contacts += 1
                logger.error(f'ERROR: {line.strip()} has a locus '
                             'larger than the size of the chromosome! '
                             'Skipping.')
                continue
            if c1 == c2:
                cis_contacts += 1
            else:
                trans_contacts += 1
            # Append a count of 1. Duplicate counts will be
            # summed upon conversion to CSC format.
            new_triplets['rows'].append(row)
            new_triplets['cols'].append(col)
            new_triplets['data'].append(1)
    return new_triplets, (cis_contacts, trans_contacts, invalid_locus_contacts)


def bin_network(network, filter_mats):
    """Bin a matrix using a pair of one-hot encoded filter matrices.

    A matrix can be binned, summing the counts for a range of values.
    e.g. binning a 4 x 6 matrix into a 2 x 2 binned matrix would
    look like this:

        [[1, 1, 0, 1, 0, 1],
         [0, 0, 0, 0, 1, 0],   to   [[2, 3],
         [1, 0, 1, 0, 0, 0],         [4, 0]]
         [0, 1, 1, 0, 0, 0]]

    This function bins a matrix using two one-hot-encoded bin matrices.
    e.g. for that same matrix binned as above, the two filter matrices
    would be:

                         [[1, 0],
        [[1, 0],          [1, 0],
         [1, 0],   and    [1, 0],
         [0, 1],          [0, 1],
         [0, 1]]          [0, 1],
                          [0, 1]]

    Args:
        network: scipy.sparse.csc_matrix. The matrix to be binned.
        filter_mats: tuple of 2 scipy.sparse.csc_matrix objects. Each
            is a one-hot encoded matrix representation of the bin intervals,
            returned from build_filter_mat(). The first matrix must be size
            A x B where A is the number of rows in `network` and B is
            the number of bins for those rows to be combined into. The
            second matrix must be size C x D where C is the number
            of columns in `network` and D is the number of bins for
            those columns to be combined into.

    Returns:
        A scipy.sparse.csc_matrix containing the binned version
        of the `network` matrix. It will be size B x D.
    """
    return(filter_mats[0].transpose().dot(network.dot(filter_mats[1])))


def build_1bp_raw(run_dir, filepath, chrom_names, chrom_sizes, size_lookup):
    """Build an initial 1 base pair resolution raw contact counts network.

    This pipeline ultimately stores all contact matrices as separate
    sparse matrices inside an HDF5. For a network, e.g. 1 base pair
    resolution without any normalization or transformation (1bp_raw),
    the chromosome names and sizes will be stored. All pairwise combinations,
    including identical pairs, of chromosomes will be stored as a sparse
    matrix containing the Hi-C contacts for that pair.

    Args:
        run_dir: str. The path to the directory holding the SRA Run
            sample. Should be named the SRA Run ID.
        filepath: str. The path to the HDF5 file holding networks
            for this sample. Should be named 'networks.hdf5'.
        chrom_names: list of str. The names of the chromosomes that
            will be paired to store contacts for that pair.
        chrom_sizes: list of int. The sizes of the chromosomes named
            in `chrom_names`. Must match up.
        size_lookup: dict. Chromosome sizes as integers, keyed by
            chromosome name. Must contain at least all the chromosomes
            and sizes in `chrom_names` and `chrom_sizes`.
            NOTE: This is redundant information. Should be refactored
                to construct this dictionary from `chrom_names` and
                `chrom_sizes` or vice versa.

    Returns:
        None. Writes network and metadata to the file
        indicated by `filepath`.
    """
    logger = logging.getLogger('standard')
    logger_c = logging.getLogger('clean')
    with h5py.File(filepath, 'a') as hf:
        if '/networks/1bp_raw' in hf:
            logger.info('Deleting existing 1bp_raw network')
            del hf['/networks/1bp_raw']
        new_raw_net = hf.create_group('/networks/1bp_raw')
        new_raw_net.create_dataset('chrom_names', data=chrom_names)
        new_raw_net.create_dataset('chrom_sizes', data=chrom_sizes)
        contact_counts = {'cis': 0, 'trans': 0, 'invalid_locus': 0, 'total': 0}
        logger.info('Building 1bp raw network.')
        for c in list(get_chrom_combs(get_main_chroms(chrom_names))):
            c1, c2 = c.split('_vs_')
            logger.info(f'Building {c1} x {c2}')
            network, stats = build_network((c1, c2),
                                           (size_lookup[c1], size_lookup[c2]),
                                           run_dir)
            # check for no contacts or build_network failure
            if str(network) == 'nopairs':
                # error message produced inside build_network
                return
            elif (network is None or not stats):
                # error message produced inside build_network
                continue
            stats["total"] = (stats["cis"]
                              + stats["trans"]
                              + stats["invalid_locus"])
            if stats["total"] == 0:
                logger.info('No contacts found!')
            else:
                logger.info(f'Reporting {c1} x {c2} statistics ...')
                logger_c.info(f'{"Cis":14s} : {stats["cis"]} '
                              f'({stats["cis"]/stats["total"]:.2%})')
                logger_c.info(f'{"Trans":14s} : {stats["trans"]} '
                              f'({stats["trans"]/stats["total"]:.2%})')
                logger_c.info(f'{"Invalid Locus":14s} : '
                              f'{stats["invalid_locus"]} '
                              f'({stats["invalid_locus"]/stats["total"]:.2%})')
                logger_c.info(f'{"Total Contacts":14s} : {stats["total"]}')
                for k in contact_counts.keys():
                    contact_counts[k] += stats[k]
            network = network.tocoo()
            data, row, col, shape = (network.data, network.row,
                                     network.col, network.shape)
            new_raw_net.create_group(c)
            new_raw_net.create_dataset(f'{c}/data', data=data)
            new_raw_net.create_dataset(f'{c}/row', data=row)
            new_raw_net.create_dataset(f'{c}/col', data=col)
            new_raw_net.create_dataset(f'{c}/shape', data=shape)
        if contact_counts['total'] == 0:
            logger.info('No contacts found!')
        else:
            logger.info('Reporting full statistics ...')
            logger_c.info(f'{"Cis":14s} : {contact_counts["cis"]} '
                          f"""({contact_counts["cis"] /
                                contact_counts["total"]:.2%})""")
            logger_c.info(f'{"Trans":14s} : {contact_counts["trans"]} '
                          f"""({contact_counts["trans"] /
                                contact_counts["total"]:.2%})""")
            logger_c.info(f'{"Invalid Locus":14s} : '
                          f'{contact_counts["invalid_locus"]} '
                          f"""({contact_counts["invalid_locus"] /
                                contact_counts["total"]:.2%})""")
            logger_c.info(f'{"Total Contacts":14s} : '
                          f'{contact_counts["total"]}')


def bin_to_interval(filepath, regions, humanres, chrom_names):
    """Bin a 1 base pair resolution raw contact counts network.

    This pipeline ultimately stores all contact matrices as separate
    sparse matrices inside an HDF5. For a network, e.g. 1 base pair
    resolution without any normalization or transformation (1bp_raw),
    the chromosome names and sizes will be stored. All pairwise combinations,
    including identical pairs, of chromosomes will be stored as a sparse
    matrix containing the Hi-C contacts for that pair.

    This pipeline also bins that base 1bp_raw matrix to lower resolutions,
    e.g. binning the two chromosomes to 10 kilobase-pair intervals and
    summing the contacts between any loci in a pair of intervals. See
    bin_network() docstring for details. Those binned networks are stored
    identically to the 1bp_raw network, with the addition of the
    regions DataFrame (see the generate_regions() docstring),
    stored as separate columns.

    Args:
        filepath: str. The path to the HDF5 file holding networks
            for this sample. Should be named 'networks.hdf5'.
        regions: pandas DataFrame. A valid regions dataframe returned
            by generate_regions(). Holds the indices defining the
            bins for each chromosome.
        humanres: The human readable version of the bin interval,
            e.g. an appropriate value for a bin size of 10000 would
            be '10kbp'.
        chrom_names: list of str. The names of the chromosomes that
            are paired to generate the 1bp_raw network in this file.
            Must all be present in `regions`.

    Returns:
        None. Writes binned network and metadata to the file
        indicated by `filepath`.
    """
    logger = logging.getLogger('standard')
    logger.info(f'Binning 1bp raw networks to {humanres} raw networks')
    with h5py.File(filepath, 'a') as hf:
        new_raw_net = hf['/networks/1bp_raw']
        if f'/networks/{humanres}_raw' in hf:
            logger.info(f'Deleting existing {humanres}_raw network')
            del hf[f'/networks/{humanres}_raw']
        new_bin_net = hf.create_group(f'/networks/{humanres}_raw')
        new_bin_net.create_group('binning_data')
        logger.info('Generating filter matrices')
        filter_mats = {}
        for cname in new_raw_net['chrom_names'][:]:  # FLAG
            cname = cname.decode('utf-8')
            logger.info(f'Building filter matrix for {cname}')
            filt = build_filter_mat(regions, cname)
            filt = filt.tocoo()
            filter_mats[cname] = filt
        # Assume regions has 3 columns: chrom, start, end
        # Assume the dtypes are right (string, int, int)
        new_bin_net.create_group('binning_data/regions')
        for col in ['chrom', 'start', 'end']:
            new_bin_net['binning_data/regions'].create_dataset(
                col, data=regions[col].values)
        for c in list(get_chrom_combs(get_main_chroms(chrom_names))):
            if c not in new_raw_net.keys():
                logger.error(f'Internal logic error. {c} not in the keys'
                             'for the newly created 1bp raw network. '
                             f'Skipping binning {c}.')
                continue
            c1, c2 = c.split('_vs_')
            logger.info(f'Binning {c1} x {c2}')
            orig_mat = ss.coo_matrix((new_raw_net[f'{c}/data'][:],
                                      (new_raw_net[f'{c}/row'][:],
                                       new_raw_net[f'{c}/col'][:])),
                                     new_raw_net[f'{c}/shape'][:]).tocsc()
            filter_1 = filter_mats[c1].tocsc()
            filter_2 = filter_mats[c2].tocsc()
            binned_network = bin_network(orig_mat, (filter_1, filter_2))
            binned_network = binned_network.tocoo()
            data, row, col, shape = (binned_network.data, binned_network.row,
                                     binned_network.col, binned_network.shape)
            new_bin_net.create_group(c)
            new_bin_net.create_dataset(f'{c}/data', data=data)
            new_bin_net.create_dataset(f'{c}/row', data=row)
            new_bin_net.create_dataset(f'{c}/col', data=col)
            new_bin_net.create_dataset(f'{c}/shape', data=shape)


def main(regions, run_dir, chroms, interval=False, humanres='',
         delim='\t', columns=[0, 1, 2], comment='', logfile='',
         splitouterr=False):
    if not logfile:
        logs_dir = os.path.join(run_dir, 'logs')
        if not os.path.exists(logs_dir):
            os.makedirs(logs_dir)
        logfile = os.path.join(run_dir, 'logs', 'build_network.log')
    setup_loggers(logfile=logfile, global_level=logging.INFO,
                  splitouterr=splitouterr)
    logger = logging.getLogger('standard')
    logger.info('Starting.')
    run_dir = run_dir.rstrip('/')
    temp = get_chrom_sizes(chroms)
    size_lookup = {k: v for k, v in temp.items() if '_' not in k}
    filepath = os.path.join(run_dir, 'networks.hdf5')
    chrom_names = list(size_lookup.keys())
    chrom_sizes = [size_lookup[k] for k in chrom_names]
    if interval:
        try:
            regions = int(regions)
        except ValueError:
            logger.error('If --interval is passed, regions must be '
                         'a valid integer! Exiting.')
            return
        regions_interval = regions
        regions = generate_regions(regions, chrom_names, chrom_sizes)
    else:
        logger.error('Currently does not support not using --interval. '
                     'Exiting.')
        return
        # DEBUG >>>
        # if comment == '':
        #     comment = None
        # columns = [int(c) for c in columns]
        # regions = pd.read_csv(regions, sep=delim, comment=comment,
        #                       header=0, usecols=columns)
        # regions_interval = None
        # if humanres != '':
        #     logger.warning('humanres is ignored when --interval is '
        #                    'not passed.')
        # <<< DEBUG
    if humanres == '' and interval:
        humanres = str(regions_interval)
    logger.info(f'Writing to {filepath}')
    if regions_interval == 1:
        build_1bp_raw(run_dir, filepath, chrom_names, chrom_sizes, size_lookup)
    elif type(regions_interval) is int:
        bin_to_interval(filepath, regions, humanres, chrom_names)
    logger.info('Finished.')


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('regions',
                        help='Path to file containing the bins. The file '
                             'must be tabular, with one region per line '
                             'with at least 3 columns: chromosome, start, '
                             'and end. The first line will be taken as a '
                             'header and cannot start with the comment char. '
                             'If --interval is passed, this argument should '
                             'just be a single integer, representing the '
                             'size of the interval in base pairs.')
    parser.add_argument('run_dir',
                        help='Path to the Run folder containing the '
                             'source pairs file.')
    parser.add_argument('chroms',
                        help='Path to the chrom sizes file used for '
                             'these data')
    parser.add_argument('--interval', action='store_true',
                        help='Pass if each chromosome should be binned at a '
                             'regular interval, instead of by specific '
                             'regions. If passed, the "regions" argument '
                             'should be a single integer, denoting the '
                             'size of the interval in base pairs.')
    parser.add_argument('--humanres', default='',
                        help='The human readable version of the resolution, '
                             'e.g. 10Kbp for 10000. Used for file naming. '
                             'Only applicable when --interval is passed.')
    parser.add_argument('-d', '--delim', default='\t',
                        help='Character string separating columns in '
                             'the regions file. TAB character by default. ')
    parser.add_argument('-c', '--columns', nargs=3, default=[0, 1, 2],
                        help='Columns in the regions file for '
                             'chromosome, start, and end, in that order. '
                             'First column is 0, and so on. Default is '
                             '"-c 0 1 2"')
    parser.add_argument('--comment', default='',
                        help='Single character that prefixes all lines '
                             'that should be ignored.')
    parser.add_argument('-l', '--logfile',
                        help='The file to which all log messages should be '
                             'appended. If it is a directory, the log will '
                             'be "build_network.log"')
    parser.add_argument('--splitouterr', action='store_true',
                        help='If set, the single logfile will instead be '
                             'split from LOGFILE.EXTENSION to LOGFILE.out '
                             'and LOGFILE.err')
    # Needs open/closed interval spec
    args = parser.parse_args()
    main(args.regions, args.run_dir, args.chroms, args.interval,
         args.humanres, args.delim, args.columns, args.comment,
         args.logfile, args.splitouterr)