#!/usr/bin/env python
#
# Copyright (c) 2020 10X Genomics, Inc. All rights reserved.
#
"""Utility script to check the fraction of reads from a whole transcriptome experiment matching
a target panel and provide sequencing depth recommendations for a corresponding targeted
experiment on the same (or a similar) sample.
"""

from __future__ import absolute_import, division, print_function
from math import sqrt
import argparse
import os
import sys
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

# pylint: disable=wrong-import-position
from cellranger.csv_utils import CSVParseException
import cellranger.molecule_counter as cr_mc
from cellranger.molecule_counter_extensions import get_indices_for_values
import cellranger.targeted.simple_utils as cr_tgt_utils
from cellranger.rna.library import GENE_EXPRESSION_LIBRARY_TYPE
from cellranger.targeted.targeted_constants import TARGETING_METHOD_HC


MIN_TARGETED_READS = 1000
# Maximum acceptable CV in the estimated panel content of the WTA sample
# (i.e., fraction of reads expected to map to genes in the the target panel).
# Equal to the CV in the estimated Targeted GEX sequencing depths.
MAX_CV = 0.1

SUPPORTED_PRODUCTS = ["cellranger", "spaceranger"]
CELL_NAME = {"cellranger": "Cell", "spaceranger": "Spot"}
RPC_NAME = {"cellranger": "rpc", "spaceranger": "rps"}
# None indicates full depth for the whole transcriptome sample
WTA_DEPTHS = {"cellranger": [None, 20000, 50000], "spaceranger": [None, 30000]}


def print_err(string):
    """Print to stderr"""
    print(string, file=sys.stderr)


def parse_args(product):
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(
        description=(
            "Compute the fraction of reads from a whole transcriptome analysis that were "
            "counted towards genes in a specified target panel. Also, provide sequencing "
            "depth recommendations for a hypothetical targeted experiment on a similar "
            "sample to achieve comparable sensitivity."
        ),
        prog="{} targeted-depth".format(product),
    )
    required = parser.add_argument_group("required arguments")
    required.add_argument(
        "--molecule-h5",
        type=str,
        required=True,
        help="path to molecule_info.h5 file from a whole transcriptome analysis",
    )
    required.add_argument(
        "--target-panel", type=str, required=True, help="path to target panel CSV file"
    )
    return parser.parse_args()


def check_path(path, name):
    """Check that the file at `path` exists and is readable, then return the path.
    Exit on failure.
    """
    if not os.path.exists(path):
        print_err("error: The provided {} does not exist: {}".format(name, path))
        sys.exit(3)
    if not os.access(path, os.R_OK):
        print_err(
            "error: The provided {} is not readable, please check file permissions: {}".format(
                name, path
            )
        )
        sys.exit(4)
    return path


def check_molecule_counter(counter, product):
    """Check that the molecule counter contains exactly one Gene Expression library
    and is not a barnyard sample. Exit on failure, otherwise returns the index of
    the Gene Expression library. Warn if there is a mismatch between `product` and
    the data provided (spatial vs single cell).
    """
    gex_libs = counter.get_library_indices_by_type()[GENE_EXPRESSION_LIBRARY_TYPE]
    if len(gex_libs) == 0:
        print_err("error: The molecule info h5 file does not contain a Gene Expression library.")
        sys.exit(5)
    elif len(gex_libs) > 1:
        # As of CR 4.0, this should not be possible since the merged molecule_info.h5 is not an
        # output of the aggr pipeline
        print_err(
            "error: The molecule info h5 file was aggregated from multiple samples. "
            "Please use a molecule info h5 file from a single sample run."
        )
        sys.exit(5)
    elif len(counter.get_genomes()) > 1:
        print_err("error: The molecule info h5 file was run using a multi-species reference.")
        sys.exit(5)
    gex_lib_idx = gex_libs[0]

    # check that the sample type matches the product being used for targeted-depth
    is_spatial = counter.is_spatial_data()
    if product == "spaceranger" and not is_spatial:
        print(
            "\nWARNING: The molecule info h5 file is from a Cell Ranger analysis."
            "\nPlease use cellranger targeted-depth for metrics and depth"
            "\nrecommendations specific to Single Cell Gene Expression."
        )
    elif product == "cellranger" and is_spatial:
        print(
            "\nWARNING: The molecule info h5 file is from a Space Ranger analysis."
            "\nPlease use spaceranger targeted-depth for metrics and depth"
            "\nrecommendations specific to Spatial Gene Expression."
        )

    return gex_lib_idx


def check_mol_info_format(mol_info_fn):
    """Exit if the given file is not a valid molecule info h5 file from Cellranger 3.0 or later."""
    try:
        h5_file, molecule_h5_version = cr_mc.get_h5py_file_and_version(mol_info_fn)
    # pylint: disable=broad-except
    except Exception:
        print_err("error: Not a valid molecule info h5 file: {}".format(mol_info_fn))
        sys.exit(5)
    if (
        cr_mc.METRICS_JSON_DATASET_NAME not in h5_file.keys()
        and cr_mc.V3_METRICS_GROUP_NAME not in h5_file.keys()
    ):
        print_err("error: Not a valid molecule info h5 file: {}".format(mol_info_fn))
        sys.exit(5)
    elif molecule_h5_version is None or molecule_h5_version < 3:
        print_err(
            "error: Molecule info h5 file is from Cell Ranger 2.1 or older. "
            "Please use results from Cell Ranger 3.0 or later."
        )
        sys.exit(5)
    h5_file.close()


def parse_target_panel(target_panel_fn):
    """Return the targeted gene IDs from a valid target panel csv file, or exit
    if the file is not valid.
    """
    try:
        _, targeted_gene_ids, _ = cr_tgt_utils.parse_target_csv(
            target_panel_fn, expected_targeting_method=TARGETING_METHOD_HC
        )
    except CSVParseException as err:
        print_err("error: {}".format(err))
        sys.exit(5)
    except Exception:  # pylint: disable=broad-except
        print_err("error: Not a valid target panel csv file: {}".format(target_panel_fn))
        sys.exit(5)
    return targeted_gene_ids


def binomial_cv(n, p_est):
    """Compute the CV of an estimated binomial proportion. Not reliable for very small n*p."""
    if n == 0 or p_est == 0:
        return None  # shouldn't happen
    std_error = sqrt(p_est * (1 - p_est) / n)
    return std_error / p_est


# pylint: disable=too-many-locals
def main():
    product = os.environ["TENX_PRODUCT"]
    if product not in SUPPORTED_PRODUCTS:
        print("targeted-depth: Error: Environment variable TENX_PRODUCT is invalid:", product)
        sys.exit(1)

    args = parse_args(product)

    mol_info_fn = check_path(args.molecule_h5, "molecule info h5 file")
    target_panel_fn = check_path(args.target_panel, "target panel csv file")
    check_mol_info_format(mol_info_fn)

    counter = cr_mc.MoleculeCounter.open(mol_info_fn, "r")

    gex_library_idx = check_molecule_counter(counter, product)
    if cr_mc.MoleculeCounter.is_targeted_library(counter.get_library_info()[gex_library_idx]):
        print(
            "\nWARNING: The molecule info h5 file is already from a Targeted\n"
            "Gene Expression analysis."
        )

    targeted_gene_ids = parse_target_panel(target_panel_fn)
    feature_ref = counter.feature_reference
    gene_id_to_index = {
        fdef.id: fdef.index
        for fdef in feature_ref.feature_defs
        if fdef.feature_type == GENE_EXPRESSION_LIBRARY_TYPE
    }
    targeted_gene_idx = [gene_id_to_index.get(gene_id) for gene_id in targeted_gene_ids]
    if any(idx is None for idx in targeted_gene_idx):
        missing_gene_id = targeted_gene_ids[targeted_gene_idx.index(None)]
        print_err(
            """error: The gene {} from the target panel csv is not present
in the reference transcriptome used by the molecule info h5 file.""".format(
                missing_gene_id
            )
        )
        sys.exit(5)

    num_cells = counter.get_num_filtered_barcodes_for_library(gex_library_idx)
    num_raw_reads = counter.get_raw_read_pairs_per_library()[gex_library_idx]
    raw_reads_per_cell = float(num_raw_reads) / float(num_cells)

    gex_mol_idx = get_indices_for_values(counter, [cr_mc.LIBRARY_IDX_COL_NAME], gex_library_idx)
    num_gex_umis = gex_mol_idx.sum()
    num_gex_reads = counter.get_column_with_indices(cr_mc.COUNT_COL_NAME, gex_mol_idx).sum()
    dup_frac = 1.0 - float(num_gex_umis) / float(max(num_gex_reads, 1))
    percent_sequencing_saturation = 100.0 * dup_frac

    targeted_mol_idx = get_indices_for_values(
        counter, [cr_mc.FEATURE_IDX_COL_NAME], targeted_gene_idx
    )
    num_targeted_reads = counter.get_column_with_indices(
        cr_mc.COUNT_COL_NAME, targeted_mol_idx
    ).sum()
    targeted_reads_per_cell = float(num_targeted_reads) / float(num_cells)
    frac_reads_counted_on_target = float(num_targeted_reads) / float(num_raw_reads)
    percent_on_target = 100.0 * frac_reads_counted_on_target

    estimate_cv = binomial_cv(n=num_raw_reads, p_est=frac_reads_counted_on_target)
    if num_targeted_reads < MIN_TARGETED_READS or estimate_cv is None or estimate_cv > MAX_CV:
        print_err(
            "\nWARNING: Not enough data to accurately determine Targeted Gene Expression "
            "sequencing depths. This may reflect low overall read depth and/or low panel "
            "gene content in the whole transcriptome sample."
        )

    cell = CELL_NAME[product]

    print("")
    print("Whole Transcriptome Analysis (WTA) Input Sample Metrics:")
    print("---------------------------------------------------------")
    if product == "cellranger":
        print("Estimated Number of Cells{:32,d}".format(num_cells))
    elif product == "spaceranger":
        print("Number of Spots Under Tissue{:29,d}".format(num_cells))
    else:
        # Shouldn't happen
        print("targeted-depth: Error: Environment variable TENX_PRODUCT is invalid:", product)
        sys.exit(1)
    print("Number of Reads{:42,d}".format(num_raw_reads))
    print("Mean Reads per {}{:38.0f}".format(cell, raw_reads_per_cell))
    print("Sequencing Saturation{:35.1f}%".format(percent_sequencing_saturation))
    print("")

    if percent_on_target >= 1.0:
        print("Fraction of Reads from Targeted Genes{:19.2f}%".format(percent_on_target))
    else:
        print("Fraction of Reads from Targeted Genes{:19.2g}%".format(percent_on_target))
    print("Number of Reads from Targeted Genes{:22,d}".format(num_targeted_reads))
    print("Mean Reads per {} from Targeted Genes{:18.0f}".format(cell, targeted_reads_per_cell))

    rpc = RPC_NAME[product]
    wta_depths = WTA_DEPTHS[product]

    _print_recommended_depths(
        cell, rpc, num_raw_reads, frac_reads_counted_on_target, num_cells, wta_depths
    )
    _print_footer(cell, rpc)
    counter.close()


def _print_recommended_depths(
    cell, rpc, num_raw_reads, frac_reads_counted_on_target, num_cells, wta_depths
):
    wta_depth_name = {}
    reads_equiv = {}
    rpc_equiv = {}
    for depth in wta_depths:
        if depth is None:
            wta_depth_name[depth] = "Original"
            adjusted_raw_reads = num_raw_reads
        else:
            wta_depth_name[depth] = "{}k {}".format(int(depth / 1000), rpc)
            adjusted_raw_reads = depth * num_cells
        reads_equiv[depth] = int(2.0 * adjusted_raw_reads * frac_reads_counted_on_target)
        rpc_equiv[depth] = int(reads_equiv[depth] / num_cells)

    print("\n")
    print("Targeted GEX Recommended Sequencing Depths:")
    print("---------------------------------------------------------")
    print("WTA Depth     Mean Reads per {}     Total Reads".format(cell))
    print("---------------------------------------------------------")
    for depth in wta_depths:
        print(
            "{:<14}{:<24,d}{:,d}".format(
                wta_depth_name[depth], rpc_equiv[depth], reads_equiv[depth]
            )
        )


def _print_footer(cell, rpc):
    print("")
    print("The recommended Targeted Gene Expression sequencing depth")
    print("is calculated as 2.0 * WTA Depth * Fraction of Reads from")
    print("Targeted Genes. The WTA Depth is in terms of Mean Reads")
    print("per {} ({}). The 2.0 depth adjustment factor can help".format(cell, rpc))
    print("compensate for non-uniform read coverage and reads that")
    print("cannot be mapped confidently to targeted genes. These are")
    print("approximate estimates, and final results may vary.")
    print("")


if __name__ == "__main__":
    main()