#!/usr/bin/env python # # Copyright (c) 2020 10X Genomics, Inc. All rights reserved. # """Utility script to check the fraction of reads from a whole transcriptome experiment matching a target panel and provide sequencing depth recommendations for a corresponding targeted experiment on the same (or a similar) sample. """ from __future__ import absolute_import, division, print_function from math import sqrt import argparse import os import sys import warnings warnings.simplefilter(action="ignore", category=FutureWarning) # pylint: disable=wrong-import-position from cellranger.csv_utils import CSVParseException import cellranger.molecule_counter as cr_mc from cellranger.molecule_counter_extensions import get_indices_for_values import cellranger.targeted.simple_utils as cr_tgt_utils from cellranger.rna.library import GENE_EXPRESSION_LIBRARY_TYPE from cellranger.targeted.targeted_constants import TARGETING_METHOD_HC MIN_TARGETED_READS = 1000 # Maximum acceptable CV in the estimated panel content of the WTA sample # (i.e., fraction of reads expected to map to genes in the the target panel). # Equal to the CV in the estimated Targeted GEX sequencing depths. MAX_CV = 0.1 SUPPORTED_PRODUCTS = ["cellranger", "spaceranger"] CELL_NAME = {"cellranger": "Cell", "spaceranger": "Spot"} RPC_NAME = {"cellranger": "rpc", "spaceranger": "rps"} # None indicates full depth for the whole transcriptome sample WTA_DEPTHS = {"cellranger": [None, 20000, 50000], "spaceranger": [None, 30000]} def print_err(string): """Print to stderr""" print(string, file=sys.stderr) def parse_args(product): """Parse command line arguments""" parser = argparse.ArgumentParser( description=( "Compute the fraction of reads from a whole transcriptome analysis that were " "counted towards genes in a specified target panel. Also, provide sequencing " "depth recommendations for a hypothetical targeted experiment on a similar " "sample to achieve comparable sensitivity." ), prog="{} targeted-depth".format(product), ) required = parser.add_argument_group("required arguments") required.add_argument( "--molecule-h5", type=str, required=True, help="path to molecule_info.h5 file from a whole transcriptome analysis", ) required.add_argument( "--target-panel", type=str, required=True, help="path to target panel CSV file" ) return parser.parse_args() def check_path(path, name): """Check that the file at `path` exists and is readable, then return the path. Exit on failure. """ if not os.path.exists(path): print_err("error: The provided {} does not exist: {}".format(name, path)) sys.exit(3) if not os.access(path, os.R_OK): print_err( "error: The provided {} is not readable, please check file permissions: {}".format( name, path ) ) sys.exit(4) return path def check_molecule_counter(counter, product): """Check that the molecule counter contains exactly one Gene Expression library and is not a barnyard sample. Exit on failure, otherwise returns the index of the Gene Expression library. Warn if there is a mismatch between `product` and the data provided (spatial vs single cell). """ gex_libs = counter.get_library_indices_by_type()[GENE_EXPRESSION_LIBRARY_TYPE] if len(gex_libs) == 0: print_err("error: The molecule info h5 file does not contain a Gene Expression library.") sys.exit(5) elif len(gex_libs) > 1: # As of CR 4.0, this should not be possible since the merged molecule_info.h5 is not an # output of the aggr pipeline print_err( "error: The molecule info h5 file was aggregated from multiple samples. " "Please use a molecule info h5 file from a single sample run." ) sys.exit(5) elif len(counter.get_genomes()) > 1: print_err("error: The molecule info h5 file was run using a multi-species reference.") sys.exit(5) gex_lib_idx = gex_libs[0] # check that the sample type matches the product being used for targeted-depth is_spatial = counter.is_spatial_data() if product == "spaceranger" and not is_spatial: print( "\nWARNING: The molecule info h5 file is from a Cell Ranger analysis." "\nPlease use cellranger targeted-depth for metrics and depth" "\nrecommendations specific to Single Cell Gene Expression." ) elif product == "cellranger" and is_spatial: print( "\nWARNING: The molecule info h5 file is from a Space Ranger analysis." "\nPlease use spaceranger targeted-depth for metrics and depth" "\nrecommendations specific to Spatial Gene Expression." ) return gex_lib_idx def check_mol_info_format(mol_info_fn): """Exit if the given file is not a valid molecule info h5 file from Cellranger 3.0 or later.""" try: h5_file, molecule_h5_version = cr_mc.get_h5py_file_and_version(mol_info_fn) # pylint: disable=broad-except except Exception: print_err("error: Not a valid molecule info h5 file: {}".format(mol_info_fn)) sys.exit(5) if ( cr_mc.METRICS_JSON_DATASET_NAME not in h5_file.keys() and cr_mc.V3_METRICS_GROUP_NAME not in h5_file.keys() ): print_err("error: Not a valid molecule info h5 file: {}".format(mol_info_fn)) sys.exit(5) elif molecule_h5_version is None or molecule_h5_version < 3: print_err( "error: Molecule info h5 file is from Cell Ranger 2.1 or older. " "Please use results from Cell Ranger 3.0 or later." ) sys.exit(5) h5_file.close() def parse_target_panel(target_panel_fn): """Return the targeted gene IDs from a valid target panel csv file, or exit if the file is not valid. """ try: _, targeted_gene_ids, _ = cr_tgt_utils.parse_target_csv( target_panel_fn, expected_targeting_method=TARGETING_METHOD_HC ) except CSVParseException as err: print_err("error: {}".format(err)) sys.exit(5) except Exception: # pylint: disable=broad-except print_err("error: Not a valid target panel csv file: {}".format(target_panel_fn)) sys.exit(5) return targeted_gene_ids def binomial_cv(n, p_est): """Compute the CV of an estimated binomial proportion. Not reliable for very small n*p.""" if n == 0 or p_est == 0: return None # shouldn't happen std_error = sqrt(p_est * (1 - p_est) / n) return std_error / p_est # pylint: disable=too-many-locals def main(): product = os.environ["TENX_PRODUCT"] if product not in SUPPORTED_PRODUCTS: print("targeted-depth: Error: Environment variable TENX_PRODUCT is invalid:", product) sys.exit(1) args = parse_args(product) mol_info_fn = check_path(args.molecule_h5, "molecule info h5 file") target_panel_fn = check_path(args.target_panel, "target panel csv file") check_mol_info_format(mol_info_fn) counter = cr_mc.MoleculeCounter.open(mol_info_fn, "r") gex_library_idx = check_molecule_counter(counter, product) if cr_mc.MoleculeCounter.is_targeted_library(counter.get_library_info()[gex_library_idx]): print( "\nWARNING: The molecule info h5 file is already from a Targeted\n" "Gene Expression analysis." ) targeted_gene_ids = parse_target_panel(target_panel_fn) feature_ref = counter.feature_reference gene_id_to_index = { fdef.id: fdef.index for fdef in feature_ref.feature_defs if fdef.feature_type == GENE_EXPRESSION_LIBRARY_TYPE } targeted_gene_idx = [gene_id_to_index.get(gene_id) for gene_id in targeted_gene_ids] if any(idx is None for idx in targeted_gene_idx): missing_gene_id = targeted_gene_ids[targeted_gene_idx.index(None)] print_err( """error: The gene {} from the target panel csv is not present in the reference transcriptome used by the molecule info h5 file.""".format( missing_gene_id ) ) sys.exit(5) num_cells = counter.get_num_filtered_barcodes_for_library(gex_library_idx) num_raw_reads = counter.get_raw_read_pairs_per_library()[gex_library_idx] raw_reads_per_cell = float(num_raw_reads) / float(num_cells) gex_mol_idx = get_indices_for_values(counter, [cr_mc.LIBRARY_IDX_COL_NAME], gex_library_idx) num_gex_umis = gex_mol_idx.sum() num_gex_reads = counter.get_column_with_indices(cr_mc.COUNT_COL_NAME, gex_mol_idx).sum() dup_frac = 1.0 - float(num_gex_umis) / float(max(num_gex_reads, 1)) percent_sequencing_saturation = 100.0 * dup_frac targeted_mol_idx = get_indices_for_values( counter, [cr_mc.FEATURE_IDX_COL_NAME], targeted_gene_idx ) num_targeted_reads = counter.get_column_with_indices( cr_mc.COUNT_COL_NAME, targeted_mol_idx ).sum() targeted_reads_per_cell = float(num_targeted_reads) / float(num_cells) frac_reads_counted_on_target = float(num_targeted_reads) / float(num_raw_reads) percent_on_target = 100.0 * frac_reads_counted_on_target estimate_cv = binomial_cv(n=num_raw_reads, p_est=frac_reads_counted_on_target) if num_targeted_reads < MIN_TARGETED_READS or estimate_cv is None or estimate_cv > MAX_CV: print_err( "\nWARNING: Not enough data to accurately determine Targeted Gene Expression " "sequencing depths. This may reflect low overall read depth and/or low panel " "gene content in the whole transcriptome sample." ) cell = CELL_NAME[product] print("") print("Whole Transcriptome Analysis (WTA) Input Sample Metrics:") print("---------------------------------------------------------") if product == "cellranger": print("Estimated Number of Cells{:32,d}".format(num_cells)) elif product == "spaceranger": print("Number of Spots Under Tissue{:29,d}".format(num_cells)) else: # Shouldn't happen print("targeted-depth: Error: Environment variable TENX_PRODUCT is invalid:", product) sys.exit(1) print("Number of Reads{:42,d}".format(num_raw_reads)) print("Mean Reads per {}{:38.0f}".format(cell, raw_reads_per_cell)) print("Sequencing Saturation{:35.1f}%".format(percent_sequencing_saturation)) print("") if percent_on_target >= 1.0: print("Fraction of Reads from Targeted Genes{:19.2f}%".format(percent_on_target)) else: print("Fraction of Reads from Targeted Genes{:19.2g}%".format(percent_on_target)) print("Number of Reads from Targeted Genes{:22,d}".format(num_targeted_reads)) print("Mean Reads per {} from Targeted Genes{:18.0f}".format(cell, targeted_reads_per_cell)) rpc = RPC_NAME[product] wta_depths = WTA_DEPTHS[product] _print_recommended_depths( cell, rpc, num_raw_reads, frac_reads_counted_on_target, num_cells, wta_depths ) _print_footer(cell, rpc) counter.close() def _print_recommended_depths( cell, rpc, num_raw_reads, frac_reads_counted_on_target, num_cells, wta_depths ): wta_depth_name = {} reads_equiv = {} rpc_equiv = {} for depth in wta_depths: if depth is None: wta_depth_name[depth] = "Original" adjusted_raw_reads = num_raw_reads else: wta_depth_name[depth] = "{}k {}".format(int(depth / 1000), rpc) adjusted_raw_reads = depth * num_cells reads_equiv[depth] = int(2.0 * adjusted_raw_reads * frac_reads_counted_on_target) rpc_equiv[depth] = int(reads_equiv[depth] / num_cells) print("\n") print("Targeted GEX Recommended Sequencing Depths:") print("---------------------------------------------------------") print("WTA Depth Mean Reads per {} Total Reads".format(cell)) print("---------------------------------------------------------") for depth in wta_depths: print( "{:<14}{:<24,d}{:,d}".format( wta_depth_name[depth], rpc_equiv[depth], reads_equiv[depth] ) ) def _print_footer(cell, rpc): print("") print("The recommended Targeted Gene Expression sequencing depth") print("is calculated as 2.0 * WTA Depth * Fraction of Reads from") print("Targeted Genes. The WTA Depth is in terms of Mean Reads") print("per {} ({}). The 2.0 depth adjustment factor can help".format(cell, rpc)) print("compensate for non-uniform read coverage and reads that") print("cannot be mapped confidently to targeted genes. These are") print("approximate estimates, and final results may vary.") print("") if __name__ == "__main__": main()