import sys import argparse import numpy as np import pysam import util import snptable import tables import os def write_results(out_f, chrom_name, snp_tab, ref_matches, alt_matches, oth_matches, geno_sample): haps = None has_haps = False if geno_sample: # get index for this sample in the haplotype table samp_idx_dict = dict(list(zip(snp_tab.samples, list(range(len(snp_tab.samples)))))) if geno_sample in samp_idx_dict: idx = samp_idx_dict[geno_sample] geno_hap_idx = np.array([idx*2, idx*2+1], dtype=np.int) haps = snp_tab.haplotypes[:, geno_hap_idx] has_haps = True sys.stderr.write("geno_hap_idx: %s\n" % repr(geno_hap_idx)) else: sys.stderr.write("WARNING: sample %s is not present for " "chromosome %s\n" % (geno_sample, chrom_name)) haps = None has_haps = False for i in range(snp_tab.n_snp): if has_haps: geno_str = "{}|{}".format(haps[i, 0], haps[i, 1]) else: geno_str = "NA" out_f.write("%s %d %s %s %s %d %d %d\n" % (chrom_name, snp_tab.snp_pos[i], snp_tab.snp_allele1[i], snp_tab.snp_allele2[i], geno_str, ref_matches[i], alt_matches[i], oth_matches[i])) def write_header(out_f): out_f.write("CHROM SNP.POS REF.ALLELE ALT.ALLELE GENOTYPE REF.COUNT " "ALT.COUNT OTHER.COUNT\n") def parse_samples(samples_str): """Gets list of samples from --samples argument. This may be a comma-delimited string or a path to a file. If a file is provided then the first column of the file is assumed to be the sample name""" if samples_str is None: return None # first check if this is a path to a file if os.path.exists(samples_str) and not os.path.isdir(samples_str): samples = [] if util.is_gzipped(samples_str): f = gzip.open(samples_str, "rt") else: f = open(samples_str, "rt") for line in f: # assume first token in line is sample name samples.append(line.split()[0]) sys.stderr.write("read %d sample names from file '%s'\n" % (len(samples), samples_str)) f.close() else: # otherwise assume comma-delimited string if ("," not in samples_str and len(samples_str) > 15) \ or ("/" in samples_str): sys.stderr.write("WARNING: --samples argument (%s) " "does not look like sample name " "but is not path to valid file. " "Assuming it is a sample name anyway." % samples_str) samples = samples_str.split(",") sys.stderr.write("SAMPLES: %s\n"% repr(samples)) return samples def parse_options(): parser = argparse.ArgumentParser(description="This script outputs " "allele-specific counts for SNPs, using " "reads from the provided BAM file. " "Currently indels are not output and " "chromosomes with no mapped reads " "are skipped. Output " "is written to stdout, with a single " "header row and the following " "columns: " " " " " ". Reads that overlap " "multiple SNPs will be counted multiple " "times in the output (this behavior " "differs from the " "extract_haplotype_read_counts.py " "script).") parser.add_argument("--snp_dir", action='store', help=("Directory containing SNP text files " "This directory should contain one file per " "chromosome named like chr<#>.snps.txt.gz. " "Each file should contain 3 columns: position " "RefAllele AltAllele"), default=None) parser.add_argument("--snp_tab", help="Path to HDF5 file to read SNP information " "from. Each row of SNP table contains SNP name " "(rs_id), position, allele1, allele2.", metavar="SNP_TABLE_H5_FILE", default=None) parser.add_argument("--snp_index", help="Path to HDF5 file containing SNP index. The " "SNP index is used to convert the genomic position " "of a SNP to its corresponding row in the haplotype " "and snp_tab HDF5 files.", metavar="SNP_INDEX_H5_FILE", default=None) parser.add_argument("--haplotype", help="Path to HDF5 file to read phased haplotypes " "from. When generating alternative reads " "use known haplotypes from this file rather " "than all possible allelic combinations.", metavar="HAPLOTYPE_H5_FILE", default=None) parser.add_argument("--samples", help="Use only haplotypes and SNPs that are " "polymorphic in these samples. " "SAMPLES can either be a comma-delimited string " "of sample names or a path to a file with one sample " "name per line (file is assumed to be " "whitespace-delimited and first column is assumed to " "be sample name). Sample names should match those " "present in the haplotype HDF5 file. Samples are " "ignored if no haplotype file is provided.", metavar="SAMPLES", default=None) parser.add_argument("--genotype_sample", metavar="GENO_SAMPLE", help="output genotypes for sample with name " "GENO_SAMPLE alongside allele-specific counts. " "GENO_SAMPLE must match one " "of the names present in the haplotype HDF5 file. " "If the --samples argument is provided then " "GENO_SAMPLE must also be one of the specified " "samples. If --genotype_sample is " "not provided or the GENO_SAMPLE does not match any " "of the samples in haplotype file then NA is " "output for genotype.", default=None) parser.add_argument("bam_filename", action='store', help="Coordinate-sorted input BAM file " "containing mapped reads.") options = parser.parse_args() if options.snp_dir: if(options.snp_tab or options.snp_index or options.haplotype): parser.error("expected --snp_dir OR (--snp_tab, --snp_index and " "--haplotype) arguments but not both") else: if not (options.snp_tab and options.snp_index and options.haplotype): parser.error("either --snp_dir OR (--snp_tab, " "--snp_index AND --haplotype) arguments must be " "provided") return options def main(bam_filename, snp_dir=None, snp_tab_filename=None, snp_index_filename=None, haplotype_filename=None, samples=None, geno_sample=None): out_f = sys.stdout bam = pysam.Samfile(bam_filename) cur_chrom = None cur_tid = None seen_chrom = set([]) snp_tab = snptable.SNPTable() read_pair_cache = {} # keep track of number of ref matches, non-ref matches, and other # for each SNP snp_ref_match = None snp_alt_match = None snp_other_match = None if geno_sample and not haplotype_filename: sys.stderr.write("WARNING: cannot obtain genotypes for sample " "%s without --haplotype argument\n") geno_sample = None sys.stderr.write("GENOTYPE_SAMPLE: %s\n" % geno_sample) if snp_tab_filename: if (not snp_index_filename) or (not haplotype_filename): raise ValueError("--snp_index and --haplotype must be provided " "if --snp_tab is provided") snp_tab_h5 = tables.open_file(snp_tab_filename, "r") snp_index_h5 = tables.open_file(snp_index_filename, "r") hap_h5 = tables.open_file(haplotype_filename, "r") else: snp_tab_h5 = None snp_index_h5 = None hap_h5 = None for read in bam: if (cur_tid is None) or (read.tid != cur_tid): # this is a new chromosome if cur_chrom: # write out results from last chromosome write_results(out_f, cur_chrom, snp_tab, snp_ref_match, snp_alt_match, snp_oth_match, geno_sample) cur_chrom = bam.getrname(read.tid) if cur_chrom in seen_chrom: # sanity check that input bam file is sorted raise ValueError("expected input BAM file to be sorted " "but chromosome %s is repeated\n" % cur_chrom) seen_chrom.add(cur_chrom) cur_tid = read.tid sys.stderr.write("starting chromosome %s\n" % cur_chrom) # read SNPs for next chromomsome if snp_tab_h5: # read SNPs from HDF5 files, reduce to set that are # polymorphic in specified samples snp_tab.read_h5(snp_tab_h5, snp_index_h5, hap_h5, cur_chrom, samples=samples) elif snp_dir: # read SNPs from text file snp_filename = "%s/%s.snps.txt.gz" % (snp_dir, cur_chrom) snp_tab.read_file(snp_filename) else: raise ValueError("--snp_dir OR (--snp_tab, --snp_index, " "and --hap_h5) must be defined") sys.stderr.write("read %d SNPs\n" % snp_tab.n_snp) # clear SNP table and results snp_ref_match = np.zeros(snp_tab.n_snp, dtype=np.int16) snp_alt_match = np.zeros(snp_tab.n_snp, dtype=np.int16) snp_oth_match = np.zeros(snp_tab.n_snp, dtype=np.int16) if read.is_secondary: # this is a secondary alignment (i.e. read was aligned more than # once and this has align score that <= best score) continue # loop over all SNP that overlap this read snp_idx, snp_read_pos, \ indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read) for snp_i, read_pos in zip(snp_idx, snp_read_pos): snp_pos = snp_tab.snp_pos[snp_i] ref_allele = snp_tab.snp_allele1[snp_i] alt_allele = snp_tab.snp_allele2[snp_i] if ref_allele == read.query_sequence[read_pos-1]: snp_ref_match[snp_i] += 1 elif alt_allele == read.query_sequence[read_pos-1]: snp_alt_match[snp_i] += 1 else: snp_oth_match[snp_i] += 1 if cur_chrom: # write results for final chromosome write_results(out_f, cur_chrom, snp_tab, snp_ref_match, snp_alt_match, snp_oth_match, geno_sample) if __name__ == "__main__": sys.stderr.write("command: %s\n" % " ".join(sys.argv)) options = parse_options() samples = parse_samples(options.samples) main(options.bam_filename, snp_dir=options.snp_dir, snp_tab_filename=options.snp_tab, snp_index_filename=options.snp_index, haplotype_filename=options.haplotype, samples=samples, geno_sample=options.genotype_sample)