/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Notes and known issues: - SN ts/tv calculation includes all non-ref alleles listed in ALT while per-sample ts/tv takes the first non-ref allele only, something to consider with many non-ref HETs. */ #include #include #include #include #include #include #include #include #include #include #include #include "bcftools.h" #include "filter.h" #include "bin.h" #include "dist.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 #define FLT_EXCLUDE 2 #define HWE_STATS 1 #define QUAL_STATS 1 #define IRC_STATS 1 #define IRC_RLEN 10 #define NA_STRING "0" typedef struct { char *tag; float min, max; uint64_t *vals_ts, *vals_tv; void *val; int nbins, type, m_val, idx; } user_stats_t; typedef struct { int min, max, step, m_vals; uint64_t *vals; } idist_t; // variant allele frequency (fraction of alt allele in pileup as determined from AD) collected into 0.05 bins typedef struct { int snv[21], indel[21]; } vaf_t; typedef struct { uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons #if HWE_STATS int *af_hwe; #endif #if IRC_STATS int n_repeat[IRC_RLEN][4], n_repeat_na; // number of indels which are repeat-consistent, repeat-inconsistent (dels and ins), and not applicable int *af_repeats[3]; #endif int ts_alt1, tv_alt1; #if QUAL_STATS // Values are rounded to one significant digit and 1 is added (Q*10+1); missing and negative values go in the first bin // Only SNPs that are the 1st alternate allele are counted dist_t *qual_ts, *qual_tv, *qual_indels; #endif int *insertions, *deletions, m_indel; // maximum indel length int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1; int subst[15]; int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs; int *smpl_frm_shifts; // not-applicable, in-frame, out-frame vaf_t vaf, *smpl_vaf; // total (INFO/AD) and per-sample (FMT/VAF) VAF distributions unsigned long int *smpl_dp; idist_t dp, dp_sites; int nusr; user_stats_t *usr; double *dvaf; // distribution of the mean indel-allele frequency by length: -m_indel,-(m_indel-1),...-1,0,1,..,m_indel uint32_t *nvaf; } stats_t; typedef struct { uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats /* Pearson's R^2 is used for aggregate R^2 y, yy .. sum of dosage and squared dosage in the query VCF (second file) x, xx .. sum of squared dosage in the truth VCF (first file) n .. number of genotypes */ double y, yy, x, xx, yx, n; } gtcmp_t; typedef struct { char *seq; int pos, cnt, len; } _idc1_t; typedef struct { faidx_t *ref; _idc1_t *dat; int ndat, mdat; } indel_ctx_t; typedef struct { // stats stats_t stats[3]; int *tmp_iaf, ntmp_iaf, m_af, m_qual, naf_hwe, mtmp_frm; uint8_t *tmp_frm; int dp_min, dp_max, dp_step; gtcmp_t *smpl_gts_snps, *smpl_gts_indels; gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons bin_t *af_bins; float *farr; int32_t *iarr; int mfarr, miarr; int nref_tot, nhet_tot, nalt_tot, n_nref, i_nref; // indel context indel_ctx_t *indel_ctx; char *ref_fname; // user stats int nusr; user_stats_t *usr; // other bcf_srs_t *files; bcf_sr_regions_t *exons; char **argv, *exons_fname, *regions_list, *samples_list, *targets_list, *af_bins_list, *af_tag; int argc, verbose_sites, first_allele_only, samples_is_file; int split_by_id, nstats; filter_t *filter[2]; char *filter_str; int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE int n_threads; } args_t; static int type2dosage[6], type2ploidy[6], type2stats[7]; static void idist_init(idist_t *d, int min, int max, int step) { d->min = min; d->max = max; d->step = step; d->m_vals = 4 + (d->max - d->min)/d->step; d->vals = (uint64_t*) calloc(d->m_vals,sizeof(uint64_t)); } static void idist_destroy(idist_t *d) { if ( d->vals ) free(d->vals); } static inline uint64_t *idist(idist_t *d, int val) { if ( val < d->min ) return &d->vals[0]; if ( val > d->max ) return &d->vals[d->m_vals-1]; return &d->vals[1 + (val - d->min) / d->step]; } static inline int idist_i2bin(idist_t *d, int i) { if ( i<=0 ) return d->min; if ( i>= d->m_vals ) return d->max; return i-1+d->min; } #define IC_DBG 0 #if IC_DBG static void _indel_ctx_print1(_idc1_t *idc) { int i; fprintf(stdout, "%d\t", idc->cnt); for (i=0; ilen; i++) fputc(idc->seq[i], stdout); fputc('\n', stdout); } static void _indel_ctx_print(indel_ctx_t *ctx) { int i; for (i=0; indat; i++) _indel_ctx_print1(&ctx->dat[i]); fputc('\n',stdout); } #endif static int _indel_ctx_lookup(indel_ctx_t *ctx, char *seq, int seq_len, int *hit) { // binary search int min = 0, max = ctx->ndat - 1; while ( min<=max ) { int i = (min+max)/2; int cmp = strncmp(seq, ctx->dat[i].seq, seq_len); if ( cmp<0 ) max = i - 1; else if ( cmp>0 ) min = i + 1; else { if ( seq_len==ctx->dat[i].len ) { *hit = 1; return i; } else if ( seq_lendat[i].len ) max = i - 1; else min = i + 1; } } *hit = 0; return max; } static void _indel_ctx_insert(indel_ctx_t *ctx, char *seq, int seq_len, int pos) { int idat, hit, i; idat = _indel_ctx_lookup(ctx, seq, seq_len, &hit); if ( !hit ) { if ( pos>0 ) return; idat++; ctx->ndat++; hts_expand(_idc1_t, ctx->ndat+1, ctx->mdat, ctx->dat); if ( idatndat && ctx->ndat>1 ) memmove(&ctx->dat[idat+1], &ctx->dat[idat], (ctx->ndat - idat - 1)*sizeof(_idc1_t)); ctx->dat[idat].len = seq_len; ctx->dat[idat].cnt = 1; ctx->dat[idat].pos = pos; ctx->dat[idat].seq = (char*) malloc(sizeof(char)*(seq_len+1)); for (i=0; idat[idat].seq[i] = seq[i]; ctx->dat[idat].seq[i] = 0; return; } if ( ctx->dat[idat].pos + seq_len == pos ) { ctx->dat[idat].cnt++; ctx->dat[idat].pos = pos; } } indel_ctx_t *indel_ctx_init(char *fa_ref_fname) { indel_ctx_t *ctx = (indel_ctx_t *) calloc(1,sizeof(indel_ctx_t)); ctx->ref = fai_load(fa_ref_fname); if ( !ctx->ref ) { free(ctx); return NULL; } return ctx; } void indel_ctx_destroy(indel_ctx_t *ctx) { fai_destroy(ctx->ref); if ( ctx->mdat ) free(ctx->dat); free(ctx); } /** * indel_ctx_type() - determine indel context type * @ctx: * @chr: chromosome name * @pos: position of the first @ref base, 1-based * @ref: reference allele * @alt: alternate allele. Only first of multiple comma-separated alleles is * considered * @nrep: number of repeated elements (w) * @nlen: length of a single repeat element (w) * * Returns the INDEL length, negative for deletions, positive for insertions */ int indel_ctx_type(indel_ctx_t *ctx, char *chr, int pos, char *ref, char *alt, int *nrep, int *nlen) { const int win_size = 50; // hard-wired for now const int rep_len = IRC_RLEN; // hard-wired for now int ref_len = strlen(ref); int alt_len = 0; while ( alt[alt_len] && alt[alt_len]!=',' ) alt_len++; int i, fai_ref_len; char *fai_ref = faidx_fetch_seq(ctx->ref, chr, pos-1, pos+win_size, &fai_ref_len); for (i=0; i96 ) fai_ref[i] -= 32; // Sanity check: the reference sequence must match the REF allele for (i=0; indat = 0; for (i=0; indat; i++) { if ( max_cnt < ctx->dat[i].cnt || (max_cnt==ctx->dat[i].cnt && max_len < ctx->dat[i].len) ) { max_cnt = ctx->dat[i].cnt; max_len = ctx->dat[i].len; } free(ctx->dat[i].seq); } free(fai_ref); *nrep = max_cnt; *nlen = max_len; return alt_len - ref_len; } static void add_user_stats(args_t *args, char *str) { args->nusr++; args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr); user_stats_t *usr = &args->usr[args->nusr-1]; memset(usr,0,sizeof(*usr)); usr->min = 0; usr->max = 1; usr->nbins = 100; usr->idx = 0; char *tmp = str; while ( *tmp && *tmp!=':' ) tmp++; // Tag with an index or just tag? (e.g. PV4[1] vs DP) if ( tmp > str && tmp[-1]==']' ) { char *ptr = tmp; while ( ptr>str && *ptr!='[' ) ptr--; if ( *ptr=='[' ) { char *ptr2; usr->idx = strtol(ptr+1, &ptr2, 10); if ( ptr+1==ptr2 || ptr2 != tmp-1 ) error("Could not parse the index in \"%s\" (ptr=%s;ptr2=%s(%p),tmp=%s(%p),idx=%d)\n", str,ptr,ptr2,ptr2,tmp,tmp,usr->idx); if ( usr->idx<0 ) error("Error: negative index is not allowed: \"%s\"\n", str); *ptr = 0; } } usr->tag = (char*)calloc(tmp-str+2,sizeof(char)); memcpy(usr->tag,str,tmp-str); if ( *tmp ) { char *ptr = ++tmp; usr->min = strtod(tmp, &ptr); if ( tmp==ptr ) error("Could not parse %s\n", str); tmp = ptr+1; } if ( *tmp ) { char *ptr = tmp; usr->max = strtod(tmp, &ptr); if ( tmp==ptr ) error("Could not parse %s\n", str); tmp = ptr+1; } if ( *tmp ) { char *ptr = tmp; usr->nbins = strtol(tmp, &ptr, 10); if ( tmp==ptr ) error("Could not parse %s\n", str); if ( usr->nbins<=0 ) error("Number of bins does not make sense (%d): %s.\n", usr->nbins, str); } } static void init_user_stats(args_t *args, bcf_hdr_t *hdr, stats_t *stats) { stats->nusr = args->nusr; stats->usr = (user_stats_t*)malloc(sizeof(user_stats_t)*args->nusr); memcpy(stats->usr,args->usr,args->nusr*sizeof(user_stats_t)); int i; for (i=0; inusr; i++) { user_stats_t *usr = &stats->usr[i]; usr->vals_ts = (uint64_t*)calloc(usr->nbins,sizeof(uint64_t)); usr->vals_tv = (uint64_t*)calloc(usr->nbins,sizeof(uint64_t)); int id = bcf_hdr_id2int(hdr,BCF_DT_ID,usr->tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) error("The INFO tag \"%s\" is not defined in the header\n", usr->tag); usr->type = bcf_hdr_id2type(hdr,BCF_HL_INFO,id); if ( usr->type!=BCF_HT_REAL && usr->type!=BCF_HT_INT ) error("The INFO tag \"%s\" is not of Float or Integer type (%d)\n", usr->tag, usr->type); } } static void init_stats(args_t *args) { int i; args->nstats = args->files->nreaders==1 ? 1 : 3; if ( args->split_by_id ) args->nstats = 2; if ( args->filter_str ) { args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str); if ( args->files->nreaders==2 ) args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str); args->files->max_unpack |= filter_max_unpack(args->filter[0]); } // AF corresponds to AC but is more robust to mixtures of haploid and diploid GTs if ( !args->af_bins_list ) { args->m_af = 101; for (i=0; ifiles->nreaders; i++) if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af ) args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1; } else { args->af_bins = bin_init(args->af_bins_list,0,1); // m_af is used also for other af arrays, where the first bin is for // singletons. However, since the last element is unused in af_bins // (n boundaries form n-1 intervals), the m_af count is good for both. args->m_af = bin_get_size(args->af_bins); } bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) ) error("No such INFO tag: %s\n", args->af_tag); int id, has_fmt_ad = ((id=bcf_hdr_id2int(hdr,BCF_DT_ID,"AD"))>=0 && bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id)) ? 1 : 0; #if QUAL_STATS args->m_qual = 999; #endif #if HWE_STATS args->naf_hwe = 100; #endif if ( args->samples_list ) { if ( !bcf_sr_set_samples(args->files,args->samples_list,args->samples_is_file) ) { if ( !bcf_hdr_nsamples(args->files->readers[0].header) ) error("No sample columns in %s\n", args->files->readers[0].fname); error("Unable to parse the samples: \"%s\"\n", args->samples_list); } args->af_gts_snps = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t)); args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t)); args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t)); args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t)); } for (i=0; instats; i++) { stats_t *stats = &args->stats[i]; stats->m_indel = 60; stats->insertions = (int*) calloc(stats->m_indel,sizeof(int)); stats->deletions = (int*) calloc(stats->m_indel,sizeof(int)); stats->af_ts = (int*) calloc(args->m_af,sizeof(int)); stats->af_tv = (int*) calloc(args->m_af,sizeof(int)); stats->af_snps = (int*) calloc(args->m_af,sizeof(int)); int j; for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int)); #if QUAL_STATS stats->qual_ts = dist_init(5); stats->qual_tv = dist_init(5); stats->qual_indels = dist_init(5); #endif if ( args->files->n_smpl ) { stats->smpl_missing = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_ins_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_del_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_ins_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_del_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_tv = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_indels = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_dp = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int)); stats->smpl_ndp = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_sngl = (int *) calloc(args->files->n_smpl,sizeof(int)); if ( has_fmt_ad ) stats->smpl_vaf = (vaf_t*) calloc(args->files->n_smpl,sizeof(vaf_t)); #if HWE_STATS stats->af_hwe = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int)); #endif if ( args->exons_fname ) stats->smpl_frm_shifts = (int*) calloc(args->files->n_smpl*3,sizeof(int)); stats->nvaf = (uint32_t*) calloc(stats->m_indel*2+1,sizeof(*stats->nvaf)); stats->dvaf = (double*) calloc(stats->m_indel*2+1,sizeof(*stats->dvaf)); } idist_init(&stats->dp, args->dp_min,args->dp_max,args->dp_step); idist_init(&stats->dp_sites, args->dp_min,args->dp_max,args->dp_step); init_user_stats(args, i!=1 ? args->files->readers[0].header : args->files->readers[1].header, stats); } if ( args->exons_fname ) { args->exons = bcf_sr_regions_init(args->exons_fname,1,0,1,2); if ( !args->exons ) error("Error occurred while reading, was the file compressed with bgzip: %s?\n", args->exons_fname); } #if IRC_STATS if ( args->ref_fname ) args->indel_ctx = indel_ctx_init(args->ref_fname); #endif type2dosage[GT_HOM_RR] = 0; type2dosage[GT_HET_RA] = 1; type2dosage[GT_HOM_AA] = 2; type2dosage[GT_HET_AA] = 2; type2dosage[GT_HAPL_R] = 0; type2dosage[GT_HAPL_A] = 1; type2ploidy[GT_HOM_RR] = 1; type2ploidy[GT_HET_RA] = 1; type2ploidy[GT_HOM_AA] = 1; type2ploidy[GT_HET_AA] = 1; type2ploidy[GT_HAPL_R] = -1; type2ploidy[GT_HAPL_A] = -1; type2stats[GT_HOM_RR] = 0; type2stats[GT_HET_RA] = 1; type2stats[GT_HOM_AA] = 2; type2stats[GT_HET_AA] = 3; type2stats[GT_HAPL_R] = 0; type2stats[GT_HAPL_A] = 2; type2stats[GT_UNKN] = 4; } static void destroy_stats(args_t *args) { int id, j; for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; if (stats->af_ts) free(stats->af_ts); if (stats->af_tv) free(stats->af_tv); if (stats->af_snps) free(stats->af_snps); for (j=0; j<3; j++) if (stats->af_repeats[j]) free(stats->af_repeats[j]); #if QUAL_STATS if (stats->qual_ts) dist_destroy(stats->qual_ts); if (stats->qual_tv) dist_destroy(stats->qual_tv); if (stats->qual_indels) dist_destroy(stats->qual_indels); #endif #if HWE_STATS free(stats->af_hwe); #endif free(stats->insertions); free(stats->deletions); free(stats->smpl_missing); free(stats->smpl_hets); free(stats->smpl_homAA); free(stats->smpl_homRR); free(stats->smpl_hapRef); free(stats->smpl_hapAlt); free(stats->smpl_ins_homs); free(stats->smpl_del_homs); free(stats->smpl_ins_hets); free(stats->smpl_del_hets); free(stats->smpl_ts); free(stats->smpl_tv); free(stats->smpl_indels); free(stats->smpl_dp); free(stats->smpl_ndp); free(stats->smpl_sngl); free(stats->smpl_vaf); idist_destroy(&stats->dp); idist_destroy(&stats->dp_sites); for (j=0; jnusr; j++) { free(stats->usr[j].vals_ts); free(stats->usr[j].vals_tv); free(stats->usr[j].val); } free(stats->usr); if ( args->exons ) free(stats->smpl_frm_shifts); free(stats->nvaf); free(stats->dvaf); } for (j=0; jnusr; j++) free(args->usr[j].tag); if ( args->af_bins ) bin_destroy(args->af_bins); free(args->farr); free(args->iarr); free(args->usr); free(args->tmp_frm); free(args->tmp_iaf); if (args->exons) bcf_sr_regions_destroy(args->exons); free(args->af_gts_snps); free(args->af_gts_indels); free(args->smpl_gts_snps); free(args->smpl_gts_indels); if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx); if (args->filter[0]) filter_destroy(args->filter[0]); if (args->filter[1]) filter_destroy(args->filter[1]); } // The arary tmp_iaf keeps the index of AF bin for each allele, the first bin is for singletons. // The number of bins, either m_af (101) or as given by the user in --af-bins static void init_iaf(args_t *args, bcf_sr_t *reader) { bcf1_t *line = reader->buffer[0]; hts_expand(int32_t,line->n_allele,args->ntmp_iaf,args->tmp_iaf); int i, ret; if ( args->af_tag ) { ret = bcf_get_info_float(reader->header, line, args->af_tag, &args->farr, &args->mfarr); if ( ret<=0 || ret!=line->n_allele-1 ) { // the AF tag is not present or wrong number of values, put in the singletons/unknown bin for (i=0; in_allele; i++) args->tmp_iaf[i] = 0; return; } args->tmp_iaf[0] = 0; for (i=1; in_allele; i++) { float af = args->farr[i-1]; if ( af<0 ) af = 0; else if ( af>1 ) af = 1; int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2); args->tmp_iaf[i] = iaf + 1; // the first tmp_iaf bin is reserved for singletons } return; } // tmp_iaf is first filled with AC counts in calc_ac and then transformed to // an index to af_gts_snps ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO); if ( !ret ) { for (i=0; in_allele; i++) args->tmp_iaf[i] = 0; // singletons/unknown bin return; } int an = 0; for (i=0; in_allele; i++) an += args->tmp_iaf[i]; args->tmp_iaf[0] = 0; for (i=1; in_allele; i++) { if ( args->tmp_iaf[i]==1 ) args->tmp_iaf[i] = 0; // singletons into the first bin else if ( !an ) args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin else { float af = (float) args->tmp_iaf[i] / an; if ( af<0 ) af = 0; else if ( af>1 ) af = 1; int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2); args->tmp_iaf[i] = iaf + 1; } } } static inline void do_mnp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->n_mnps++; } static inline void do_other_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->n_others++; } static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->n_indels++; bcf1_t *line = reader->buffer[0]; #if QUAL_STATS int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10); dist_insert(stats->qual_indels, iqual); #endif // Check if the indel is near an exon for the frameshift statistics int i, exon_overlap = 0; if ( args->exons ) { if ( !bcf_sr_regions_overlap(args->exons, bcf_seqname(reader->header,line),line->pos,line->pos) ) exon_overlap = 1; hts_expand(uint8_t,line->n_allele,args->mtmp_frm,args->tmp_frm); for (i=0; in_allele; i++) args->tmp_frm[i] = 0; } for (i=1; in_allele; i++) { if ( args->first_allele_only && i>1 ) break; int is_indel = bcf_has_variant_type(line,i,VCF_INDEL); if (is_indel < 0) error("bcf_has_variant_type() failed."); if ( !is_indel ) continue; int len = bcf_variant_length(line, i); #if IRC_STATS // Indel repeat consistency if ( args->indel_ctx ) { int nrep, nlen, ndel; ndel = indel_ctx_type(args->indel_ctx, (char*)reader->header->id[BCF_DT_CTG][line->rid].key, line->pos+1, line->d.allele[0], line->d.allele[i], &nrep, &nlen); if ( nlen<=1 || nrep<=1 ) { // not a repeat or a single base repeat stats->n_repeat_na++; stats->af_repeats[2][ args->tmp_iaf[i] ]++; } else { if ( abs(ndel) % nlen ) { // the length of the inserted/deleted sequence is not consistent with the repeat element stats->n_repeat[nlen-1][ndel<0 ? 1 : 3]++; stats->af_repeats[1][ args->tmp_iaf[i] ]++; } else { // the length consistent with the repeat stats->n_repeat[nlen-1][ndel<0 ? 0 : 2]++; stats->af_repeats[0][ args->tmp_iaf[i] ]++; } } } else stats->af_repeats[2][ args->tmp_iaf[i] ]++; #endif // Check the frameshifts int tlen = 0; if ( args->exons && exon_overlap ) // there is an exon { if ( len>0 ) { // insertion if ( args->exons->start <= line->pos && args->exons->end > line->pos ) tlen = abs(len); } else if ( args->exons->start <= line->pos + abs(len) ) { // deletion tlen = abs(len); if ( line->pos < args->exons->start ) // trim the beginning tlen -= args->exons->start - line->pos + 1; if ( args->exons->end < line->pos + abs(len) ) // trim the end tlen -= line->pos + abs(len) - args->exons->end; } } if ( tlen ) // there are some deleted/inserted bases in the exon { if ( tlen%3 ) { stats->out_frame++; args->tmp_frm[i] = 2; } else { stats->in_frame++; args->tmp_frm[i] = 1; } if ( i==1 ) { if ( tlen%3 ) stats->out_frame_alt1++; else stats->in_frame_alt1++; } } else // no exon affected { if ( i==1 ) stats->na_frame_alt1++; stats->na_frame++; } // Indel length distribution int *ptr = stats->insertions; if ( len<0 ) { len *= -1; ptr = stats->deletions; } if ( --len >= stats->m_indel ) len = stats->m_indel-1; ptr[len]++; } } static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts) { int i, nval; for (i=0; inusr; i++) { user_stats_t *usr = &stats->usr[i]; uint64_t *vals = is_ts ? usr->vals_ts : usr->vals_tv; float val; if ( usr->type==BCF_HT_REAL ) { if ( (nval=bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue; if ( usr->idx >= nval ) continue; val = ((float*)usr->val)[usr->idx]; } else { if ( (nval=bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue; if ( usr->idx >= nval ) continue; val = ((int32_t*)usr->val)[usr->idx]; } int idx; if ( val<=usr->min ) idx = 0; else if ( val>=usr->max ) idx = usr->nbins - 1; else idx = (val - usr->min)/(usr->max - usr->min) * (usr->nbins-1); vals[idx]++; } } static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->n_snps++; bcf1_t *line = reader->buffer[0]; int ref = bcf_acgt2int(*line->d.allele[0]); if ( ref<0 ) return; #if QUAL_STATS int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10); #endif int i; for (i=1; in_allele; i++) { if ( args->first_allele_only && i>1 ) break; if ( !(bcf_get_variant_type(line,i)&VCF_SNP) ) continue; int alt = bcf_acgt2int(*line->d.allele[i]); if ( alt<0 || ref==alt ) continue; stats->subst[ref<<2|alt]++; int iaf = args->tmp_iaf[i]; stats->af_snps[iaf]++; if ( abs(ref-alt)==2 ) { if (i==1) { stats->ts_alt1++; #if QUAL_STATS dist_insert(stats->qual_ts,iqual); #endif do_user_stats(stats, reader, 1); } stats->af_ts[iaf]++; } else { if (i==1) { stats->tv_alt1++; #if QUAL_STATS dist_insert(stats->qual_tv,iqual); #endif do_user_stats(stats, reader, 0); } stats->af_tv[iaf]++; } } } // Returns the max non-ref AD value static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ial) { int iv, ad = 0; *ial = 0; #define BRANCH_INT(type_t,missing,vector_end) { \ type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ for (iv=1; ivn; iv++) \ { \ if ( ptr[iv]==vector_end ) break; \ if ( ptr[iv]==missing ) continue; \ if ( ad < ptr[iv] ) { ad = ptr[iv]; *ial = iv; }\ } \ } switch (ad_fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break; } #undef BRANCH_INT return ad; } static inline int get_iad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int ial) { #define BRANCH_INT(type_t,missing,vector_end) { \ type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ if ( ptr[ial]==vector_end ) return 0; \ if ( ptr[ial]==missing ) return 0; \ return ptr[ial]; \ } switch (ad_fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break; } #undef BRANCH_INT } static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf) { int len = line->d.var[ial].n; if ( len < -stats->m_indel ) len = -stats->m_indel; else if ( len > stats->m_indel ) len = stats->m_indel; int bin = stats->m_indel + len; stats->nvaf[bin]++; stats->dvaf[bin] += vaf; } #define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05)) static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf) { int idx = vaf2bin(vaf); if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++; else smpl_vaf->indel[idx]++; } static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr) { if ( dp_fmt_ptr ) { #define BRANCH_INT(type_t,missing,vector_end) { \ type_t *ptr = (type_t *) (dp_fmt_ptr->p + dp_fmt_ptr->size*ismpl); \ if ( *ptr==missing || *ptr==vector_end ) return -1; \ return *ptr; \ } switch (dp_fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, dp_fmt_ptr->type); exit(1); break; } #undef BRANCH_INT } if ( ad_fmt_ptr ) { int iv, dp = 0, has_value = 0; #define BRANCH_INT(type_t,missing,vector_end) { \ type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ for (iv=0; ivn; iv++) \ { \ if ( ptr[iv]==vector_end ) break; \ if ( ptr[iv]==missing ) continue; \ has_value = 1; \ dp += ptr[iv]; \ } \ } switch (ad_fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break; } #undef BRANCH_INT if ( !has_value ) return -1; return dp; } return -1; } static inline void sample_gt_stats(args_t *args, stats_t *stats, bcf1_t *line, int ismpl, int gt, int ial, int jal) { if ( gt==GT_UNKN ) { stats->smpl_missing[ismpl]++; return; } int var_type = 0; if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial); if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal); if ( gt==GT_HAPL_R || gt==GT_HAPL_A ) { if ( var_type&VCF_INDEL && stats->smpl_frm_shifts ) { assert( ialn_allele ); stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++; } if ( gt == GT_HAPL_R ) stats->smpl_hapRef[ismpl]++; if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[ismpl]++; return; } if ( gt != GT_HOM_RR ) { args->n_nref++; args->i_nref = ismpl; } #if HWE_STATS switch (gt) { case GT_HOM_RR: args->nref_tot++; break; case GT_HET_RA: args->nhet_tot++; break; case GT_HET_AA: case GT_HOM_AA: args->nalt_tot++; break; } #endif if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP { if ( gt == GT_HET_RA ) stats->smpl_hets[ismpl]++; else if ( gt == GT_HET_AA ) stats->smpl_hets[ismpl]++; else if ( gt == GT_HOM_RR ) stats->smpl_homRR[ismpl]++; else if ( gt == GT_HOM_AA ) stats->smpl_homAA[ismpl]++; if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called { int ref = bcf_acgt2int(*line->d.allele[0]); int alt = bcf_acgt2int(*line->d.allele[ial]); if ( alt<0 ) return; if ( abs(ref-alt)==2 ) stats->smpl_ts[ismpl]++; else stats->smpl_tv[ismpl]++; } if ( gt != GT_HOM_RR && line->d.var[jal].type&VCF_SNP && ial!=jal ) { int ref = bcf_acgt2int(*line->d.allele[0]); int alt = bcf_acgt2int(*line->d.allele[jal]); if ( alt<0 ) return; if ( abs(ref-alt)==2 ) stats->smpl_ts[ismpl]++; else stats->smpl_tv[ismpl]++; } } if ( var_type&VCF_INDEL ) { if ( gt != GT_HOM_RR ) { stats->smpl_indels[ismpl]++; if ( gt==GT_HET_RA || gt==GT_HET_AA ) { int is_ins = 0, is_del = 0; if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) { if ( line->d.var[ial].n < 0 ) is_del = 1; else is_ins = 1; } if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) { if ( line->d.var[jal].n < 0 ) is_del = 1; else is_ins = 1; } // Note that alt-het genotypes with both ins and del allele are counted twice!! if ( is_del ) stats->smpl_del_hets[ismpl]++; if ( is_ins ) stats->smpl_ins_hets[ismpl]++; } else if ( gt==GT_HOM_AA ) { if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[ismpl]++; else stats->smpl_ins_homs[ismpl]++; } } if ( stats->smpl_frm_shifts ) { assert( ialn_allele && jaln_allele ); stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++; stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[jal]]++; } } } static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) { bcf_srs_t *files = args->files; bcf1_t *line = reader->buffer[0]; args->nref_tot = 0; args->nhet_tot = 0; args->nalt_tot = 0; args->n_nref = 0; args->i_nref = 0; bcf_fmt_t *gt_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT"); bcf_fmt_t *ad_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD"); bcf_fmt_t *dp_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP"); int is; for (is=0; isfiles->n_smpl; is++) { // Determine depth int dp = calc_sample_depth(args,is,ad_fmt_ptr,dp_fmt_ptr); if ( dp>0 ) { (*idist(&stats->dp, dp))++; stats->smpl_ndp[is]++; stats->smpl_dp[is] += dp; } // Determine genotype int ial, jal, gt=GT_UNKN; if ( gt_fmt_ptr ) { gt = bcf_gt_type(gt_fmt_ptr, reader->samples[is], &ial, &jal); sample_gt_stats(args,stats,line,is,gt,ial,jal); } // Determine variant allele frequency if ( dp>0 && ad_fmt_ptr ) { float iad = 0, jad = 0; if ( gt==GT_UNKN ) // GT not available { iad = get_ad(line,ad_fmt_ptr,is,&ial); } else if ( gt!=GT_UNKN ) { iad = ial==0 ? 0 : get_iad(line,ad_fmt_ptr,is,ial); jad = jal==0 ? 0 : get_iad(line,ad_fmt_ptr,is,jal); } if ( iad ) { update_dvaf(stats,line,ial,(float)iad/dp); update_vaf(&stats->smpl_vaf[is],line,ial,(float)iad/dp); } if ( jad && iad!=jad ) { update_dvaf(stats,line,jal,(float)jad/dp); update_vaf(&stats->smpl_vaf[is],line,jal,(float)jad/dp); } } } if ( args->n_nref==1 ) stats->smpl_sngl[args->i_nref]++; #if HWE_STATS if ( gt_fmt_ptr && line->n_allele > 1 && (args->nref_tot || args->nhet_tot || args->nalt_tot) ) { // Number of heterozygous genotypes observed for any given allele frequency. This is used // by plot-vcfstats to show the observed vs expected number of hets. There the expected number // of hets is calculated from the probability P(het) = 2*AF*(1-AF). // The array af_hwe is organized as follows // m_af .. number of allele frequency bins // naf_hwe .. the number of het genotype frequency bins // iallele_freq*naf_hwe + ihet_freq // float het_frac = (float)args->nhet_tot / (args->nref_tot + args->nhet_tot + args->nalt_tot); int ihet_freq = het_frac * (args->naf_hwe - 1); int idx = ihet_freq + args->tmp_iaf[1] * args->naf_hwe; stats->af_hwe[idx]++; } #endif if ( matched==3 ) { int is; bcf_fmt_t *fmt0, *fmt1; fmt0 = bcf_get_fmt(files->readers[0].header,files->readers[0].buffer[0],"GT"); if ( !fmt0 ) return; fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return; // only the first ALT allele is considered if (args->ntmp_iaf <= 1) return; // Do not consider invariate sites int iaf = args->tmp_iaf[1]; int line_type = bcf_get_variant_types(files->readers[0].buffer[0]); gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels; gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels; for (is=0; isn_smpl; is++) { // Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of // actual alleles can be enforced by running without the -c option. int gt0 = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL); int gt1 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL); int idx0 = type2stats[gt0]; int idx1 = type2stats[gt1]; af_stats[iaf].gt2gt[idx0][idx1]++; smpl_stats[is].gt2gt[idx0][idx1]++; if ( gt0 == GT_UNKN || gt1 == GT_UNKN ) continue; if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes float y = type2dosage[gt0]; float x = type2dosage[gt1]; smpl_stats[is].yx += y*x; smpl_stats[is].x += x; smpl_stats[is].xx += x*x; smpl_stats[is].y += y; smpl_stats[is].yy += y*y; smpl_stats[is].n += 1; af_stats[iaf].yx += y*x; af_stats[iaf].x += x; af_stats[iaf].xx += x*x; af_stats[iaf].y += y; af_stats[iaf].yy += y*y; af_stats[iaf].n += 1; } if ( args->verbose_sites ) { int nm = 0, nmm = 0, nrefm = 0; for (is=0; isn_smpl; is++) { int gt = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL); if ( gt == GT_UNKN ) continue; int gt2 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL); if ( gt2 == GT_UNKN ) continue; if ( gt != gt2 ) { nmm++; bcf_sr_t *reader = &files->readers[0]; printf("DBG\t%s\t%"PRId64"\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,files->samples[is],gt,gt2); } else { if ( gt!=GT_HOM_RR ) nrefm++; nm++; } } float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0; printf("PSD\t%s\t%"PRId64"\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,nm,nmm,nrd); } } } static void do_vcf_stats(args_t *args) { bcf_srs_t *files = args->files; assert( sizeof(int)>files->nreaders ); while ( bcf_sr_next_line(files) ) { bcf_sr_t *reader = NULL; bcf1_t *line = NULL; int ret = 0, i, pass = 1; for (i=0; inreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->filter[i] ) { int is_ok = filter_test(args->filter[i], bcf_sr_get_line(files,i), NULL); if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1; if ( !is_ok ) { pass = 0; break; } } ret |= 1<readers[i]; line = bcf_sr_get_line(files,i); } } if ( !pass ) continue; int line_type = bcf_get_variant_types(line); init_iaf(args, reader); stats_t *stats = &args->stats[ret-1]; if ( args->split_by_id && line->d.id[0]=='.' && !line->d.id[1] ) stats = &args->stats[1]; stats->n_records++; if ( line_type==VCF_REF ) stats->n_noalts++; if ( line_type&VCF_SNP ) do_snp_stats(args, stats, reader); if ( line_type&VCF_INDEL ) do_indel_stats(args, stats, reader); if ( line_type&VCF_MNP ) do_mnp_stats(args, stats, reader); if ( line_type&VCF_OTHER ) do_other_stats(args, stats, reader); if ( line->n_allele>2 ) { stats->n_mals++; if ( line_type == VCF_SNP ) stats->n_snp_mals++; // note: this will be fooled by C>C,T } if ( files->n_smpl ) do_sample_stats(args, stats, reader, ret); if ( bcf_get_info_int32(reader->header,line,"DP",&args->iarr,&args->miarr)==1 ) (*idist(&stats->dp_sites, args->iarr[0]))++; } } static void print_header(args_t *args) { int i; printf("# This file was produced by bcftools stats (%s+htslib-%s) and can be plotted using plot-vcfstats.\n", bcftools_version(),hts_version()); printf("# The command line was:\tbcftools %s ", args->argv[0]); for (i=1; iargc; i++) printf(" %s",args->argv[i]); printf("\n#\n"); printf("# Definition of sets:\n# ID\t[2]id\t[3]tab-separated file names\n"); if ( args->files->nreaders==1 ) { const char *fname = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : ""; if ( args->split_by_id ) { printf("ID\t0\t%s:known (sites with ID different from \".\")\n", fname); printf("ID\t1\t%s:novel (sites where ID column is \".\")\n", fname); } else printf("ID\t0\t%s\n", fname); } else { const char *fname0 = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : ""; const char *fname1 = strcmp("-",args->files->readers[1].fname) ? args->files->readers[1].fname : ""; printf("ID\t0\t%s\n", fname0); printf("ID\t1\t%s\n", fname1); printf("ID\t2\t%s\t%s\n", fname0,fname1); if ( args->verbose_sites ) { printf( "# Verbose per-site discordance output.\n" "# PSD\t[2]CHROM\t[3]POS\t[4]Number of matches\t[5]Number of mismatches\t[6]NRD\n"); printf( "# Verbose per-site and per-sample output. Genotype codes: %d:HomRefRef, %d:HomAltAlt, %d:HetAltRef, %d:HetAltAlt, %d:haploidRef, %d:haploidAlt\n" "# DBG\t[2]CHROM\t[3]POS\t[4]Sample\t[5]GT in %s\t[6]GT in %s\n", GT_HOM_RR, GT_HOM_AA, GT_HET_RA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A, fname0,fname1); } } } #define T2S(x) type2stats[x] static void print_stats(args_t *args) { int i, j,k, id; printf("# SN, Summary numbers:\n"); printf("# number of records .. number of data rows in the VCF\n"); printf("# number of no-ALTs .. reference-only sites, ALT is either \".\" or identical to REF\n"); printf("# number of SNPs .. number of rows with a SNP\n"); printf("# number of MNPs .. number of rows with a MNP, such as CC>TT\n"); printf("# number of indels .. number of rows with an indel\n"); printf("# number of others .. number of rows with other type, for example a symbolic allele or\n"); printf("# a complex substitution, such as ACT>TCGA\n"); printf("# number of multiallelic sites .. number of rows with multiple alternate alleles\n"); printf("# number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs\n"); printf("# \n"); printf("# Note that rows containing multiple types will be counted multiple times, in each\n"); printf("# counter. For example, a row with a SNP and an indel increments both the SNP and\n"); printf("# the indel counter.\n"); printf("# \n"); printf("# SN\t[2]id\t[3]key\t[4]value\n"); for (id=0; idfiles->nreaders; id++) printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header)); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; printf("SN\t%d\tnumber of records:\t%"PRIu64"\n", id, stats->n_records); printf("SN\t%d\tnumber of no-ALTs:\t%"PRIu64"\n", id, stats->n_noalts); printf("SN\t%d\tnumber of SNPs:\t%"PRIu64"\n", id, stats->n_snps); printf("SN\t%d\tnumber of MNPs:\t%"PRIu64"\n", id, stats->n_mnps); printf("SN\t%d\tnumber of indels:\t%"PRIu64"\n", id, stats->n_indels); printf("SN\t%d\tnumber of others:\t%"PRIu64"\n", id, stats->n_others); printf("SN\t%d\tnumber of multiallelic sites:\t%"PRIu64"\n", id, stats->n_mals); printf("SN\t%d\tnumber of multiallelic SNP sites:\t%"PRIu64"\n", id, stats->n_snp_mals); } printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; int ts=0,tv=0; for (i=0; im_af; i++) { ts += stats->af_ts[i]; tv += stats->af_tv[i]; } printf("TSTV\t%d\t%d\t%d\t%.2f\t%d\t%d\t%.2f\n", id,ts,tv,tv?(float)ts/tv:0, stats->ts_alt1,stats->tv_alt1,stats->tv_alt1?(float)stats->ts_alt1/stats->tv_alt1:0); } if ( args->exons_fname ) { printf("# FS, Indel frameshifts:\n# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)\n"); for (id=0; idnstats; id++) { int in=args->stats[id].in_frame, out=args->stats[id].out_frame, na=args->stats[id].na_frame; int in1=args->stats[id].in_frame_alt1, out1=args->stats[id].out_frame_alt1, na1=args->stats[id].na_frame_alt1; printf("FS\t%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\n", id, in,out,na,out?(float)out/(in+out):0,in1,out1,na1,out1?(float)out1/(in1+out1):0); } } if ( args->indel_ctx ) { printf("# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n"); for (id=0; idnstats; id++) { int nc = 0, ni = 0, na = args->stats[id].n_repeat_na; for (i=0; istats[id].n_repeat[i][0] + args->stats[id].n_repeat[i][2]; ni += args->stats[id].n_repeat[i][1] + args->stats[id].n_repeat[i][3]; } printf("ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0); } printf("# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n"); for (id=0; idnstats; id++) { for (i=1; istats[id].n_repeat[i][0]+args->stats[id].n_repeat[i][2], ni = args->stats[id].n_repeat[i][1]+args->stats[id].n_repeat[i][3]; printf("ICL\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\n", id, i+1, args->stats[id].n_repeat[i][0],args->stats[id].n_repeat[i][1],args->stats[id].n_repeat[i][2],args->stats[id].n_repeat[i][3], nc+ni ? (float)nc/(nc+ni) : 0.0); } } } printf("# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; printf("SiS\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,1,stats->af_snps[0],stats->af_ts[0],stats->af_tv[0], stats->af_repeats[0][0]+stats->af_repeats[1][0]+stats->af_repeats[2][0],stats->af_repeats[0][0],stats->af_repeats[1][0],stats->af_repeats[2][0]); // put the singletons stats into the first AF bin, note that not all of the stats is transferred (i.e. nrd mismatches) stats->af_snps[1] += stats->af_snps[0]; stats->af_ts[1] += stats->af_ts[0]; stats->af_tv[1] += stats->af_tv[0]; stats->af_repeats[0][1] += stats->af_repeats[0][0]; stats->af_repeats[1][1] += stats->af_repeats[1][0]; stats->af_repeats[2][1] += stats->af_repeats[2][0]; } // move the singletons stats into the first AF bin, singleton stats was collected separately because of init_iaf if ( args->af_gts_snps ) { args->af_gts_snps[1].y += args->af_gts_snps[0].y; args->af_gts_snps[1].yy += args->af_gts_snps[0].yy; args->af_gts_snps[1].xx += args->af_gts_snps[0].xx; args->af_gts_snps[1].yx += args->af_gts_snps[0].yx; args->af_gts_snps[1].n += args->af_gts_snps[0].n; } if ( args->af_gts_indels ) { args->af_gts_indels[1].y += args->af_gts_indels[0].y; args->af_gts_indels[1].yy += args->af_gts_indels[0].yy; args->af_gts_indels[1].xx += args->af_gts_indels[0].xx; args->af_gts_indels[1].yx += args->af_gts_indels[0].yx; args->af_gts_indels[1].n += args->af_gts_indels[0].n; } printf("# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=1; im_af; i++) // note that af[1] now contains also af[0], see SiS stats output above { if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue; double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,af,stats->af_snps[i],stats->af_ts[i],stats->af_tv[i], stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]); } } #if QUAL_STATS printf("# QUAL, Stats by quality\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; int ndist_ts = dist_nbins(stats->qual_ts); int ndist_tv = dist_nbins(stats->qual_tv); int ndist_in = dist_nbins(stats->qual_indels); int ndist_max = ndist_ts; if ( ndist_max < ndist_tv ) ndist_max = ndist_tv; if ( ndist_max < ndist_in ) ndist_max = ndist_in; uint32_t beg, end; uint32_t nts, ntv, nin; for (i=0; iqual_ts, i, &beg, &end); qval = beg>0 ? 0.1*(beg - 1) : -1; } if ( i < ndist_tv ) { ntv = dist_get(stats->qual_tv, i, &beg, &end); if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1; } if ( i < ndist_in ) { nin = dist_get(stats->qual_indels, i, &beg, &end); if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1; } if ( nts+ntv+nin==0 ) continue; printf("QUAL\t%d\t",id); if ( qval==-1 ) printf("."); else printf("%.1f",qval); printf("\t%d\t%d\t%d\t%d\n",nts+ntv,nts,ntv,nin); } } #endif for (i=0; inusr; i++) { printf("# USR:%s/%d\t[2]id\t[3]%s/%d\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", args->usr[i].tag,args->usr[i].idx,args->usr[i].tag,args->usr[i].idx); for (id=0; idnstats; id++) { user_stats_t *usr = &args->stats[id].usr[i]; int j; for (j=0; jnbins; j++) { if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n"; printf(fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } } printf("# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]number of sites\t[5]number of genotypes\t[6]mean VAF\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=stats->m_indel-1; i>=0; i--) { if ( !stats->deletions[i] ) continue; // whops, differently organized arrow, dels are together with ins int bin = stats->m_indel - i - 1; printf("IDD\t%d\t%d\t%d\t", id,-i-1,stats->deletions[i]); if ( stats->nvaf && stats->nvaf[bin] ) printf("%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); else printf("0\t."); printf("\n"); } for (i=0; im_indel; i++) { if ( !stats->insertions[i] ) continue; int bin = stats->m_indel + i + 1; printf("IDD\t%d\t%d\t%d\t", id,i+1,stats->insertions[i]); if ( stats->nvaf && stats->nvaf[bin] ) printf("%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); else printf("0\t."); printf("\n"); } } printf("# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n"); for (id=0; idnstats; id++) { int t; for (t=0; t<15; t++) { if ( t>>2 == (t&3) ) continue; printf("ST\t%d\t%c>%c\t%d\n", id, bcf_int2acgt(t>>2),bcf_int2acgt(t&3),args->stats[id].subst[t]); } } if ( args->files->nreaders>1 && args->files->n_smpl ) { printf("SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl); int x; for (x=0; x<2; x++) // x=0: snps, x=1: indels { gtcmp_t *stats; if ( x==0 ) { printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); stats = args->af_gts_snps; } else { printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); stats = args->af_gts_indels; } uint64_t nrd_m[4] = {0,0,0,0}, nrd_mm[4] = {0,0,0,0}; // across all bins for (i=0; im_af; i++) { int n = 0; uint64_t m[4] = {0,0,0,0}, mm[4] = {0,0,0,0}; // in i-th AF bin for (j=0; j<4; j++) // rr, ra, aa hom, aa het, ./. for (k=0; k<4; k++) { n += stats[i].gt2gt[j][k]; if ( j==k ) { nrd_m[j] += stats[i].gt2gt[j][k]; m[j] += stats[i].gt2gt[j][k]; } else { nrd_mm[j] += stats[i].gt2gt[j][k]; mm[j] += stats[i].gt2gt[j][k]; } } if ( !i || !n ) continue; // skip singleton stats and empty bins // Pearson's r2 double r2 = 0; if ( stats[i].n ) { r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n); r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n)); r2 *= r2; } double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', af); printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); if ( stats[i].n && !isnan(r2) ) printf("\t%f", r2); else printf("\t"NA_STRING); printf("\t%.0f\n", stats[i].n); } if ( x==0 ) { printf("# NRD and discordance is calculated as follows:\n"); printf("# m .. number of matches\n"); printf("# x .. number of mismatches\n"); printf("# NRD = 100 * (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n"); printf("# RR discordance = 100 * xRR / (xRR + mRR)\n"); printf("# RA discordance = 100 * xRA / (xRA + mRA)\n"); printf("# AA discordance = 100 * xAA / (xAA + mAA)\n"); printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); } else printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)] + nrd_m[T2S(GT_HET_AA)]; uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)] + nrd_mm[T2S(GT_HET_AA)]; printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i', m+mm ? mm*100.0/(m+mm) : 0, nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0, nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)] ? nrd_mm[T2S(GT_HET_RA)]*100.0/(nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)]) : 0, nrd_m[T2S(GT_HOM_AA)]+nrd_mm[T2S(GT_HOM_AA)] ? nrd_mm[T2S(GT_HOM_AA)]*100.0/(nrd_m[T2S(GT_HOM_AA)]+nrd_mm[T2S(GT_HOM_AA)]) : 0 ); } for (x=0; x<2; x++) // x=0: snps, x=1: indels { gtcmp_t *stats; if ( x==0 ) { printf("# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_snps; } else { printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_indels; } for (i=0; ifiles->n_smpl; i++) { uint64_t mm = 0, m = stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]; for (j=0; j<3; j++) for (k=0; k<3; k++) if ( j!=k ) mm += stats[i].gt2gt[j][k]; // Pearson's r2 double r2 = 0; if ( stats[i].n ) { r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n); r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n)); r2 *= r2; } printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]); printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]); if ( stats[i].n && !isnan(r2) ) printf("\t%f\n", r2); else printf("\t"NA_STRING"\n"); } } for (x=0; x<2; x++) // x=0: snps, x=1: indels { //printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); gtcmp_t *stats; if ( x==0 ) { printf("# GCTs, Genotype concordance table (SNPs)\n# GCTs"); stats = args->smpl_gts_snps; } else { printf("# GCTi, Genotype concordance table (indels)\n# GCTi"); stats = args->smpl_gts_indels; } i = 1; printf("\t[%d]sample", ++i); printf("\t[%d]RR Hom -> RR Hom", ++i); printf("\t[%d]RR Hom -> RA Het", ++i); printf("\t[%d]RR Hom -> AA Hom", ++i); printf("\t[%d]RR Hom -> AA Het", ++i); printf("\t[%d]RR Hom -> missing", ++i); printf("\t[%d]RA Het -> RR Hom", ++i); printf("\t[%d]RA Het -> RA Het", ++i); printf("\t[%d]RA Het -> AA Hom", ++i); printf("\t[%d]RA Het -> AA Het", ++i); printf("\t[%d]RA Het -> missing", ++i); printf("\t[%d]AA Hom -> RR Hom", ++i); printf("\t[%d]AA Hom -> RA Het", ++i); printf("\t[%d]AA Hom -> AA Hom", ++i); printf("\t[%d]AA Hom -> AA Het", ++i); printf("\t[%d]AA Hom -> missing", ++i); printf("\t[%d]AA Het -> RR Hom", ++i); printf("\t[%d]AA Het -> RA Het", ++i); printf("\t[%d]AA Het -> AA Hom", ++i); printf("\t[%d]AA Het -> AA Het", ++i); printf("\t[%d]AA Het -> missing", ++i); printf("\t[%d]missing -> RR Hom", ++i); printf("\t[%d]missing -> RA Het", ++i); printf("\t[%d]missing -> AA Hom", ++i); printf("\t[%d]missing -> AA Het", ++i); printf("\t[%d]missing -> missing\n", ++i); for (i=0; ifiles->n_smpl; i++) { printf("GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]); for (j=0; j<5; j++) for (k=0; k<5; k++) printf("\t%"PRIu64, stats[i].gt2gt[j][k]); printf("\n"); } } } printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; long unsigned int sum = 0, sum_sites = 0; for (i=0; idp.m_vals; i++) { sum += stats->dp.vals[i]; sum_sites += stats->dp_sites.vals[i]; } for (i=0; idp.m_vals; i++) { if ( stats->dp.vals[i]==0 && stats->dp_sites.vals[i]==0 ) continue; printf("DP\t%d\t", id); if ( i==0 ) printf("<%d", stats->dp.min); else if ( i+1==stats->dp.m_vals ) printf(">%d", stats->dp.max); else printf("%d", idist_i2bin(&stats->dp,i)); printf("\t%"PRIu64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); printf("\t%"PRIu64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); } } if ( args->files->n_smpl ) { printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. The rest include both SNPs and indels.\n"); printf("# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons" "\t[12]nHapRef\t[13]nHapAlt\t[14]nMissing\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; ifiles->n_smpl; i++) { float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0; printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i], stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i], stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i], stats->smpl_missing[i]); } } printf("# PSI, Per-Sample Indels. Note that alt-het genotypes with both ins and del allele are counted twice, in both nInsHets and nDelHets.\n"); printf("# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nInsHets\t[9]nDelHets\t[10]nInsAltHoms\t[11]nDelAltHoms\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; ifiles->n_smpl; i++) { int na = 0, in = 0, out = 0; if ( args->exons ) { na = stats->smpl_frm_shifts[i*3 + 0]; in = stats->smpl_frm_shifts[i*3 + 1]; out = stats->smpl_frm_shifts[i*3 + 2]; } printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0, stats->smpl_ins_hets[i],stats->smpl_del_hets[i],stats->smpl_ins_homs[i],stats->smpl_del_homs[i]); } } #ifdef HWE_STATS printf("# HWE\n# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; inaf_hwe; i++) stats->af_hwe[i+args->naf_hwe] += stats->af_hwe[i]; // singletons for (i=1; im_af; i++) { unsigned int sum_tot = 0, sum_tmp = 0; int j, *ptr = &stats->af_hwe[i*args->naf_hwe]; for (j=0; jnaf_hwe; j++) sum_tot += ptr[j]; if ( !sum_tot ) continue; double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); int nprn = 3; printf("HWE\t%d\t%f\t%d",id,af,sum_tot); for (j=0; jnaf_hwe; j++) { sum_tmp += ptr[j]; float frac = (float)sum_tmp/sum_tot; if ( frac >= 0.75 ) { while (nprn>0) { printf("\t%f", (float)j/args->naf_hwe); nprn--; } break; } if ( frac >= 0.5 ) { while (nprn>1) { printf("\t%f", (float)j/args->naf_hwe); nprn--; } continue; } if ( frac >= 0.25 ) { while (nprn>2) { printf("\t%f", (float)j/args->naf_hwe); nprn--; } } } assert(nprn==0); printf("\n"); } } #endif } if ( args->stats[0].smpl_vaf ) { printf("# VAF, Variant Allele Frequency determined as fraction of alternate reads in FORMAT/AD\n"); printf("# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; ifiles->n_smpl; i++) { printf("VAF\t%d\t%s\t", id,args->files->samples[i]); for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].snv[j]); printf("\t"); for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].indel[j]); printf("\n"); } } } } static void usage(void) { fprintf(stderr, "\n"); fprintf(stderr, "About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.\n"); fprintf(stderr, " When two files are given, the program generates separate stats for intersection\n"); fprintf(stderr, " and the complements. By default only sites are compared, -s/-S must given to include\n"); fprintf(stderr, " also sample columns.\n"); fprintf(stderr, "Usage: bcftools stats [options] []\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " --af-bins LIST Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); fprintf(stderr, " --af-tag STRING Allele frequency tag to use, by default estimated from AN,AC or GT\n"); fprintf(stderr, " -1, --1st-allele-only Include only 1st allele at multiallelic sites\n"); fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); fprintf(stderr, " -d, --depth INT,INT,INT Depth distribution: min,max,bin size [0,500,1]\n"); fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -E, --exons FILE.gz Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed)\n"); fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); fprintf(stderr, " -F, --fasta-ref FILE Faidx indexed reference sequence file to determine INDEL context\n"); fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n"); fprintf(stderr, " -S, --samples-file FILE File of samples to include\n"); fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); fprintf(stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -v, --verbose Produce verbose per-site and per-sample output\n"); fprintf(stderr, "\n"); exit(1); } int main_vcfstats(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->files = bcf_sr_init(); args->argc = argc; args->argv = argv; args->dp_min = 0; args->dp_max = 500; args->dp_step = 1; int regions_is_file = 0, targets_is_file = 0; int regions_overlap = 1; int targets_overlap = 0; static struct option loptions[] = { {"af-bins",1,0,1}, {"af-tag",1,0,2}, {"1st-allele-only",0,0,'1'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"help",0,0,'h'}, {"collapse",1,0,'c'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"regions-overlap",required_argument,NULL,3}, {"verbose",0,0,'v'}, {"depth",1,0,'d'}, {"apply-filters",1,0,'f'}, {"exons",1,0,'E'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"split-by-ID",0,0,'I'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"targets-overlap",required_argument,NULL,4}, {"fasta-ref",1,0,'F'}, {"user-tstv",1,0,'u'}, {"threads",1,0,9}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) { switch (c) { case 1 : args->af_bins_list = optarg; break; case 2 : args->af_tag = optarg; break; case 'u': add_user_stats(args,optarg); break; case '1': args->first_allele_only = 1; break; case 'F': args->ref_fname = optarg; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'c': if ( !strcmp(optarg,"snps") ) args->files->collapse |= COLLAPSE_SNPS; else if ( !strcmp(optarg,"indels") ) args->files->collapse |= COLLAPSE_INDELS; else if ( !strcmp(optarg,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(optarg,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME; else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; case 'v': args->verbose_sites = 1; break; case 'd': if ( sscanf(optarg,"%d,%d,%d",&args->dp_min,&args->dp_max,&args->dp_step)!=3 ) error("Could not parse --depth %s\n", optarg); if ( args->dp_min<0 || args->dp_min >= args->dp_max || args->dp_step > args->dp_max - args->dp_min + 1 ) error("Is this a typo? --depth %s\n", optarg); break; case 'f': args->files->apply_filters = optarg; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 'E': args->exons_fname = optarg; break; case 's': args->samples_list = optarg; break; case 'S': args->samples_list = optarg; args->samples_is_file = 1; break; case 'I': args->split_by_id = 1; break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 3 : regions_overlap = parse_overlap_option(optarg); if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : targets_overlap = parse_overlap_option(optarg); if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); } } char *fname = NULL; if ( optind==argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(); } else fname = argv[optind]; if ( argc-optind>2 ) usage(); if ( argc-optind>1 ) { args->files->require_index = 1; if ( args->split_by_id ) error("Only one file can be given with -i.\n"); } if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO; else args->files->max_unpack = BCF_UN_FMT; if ( args->targets_list ) { bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( args->regions_list) { bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) error("Failed to create threads\n"); while (fname) { if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); fname = ++optind < argc ? argv[optind] : NULL; } init_stats(args); print_header(args); do_vcf_stats(args); print_stats(args); destroy_stats(args); bcf_sr_destroy(args->files); free(args); return 0; }