/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. Copyright (C) 2013-2023 Genome Research Ltd. Author: Shane McCarthy Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bcftools.h" #include "filter.h" #include "htslib/khash_str2int.h" #define FLT_INCLUDE 1 #define FLT_EXCLUDE 2 #define ALLELE_NONREF 1 #define ALLELE_MINOR 2 #define ALLELE_ALT1 3 #define ALLELE_MAJOR 4 #define ALLELE_NONMAJOR 5 #define GT_NEED_HOM 1 #define GT_NEED_HET 2 #define GT_NO_HOM 3 #define GT_NO_HET 4 #define GT_NEED_MISSING 5 #define GT_NO_MISSING 6 typedef struct _args_t { filter_t *filter; char *filter_str; int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e) bcf_srs_t *files; bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list; int regions_overlap, targets_overlap; int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac; int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased; int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type; int *ac, mac; float min_af, max_af; char *fn_ref, *fn_out, **samples; int sample_is_file, force_samples; char *include_types, *exclude_types; int include, exclude; int record_cmd_line; char *index_fn; int write_index; htsFile *out; } args_t; static void init_data(args_t *args) { int i; args->hdr = args->files->readers[0].header; if (args->calc_ac && args->update_info) { if (bcf_hdr_append(args->hdr,"##INFO=") < 0) error_errno("[%s] Failed to add \"AC\" INFO header", __func__); if (bcf_hdr_append(args->hdr,"##INFO=") < 0) error_errno("[%s] Failed to add \"AN\" INFO header", __func__); } if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); else if (bcf_hdr_sync(args->hdr) < 0) error_errno("[%s] Failed to update header", __func__); // setup sample data if (args->sample_names) { void *hdr_samples = khash_str2int_init(); for (i=0; ihdr); i++) khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)); void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL; int nsmpl; char **smpl = NULL; args->samples = NULL; args->n_samples = 0; smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl); if ( !smpl ) { error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names); } if ( exclude ) { for (i=0; iforce_samples) { fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); } else { error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } } khash_str2int_inc(exclude, smpl[i]); } for (i=0; ihdr); i++) { if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)) ) continue; args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*)); args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)); } khash_str2int_destroy(exclude); } else { for (i=0; iforce_samples) { fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); continue; } else { error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } } args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*)); args->samples[args->n_samples++] = strdup(smpl[i]); } } for (i=0; in_samples == 0) { fprintf(stderr, "Warn: subsetting has removed all samples\n"); args->sites_only = 1; } } if (args->n_samples) args->imap = (int*)malloc(args->n_samples * sizeof(int)); // determine variant types to include/exclude if (args->include_types || args->exclude_types) { if (args->include_types && args->exclude_types) { fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n"); exit(1); } char **type_list = 0; int m = 0, n = 0; const char *q, *p; for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) { if (*p == ',' || *p == 0) { if (m == n) { m = m? m<<1 : 16; type_list = (char**)realloc(type_list, m * sizeof(char*)); } type_list[n] = (char*)calloc(p - q + 1, 1); strncpy(type_list[n++], q, p - q); q = p + 1; if (*p == 0) break; } } type_list = (char**)realloc(type_list, n * sizeof(char*)); if (args->include_types) { args->include = 0; for (i = 0; i < n; ++i) { if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1; else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1; else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1; else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1; else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1; else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } } if (args->exclude_types) { args->exclude = 0; for (i = 0; i < n; ++i) { if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1; else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1; else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1; else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1; else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1; else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } } for (i = 0; i < n; ++i) free(type_list[i]); free(type_list); } char wmode[8]; set_wmode(wmode,args->output_type,args->fn_out,args->clevel); args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( args->n_threads > 0) hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); // headers: hdr=full header, hsub=subset header, hnull=sites only header if (args->sites_only){ args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0); bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL); } if (args->n_samples > 0) { args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap); if ( !args->hsub ) error("Error occurred while subsetting samples\n"); if ( args->n_samples != bcf_hdr_nsamples(args->hsub) ) { int i; for (i=0; in_samples; i++) if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]); } } if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); } static void destroy_data(args_t *args) { int i; if ( args->imap ) { for (i = 0; i < args->n_samples; ++i) free(args->samples[i]); free(args->samples); free(args->imap); } if (args->hnull) bcf_hdr_destroy(args->hnull); if (args->hsub) bcf_hdr_destroy(args->hsub); if ( args->filter ) filter_destroy(args->filter); free(args->ac); } // true if all samples are phased. // haploid genotypes are considered phased // ./. => not phased, .|. => phased int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line) { bcf_unpack(line, BCF_UN_FMT); bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT"); int all_phased = 1; if ( fmt_ptr ) { int i, isample; for (isample=0; isamplen_sample; isample++) { int sample_phased = 0; #define BRANCH_INT(type_t,vector_end) { \ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ for (i=0; in; i++) \ { \ if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \ if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \ if ((p[i])&1) { \ sample_phased = 1; \ break; \ } \ } \ } switch (fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT if (!sample_phased) { all_phased = 0; break; } } } return all_phased; } int subset_vcf(args_t *args, bcf1_t *line) { if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles if (args->novel || args->known) { if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.' if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0; // skip sites which are novel, ID == '.' } if (args->include || args->exclude) { int line_type = bcf_get_variant_types(line); if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types if ( args->exclude && (line_type<<1) & args->exclude ) return 0; // exclude given variant types } if ( args->filter ) { int ret = filter_test(args->filter, line, NULL); if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; } else if ( ret ) return 0; } hts_expand(int, line->n_allele, args->mac, args->ac); int i, an = 0, non_ref_ac = 0; if (args->calc_ac) { bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate for (i=1; in_allele; i++) non_ref_ac += args->ac[i]; for (i=0; in_allele; i++) an += args->ac[i]; } int update_ac = args->calc_ac; if (args->n_samples) { int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int)); bcf_subset(args->hdr, line, args->n_samples, args->imap); if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0; if ( update_ac ) { bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN an = 0; for (i=0; in_allele; i++) { args->ac[i] = ac_sub[i]; an += ac_sub[i]; } for (i=1; in_allele; i++) non_ref_ac_sub += ac_sub[i]; if (args->private_vars) { if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites } non_ref_ac = non_ref_ac_sub; } free(ac_sub); } bcf_fmt_t *gt_fmt; if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) ) { int nhet = 0, nhom = 0, nmiss = 0; for (i=0; ihdr); i++) { int type = bcf_gt_type(gt_fmt,i,NULL,NULL); if ( type==GT_HET_RA || type==GT_HET_AA ) { if ( args->gt_type==GT_NO_HET ) return 0; nhet = 1; } else if ( type==GT_UNKN ) { if ( args->gt_type==GT_NO_MISSING ) return 0; nmiss = 1; } else { if ( args->gt_type==GT_NO_HOM ) return 0; nhom = 1; } } if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0; else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0; else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0; } int minor_ac = 0; int major_ac = 0; if ( args->calc_ac ) { minor_ac = args->ac[0]; major_ac = args->ac[0]; for (i=1; in_allele; i++){ if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; } if (args->ac[i] > major_ac) { major_ac = args->ac[i]; } } } if (args->min_ac!=-1) { if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC } if (args->max_ac!=-1) { if (args->max_ac_type == ALLELE_NONREF && args->max_acmax_ac_type == ALLELE_MINOR && args->max_acmax_ac_type == ALLELE_ALT1 && args->max_acac[1]) return 0; // max 1st alternate AC else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC } if (args->min_af!=-1) { if (an == 0) return 0; // freq not defined, skip site if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF } if (args->max_af!=-1) { if (an == 0) return 0; // freq not defined, skip site if (args->max_af_type == ALLELE_NONREF && args->max_afmax_af_type == ALLELE_MINOR && args->max_afmax_af_type == ALLELE_ALT1 && args->max_afac[1]/(double)an) return 0; // max 1st alternate AF else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF } if (args->uncalled) { if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled } if (update_ac && args->update_info) { bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1); bcf_update_info_int32(args->hdr, line, "AN", &an, 1); } if (args->trim_alts) { int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line); if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1); } if (args->phased) { int phased = bcf_all_phased(args->hdr, line); if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased } if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0); return 1; } void set_allele_type (int *atype, char *atype_string) { *atype = ALLELE_NONREF; if (strcmp(atype_string, "minor") == 0) { *atype = ALLELE_MINOR; } else if (strcmp(atype_string, "alt1") == 0) { *atype = ALLELE_ALT1; } else if (strcmp(atype_string, "nref") == 0) { *atype = ALLELE_NONREF; } else if (strcmp(atype_string, "major") == 0) { *atype = ALLELE_MAJOR; } else if (strcmp(atype_string, "nonmajor") == 0) { *atype = ALLELE_NONMAJOR; } else { error("Error: allele type not recognised. Expected one of nref|alt1|minor|major|nonmajor, got \"%s\".\n", atype_string); } } static void usage(args_t *args) { fprintf(stderr, "\n"); fprintf(stderr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n"); fprintf(stderr, "Usage: bcftools view [options] [region1 [...]]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Output options:\n"); fprintf(stderr, " -G, --drop-genotypes Drop individual genotype information (after subsetting if -s option set)\n"); fprintf(stderr, " -h, --header-only Print only the header in VCF output (equivalent to bcftools head)\n"); fprintf(stderr, " -H, --no-header Suppress the header in VCF output\n"); fprintf(stderr, " --with-header Print both header and records in VCF output [default]\n"); fprintf(stderr, " -l, --compression-level [0-9] Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Output file name [stdout]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -t, --targets [^]REGION Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); fprintf(stderr, " -T, --targets-file [^]FILE Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Subset options:\n"); fprintf(stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); fprintf(stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix). Be careful\n"); fprintf(stderr, " when combining filtering with sample subsetting as filtering comes (usually) first.\n"); fprintf(stderr, " If unsure, split sample subsetting and filtering in two commands, using -Ou when piping.\n"); fprintf(stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n"); fprintf(stderr, " --force-samples Only warn about unknown subset samples\n"); fprintf(stderr, "\n"); fprintf(stderr, "Filter options:\n"); fprintf(stderr, " -c/C, --min-ac/--max-ac INT[:TYPE] Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n"); fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); fprintf(stderr, " -g, --genotype [^]hom|het|miss Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n"); fprintf(stderr, " -i/e, --include/--exclude EXPR Select/exclude sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -k/n, --known/--novel Select known/novel sites only (ID is not/is '.')\n"); fprintf(stderr, " -m/M, --min-alleles/--max-alleles INT Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n"); fprintf(stderr, " -p/P, --phased/--exclude-phased Select/exclude sites where all samples are phased\n"); fprintf(stderr, " -q/Q, --min-af/--max-af FLOAT[:TYPE] Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n"); fprintf(stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); fprintf(stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } int main_vcfview(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->clevel = -1; args->print_header = 1; args->update_info = 1; args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; args->write_index = 0; args->min_ac = args->max_ac = args->min_af = args->max_af = -1; args->regions_overlap = 1; args->targets_overlap = 0; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = { {"genotype",required_argument,NULL,'g'}, {"compression-level",required_argument,NULL,'l'}, {"threads",required_argument,NULL,9}, {"header-only",no_argument,NULL,'h'}, {"no-header",no_argument,NULL,'H'}, {"with-header",no_argument,NULL,4}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"trim-alt-alleles",no_argument,NULL,'a'}, {"no-update",no_argument,NULL,'I'}, {"drop-genotypes",no_argument,NULL,'G'}, {"private",no_argument,NULL,'x'}, {"exclude-private",no_argument,NULL,'X'}, {"uncalled",no_argument,NULL,'u'}, {"exclude-uncalled",no_argument,NULL,'U'}, {"apply-filters",required_argument,NULL,'f'}, {"known",no_argument,NULL,'k'}, {"novel",no_argument,NULL,'n'}, {"min-alleles",required_argument,NULL,'m'}, {"max-alleles",required_argument,NULL,'M'}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, {"force-samples",no_argument,NULL,1}, {"output-type",required_argument,NULL,'O'}, {"output-file",required_argument,NULL,'o'}, {"output",required_argument,NULL,'o'}, {"types",required_argument,NULL,'v'}, {"exclude-types",required_argument,NULL,'V'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, {"targets-overlap",required_argument,NULL,2}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"regions-overlap",required_argument,NULL,3}, {"min-ac",required_argument,NULL,'c'}, {"max-ac",required_argument,NULL,'C'}, {"min-af",required_argument,NULL,'q'}, {"max-af",required_argument,NULL,'Q'}, {"phased",no_argument,NULL,'p'}, {"exclude-phased",no_argument,NULL,'P'}, {"no-version",no_argument,NULL,8}, {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0) { char allele_type[9] = "nref"; switch (c) { case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: { args->clevel = strtol(optarg,&tmp,10); if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); } }; if ( optarg[1] ) { args->clevel = strtol(optarg+1,&tmp,10); if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'l': args->clevel = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg); args->output_type |= FT_GZ; break; case 'o': args->fn_out = optarg; break; case 'H': args->print_header = 0; break; case 'h': args->header_only = 1; break; case 4 : args->print_header = 1; args->header_only = 0; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 's': args->sample_names = optarg; break; case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; case 1 : args->force_samples = 1; break; case 'a': args->trim_alts = 1; args->calc_ac = 1; break; case 'I': args->update_info = 0; break; case 'G': args->sites_only = 1; break; case 'f': args->files->apply_filters = optarg; break; case 'k': args->known = 1; break; case 'n': args->novel = 1; break; case 'm': args->min_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg); break; case 'M': args->max_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg); break; case 'v': args->include_types = optarg; break; case 'V': args->exclude_types = optarg; break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'c': { args->min_ac_type = ALLELE_NONREF; if ( sscanf(optarg,"%d:%8s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 ) error("Error: Could not parse --min-ac %s\n", optarg); set_allele_type(&args->min_ac_type, allele_type); args->calc_ac = 1; break; } case 'C': { args->max_ac_type = ALLELE_NONREF; if ( sscanf(optarg,"%d:%8s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 ) error("Error: Could not parse --max-ac %s\n", optarg); set_allele_type(&args->max_ac_type, allele_type); args->calc_ac = 1; break; } case 'q': { args->min_af_type = ALLELE_NONREF; if ( sscanf(optarg,"%f:%8s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 ) error("Error: Could not parse --min-af %s\n", optarg); set_allele_type(&args->min_af_type, allele_type); args->calc_ac = 1; break; } case 'Q': { args->max_af_type = ALLELE_NONREF; if ( sscanf(optarg,"%f:%8s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 ) error("Error: Could not parse --max-af %s\n", optarg); set_allele_type(&args->max_af_type, allele_type); args->calc_ac = 1; break; } case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break; case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break; case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break; case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break; case 'p': args->phased |= FLT_INCLUDE; break; // phased case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased case 'g': { if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM; else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET; else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING; else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM; else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET; else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING; else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg); break; } case 2 : args->targets_overlap = parse_overlap_option(optarg); if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 3 : args->regions_overlap = parse_overlap_option(optarg); if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->write_index = 1; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } } if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n"); if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n"); if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n"); if ( args->sample_names && args->update_info) args->calc_ac = 1; char *fname = NULL; if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(args); } else fname = argv[optind]; // read in the regions from the command line if ( args->regions_list ) { bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } else if ( optind+1 < argc ) { int i; kstring_t tmp = {0,0,0}; kputs(argv[optind+1],&tmp); for (i=optind+2; ifiles, tmp.s, 0)<0 ) error("Failed to read the regions: %s\n", tmp.s); free(tmp.s); } if ( args->targets_list ) { bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); init_data(args); bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr); if (args->print_header) { if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); } else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out); int ret = 0; if (!args->header_only) { while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = args->files->readers[0].buffer[0]; if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n"); if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); } ret = args->files->errnum; if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); } if (args->write_index) { if (bcf_idx_save(args->out) < 0) { if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout"); error("Error: cannot write to index %s\n", args->index_fn); } free(args->index_fn); } if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout"); destroy_data(args); bcf_sr_destroy(args->files); free(args); return ret; }