# Copyright 2013 Graham McVicker and Bryce van de Geijn
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os
import math
import time
import gzip
import argparse

from scipy.optimize import *
from scipy.special import gammaln
from scipy.special import betaln
import scipy.stats

import numpy as np

import random

import util

# OPTIMIZER="BFGS"
OPTIMIZER="Nelder-Mead"


class TestSNP:
    def __init__(self, name, geno_hap1, geno_hap2, AS_target_ref, AS_target_alt,
                 hetps, totals, counts):
        self.name = name
        self.geno_hap1 = geno_hap1
        self.geno_hap2 = geno_hap2
        self.AS_target_ref = AS_target_ref
        self.AS_target_alt = AS_target_alt
        self.hetps = hetps
        self.totals = totals
        self.counts = counts


    def is_het(self):
        """returns True if the test SNP is heterozygous"""
        return self.geno_hap1 != self.geno_hap2

    def is_homo_ref(self):
        """Returns True if test SNP is homozygous for reference allele"""
        return self.geno_hap1 == 0 and self.geno_hap2 == 0

    def is_homo_alt(self):
        """Returns True if test SNP is homozygous for non-reference allele"""
        return self.geno_hap1 == 1 and self.geno_hap2 == 1


def open_input_files(in_filename):
    if not os.path.exists(in_filename) or not os.path.isfile(in_filename):
        sys.stderr.write("input file %s does not exist or is not a regular file\n" %
                         in_filename)
        exit(2)

    # read file that contains list of input files
    in_file = open(in_filename, "rt")

    infiles = []
    for line in in_file:
        # open each input file and read first line
        filename = line.rstrip()
        if not filename or not os.path.exists(filename) or not os.path.isfile(filename):
            sys.stderr.write("input file '%s' does not exist or is not a regular file\n"
                             % line)
            exit(2)
        if util.is_gzipped(filename):
            f = gzip.open(filename, "rt")
        else:
            f = open(filename, "rt")

        # skip header
        f.readline()

        infiles.append(f)
    in_file.close()

    if len(infiles) == 0:
        sys.stderr.write("no input files specified in file '%s'\n" % options.infile_list)
        exit(2)

    return infiles


def write_header(outfile):
    outfile.write("\t".join(["TEST.SNP.CHROM", "TEST.SNP.POS", "TEST.SNP.ID",
                             "TEST.SNP.REF.ALLELE", "TEST.SNP.ALT.ALLELE",
                             "REGION.START", "REGION.END",
                             "LOGLIKE.NULL", "LOGLIKE.ALT",
                             "CHISQ", "P.VALUE", "ALPHA", "BETA",
                             "PHI", "TOTAL.AS.READ.COUNT",
                             "REF.AS.READ.COUNT", "ALT.AS.READ.COUNT",
                             "TOTAL.READ.COUNT"]) + "\n")


def read_bnb_sigmas(options, infiles):
    """Read overdispersion parameters for beta-negative binomial.
    Expect one for each individual."""
    if (options.bnb_disp):
        disp_file = open(options.bnb_disp, "rt")
        line = disp_file.readline()
        bnb_sigmas = []
        while line:
            bnb_sigmas.append(np.float64(line.strip()))
            line = disp_file.readline()
        disp_file.close()

        if len(bnb_sigmas) != len(infiles):
            raise ValueError("expected %d values in bnb_disp file "
                             "(one for each input file) but got %d"
                             % (len(infiles), len(bnb_sigmas)))
    else:
        bnb_sigmas = [0.001]*len(infiles)

    return bnb_sigmas


def read_as_sigmas(options, infiles):
    """Read overdispersion parameters for allele-specific test
    (Beta-Binomial). Expect one for each individual."""

    if (options.as_disp):
        disp_file = open(options.as_disp, "rt")
        line = disp_file.readline()
        as_sigmas = []
        while line:
            val = np.float64(line.strip())
            if val < 0.0 or val > 1.0:
                raise ValueError("expected as_sigma values to be "
                                 " in range 0.0-1.0, but got %g" %
                                 val)
            as_sigmas.append(np.float64(line.strip()))
            line = disp_file.readline()

        disp_file.close()

        if len(as_sigmas) != len(infiles):
            raise ValueError("expected %d values in as_disp file "
                             "(one for each input file) but got "
                             "%d" % (len(infiles), len(as_sigmas)))

    else:
        as_sigmas = [0.001] * len(infiles)

    return as_sigmas


def write_results(outfile, snpinfo, loglike1par, loglike2par,
                  best2par, tot_as_counts, ref_as_counts, alt_as_counts,
                  all_counts):
    """Write result to output file. First columns mirror those in the input 
    files, but no genotypes or haplotypes are given.
    Tab-delimited columns are:
      1. Chromosome,
      2. Test SNP position,
      3. Test SNP identifier,
      4. Test SNP Ref Allele,
      5. Test SNP Alt Allele,
      6. Target Region Start,
      7. Target Region End,
      8. Log likelihood 1 parameter model (Null),
      9. Log likelihood 2 parameter model (Alternative),
      10. Chi-squared statistic,
      11. P-value,
      12. Alpha parameter estimate (expression level
          of reference allele)
      13. Beta parameter estimate (expression level of
          alternative allele)
      14. Phi parameter estimate (beta-negative-binomial
          overdispersion parameter for target region)
      15. Total number of allele-specific read counts for this
          region summed across individuals
      16. Total number of reference haplotype allele-specific read counts
      17. Total number of alt haplotype allele-specific read counts
      18. Total number of mapped reads for the target region,
          summed across individuals"""

    # compute likelihood ratio test statistic:
    chisq = 2 * (loglike1par - loglike2par)
    pval = (1-scipy.stats.chi2.cdf(chisq, 1)),

    outfile.write("\t".join([snpinfo[0][0], snpinfo[0][1],
                             snpinfo[0][2], snpinfo[0][3],
                             snpinfo[0][4], snpinfo[0][7],
                             snpinfo[0][8],
                             "%.2f" % -loglike1par,
                             "%.2f" % -loglike2par,
                             "%.3f" % chisq,
                             "%g" % pval,
                             "%g" % best2par[0],
                             "%g" % best2par[1],
                             "%g" % best2par[2],
                             "%d" % tot_as_counts,
                             "%d" % ref_as_counts,
                             "%d" % alt_as_counts,
                             "%d" % all_counts]) + '\n')
    outfile.flush()


def write_empty_result(outfile, snpinfo):
    """Write all zeros in the even that the test failed"""
    outfile.write("\t".join([snpinfo[0][0], snpinfo[0][1],
                             snpinfo[0][2], snpinfo[0][3],
                             snpinfo[0][4],
                             snpinfo[0][7], snpinfo[0][8],
                             "0", "0",
                             "0", "NA", "0", "0", "0",
                             "0", "0", "0"]) + '\n')


def rescale_totals(test_snps):
    min_tot = min([s.totals for s in test_snps])

    if min_tot > 0:
        for s in test_snps:
            s.totals = s.totals / min_tot        
        
    
def main():
    options = parse_options()

    if options.pc_file:
        pc_matrix = load_covariates(options.pc_file)
        num_pcs = options.num_pcs
    else:
        pc_matrix = []
        num_pcs = 0

    if options.out_file.endswith(".gz"):
        outfile = gzip.open(options.out_file, "wt")
    else:
        outfile = open(options.out_file, "wt")

    if options.benchmark:
        if options.benchmark == "-":
            bench_file = sys.stderr
        else:
            bench_file = open(options.benchmark, "wt")

        bench_file.write("TEST.TYPE TIME\n")

    write_header(outfile)

    # read list of input files (one for each individual)
    infiles = open_input_files(options.infile_list)

    # read dispersion parameters for each individual
    bnb_sigmas = read_bnb_sigmas(options, infiles)
    as_sigmas = read_as_sigmas(options, infiles)


    # add first row of each input file to snpinfo list
    snpinfo = []
    for f in infiles:
        snpinfo.append(f.readline().strip().split())

    row_count = 0
    finished=False

    options.dup_snp_warn = True
    
    while not finished:
        try:
            test_snps = []
            # parse test SNP and associated info from input file row
            for i in range(len(infiles)):
                test_snps.append(parse_test_snp(snpinfo[i], options))

            # rescale totals to put values into reasonable range for
            # alpha and beta parameter estimation
            rescale_totals(test_snps)
            

            # how many allele-specific reads are there across all
            # linked SNPs and and individuals?
            ref_as_counts = sum([np.sum(x.AS_target_ref) for x in test_snps])
            alt_as_counts = sum([np.sum(x.AS_target_alt) for x in test_snps])
            tot_as_counts = ref_as_counts + alt_as_counts
            
            all_counts = sum([test_snps[i].counts for i in range(len(test_snps))])

            if tot_as_counts < options.min_as_counts:
                if options.verbose:
                    sys.stderr.write("-----\nskipping SNP %s because "
                                     "total AS counts %d <= %d\n" %
                                     (test_snps[0].name, tot_as_counts, options.min_as_counts))

                # skip, not enough allele-specific counts
                for i in range(len(infiles)):
                    line = infiles[i].readline().strip()
                    if line:
                        snpinfo[i] = line.split()
                    else:
                        # out of lines from at least one file, assume we are finished
                        finished = True
                continue

            if options.verbose:
                sys.stderr.write("-----\ntesting SNP %s\n" % test_snps[0].name)

            row_count+=1
            old_genos = [test_snps[y].geno_hap1 + test_snps[y].geno_hap2
                         for y in range(len(test_snps))]

            if options.shuffle:
                # permute genotypes
                perm = list(range(len(test_snps)))
                random.shuffle(perm)
                geno1temp = [test_snps[y].geno_hap1 for y in perm]
                geno2temp = [test_snps[y].geno_hap2 for y in perm]
                for i in range(len(test_snps)):
                    test_snps[i].geno_hap1 = geno1temp[i]
                    test_snps[i].geno_hap2 = geno2temp[i]


            if options.benchmark:
                # start timing test for NULL model
                bench_file.write("null %s %s %d %d " % (snpinfo[0][0], snpinfo[0][1],
                                                        tot_as_counts, all_counts))
                bench_file.flush()
            t1 = time.time()

            starting_gene = [np.float64(x) for x in [0.1, 0.001]]
            maxlike = 10000000000

            for start in starting_gene:
                starts = [np.float64(0.5), np.float64(start)]

                # regress against the covariates and get residuals
                #fit_cov(test_snps,cov_table)

                # maximize likelihood with alpha = beta (no difference between genotypes)
                res = minimize(ll_one, starts, args=(test_snps, True, #options.is_bnb_only,
                                                     options.is_as_only,
                                                     bnb_sigmas,
                                                     as_sigmas,
                                                     options.read_error_rate,
                                                     [],
                                                     pc_matrix),
                                options={"maxiter" : 50000, "disp" : options.verbose},
                                method=OPTIMIZER)

                new_par = res.x

                new_loglike = ll_one(new_par, test_snps, options.is_bnb_only,
                                     options.is_as_only, bnb_sigmas,
                                     as_sigmas, options.read_error_rate,
                                     [], pc_matrix)
                if new_loglike < maxlike:
                    starting_par = new_par

            pc_coefs=[]
            for pc in range(num_pcs):
                res = minimize(ll_pc, [np.float64(0)],
                               args=(starting_par, test_snps, True, #options.is_bnb_only,
                                     options.is_as_only, bnb_sigmas, as_sigmas,
                                     options.read_error_rate, pc_coefs, pc_matrix),
                               options={"maxiter" : 50000, "disp" : options.verbose},
                               method=OPTIMIZER)

                new_coef = res.x
                pc_coefs = np.concatenate([pc_coefs, new_coef])

            res = minimize(ll_one, starting_par,
                           args=(test_snps, options.is_bnb_only, options.is_as_only, bnb_sigmas,
                                 as_sigmas, options.read_error_rate, pc_coefs, pc_matrix),
                           options={"maxiter" : 50000, "disp" : options.verbose},
                           method=OPTIMIZER)
            best1par = res.x

            time_taken = time.time() - t1
            if options.verbose:
                sys.stderr.write("null model optimization took %.3fs\n" % time_taken)
            if options.benchmark:
                bench_file.write("%.3f\n" % time_taken)
                bench_file.flush()


            loglike1par = ll_one(best1par, test_snps, options.is_bnb_only,
                                 options.is_as_only, bnb_sigmas,
                                 as_sigmas, options.read_error_rate,
                                 pc_coefs, pc_matrix)


            start = [best1par[0], best1par[0], best1par[1]]

            if options.benchmark:
                # start timing test for ALT model
                bench_file.write("alt %s %s %d %d " % (snpinfo[0][0], snpinfo[0][1],
                                                       tot_as_counts, all_counts))
                bench_file.flush()

            t1 = time.time()

            # maximize likelihood with alpha and beta as separate parameters
            res = minimize(ll_two, start,
                           args=(test_snps, options.is_bnb_only, options.is_as_only,
                                 bnb_sigmas, as_sigmas, options.read_error_rate,
                                 pc_coefs, pc_matrix),
                           options={"maxiter" : 50000, "disp" : options.verbose},
                           method=OPTIMIZER)
            best2par = res.x

            time_taken = time.time() - t1
            if options.verbose:
                sys.stderr.write("alternative model optimization took %.3fs\n" % time_taken)
            if options.benchmark:
                bench_file.write("%.3f\n" % time_taken)
                bench_file.flush()

            loglike2par = ll_two(best2par, test_snps, options.is_bnb_only,
                                 options.is_as_only, bnb_sigmas,
                                 as_sigmas, options.read_error_rate,
                                 pc_coefs, pc_matrix)

            write_results(outfile, snpinfo, loglike1par, loglike2par, best2par,
                          tot_as_counts, ref_as_counts, alt_as_counts, all_counts)

        except Exception as e:
            write_empty_result(outfile, snpinfo)
            # an error occured, write to output file, but put 0s for all params and
            sys.stderr.write("An error occurred, writing line with 0s for SNP:\n%s\n" % str(e))
            # continue
            raise

        # read next set of lines from input file
        for i in range(len(infiles)):
            line = infiles[i].readline().strip()
            if line:
                snpinfo[i] = line.split()
            else:
                # out of lines from at least one file, assume we are finished
                finished = True


def parse_options():
    parser=argparse.ArgumentParser(description="Runs the combined haplotype test on a set of "
                                   "input files.\nOutput is written to the specified file "
                                   "with the following tab-delimited fields:\n"
                                   "  1. Chromosome,\n"
                                   "  2. Test SNP position,\n"
                                   "  3. Test SNP identifier,\n"
                                   "  4. Test SNP Ref Allele,\n"
                                   "  5. Test SNP Alt Allele,\n"
                                   "  6. Target Region Start,\n"
                                   "  7. Target Region End,\n"
                                   "  8. Log likelihood 1 parameter model (Null),\n"
                                   "  9. Log likelihood 2 parameter model (Alternative),\n"
                                   "  10. Chi-squared statistic,\n"
                                   "  11. P-value,\n"
                                   "  12. Alpha parameter estimate (expression level "
                                   "      of reference allele),\n"
                                   "  13. Beta parameter estimate (expression level of "
                                   "alternative allele),\n"
                                   "  14. Phi parameter estimate (beta-negative-binomial "
                                   "overdispersion parameter for target region),\n"
                                   "  15. Total number of allele-specific read counts for this "
                                   "region summed across individuals,\n"
                                   "  16. Total number of reference haplotype allele-specific "
                                   "read counts,\n"
                                   "  17. Total number of alt haplotype allele-specific read "
                                   "counts,\n"
                                   "  18. Total number of mapped reads for the target region,"
                                   "summed across individuals\n",
                                   formatter_class=argparse.RawDescriptionHelpFormatter)

    
    parser.add_argument("-a", "--as_only",
                        action='store_true',
                        dest='is_as_only', default=False,
                        help="only perform the allele-specific part (Beta Binomial) "
                        "part of the test")

    parser.add_argument("-d", "--bnb_only", action='store_true',
                        dest='is_bnb_only', default=False,
                        help="only perform the association (Beta Negative Binomial) part "
                        "of the test")

    parser.add_argument("--pc_file", action='store',
                        dest='pc_file',
                        help="file containing PC covariates to include in the model"
                        , default=None)

    parser.add_argument("-b", "--bnb_disp", action='store', dest='bnb_disp',
                        help="file containing depth (Beta Negative Binomial)"
                        "dispersion parameters", default=None)

    parser.add_argument("-o", "--as_disp", action='store',
                        dest='as_disp',
                        help="file containing allele-specific (Beta Binomial) dispersion "
                        "parameters", default=None)

    parser.add_argument("-s", "--shuffle", action='store_true',
                        dest='shuffle', default=False,
                        help="permute genotypes")

    parser.add_argument("-e", "--read_error_rate", action='store', dest='read_error_rate',
                        help="estimate of error rate, used to update "
                        "heterozygous genotype probabilities "
                        "(currently this option disabled / not used)",
                        type=float, default=0.005)

    parser.add_argument("-m", "--min_as_counts", action='store', dest='min_as_counts',
                        type=int, default=0,
                        help="only perform test when total number of allele-specific "
                        "read counts across individuals > MIN_COUNTS")

    parser.add_argument("--num_pcs", action='store', dest='num_pcs',
                        type=int, default=0,
                        help="designates the number of PCs to use as covariates")

    parser.add_argument("-v", "--verbose", action='store_true', dest='verbose',
                        default=False, help="print extra information")

    parser.add_argument("--benchmark", dest="benchmark",
                        help="write information about time test is takes, number of optimization "
                        "functions, etc. to specified filename, or to stderr if '-' is specified")

    parser.add_argument("infile_list", action='store', default=None)
    parser.add_argument("out_file", action='store', default=None)

    return parser.parse_args()


def addlogs(loga, logb):
    """Helper function: perform numerically-stable addition in log space"""
    return max(loga, logb) + math.log(1 + math.exp(-abs(loga - logb)))


def AS_betabinom_loglike(logps, sigma, AS1, AS2, hetp, error):
    """Given parameter, returns log likelihood of allele-specific
    part of test. Note that some parts of equation have been
    canceled out"""
    a = math.exp(logps[0] + math.log(1/sigma**2 - 1))
    b = math.exp(logps[1] + math.log(1/sigma**2 - 1))

    part1 = 0
    part1 += betaln(AS1 + a, AS2 + b)
    part1 -= betaln(a, b)

    if hetp==1:
        return part1

    e1 = math.log(error) * AS1 + math.log(1 - error) * AS2
    e2 = math.log(error) * AS2 + math.log(1 - error) * AS1
    if hetp == 0:
        return addlogs(e1, e2)

    return addlogs(math.log(hetp)+part1, math.log(1-hetp) + addlogs(e1, e2))

def betaln_asym(a, b):
    if b > a:
        a, b = b, a

    if a < 1e6:
        return betaln(a, b)

    l=gammaln(b)

    l -= b*math.log(a)
    l += b*(1-b)/(2*a)
    l += b*(1-b)*(1-2*b)/(12*a*a)
    l += -((b*(1-b))**2)/(12*a**3)

    return l

def BNB_loglike(k, mean, sigma, n):
    #Put variables in beta-NB form (n,a,b)
    #sys.stderr.write(str(sigma)+"\n")
    try:
        mean = max(mean, 0.00001)
        logps = [math.log(n) - math.log(n + mean),
                 math.log(mean) - math.log(n + mean)]
    except:
        raise
        n_val=n
        pdb.set_trace()

    p=np.float64(n/(n+mean))

    if sigma < 0.00001: #> 18: #20:
        loglike=-betaln(n, k+1)-math.log(n+k)+n*logps[0]+k*logps[1]
        return loglike

    sigma=(1/sigma)**2 #+sigma*n
    sigma=sigma #+math.sqrt(sigma)/(p*(1-p))**2

    a = p*sigma+1
    b = (1-p)*sigma

    #Rising Pochhammer = gamma(k+n)/gamma(n)
    if k>0:
        loglike=-betaln_asym(n, k)-math.log(k)
    else:
        loglike=0

    #Add log(beta(a+n,b+k))
    loglike += betaln_asym(a+n, b+k)

    #Subtract log(beta(a,b))
    loglike -= betaln_asym(a, b)

    return loglike


def ll_one(x, test_snps, is_bnb_only, is_as_only, bnb_sigmas,
           as_sigmas, error, pc_coefs, pc_matrix):
    alpha = x[0]
    beta = x[0]
    r = x[1]
    return loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
                         is_as_only, bnb_sigmas, as_sigmas, error, pc_coefs, pc_matrix)


def ll_pc(x, params, test_snps, is_bnb_only, is_as_only, bnb_sigmas,
          as_sigmas, error, other_pc_coefs, pc_matrix):
    alpha = params[0]
    beta = params[0]
    r = params[1]
    pc_coefs=np.concatenate([other_pc_coefs, x])
    return loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
                         is_as_only, bnb_sigmas, as_sigmas, error, pc_coefs, pc_matrix)


def ll_two(x, test_snps, is_bnb_only, is_as_only, bnb_sigmas,
           as_sigmas, error, pc_coefs, pc_matrix):
    alpha = x[0]
    beta = x[1]
    r = x[2]
    #if len(x)>3:
    #    pc_fits=x[3:]
    #else:
    #    pc_fits=[]
    return loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
                         is_as_only, bnb_sigmas, as_sigmas, error, pc_coefs, pc_matrix)


def calc_pc_factor(pc_fits, pcs, i):
    if len(pc_fits) > 0:
        return 1 + sum(pc_fits * pcs[i, :len(pc_fits)])
    else:
        return 1


def loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
                  is_as_only, bnb_sigmas, as_sigmas, error,
                  pc_coefs, pc_matrix):
    loglike = 0

    # if input values are outside of reasonable range return a
    # very high -loglike
    if alpha <= 0 or beta <= 0 or r <= 0 or r > 1:
        return 10000000

    ratio = (alpha / (alpha + beta))

    for i in range(len(test_snps)):
        if(test_snps[i].is_homo_ref()):
            m = 2*alpha*test_snps[i].totals * calc_pc_factor(pc_coefs, pc_matrix, i)
        elif(test_snps[i].is_homo_alt()):
            m = 2*beta*test_snps[i].totals * calc_pc_factor(pc_coefs, pc_matrix, i)
        else:
            m = (alpha+beta)*test_snps[i].totals * calc_pc_factor(pc_coefs, pc_matrix, i)
        if m<0:
            m = 0.000001
        if not is_bnb_only:
            for j in range(len(test_snps[i].AS_target_ref)):
                if test_snps[i].hetps[j]>.9:
                    hetp = min(0.99, test_snps[i].hetps[j])
                    logps = [math.log(alpha) - math.log(alpha+beta),
                             math.log(beta) - math.log(alpha+beta)]
                    loglike += AS_betabinom_loglike(logps, as_sigmas[i],
                                                    test_snps[i].AS_target_ref[j],
                                                    test_snps[i].AS_target_alt[j],
                                                    hetp, error)
        if not is_as_only:
            l = BNB_loglike(test_snps[i].counts, m, r, bnb_sigmas[i])
            loglike += l
    return -loglike


def parse_test_snp(snpinfo, options):    
    snp_id = snpinfo[2]
    if snpinfo[16] == "NA":
        # SNP is missing data
        tot = 0
    else:
        # these totals are later rescaled by dividing
        # by the minimum total across individuals to
        # put them into a reasonable range for
        # estimating alpha and beta
        tot = float(snpinfo[16])

    if snpinfo[6] == "NA":
        geno_hap1 = 0
        geno_hap2 = 0
    else:
        geno_hap1 = int(snpinfo[6].strip().split("|")[0])
        geno_hap2 = int(snpinfo[6].strip().split("|")[1])

    if snpinfo[15] == "NA":
        count = 0
    else:
        count = int(snpinfo[15])

    if snpinfo[9].strip() == "NA" or geno_hap1 == geno_hap2:
        # SNP is homozygous, so there is no AS info
        return TestSNP(snp_id, geno_hap1, geno_hap2, [], [], [], tot, count)
    else:
        # positions of target SNPs
        snp_locs = np.array([int(y.strip()) for y in snpinfo[9].split(';')])

        # counts of reads that match reference overlapping linked 'target' SNPs
        snp_as_ref = np.array([int(y) for y in snpinfo[12].split(';')])

        # counts of reads that match alternate allele
        snp_as_alt = np.array([int(y) for y in snpinfo[13].split(';')])

        # heterozygote probabilities
        snp_hetps = np.array([np.float64(y.strip())
                          for y in snpinfo[10].split(';')])

        # linkage probabilities, not currently used
        snp_linkageps = np.array([np.float64(y.strip())
                                  for y in snpinfo[11].split(';')])


        # same SNP should not be provided multiple times, this
        # can create problems with combined test. Warn and filter
        # duplicate SNPs
        uniq_loc, uniq_idx = np.unique(snp_locs, return_index=True)

        if options.dup_snp_warn and uniq_loc.shape[0] != snp_locs.shape[0]:
            sys.stderr.write("WARNING: discarding SNPs that are repeated "
                                     "multiple times in same line\n")
            options.dup_snp_warn = False

        snp_as_ref = snp_as_ref[uniq_idx]
        snp_as_alt = snp_as_alt[uniq_idx]
        snp_hetps = snp_hetps[uniq_idx]
        snp_linkageps = snp_linkageps[uniq_idx]
                             
        if options.shuffle:
            # permute allele-specific read counts by flipping them randomly at
            # each SNP
            for y in range(len(snp_as_ref)):
                if random.randint(0, 1) == 1:
                    temp = snp_as_ref[y]
                    snp_as_ref[y] = snp_as_alt[y]
                    snp_as_alt[y] = temp

        return TestSNP(snp_id, geno_hap1, geno_hap2, snp_as_ref,
                       snp_as_alt, snp_hetps, tot, count)

def load_covariates(cov_file):
    infile = open(cov_file, "rt")
    cov_table=[]
    while True:
        line=infile.readline()
        if line:
            cov_table.append([np.float64(x) for x in line.strip().split()])
        else:
            break

    return np.array(cov_table, dtype=np.float64)

main()