# Copyright 2013 Graham McVicker and Bryce van de Geijn # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import os import math import gzip import argparse from scipy.optimize import * from scipy.special import gammaln from scipy.special import betaln import scipy.stats import time import numpy as np import cht_data MIN_GENE_FIT = 0.0 MAX_GENE_FIT = 1e8 MIN_GW_FIT = 1.0 MAX_GW_FIT = 10e3 # MIN_GENE_FIT = 0.0 # MAX_GENE_FIT = 1.0 # MIN_GW_FIT = 1.0 # MAX_GW_FIT = 1e3 # bounds for parameters MIN_MEAN_FIT = MIN_GENE_FIT MAX_MEAN_FIT = MAX_GENE_FIT GW_XTOL = 1e-2 GENE_XTOL = 1e-7 MEAN_XTOL = 1e-7 def parse_options(): default_sample = 2000 parser=argparse.ArgumentParser(description="This program estimates the " "genome-wide dispersion parameters for " "the read depth (beta-negative binomial, BNB) part " "of the combined haplotype test. These " "parameter estimates are not required " "for the allele-specific (beta-binomial) part " "of the test." "This program uses an iterative " "maximum likelihood approach to estimate " "parameters. To speed up parameter estimation " "a subset of test regions are randomly " "selected from the input file. By default %d" " regions are selected. At the end of each " "iteration, the current parameter estimates " "are written to stdout and to the " "specified output file. " "If you are tired of waiting for the " "program to converge, these intermediate " "estimates can be used before the program " "has finished running." % default_sample) parser.add_argument("infile_list", action='store', default=None) parser.add_argument("outfile", action='store', default=None) parser.add_argument("--min_as_counts", action='store', dest='min_as_counts', type=int, default=0, metavar="MIN_COUNTS", help="only use regions where total allele-specific " "read counts across individuals > MIN_COUNTS") parser.add_argument("--min_counts", action='store', dest='min_counts', type=int, default=0, metavar="MIN_COUNTS", help="only use regions where total number of " "read counts across individuals > MIN_COUNTS") parser.add_argument("--skip", action='store', dest='skip', type=int, default=0, help="skip n test region between each one used for fitting") parser.add_argument("--sample", type=int, default=default_sample, help="Randomly sample a specified number of test " "regions from the total set to speed up the " "estimation procedure. This is set to %d " "by default. To " "disable sampling, set this to 0 (--sample 0), or use " "the --no_sample option" % default_sample) parser.add_argument("--no_sample", action="store_true", dest="no_sample", help="Disable random sampling of test regions", default=False) parser.add_argument("--seed", help="Random seed which affects random " "sampling of test regions. Useful for testing.", type=int, default=-1) parser.add_argument("--fix_gene", type=float, default=-1.0, help="fix the per-gene dispersion parameter estimates to " "specified value, and only fit the genome-wide dispersion " "parameter estimates") parser.add_argument("--fix_mean", type=float, default=-1.0, help="fix the per-gene mean dispersion parameter estimates to " "specified value, and only fit the genome-wide dispersion " "parameter estimates") options = parser.parse_args() if options.no_sample or options.sample < 0: # do not perform sampling options.sample = 0 return options def main(): options = parse_options() # set random seed: if options.seed >= 0: np.random.seed(seed=options.seed) # read input data sys.stderr.write("reading input data\n") count_matrix, expected_matrix = \ cht_data.read_count_matrices(options.infile_list, shuffle=False, skip=options.skip, min_counts=options.min_counts, min_as_counts=options.min_as_counts, sample=options.sample) gene_fits = [np.float64(0.005)] * count_matrix.shape[0] mean_fits = [np.float64(1)] * count_matrix.shape[0] gw_fits = [np.float64(100)] * count_matrix.shape[1] old_ll = np.float64(1000000000000000000000000) fix_gene = options.fix_gene > 0.0 fix_mean = options.fix_mean > 0.0 if fix_gene: gene_fits = [np.float64(options.fix_gene)] * count_matrix.shape[0] if fix_mean: mean_fits = [np.float64(options.fix_mean)] * count_matrix.shape[0] # iteratively search for maximum likelihood parameter estimates iteration = 1 while True: sys.stderr.write("\niteration %d\n" % iteration) # first fit over dispersion params for each region sys.stderr.write("fitting per-region overdispersion params\n") t1 = time.time() #gene_fits, mean_fits, fit_ll = get_gene_overdisp(count_matrix, expected_matrix, # gw_fits, gene_fits, mean_fits, # fix_gene=fix_gene, # fix_mean=fix_mean) gene_fit = gene_fits[0] gene_fit, mean_fits, fit_ll = get_single_param_gene_overdisp(count_matrix, expected_matrix, gw_fits, gene_fit, mean_fits) gene_fits[:] = [gene_fit] * len(gene_fits) time_taken = time.time() - t1 sys.stderr.write("time: %.2fs\n" % time_taken) sys.stderr.write("min(gene_fits): %g\n" % np.min(gene_fits)) sys.stderr.write("max(gene_fits): %g\n" % np.max(gene_fits)) sys.stderr.write("mean(gene_fits): %g\n" % np.mean(gene_fits)) sys.stderr.write("min(mean_fits): %g\n" % np.min(mean_fits)) sys.stderr.write("max(mean_fits): %g\n" % np.max(mean_fits)) sys.stderr.write("mean(mean_fits): %g\n" % np.mean(mean_fits)) # then fit genome-wide overdispersion params for each individual sys.stderr.write("fitting genome-wide overdispersion params\n") t1 = time.time() gw_fits, fit_ll = get_gw_overdisp(count_matrix, expected_matrix, gw_fits, gene_fits, mean_fits) time_taken = time.time() - t1 sys.stderr.write("time: %.2fs\n" % time_taken) sys.stderr.write("LL=-%f\n" % fit_ll) gw_str = " ".join("%.2f" % x for x in gw_fits) sys.stderr.write("current genome-wide overdispersion param estimates:\n" " %s\n" % gw_str) # write estimates to outfile each iteration # (this way outfile is written even if optimization terminated by user) outfile = open(options.outfile, "wt") for i in gw_fits: outfile.write("%f\n" % i) outfile.close() iteration += 1 if old_ll - fit_ll < 0.0001: # improvement is small, stop iterations break old_ll = fit_ll sys.stderr.write("done!\n") def get_gene_overdisp(count_matrix, expected_matrix, gw_fits, gene_fits, mean_fits, iteration=0, fix_gene=False, fix_mean=False): fit_ll = 0 gene_fit_improved = 0 mean_fit_improved = 0 for gene_indx in range(count_matrix.shape[0]): if not fix_mean: cur_like = mean_like(mean_fits[gene_indx], count_matrix[gene_indx,:], expected_matrix[gene_indx,:], gw_fits, gene_fits[gene_indx]) xtol = min(mean_fits[gene_indx] * 1e-4, MEAN_XTOL) res = minimize_scalar(mean_like, bounds=(MIN_MEAN_FIT, MAX_MEAN_FIT), args=(count_matrix[gene_indx,:], expected_matrix[gene_indx,:], gw_fits, gene_fits[gene_indx]), options={"xatol" : xtol}, method="Bounded") like_diff = cur_like - res.fun if like_diff >= 0.0: # update parameter mean_fits[gene_indx] = res.x mean_fit_improved += 1 else: # likelihood got worse indicating failed to converge, # do not accept new param value pass if not fix_gene: cur_like = gene_like(gene_fits[gene_indx], count_matrix[gene_indx,:], expected_matrix[gene_indx,:], gw_fits, mean_fits[gene_indx]) xtol = min(gene_fits[gene_indx] * 1e-4, GENE_XTOL) res = minimize_scalar(gene_like, bounds=(MIN_GENE_FIT, MAX_GENE_FIT), args=(count_matrix[gene_indx,:], expected_matrix[gene_indx,:], gw_fits, mean_fits[gene_indx]), options={"xatol" : xtol}, method="Bounded") like_diff = cur_like - res.fun if like_diff >= 0.0: # update parameter gene_fits[gene_indx] = res.x gene_fit_improved += 1 else: # likelihood got worse indicating failed to converge, # do not accept new param value pass fit_ll += gene_like(gene_fits[gene_indx], count_matrix[gene_indx,:], expected_matrix[gene_indx,:], gw_fits, mean_fits[gene_indx]) return gene_fits, mean_fits, fit_ll def get_single_param_gene_overdisp(count_matrix, expected_matrix, gw_fits, gene_fit, mean_fits, iteration=0, fix_gene=False, fix_mean=False): fit_ll = 0 gene_fit_improved = 0 mean_fit_improved = 0 # update mean fits for gene_indx in range(count_matrix.shape[0]): if not fix_mean: cur_like = mean_like(mean_fits[gene_indx], count_matrix[gene_indx,:], expected_matrix[gene_indx,:], gw_fits, gene_fit) xtol = min(mean_fits[gene_indx] * 1e-4, MEAN_XTOL) res = minimize_scalar(mean_like, bounds=(MIN_MEAN_FIT, MAX_MEAN_FIT), args=(count_matrix[gene_indx,:], expected_matrix[gene_indx,:], gw_fits, gene_fit), options={"xatol" : xtol}, method="Bounded") like_diff = cur_like - res.fun if like_diff >= 0.0: # update parameter mean_fits[gene_indx] = res.x mean_fit_improved += 1 else: # likelihood got worse indicating failed to converge, # do not accept new param value pass cur_like = single_param_gene_like(gene_fit, count_matrix, expected_matrix, gw_fits, mean_fits) xtol = min(gene_fit * 1e-4, GENE_XTOL) res = minimize_scalar(single_param_gene_like, bounds=(MIN_GENE_FIT, MAX_GENE_FIT), args=(count_matrix, expected_matrix, gw_fits, mean_fits), options={"xatol" : xtol}, method="Bounded") like_diff = cur_like - res.fun if like_diff >= 0.0: # update parameter gene_fit = res.x fit_ll = res.fun else: # likelihood got worse indicating failed to converge, # do not accept new param value fit_ll = cur_like return gene_fit, mean_fits, fit_ll def get_gw_overdisp(count_matrix, expected_matrix, gw_fits, gene_fits, mean_fits): fit_ll = 0 for indx in range(count_matrix.shape[1]): cur_like = gw_like(gw_fits[indx], count_matrix[:, indx], expected_matrix[:, indx], gene_fits, mean_fits) init_gw_fit = gw_fits[indx] xtol = min(gw_fits[indx] * 1e-2, GW_XTOL) res = minimize_scalar(gw_like, bounds=(MIN_GW_FIT, MAX_GW_FIT), args=(count_matrix[:, indx], expected_matrix[:, indx], gene_fits, mean_fits), options={'xatol' : xtol}, # tol=GW_LIKE_TOL, method="Bounded") new_like = res.fun like_diff = cur_like - res.fun if like_diff >= 0.0: # likelhood improved gw_fits[indx] = res.x like = new_like else: # likelihood did not improve because failed to converge like = cur_like like = gw_like(gw_fits[indx], count_matrix[:, indx], expected_matrix[:, indx], gene_fits, mean_fits) fit_ll += like return gw_fits, fit_ll def mean_like(mean_fit, counts, expecteds, gw_fits, gene_fit): if mean_fit < MIN_MEAN_FIT or mean_fit > MAX_MEAN_FIT: return 1e8 loglike=0 for i in range(len(counts)): loglike += BNB_loglike(counts[i], expecteds[i]*mean_fit, gw_fits[i], gene_fit) return -loglike def gene_like(gene_fit, counts, expecteds, gw_fits, mean_fit): if gene_fit < MIN_GENE_FIT or gene_fit > MAX_GENE_FIT: return 1e8 loglike=0 for i in range(len(counts)): loglike += BNB_loglike(counts[i], expecteds[i]*mean_fit, gw_fits[i], gene_fit) return -loglike def single_param_gene_like(gene_fit, count_matrix, expected_matrix, gw_fits, mean_fits): ll = 0 for gene_indx in range(count_matrix.shape[0]): ll += gene_like(gene_fit, count_matrix[gene_indx,:], expected_matrix[gene_indx,:], gw_fits, mean_fits[gene_indx]) return ll def gw_like(gw_fit, counts, expecteds, gene_fits, mean_fits): if gw_fit > MAX_GW_FIT or gw_fit < MIN_GW_FIT: return 1e8 loglike = 0 for i in range(len(counts)): loglike += BNB_loglike(counts[i], expecteds[i]*mean_fits[i], gw_fit, gene_fits[i]) return -loglike def addlogs(loga, logb): """Helper function: perform numerically-stable addition in log space""" return max(loga, logb) + math.log(1 + math.exp(-abs(loga - logb))) def lbeta_asymp(a, b): if b > a: a, b = b, a if a<1e6: return betaln(a, b) l = gammaln(b) l -= b*math.log(a) l += b*(1-b)/(2*a) l += b*(1-b)*(1-2*b)/(12*a*a) l += -((b*(1-b))**2)/(12*a**3) return l def BNB_loglike(k, mean, n, sigma): #n=min(n,10000) #Put variables in beta-NB form (n,a,b) mean = max(mean, 0.00001) p = np.float64(n)/(n+mean) logps = [math.log(n) - math.log(n + mean), math.log(mean) - math.log(n + mean)] if sigma < 0.00001: loglike = -betaln(n, k+1)-math.log(n+k)+n*logps[0]+k*logps[1] return loglike sigma = (1/sigma)**2 a = p * sigma + 1 b = (1-p) * sigma loglike = 0 #Rising Pochhammer = gamma(k+n)/gamma(n) #for j in range(k): # loglike += math.log(j+n) if k>0: loglike = -lbeta_asymp(n, k) - math.log(k) #loglike=scipy.special.gammaln(k+n)-scipy.special.gammaln(n) else: loglike=0 #Add log(beta(a+n,b+k)) loglike += lbeta_asymp(a+n, b+k) #Subtract log(beta(a,b)) loglike -= lbeta_asymp(a, b) return loglike main()