#!/usr/bin/env python import sys import time import string import os import operator from itertools import imap import re def main(): # forward primer ACGCAGAGTCCGCTC # reverse complement of this GAGCGGACTCTGCGT adapter_seq1_reverse = "GAGCGGACTCTGCGT" p1_reverse = re.compile(adapter_seq1_reverse) ##### this is the sequence that will be seen on the end of read1 and read 2 for readthrough. This is the fixed primer in [4:19] in read2 and read1. adapter_seq1_forward = "ACGCAGAGTCCGCTC" p1_forward = re.compile(adapter_seq1_forward) ##### this is the sequence that will be seen on the start of read1 and read 2. This is the fixed primer in [4:19] in read2 and read1. # adapter_seq2 = "CCAAACACACCCAA" # p2 = re.compile(adapter_seq2) ##### this is the sequence that will be seen on the end of read2 for readthrough. This is the p7 extension that will not be in read1, only seen at end of read2 on readthrough. logfilename = sys.argv[1] statfilename = sys.argv[2] infilename1 = sys.argv[3] infilename2 = sys.argv[4] outfilename1 = sys.argv[5] outfilename2 = sys.argv[6] barcode_filename = sys.argv[7] LOG = open(logfilename, "w") STAT = open(statfilename, "w") INFILE1 = open(infilename1, "r") INFILE2 = open(infilename2, "r") OUTFILE1 = open(outfilename1, "w") OUTFILE2 = open(outfilename2, "w") BARCODE = open(barcode_filename, "r") bc = dict() for x in BARCODE: arow = x.rstrip().split("\t") bc[arow[0]] = arow[1] BARCODE.close() thisRead1 = None thisRead1 = [] thisRead2 = None thisRead2 = [] counter = 0 forward = True thisBeadTag = "NONE" thisVarietalTag = "NONE" tags = dict() reject_len_aline1 = 0 reject_len_aline2 = 0 reject_missing_up = 0 reject_different_bc = 0 reject_bad_bc = 0 reject_short_insert_1 = 0 reject_primer_chain_1 = 0 reject_short_insert_2 = 0 reject_primer_chain_2 = 0 reject_up1_up1 = 0 reject_up2_up2 = 0 reject_missing_up1 = 0 reject_missing_up2 = 0 reject_missing_alu = 0 good_reads = 0 read1_up1 = 0 read1_up2 = 0 read2_up1 = 0 read2_up2 = 0 alu_pos = 0 readLength1 = 0 readLength2 = 0 useRead = False for aline1 in INFILE1: aline1 = aline1.rstrip() aline2 = INFILE2.readline().rstrip() if counter % 4 == 0: ## WRITE PREVIOUS READ (in this case, only Read2) ## if useRead: if len(thisRead1[1]) > 29 and len(thisRead2[1]) > 29: good_reads += 1 for j in range(len(thisRead2)): #print thisRead2[j] OUTFILE1.write(thisRead1[j] + "\n") OUTFILE2.write(thisRead2[j] + "\n") else: if len(thisRead1[1]) < 30: reject_len_aline1 += 1 elif len(thisRead2[1]) < 30: reject_len_aline2 += 1 ## START NEW READ ## thisBeadTag = "NONE" thisVarietalTag = "NONE" thisRead1 = None thisRead1 = [] thisRead1.append(aline1.split()[0]) thisRead2 = None thisRead2 = [] thisRead2.append(aline2.split()[0]) forward = True useRead = False elif counter % 4 == 1: thisBeadTag = "NONE" thisVarietalTag = "NONE" read1_up1 = 0 read1_up2 = 0 read2_up1 = 0 read2_up2 = 0 alu_pos = 0 read12_up = 0 if len(aline1) < 60: reject_len_aline1 += 1 counter += 1 continue if len(aline2) < 60: reject_len_aline2 += 1 counter += 1 continue ##### read 1 and 2 the same ##### [4:19] = fixed primer ##### [19:24] = 5 base sample tag ##### [24:28] = 4 base varietal tag ##### [28:32] = CATG if hamming(aline1[28:32],"CATG") <= 1 and hamming(aline2[28:32],"CATG") <= 1: read12_up = 1 if read12_up == 0: print aline1 print aline2 print aline1[28:32], aline2[28:32] print reject_missing_up += 1 counter += 1 continue ##### check bead barcode information and varietal tag information thisBeadTag1 = aline1[19:24] thisBeadTag2 = aline2[19:24] if thisBeadTag1 == thisBeadTag2: thisBeadTag = thisBeadTag1 else: reject_different_bc += 1 counter += 1 continue if bc.has_key(thisBeadTag): pass else: reject_bad_bc += 1 counter += 1 continue thisVarietalTag = aline1[24:28] + aline2[24:28] readLength1 = len(aline1) m = p1_reverse.search(aline1) if m > 0: readLength1 = m.start() - 13 # Must trim an extra 13 for CATG (4), VT (4), and barcode (5). else: for i in range(4, 14): if aline1[-i:] == adapter_seq1_reverse[0:i]: readLength1 = readLength1 - (i + 13) if readLength1 < 60: reject_short_insert_1 += 1 counter += 1 continue m = p1_forward.search(aline1[32:readLength1]) if m > 0: reject_primer_chain_1 += 1 counter += 1 continue if readLength1 < 1: readLength1 = 1 if readLength1 > 32: thisRead1.append(aline1[32:readLength1]) else: thisRead1.append(aline1[0:readLength1]) readLength2 = len(aline2) m = p1_reverse.search(aline2) if m > 0: readLength2 = m.start() - 13 # Must trim an extra 13 for CATG (4), VT (4), and barcode (5). else: for i in range(4, 14): if aline2[-i:] == adapter_seq1_reverse[0:i]: readLength2 = readLength2 - (i + 13) if readLength2 < 60: reject_short_insert_2 += 1 counter += 1 continue m = p1_forward.search(aline2[32:readLength2]) if m > 0: reject_primer_chain_2 += 1 counter += 1 continue if readLength2 < 1: readLength2 = 1 if readLength2 > 32: thisRead2.append(aline2[32:readLength2]) else: thisRead2.append(aline2[0:readLength2]) thisRead1[0] = thisRead1[0] + "/1_" + thisBeadTag1 + "_" + thisVarietalTag thisRead2[0] = thisRead2[0] + "/1_" + thisBeadTag1 + "_" + thisVarietalTag if readLength1 > 59 and readLength2 > 59: if tags.has_key(thisBeadTag): tags[thisBeadTag] += 1 else: tags[thisBeadTag] = 1 useRead = True elif counter % 4 == 2: thisRead1.append(aline1) thisRead2.append(aline2) else: if readLength1 > 32: thisRead1.append(aline1[32:readLength1]) else: thisRead1.append(aline1[0:readLength1]) if readLength2 > 32: thisRead2.append(aline2[32:readLength2]) else: thisRead2.append(aline2[0:readLength2]) counter += 1 for k, v in tags.iteritems(): LOG.write(k) LOG.write("\t") LOG.write(str(v)) LOG.write("\n") #STAT.write("total.reads\tgood.reads\tlen.read1\tlen.read2\tup1.missing\tup2.missing\tup1.up1\tup2.up2\talu.missing\n") STAT.write("total.reads\tgood.reads\tlen.read1\tlen.read2\tup.missing\tbc.mismatch\tbc.bad\tshort.insert1\tprimer.chain1\tshort.insert2\yprimer.chain2\n") STAT.write(str(counter / 4)) STAT.write("\t") STAT.write(str(good_reads)) STAT.write("\t") STAT.write(str(reject_len_aline1)) STAT.write("\t") STAT.write(str(reject_len_aline2)) STAT.write("\t") STAT.write(str(reject_missing_up)) STAT.write("\t") STAT.write(str(reject_different_bc)) STAT.write("\t") STAT.write(str(reject_bad_bc)) STAT.write("\t") STAT.write(str(reject_short_insert_1)) STAT.write("\t") STAT.write(str(reject_primer_chain_1)) STAT.write("\t") STAT.write(str(reject_short_insert_2)) STAT.write("\t") STAT.write(str(reject_primer_chain_2)) STAT.write("\n") INFILE1.close() INFILE2.close() OUTFILE1.close() OUTFILE2.close() LOG.close() STAT.close() def hamming(str1, str2): ne = operator.ne return sum(imap(ne, str1, str2)) if __name__=="__main__": main()