# Version 2 # Pandas/DataFrame/TSV-oriented, rather than FASTA-oriented. # Used in HTNA 2024 [project] project_id=M295 project_leader=Polina Kosillo project_institution=Allen Institute [barcodes] #ssifile = ~/git/mapseq-processing/etc/barcode_v2.txt #ssifile = ~/git/mapseq-processing/etc/ssi_v2.txt ssifile = ~/git/mapseq-processing/etc/novaseq_all.txt [fastq] r1start = 0 r1end = 32 # include 4nt nextseq rtag #r1end = 36 # include 5nt novaseq rtag #r1end = 37 r2start = 0 r2end = 20 # include 4nt nextseq rrtag #r2end = 24 # include 5nt nextseq/novaseq rrtag #r2end = 25 # to remove suspect sequences with long homopolymer runs. Should be < 1%? max_repeats=7 # remove sequences with ambiguous bases max_n_bases = 0 # reporting intervals for verbose output, defines how often to print. seqhandled_interval = 1000000 chunksize=50000000 # fast storage large enough for all data, e.g. ~500GB+ dask_temp=~/scratch min_reads = 2 # applied to isolated filename (NOTE: deal with optional .gz extension) for source column. # match everything up to first underscore. source_regex=(.+?)_ [mapseq] # locations of fields in standard MAPseq protocol. # standard python slice semantics vbc_st = 0 vbc_end = 30 spike_st=24 spike_end = 32 libtag_st=30 libtag_end=32 umi_st = 32 umi_end = 44 ssi_st = 44 ssi_end = 52 [split] # arbitarary column creation from string. # standard python slice semantics # Standard fields 52nt, no read tags vbc_read_st = 0 vbc_read_end = 30 spikeseq_st=24 spikeseq_end = 32 libtag_st=30 libtag_end=32 # standard nextseq/novaseq fields umi_st = 32 umi_end = 44 ssi_st = 44 ssi_end = 52 # include nextseq 4nt field GTAC, 4nt R2 CACG #rtag_st=32 #rtag_end=36 #umi_st = 36 #umi_end = 48 #ssi_st = 48 #ssi_end = 56 #rrtag_st = 56 #rrtag_end = 60 # include novaseq 5nt field GTACT, 5nt R2 CACGA #rtag_st=32 #rtag_end=37 #umi_st = 37 #umi_end = 49 #ssi_st = 49 #ssi_end = 57 #rrtag_st = 57 #rrtag_end = 62 [readfilter] # filter read fields that don't match requirement # # novaseq 5nt #rtag_seq = GTACT #rrtag_seq = CACGA # nextseq 4nt rtag # rtag_seq = GTAC # nextseq 4nt rrtag #rrtag_seq = CACG drop_mismatch = False [collapse] tool = bowtie2 n_bases = 30 max_mismatch = 3 seq_length = 30 max_recursion = 30000 [bowtie2] threads = 10 [readtable] # T or C = YY # last 2nt of spike-ins AND reals # A or G = RR # last 2nt of L1 controls spikeseq= CGTCAGTC realregex = [TC][TC]$ loneregex = [AG][AG]$ use_lones = True use_libtag = True [vbctable] inj_min_reads = 2 target_min_reads = 2 [vbcfilter] # Require that target VBCs be present in the injection area to count. # Typically True, unless the dataset doesn't have any injection areas analyzed. # Must be false if there are no injection samples. require_injection=True # Include injection area(s) in matrix. Only VBCs also in targets, though. include_injection=True include_controls=False # Per-brain, threshold UMI count in target areas to only those # more than the largest target-negative total in that brain. use_target_negative=False use_target_water_control=False # minimum unique molecules (UMIs) in injection to be kept: 30 inj_min_umi = 10 # Alternatively explicit minimum unique molecules (UMIs) # in *any* target area to retain VBC in all areas (even if less than value). target_min_umi = 4 # Minimum unique molecules (UMI) target_min_umi_absolute = 1 [matrices] #clustermap_logscale=log10