# Version 2
# Pandas/DataFrame/TSV-oriented, rather than FASTA-oriented. 
# Used in HTNA 2024

[project]
project_id=M295
project_leader=Polina Kosillo
project_institution=Allen Institute


[barcodes]
#ssifile = ~/git/mapseq-processing/etc/barcode_v2.txt
#ssifile = ~/git/mapseq-processing/etc/ssi_v2.txt
ssifile = ~/git/mapseq-processing/etc/novaseq_all.txt


[fastq]
r1start = 0
r1end = 32
# include 4nt nextseq rtag
#r1end = 36
# include 5nt novaseq rtag
#r1end = 37
r2start = 0
r2end = 20

# include 4nt nextseq rrtag
#r2end = 24

# include 5nt nextseq/novaseq rrtag
#r2end = 25


# to remove suspect sequences with long homopolymer runs. Should be < 1%?
max_repeats=7
# remove sequences with ambiguous bases
max_n_bases = 0
# reporting intervals for verbose output, defines how often to print. 
seqhandled_interval = 1000000
chunksize=50000000
# fast storage large enough for all data, e.g. ~500GB+
dask_temp=~/scratch
min_reads = 2
# applied to isolated filename (NOTE: deal with optional .gz extension) for source column. 
# match everything up to first underscore. 
source_regex=(.+?)_

[mapseq]
# locations of fields in standard MAPseq protocol.
# standard python slice semantics 
vbc_st = 0
vbc_end = 30
spike_st=24
spike_end = 32
libtag_st=30
libtag_end=32
umi_st = 32
umi_end = 44
ssi_st = 44
ssi_end = 52

[split]
# arbitarary column creation from string.
# standard python slice semantics 
# Standard fields 52nt, no read tags
vbc_read_st = 0
vbc_read_end = 30
spikeseq_st=24
spikeseq_end = 32
libtag_st=30
libtag_end=32

# standard nextseq/novaseq fields
umi_st = 32
umi_end = 44
ssi_st = 44
ssi_end = 52

# include nextseq 4nt field GTAC, 4nt R2 CACG
#rtag_st=32
#rtag_end=36
#umi_st = 36
#umi_end = 48
#ssi_st = 48
#ssi_end = 56
#rrtag_st = 56
#rrtag_end = 60


# include novaseq 5nt field GTACT, 5nt R2 CACGA
#rtag_st=32
#rtag_end=37
#umi_st = 37
#umi_end = 49
#ssi_st = 49
#ssi_end = 57
#rrtag_st = 57
#rrtag_end = 62

[readfilter]
# filter read fields that don't match requirement
#
# novaseq 5nt
#rtag_seq = GTACT
#rrtag_seq = CACGA

# nextseq 4nt rtag
# rtag_seq = GTAC

# nextseq 4nt rrtag
#rrtag_seq = CACG

drop_mismatch = False


[collapse]
tool = bowtie2
n_bases = 30
max_mismatch = 3
seq_length = 30
max_recursion = 30000

[bowtie2]
threads = 10

[readtable]
# T or C = YY
# last 2nt of spike-ins AND reals
# A or G = RR
# last 2nt of L1 controls 
spikeseq= CGTCAGTC
realregex = [TC][TC]$
loneregex = [AG][AG]$
use_lones = True
use_libtag = True

[vbctable]
inj_min_reads = 2
target_min_reads = 2

[vbcfilter]

#  Require that target VBCs be present in the injection area to count.
#  Typically True, unless the dataset doesn't have any injection areas analyzed.
#  Must be false if there are no injection samples.
require_injection=True

# Include injection area(s) in matrix. Only VBCs also in targets, though.  
include_injection=True
include_controls=False

# Per-brain, threshold UMI count in target areas to only those 
# more than the largest target-negative total in that brain.  
use_target_negative=False
use_target_water_control=False

# minimum unique molecules (UMIs) in injection to be kept: 30
inj_min_umi = 10

# Alternatively explicit minimum unique molecules (UMIs) 
# in *any* target area to retain VBC in all areas (even if less than value).
target_min_umi = 4

# Minimum unique molecules (UMI) 
target_min_umi_absolute = 1


[matrices]
#clustermap_logscale=log10