# Version 2
# Pandas/DataFrame/TSV-oriented, rather than FASTA-oriented. 
# Used in HTNA 2024


[DEFAULT]
dask_temp = ./temp

[project]
project_id=M292
project_leader=Euiseok Kim
project_institution=UCSC


[barcodes]
#ssifile = ~/git/mapseq-processing/etc/barcode_v2.txt
#ssifile = ~/git/mapseq-processing/etc/ssi_v2.txt
ssifile = ~/git/mapseq-processing/etc/novaseq_all.txt


[fastq]
r1start = 0
r1end = 32
r2start = 0
r2end = 20
# to remove suspect sequences with long homopolymer runs. Should be < 1%?
max_repeats=7
# remove sequences with ambiguous bases
max_n_bases = 0
# reporting intervals for verbose output, defines how often to print. 
seqhandled_interval = 1000000
chunksize=50000000
# fast storage large enough for all data, e.g. ~500GB+
use_dask = False
dask_temp = ~/scratch

# unconditional threshold applied at aggregation step.
# shouldn't be needed. 
min_reads = 1

# applied to isolated filename for source column. 
# match everything up to first underscore. 
source_regex=(.+?)_


[mapseq]
# locations of fields in standard MAPseq protocol.
# standard python slice semantics 
vbc_st = 0
vbc_end = 30
spike_st=24
spike_end = 32
libtag_st=30
libtag_end=32
umi_st = 32
umi_end = 44
ssi_st = 44
ssi_end = 52


[collapse]
tool = bowtie2
n_bases = 30
max_mismatch = 3
seq_length = 30
max_recursion = 40000


[bowtie2]
threads = 10


[readtable]
# Readtable creation is where sample metadata is mapped/added:
#
# Sampleinfo.xlsx/tsv site-type definitions:
#
# LABEL						SOURCE				MEANING
# target					user-provided		treated sample
# target-negative           user-provided    	treated but expected to be low
# 	Both handled exactly the same and included in matrices. 

# injection					user-provided 	treated sample

# CONTROLS
# target-negative-control   user-provided   	untreated sample
# target-wt-control         core added      	untreated biological sample
# target-water-control      core added      	empty sample
# injection-water-control   core added      	empty sample

# 
# T or C = YY
# last 2nt of spike-ins AND reals
# A or G = RR
# last 2nt of L1 controls 
spikeseq= CGTCAGTC
realregex = [TC][TC]$
loneregex = [AG][AG]$


[vbctable]
inj_min_reads = 2
target_min_reads = 2


[vbcfilter]
#  Require that target VBCs be present in the injection area to count.
#  Typically True, unless the dataset doesn't have any injection areas analyzed.
#  Must be false if there are no injection samples.
require_injection=False

# Include injection area(s) in matrix. Only VBCs also in targets, though.  
include_injection=False

# Include non-brain-specific controls in filtered table, so they will
# be included in matrices. 
include_controls = True

# Per-brain, threshold UMI count in target areas to only those 
# more than the largest target-negative total in that brain.  
use_target_negative=False
use_target_water_control=False

# minimum unique molecules (UMIs) in injection to be kept: 30
inj_min_umi = 10

# Alternatively explicit minimum unique molecules (UMIs) 
# in *any* target area to retain VBC in all areas (even if less than value).
target_min_umi = 2

# Minimum unique molecules (UMI) 
target_min_umi_absolute = 1


[matrices]
#clustermap_logscale=log10
clustermap_scale=log2

# Include non-brain-specific controls in in every brain matrix. 
include_controls = True