# Version 2 # Pandas/DataFrame/TSV-oriented, rather than FASTA-oriented. # Used in HTNA 2024 [DEFAULT] dask_temp = ./temp [project] project_id=M292 project_leader=Euiseok Kim project_institution=UCSC [barcodes] #ssifile = ~/git/mapseq-processing/etc/barcode_v2.txt #ssifile = ~/git/mapseq-processing/etc/ssi_v2.txt ssifile = ~/git/mapseq-processing/etc/novaseq_all.txt [fastq] r1start = 0 r1end = 32 r2start = 0 r2end = 20 # to remove suspect sequences with long homopolymer runs. Should be < 1%? max_repeats=7 # remove sequences with ambiguous bases max_n_bases = 0 # reporting intervals for verbose output, defines how often to print. seqhandled_interval = 1000000 chunksize=50000000 # fast storage large enough for all data, e.g. ~500GB+ use_dask = False dask_temp = ~/scratch # unconditional threshold applied at aggregation step. # shouldn't be needed. min_reads = 1 # applied to isolated filename for source column. # match everything up to first underscore. source_regex=(.+?)_ [mapseq] # locations of fields in standard MAPseq protocol. # standard python slice semantics vbc_st = 0 vbc_end = 30 spike_st=24 spike_end = 32 libtag_st=30 libtag_end=32 umi_st = 32 umi_end = 44 ssi_st = 44 ssi_end = 52 [collapse] tool = bowtie2 n_bases = 30 max_mismatch = 3 seq_length = 30 max_recursion = 40000 [bowtie2] threads = 10 [readtable] # Readtable creation is where sample metadata is mapped/added: # # Sampleinfo.xlsx/tsv site-type definitions: # # LABEL SOURCE MEANING # target user-provided treated sample # target-negative user-provided treated but expected to be low # Both handled exactly the same and included in matrices. # injection user-provided treated sample # CONTROLS # target-negative-control user-provided untreated sample # target-wt-control core added untreated biological sample # target-water-control core added empty sample # injection-water-control core added empty sample # # T or C = YY # last 2nt of spike-ins AND reals # A or G = RR # last 2nt of L1 controls spikeseq= CGTCAGTC realregex = [TC][TC]$ loneregex = [AG][AG]$ [vbctable] inj_min_reads = 2 target_min_reads = 2 [vbcfilter] # Require that target VBCs be present in the injection area to count. # Typically True, unless the dataset doesn't have any injection areas analyzed. # Must be false if there are no injection samples. require_injection=False # Include injection area(s) in matrix. Only VBCs also in targets, though. include_injection=False # Include non-brain-specific controls in filtered table, so they will # be included in matrices. include_controls = True # Per-brain, threshold UMI count in target areas to only those # more than the largest target-negative total in that brain. use_target_negative=False use_target_water_control=False # minimum unique molecules (UMIs) in injection to be kept: 30 inj_min_umi = 10 # Alternatively explicit minimum unique molecules (UMIs) # in *any* target area to retain VBC in all areas (even if less than value). target_min_umi = 2 # Minimum unique molecules (UMI) target_min_umi_absolute = 1 [matrices] #clustermap_logscale=log10 clustermap_scale=log2 # Include non-brain-specific controls in in every brain matrix. include_controls = True