
function conn=match_bc2neurons(filt_neurons,bc,varargin)
% match bc barcodes to filt_neurons.
% Optional inputs:
%           mismatch: number of mismatches allowed, default 1.
%           filterslice: slices that will be filtered out, relative to soma 
%               slice number. For example, [-1 0] means filtering out the 
%               previous slice and the soma slice. Default is [] (no 
%               filtering).
%           allowduplicates: whether to allow bc rolonies to match to
%               multiple cells. Default is 0. This affect filt_mat, not mat.
%           output: filename to which conn is saved. should be a .mat
%               filename-compatible string/char array. Default is conn.mat.
% output conn with the following fields:
%           neuron_uid: corresponds to filt_neurons.uid and rows of
%           matrices.
%           bc_uid: corresponds to bc.uid and columns of matrices.
%           match: number of matches for valid matching barcodes. Equals
%           barcode length - NM tag in bowtie output. This include matching
%           barcodes that are non-best matches, but withint the mismatch
%           tolerance.
%           mat: sparse logical matrix that indicates which neuron each
%           barcode matches to. Allows a barcode to match to multiple if
%           they are equally best matches.
%           filt_mat: sparse logical matrix that indicates barcode matching
%           after filtering by slice numbers and by duplicates. If
%           allowduplicates == 0 and filterslice == [], then this is the
%           same as mat.

%           
%           



% conn.neuron_uid;
% conn.bc_uid;
% conn.mat;
% conn.filt_slice_dist;

% check if uid is present for both neurons and bc
if ~isfield(filt_neurons,'uid')||~isfield(bc,'uid')
    error('uid is absent for filt_neurons or bc. Add uid first before matching barcodes.\n');
end

% default thresholds
mismatch_tolerance=1;
filterslice=[];
allow_duplicates=0;
fname='conn.mat';

% parse thresholds
if ~isempty(varargin)
    varargin=reshape(varargin,2,[]);
    for i=1:size(varargin,2)
        switch lower(varargin{1,i})
            case 'mismatch'
                mismatch_tolerance=varargin{2,i};
                assert(mismatch_tolerance==round(mismatch_tolerance)&&mismatch_tolerance>=0, ...
                    'mismatch should be non-negative integer. Abort.\n');
            case 'filterslice'
                filterslice=varargin{2,i};
                assert(sum(round(filterslice)~=filterslice)==0, ...
                    'filterslice should be integer array or scalar. Abort.\n');
            case 'allowduplicates'
                allow_duplicates=varargin{2,i};
                assert(isnumeric(allow_duplicates)||islogical(allow_duplicates), ...
                    'allow_duplicates should be 0 or 1. Abort.\n');
            case 'output'
                fname=varargin{2,i};
        end
    end
end
%% make unified soma sequences
% only match to neurons that are is_barcoded
% if soma_bc_hd exists (not all 5s), substitute soma_bc with soma_bc_hd
% then convert sequences to 'GTAC'

somabc=filt_neurons.soma_bc;
hd=filt_neurons.soma_bc_hd(:,1)~=5; % 5 is N which is used when soma_bc_hd is not called.
somabc(hd,:)=filt_neurons.soma_bc_hd(hd,:); %use hd if available

dic='GTACN';
somabcC=dic(somabc);
bcC=dic(bc.seq);

%% write the sequences to fasta files and run bowtie2. Need wsl and bowtie2 toolbox extension installed
% write fasta files
tic
fasta_folder='fasta';
mkdir(fasta_folder);
myfastawrite(fullfile(fasta_folder,'neurons.fa'), ...
    filt_neurons.uid(filt_neurons.is_barcoded), ...
    somabcC(filt_neurons.is_barcoded,:));
myfastawrite(fullfile(fasta_folder,'bc.fa'), ...
    bc.uid, ...
    bcC);
t1=toc;
fprintf('Writing fasta files took %.2g secs.\n',t1);


% make bowtie index
tic
ref_folder='bowtie_ref';
mkdir(ref_folder);
bowtie2build(fullfile(fasta_folder,'neurons.fa'),fullfile(ref_folder,'ref'));
t2=toc;
fprintf('Building bowtie indices took %.2g secs.\n',t2);

% run bowtie

numcores=feature('numcores');
[~,mem]=memory;
maxcore=min(round(numcores*2), floor(mem.PhysicalMemory.Total/2000000000));

alignOptions=Bowtie2AlignOptions('AlignReverseComplementStrand',0, ...
    'NoGapPosition',1, ...
    'ReadGapCosts',[10 3], ... % not alowing gaps
    'RefGapCosts',[10,3], ... % not alowing gaps
    'NumAlignments','All', ...
    'NumThreads',maxcore, ...
    'NumReseeding',3, ...
    'MismatchPenalty',[4 2], ... % this may need to change if more tolernace is needed than 2
    'SeedLength',8, ...
    'ExcludeUnaligned',1, ...
    'NumSeedMismatches',0);
tic

output_dir='bowtie_output';
mkdir(output_dir);
bowtie2(fullfile(ref_folder,'ref'), ...
    fullfile(fasta_folder,'bc.fa'),'', ...
    fullfile(output_dir,'results_bowtie.txt'), ...
    alignOptions);
t4=toc;
fprintf('bowtie matching took %.2g secs.\n',t4);
% read bowtie results
fprintf('Reading bowtie results ...');
tic
alignment=samread(fullfile(output_dir,'results_bowtie.txt'));
t5=toc;
fprintf('Done, took %.2g secs.\n', t5);

%% filter by mismatch tolerance
fprintf('Filtering out barcodes with more than %u mismatches ...',mismatch_tolerance);

tags={alignment.Tags};
nm=cellfun(@(x) x.NM,tags); % number of edits
%xm=cellfun(@(x) x.XM,tags); % number of substitutions. With large gap penalty, this hsould be the same as nm
% apply mismatch threshold
keep=nm<=mismatch_tolerance;
alignment=alignment(keep);

fprintf('Done.\n');

%% build connectiity matrix without other filtering (sparse logical)


tags={alignment.Tags};
nm1=cellfun(@(x) x.NM,tags); %
match=size(filt_neurons.soma_bc,2)-nm1;

[~,bc_idx]=ismember({alignment.QueryName},bc.uid);
[~,neuron_idx]=ismember({alignment.ReferenceName},filt_neurons.uid);
% dont' do any other filtering, but only keep the best (possibly tied)
% matches

conn=struct;
conn.neuron_uid=filt_neurons.uid;
conn.bc_uid=bc.uid;

conn.match=sparse(neuron_idx, ...
    bc_idx, ...
    double(match), ...
    numel(filt_neurons.uid), ...
    numel(bc.uid));


% remove non-best matches
fprintf('Filtering out non-best matches ...');
% tic
is_primary=[alignment(:).Flag]'==0;
keep_2nd=ones(numel(is_primary),1,'logical');
primary_match=match(is_primary);
secondary_match=match(~is_primary);
primary_QueryName={alignment(is_primary).QueryName};
[~,I]=ismember({alignment(~is_primary).QueryName},primary_QueryName); %I is the index of primary query  for every secondary query.
keep_2nd(~is_primary)=secondary_match==primary_match(I); % for non-primary matches, only keep when match scores are the same as primary match scores
conn.mat=sparse(neuron_idx(keep_2nd), ...
    bc_idx(keep_2nd), ...
    ones(sum(keep_2nd),1,'logical'), ...
    numel(filt_neurons.uid), ...
    numel(bc.uid)); % connectiivty matrix with allowed mismatches, and only best matches

% toc
fprintf('Done.\n');

% % this step seems slow, but optimize in the future
% tic
% conn.mat(conn.match>0&conn.match~=max(conn.match,[],1))=0;% remove all connections that are not highest scored for that barcode\
% toc


%% filter out barcodes by slice and by duplicates.
% note that filtering by duplicates should be done after dupliates are
% removed in filt_neurons, especially those due to technical reasons (i.e.,
% a neuron imaged twice in overlaping FOVs, or neurons sliced in half.

% check slices
% tic
if ~isempty(filterslice)
    fprintf('Filtering barcodes based on slice numbers.\n');
    [neuron_idx1, bc_idx1]=find(conn.mat);
    neuron_slice=filt_neurons.slice(neuron_idx1);
    bc_slice=bc.slice(bc_idx1);
    remove_slice=zeros(numel(neuron_idx1),1);
    for m=1:numel(filterslice)
        remove_slice(bc_slice==neuron_slice+filterslice(m))=1;
    end
    conn.filt_mat=sparse(neuron_idx1(~remove_slice), ...
        bc_idx1(~remove_slice), ...
        ones(sum(~remove_slice),1,'logical'), ...
        numel(filt_neurons.uid), ...
        numel(bc.uid)); % this is the connectiivty matrix with slice filtering applied
else
    fprintf('Skipping filtering based on slice numbers.\n');
    conn.filt_mat=conn.mat;
end
% toc

% filter out duplicates if applicable
if allow_duplicates==0
    conn.filt_mat(:,sum(conn.mat,1)>1)=0;
end

save(fname,'conn');
fprintf('Saved connectivity to %s.\n', fname);



% from bowtie2 manual:
% 
% AS:i:<N>
% Alignment score. Can be negative. Can be greater than 0 in --local mode 
% (but not in --end-to-end mode). Only present if SAM record is for an 
% aligned read.
% 
% XS:i:<N>
% Alignment score for the best-scoring alignment found other than the 
% alignment reported. Can be negative. Can be greater than 0 in --local 
% mode (but not in --end-to-end mode). Only present if the SAM record is 
% for an aligned read and more than one alignment was found for the read. 
% Note that, when the read is part of a concordantly-aligned pair, this 
% score could be greater than AS:i.
% 
% YS:i:<N>
% Alignment score for opposite mate in the paired-end alignment. Only 
% present if the SAM record is for a read that aligned as part of a 
% paired-end alignment.
% 
% XN:i:<N>
% The number of ambiguous bases in the reference covering this alignment. 
% Only present if SAM record is for an aligned read.
% 
% XM:i:<N>
% The number of mismatches in the alignment. Only present if SAM record 
% is for an aligned read.
% 
% XO:i:<N>
% The number of gap opens, for both read and reference gaps, in the 
% alignment. Only present if SAM record is for an aligned read.
% 
% XG:i:<N>
% The number of gap extensions, for both read and reference gaps, in the 
% alignment. Only present if SAM record is for an aligned read.
% 
% NM:i:<N>
% The edit distance; that is, the minimal number of one-nucleotide edits 
% (substitutions, insertions and deletions) needed to transform the read 
% string into the reference string. Only present if SAM record is for an 
% aligned read.
% 
% YF:Z:<S>
% String indicating reason why the read was filtered out. See also: 
% Filtering. Only appears for reads that were filtered out.
% 
% YT:Z:<S>
% Value of UU indicates the read was not part of a pair. Value of CP 
% indicates the read was part of a pair and the pair aligned concordantly. 
% Value of DP indicates the read was part of a pair and the pair aligned 
% discordantly. Value of UP indicates the read was part of a pair but the 
% pair failed to aligned either concordantly or discordantly.
% 
% MD:Z:<S>
% A string representation of the mismatched reference bases in the 
% alignment. See SAM Tags format specification for details. Only present 
% if SAM record is for an aligned read.













end

function myfastawrite(fname,bcid,bcC)
%% custom way of writing fasta, 10x faster than built in for large arrays.
% make parts
init=repmat('>',size(bcid,1),1);
nl=repmat(newline,size(bcid,1),1);

% combine them
joint=init+string(bcid)+nl+bcC+nl+nl;

% convert to a single char
joint2=char(joint);
joint3=reshape(joint2',1,[]);

%write to file
if isfile(fname)
    warning('%s exists, overwriting.\n',fname)
end

fid=fopen(fname,'w');
fprintf(fid,'%s',joint3);
fclose(fid);
%toc

end
