
function T=myread_bowtie_block(fname,block_size)
% leaner than samread, 20x faster and ~10x less ram?
%%

if ~exist('block_size','var')
    block_size=1000000;% this seems to be the fastest on my computer for any file size. Not sure if this is hardware specific?
end

%fname="results_bowtie.txt";

%c=fileread(fname);
%n=1;
fid=fopen(fname);
fseek(fid,0,'eof');
pos=ftell(fid);
batch_num=ceil(pos./block_size);
fseek(fid,0,'bof');

%%
remainder='';
%
% q1all={};
% nm1all={};
q1all=cell(batch_num,1);
nm1all=cell(batch_num,1);
%
% tic
%while 1
m=1;
for n=1:2%:batch_num
%tic

%
    c=fread(fid,block_size,'*char')';
    %size(c)

    %toc
    %c_size=numel(c);
    last_idx=find(c==newline,1,'last');
    %toc
    %toc
    %remainder_prev=remainder;
    old_remainder=remainder;
    remainder_next=c(last_idx+1:end);
    c=[newline,remainder,c(1:last_idx)];% add remainder from last block
    remainder=remainder_next;% update remainder for the next block
    
    %toc
    %


    A=regexp(c,'(?<=\n)[\w]*\t[\w]*\t[\w]*','match');
    a1=regexp(A,'\t','split')';
    %toc
    q1all{n}=vertcat(a1{:});
    %toc
    anm1=regexp(c,'(?<=NM:i:)\w*','match')';
    %size(q1all{n})
    %size(anm1)
    %toc
    nm1all{n}=uint8(double(string(anm1)));
    %toc     
 %       
    % if c_size<block_size
    %     break
    % else
    %     n=n+1;
    % end    
    if size(q1all{n},1)~=numel(nm1all{n})
        warning('split outputs did not match in block %u',n);
        fname="errorblock"+m+".txt";
        fid_err=fopen(fname,'w');
        fprintf(fid_err,c);
        fclose(fid_err);

        fname="error_remainder"+m+".txt";
        fid_err=fopen(fname,'w');
        fprintf(fid_err,old_remainder);
        fclose(fid_err);
        pause(0.00001)

        m=m+1;
    end

%
end
%
q1all=vertcat(q1all{:});
nm1all=vertcat(nm1all{:});
% toc
fclose(fid);

%%


%%
% tic
T=struct('QueryName',q1all(:,1), ...
    'Flag',num2cell(uint8(double(string(q1all(:,2))))), ...
    'ReferenceName',q1all(:,3), ...
    'NM',num2cell(nm1all));
% toc
% 
% T.QueryName=q1(:,1);
% T.ReferenceName=q1(:,3);
% T.Flag=uint8(double(string(q1(:,2))));
% T.NM=nm1;
%toc
%toc
% 
% %%
% tic
% 
% 
% queryname=regexp(c,'(?<=\n)[^@][\w]*(?=\t)','match')';
% refname=regexp(c(1:100000),'(?<=\n[^@][\w]*\t[\w]*\t)[\w]*','match')';
% %flagval=
% 
% toc
% %%
% clear expr c
% %toc
% %matches=repmat(matches,10,1);
% % parse sam file
% 
% %tic
% tab=char(9);
% matches1=regexp(matches,tab,'split');
% nm1=regexp(matches,'(?<=NM:i:)\w*','match');
% nm1=uint8(double(string(nm1)));
% clear matches
% %toc
% %tic
% 
% flagval=zeros(numel(matches1),1);
% queryname=cell(numel(matches1),1);
% refname=queryname;
% 
% for n=1:numel(matches1) % this is faster than cellfun
%     queryname{n}=matches1{n}{1};
%     refname{n}=matches1{n}{3};
%     flagval(n)=uint16(double(string(matches1{n}{2})));
% end
% T=table;
% T.QueryName=queryname;
% T.ReferenceName=refname;
% T.Flag=flagval;
% T.NM=nm1;
% toc
% 
% 
% 
% 
end

