/*===========================================================================
 *
 *                            PUBLIC DOMAIN NOTICE
 *               National Center for Biotechnology Information
 *
 *  This software/database is a "United States Government Work" under the
 *  terms of the United States Copyright Act.  It was written as part of
 *  the author's official duties as a United States Government employee and
 *  thus cannot be copyrighted.  This software/database is freely available
 *  to the public for use. The National Library of Medicine and the U.S.
 *  Government have not placed any restriction on its use or reproduction.
 *
 *  Although all reasonable efforts have been taken to ensure the accuracy
 *  and reliability of the software and data, the NLM and the U.S.
 *  Government do not and cannot warrant the performance or results that
 *  may be obtained by using this software or data. The NLM and the U.S.
 *  Government disclaim all warranties, express or implied, including
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.
 *
 *  Please cite the author in any work or product based on this material.
 *
 * ===========================================================================
 *
 */

/*==========================================================================
 * NCBI PacBio Fastq Sequence Read Archive schema
 */
version 1;

include 'insdc/sra.vschema';
include 'ncbi/sra.vschema';


/*--------------------------------------------------------------------------
 * NCBI:SRA:PacBio
 *  Pacific Biotech SRA Platform
 *
 * history:
 *  1.0.2 - updated ancestry
 *  1.0.3 - updated ancestry
 *  1.0.4 - updated ancestry
 */
table NCBI:SRA:PacBio:common #1.0.4 = NCBI:SRA:tbl:sra #2.1.4
{
}

/* history:
 *  1.0.2 - updated ancestry
 *  1.0.3 - updated ancestry
 *  1.0.4 - updated ancestry
 *  2.0.0 - updated ancestry
 */
table NCBI:SRA:PacBio:smrt:fastq #2
    = NCBI:SRA:PacBio:common #1.0.4
    , NCBI:tbl:base_space #3
    , NCBI:tbl:phred_quality #2.0
{
    /* PLATFORM
     *  platform name is always "PACBIO_SMRT"
     */
    ascii platform_name
        = < ascii > echo < "PACBIO_SMRT" > ();

    /* TRIMMED SEQUENCE
     *  need to find the 0-based trim_start and trim_len
     */
    INSDC:coord:zero bio_start
        = NCBI:SRA:bio_start ( out_read_start, out_read_type );

    INSDC:coord:zero trim_start = bio_start;

    U32 trim_left = ( U32 ) trim_start;
    INSDC:coord:len trim_len = ( INSDC:coord:len )
        < U32 > diff ( spot_len, trim_left );
}

/*--------------------------------------------------------------------------
 * NCBI:SRA:PacBio:smrt:db
 *  Pacific Biotech SRA Platform
 */
table NCBI:SRA:PacBio:smrt:indelsubst #1
{
    // probability that the current base is an insertion
    column < U8 > zip_encoding INSERTION_QV;

    // probability of a deletion error following current base
    // and identity of deleted base, if it exists
    column < U8 > zip_encoding DELETION_QV;
    column < INSDC:dna:text > zip_encoding DELETION_TAG;

    // probability of a substitution error
    // and most likely alternative base call
    column < U8 > zip_encoding SUBSTITUTION_QV;
    column < INSDC:dna:text > zip_encoding SUBSTITUTION_TAG;
};

typedef U8 PacBio:hole:status;
const PacBio:hole:status PacBio:hole:SEQUENCING  = 0;
const PacBio:hole:status PacBio:hole:ANTIHOLE    = 1;
const PacBio:hole:status PacBio:hole:FIDUCIAL    = 2;
const PacBio:hole:status PacBio:hole:SUSPECT     = 3;
const PacBio:hole:status PacBio:hole:ANTIMIRROR  = 4;
const PacBio:hole:status PacBio:hole:FDZMW       = 5;
const PacBio:hole:status PacBio:hole:FBZMW       = 6;
const PacBio:hole:status PacBio:hole:ANTIBEAMLET = 7;
const PacBio:hole:status PacBio:hole:OUTSIDEFOV  = 8;

/* history:
 *  1.0.1 - updated ancestry
 *  1.0.2 - updated ancestry
 *  2.0.0 - updated ancestry
 */
table NCBI:SRA:PacBio:smrt:basecalls #2
    = INSDC:SRA:tbl:spotcoord #1
    , NCBI:tbl:base_space #3
    , NCBI:tbl:phred_quality #2.0.3
    , NCBI:SRA:PacBio:smrt:indelsubst #1
{
    /* PLATFORM
     *  platform name is always "PACBIO_SMRT"
     */
    ascii platform_name
        = < ascii > echo < "PACBIO_SMRT" > ();

    // basecalls will be routed to READ column
    readonly column INSDC:dna:text BASECALL
        = out_dna_text;

    // quality value for each base
    readonly column INSDC:quality:phred QUALITY_VALUE
        = out_qual_phred;

    // zero-based hole number
    column < U32 > izip_encoding HOLE_NUMBER;

    // hole status
    column < PacBio:hole:status > zip_encoding HOLE_STATUS;

    // optional column pair to describe hole status
    // when/if it does not line up with our constants above
    column < ascii > zip_encoding HOLE_STATUS_VALUE;
    column < INSDC:coord:len > izip_encoding HOLE_STATUS_VALUE_LEN;

    // hole ( X,Y ) pair will be split and sent to X and Y columns
    column I16 [ 2 ] HOLE_XY
        = < I16 > paste ( x_clip_I16, y_clip_I16 );
    I16 x_clip_I16 = cast ( out_x_coord );
    I16 y_clip_I16 = cast ( out_y_coord );

    I16 in_x16_coord = < I16 > cut < 0 > ( HOLE_XY );
    I16 in_y16_coord = < I16 > cut < 1 > ( HOLE_XY );

    INSDC:coord:val in_x_coord = cast ( in_x16_coord );
    INSDC:coord:val in_y_coord = cast ( in_y16_coord );

    // the number of bases in ZMW
    readonly column INSDC:coord:len NUM_EVENT
        = base_space_spot_len;
};

/* history:
 *  1.0.1 - updated ancestry
 *  1.0.2 - updated ancestry
 *  1.0.3 - updated ancestry
 *  2.0.0 - updated ancestry
 */
table NCBI:SRA:PacBio:smrt:sequence #2
    = NCBI:SRA:PacBio:smrt:basecalls #2
    , NCBI:SRA:tbl:sra_nopos #2.1.4
{
    // pulse information
    column < U16 > izip_encoding PRE_BASE_FRAMES;
    column < U16 > izip_encoding WIDTH_IN_FRAMES;

    // spot to pulse map
    default column INSDC:position:zero PULSE_INDEX
        = .PULSE_INDEX;
    readonly column INSDC:position:one PULSE_INDEX
        = out_position;
    INSDC:position:one out_position
        = ( INSDC:position:one ) < INSDC:position:zero > sum < 1 > ( .PULSE_INDEX );

    column NCBI:SRA:pos16 PULSE_INDEX
        = cast ( .PULSE_INDEX );
    NCBI:SRA:pos16 in_pulse_index16
        = PULSE_INDEX;

    INSDC:position:zero in_pulse_index32
        = PULSE_INDEX
        | cast ( in_pulse_index16 );

    physical column < INSDC:position:zero > izip_encoding .PULSE_INDEX
        = in_pulse_index32;

    /* clip quality */
    extern column < INSDC:coord:zero > izip_encoding CLIP_QUALITY_LEFT;
    extern column < INSDC:coord:one > izip_encoding CLIP_QUALITY_RIGHT;

    /* TRIMMED SEQUENCE
     *  need to find the 0-based trim_start and trim_len
     */
    INSDC:coord:zero trim_start
        = .CLIP_QUALITY_LEFT
        | NCBI:SRA:bio_start ( out_read_start, out_read_type );

    U32 trim_right
        = ( U32 ) .CLIP_QUALITY_RIGHT
        | spot_len;

    U32 trim_left = ( U32 ) trim_start;
    INSDC:coord:len trim_len = ( INSDC:coord:len )
        < U32 > diff ( trim_right, trim_left );
};

/* history:
 *  1.0.1 - updated ancestry
 *  1.0.2 - updated ancestry
 *  1.0.3 - updated ancestry
 *  2.0.0 - updated ancestry
 */
table NCBI:SRA:PacBio:smrt:cons #2
    = NCBI:SRA:PacBio:smrt:basecalls #2
    , NCBI:SRA:tbl:sra #2.1.4
{
    // documented in both hdf5 and xsd as signed...
    column < I32 > izip_encoding NUM_PASSES;

    /* TRIMMED SEQUENCE
     *  need to find the 0-based trim_start and trim_len
     */
    INSDC:coord:zero trim_start
        = NCBI:SRA:bio_start ( out_read_start, out_read_type );

    U32 trim_left = ( U32 ) trim_start;
    INSDC:coord:len trim_len = ( INSDC:coord:len )
        < U32 > diff ( spot_len, trim_left );
};

/* these encoding rules attempt to compress the channels individually,
   although they may compress fine interleaved as they are... */
physical
F32 [ 4 ] NCBI:SRA:PacBio:smrt:F32_4ch_encoding #1.0 < U32 mantissa >
{
    decode
    {
        fzip_fmt cmp0 = split < 0 > ( @ );
        fzip_fmt cmp1 = split < 1 > ( @ );
        fzip_fmt cmp2 = split < 2 > ( @ );
        fzip_fmt cmp3 = split < 3 > ( @ );

        F32 ch0 = funzip ( cmp0 );
        F32 ch1 = funzip ( cmp1 );
        F32 ch2 = funzip ( cmp2 );
        F32 ch3 = funzip ( cmp3 );

        return < F32 > paste ( ch0, ch1, ch2, ch3 );
    }

    encode
    {
        F32 ch0 = < F32 > cut < 0 > ( @ );
        F32 ch1 = < F32 > cut < 1 > ( @ );
        F32 ch2 = < F32 > cut < 2 > ( @ );
        F32 ch3 = < F32 > cut < 3 > ( @ );

        fzip_fmt cmp0 = fzip < mantissa > ( ch0 );
        fzip_fmt cmp1 = fzip < mantissa > ( ch1 );
        fzip_fmt cmp2 = fzip < mantissa > ( ch2 );
        fzip_fmt cmp3 = fzip < mantissa > ( ch3 );

        return merge ( cmp0, cmp1, cmp2, cmp3 );
    }
}

table NCBI:SRA:PacBio:smrt:zmw_metrics #1
{
    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > BASE_FRACTION;
    column < F32 > fzip_encoding < 24 > BASE_IPD;
    column < F32 > fzip_encoding < 24 > BASE_RATE;
    column < F32 > fzip_encoding < 24 > BASE_WIDTH;
    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_BASE_QV;
    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_DEL_QV;
    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_INS_QV;
    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_SUB_QV;
    column < F32 > fzip_encoding < 24 > LOCAL_BASE_RATE;
    column < F32 > fzip_encoding < 24 > DARK_BASE_RATE;
    column < F32 > fzip_encoding < 24 > HQ_RGN_START_TIME;
    column < F32 > fzip_encoding < 24 > HQ_RGN_END_TIME;
    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > HQ_RGN_SNR;
    column < I8 > zip_encoding PRODUCTIVITY;
    column < F32 > fzip_encoding < 24 > READ_SCORE;
    column < F32 > fzip_encoding < 24 > READ_BASE_QV;
    column < F32 > fzip_encoding < 24 > READ_DEL_QV;
    column < F32 > fzip_encoding < 24 > READ_INS_QV;
    column < F32 > fzip_encoding < 24 > READ_SUB_QV;
};

table NCBI:SRA:PacBio:smrt:passes #1
{
    column < U8 > zip_encoding ADAPTER_HIT_BEFORE;
    column < U8 > zip_encoding ADAPTER_HIT_AFTER;
    column < U8 > zip_encoding PASS_DIRECTION;
    column < I32 > izip_encoding PASS_NUM_BASES;
    column < I32 > izip_encoding PASS_START_BASE;
};

database NCBI:SRA:PacBio:smrt:db #2
{
    table NCBI:SRA:PacBio:smrt:sequence #2.0 SEQUENCE;
    table NCBI:SRA:PacBio:smrt:cons #2.0 CONSENSUS;
    table NCBI:SRA:PacBio:smrt:passes #1.0 PASSES;
    table NCBI:SRA:PacBio:smrt:zmw_metrics #1.0 ZMW_METRICS;
};