/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ /*========================================================================== * Sequence schema implementation tables */ version 1; include 'vdb/vdb.vschema'; include 'ncbi/ncbi.vschema'; include 'insdc/sra.vschema'; typeset NCBI:spot_filter_read_set { INSDC:4na:bin, INSDC:x2na:bin, INSDC:2na:bin }; /* make_spot_filter * function to analyze qualities and bases * and potentially set SRA_SPOT_FILTER_REJECT bit */ function INSDC:SRA:spot_filter NCBI:SRA:make_spot_filter #1 < * U32 min_length, U8 min_quality, U8 no_quality > ( NCBI:spot_filter_read_set bin_read, INSDC:quality:phred quality, INSDC:coord:zero read_start, INSDC:coord:len read_len, INSDC:SRA:xread_type read_type, INSDC:SRA:spot_filter spot_filter ); function INSDC:quality:phred NCBI:SRA:syn_quality #1 < INSDC:quality:phred good_quality, INSDC:quality:phred bad_quality > ( INSDC:coord:len read_len, INSDC:SRA:spot_filter spot_filter ); function INSDC:quality:phred NCBI:SRA:syn_quality_read #1 < INSDC:quality:phred good_quality, INSDC:quality:phred bad_quality > ( INSDC:coord:zero read_start, INSDC:coord:len read_len, INSDC:SRA:xread_type read_type, INSDC:SRA:read_filter read_filter ); /*-------------------------------------------------------------------------- * n_encoding - implementation * introduces common virtual productions */ table NCBI:tbl:n_encoding #1 { U8 n_encoding_dummy = read_unpack | read_ndecode; }; /*-------------------------------------------------------------------------- * seqloc * NCBI sequence locator table */ table NCBI:tbl:seqloc #1.0 { /* SEQ_ID * a FASTA-style SeqId */ extern column < ascii > zip_encoding SEQ_ID; /* SEQ_START * provided in both 1 ( default ) and 0-based coordinates */ extern default column < INSDC:coord:one > izip_encoding SEQ_START; readonly column INSDC:coord:zero SEQ_START = ( INSDC:coord:zero ) < INSDC:coord:one > diff < 1 > ( .SEQ_START ); /* SEQ_LEN */ extern column < INSDC:coord:len > izip_encoding SEQ_LEN; }; /*-------------------------------------------------------------------------- * base_space - implementation * READ column rules */ /* color_from_dna * use starting keys and color matrix to convert individual reads * to base space. */ extern function INSDC:x2cs:bin NCBI:color_from_dna #1.1 ( INSDC:x2na:bin bin_x2na, INSDC:coord:zero read_start, INSDC:coord:len read_len, INSDC:dna:text cs_key, U8 color_matrix ); /* dcmp_base_space * table to introduce common virtual productions */ table NCBI:tbl:dcmp_base_space #1 { // rules to introduce purely virtual productions // never expected to resolve... INSDC:dna:text dcmp_virtual_productions = out_dcmp_4na_bin | out_dcmp_x2na_bin | out_dcmp_2na_bin | out_dcmp_2na_packed ; } /* history: * 1.0.1 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1 * 1.0.2 - spotdesc #1.0.2 * 1.0.3 - base upon dcmp_base_space for "out_dcmp_2na_bin" */ table NCBI:tbl:base_space_common #1.0.3 = INSDC:tbl:sequence #1.0.1 , INSDC:SRA:tbl:spotdesc #1.0.2 , INSDC:SRA:tbl:stats #1.1.0 , NCBI:tbl:dcmp_base_space #1.0.0 { /* INSDC:tbl:sequence inherited virtual productions */ // cs_native - tells user color space is not native bool cs_native = < bool > echo < false > (); // in_cs_key is not writable in base_space // color-space key is completely artificial INSDC:dna:text out_cs_key = .CS_KEY | < INSDC:dna:text > echo < 'T' > ( out_read_type ) | < INSDC:dna:text > echo < 'T' > ( out_read_len ) | < INSDC:dna:text > echo < 'T' > () ; // unambiguous synthesized 2cs INSDC:2cs:bin out_2cs_bin = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2cs_bin ) ; // unambiguous unpacked 2na INSDC:2na:bin out_2na_bin = out_dcmp_2na_bin | ( INSDC:2na:bin ) unpack ( out_2na_packed ) ; // synthesized color sequence INSDC:x2cs:bin out_x2cs_bin = NCBI:color_from_dna ( out_x2na_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix ); // synthesized packed 2cs INSDC:2cs:packed out_2cs_packed = ( INSDC:2cs:packed ) pack ( out_2cs_bin ); // synthesized packed 4na INSDC:4na:packed out_4na_packed = ( INSDC:4na:packed ) pack ( out_4na_bin ); // synthesized color text INSDC:color:text out_color_text = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ); // published color matrix U8 out_color_matrix = < U8 > echo < INSDC:color:default_matrix > (); // spot_len and fixed_spot_len INSDC:coord:len base_space_spot_len = ( INSDC:coord:len ) row_len ( out_2na_packed ); INSDC:coord:len base_space_fixed_spot_len = ( INSDC:coord:len ) fixed_row_len ( out_2na_packed ); /* INSDC:tbl:sequence inherited productions * out_signal * in_dna_text * out_4na_bin * out_dna_text * out_x2na_bin * out_2na_packed */ /* INSDC:SRA:tbl:stats inherited productions * in_stats_bin */ /* NCBI:tbl:dcmp_base_space inherited productions * out_dcmp_2na_bin * out_dcmp_4na_bin * out_dcmp_x2na_bin * out_dcmp_2na_packed */ }; /* base_space_nocol * this table describes viewing rules * but omits writing rules and physical column description * in order to support older tables * * history: * 1.0.1 - base explicitly upon base_space_common #1.0.1 * 1.0.2 - base explicitly upon base_space_common #1.0.2 * 1.0.3 - " " 1.0.3 */ table NCBI:tbl:base_space_nocol #1.0.3 = NCBI:tbl:base_space_common #1.0.3 , NCBI:tbl:n_encoding #1 { // incoming is disabled // synthesized dna text INSDC:dna:text out_dna_text = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin ); // synthesized 4na INSDC:4na:bin out_4na_bin = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ); // unpacked 2na with ambiguities INSDC:x2na:bin out_x2na_bin = ( INSDC:x2na:bin ) read_ndecode; // interface with n-encoded qualities U8 read_unpack = out_2na_bin; /* INSDC:tbl:sequence inherited productions * out_signal * out_2na_packed */ /* NCBI:tbl:n_encoding inherited productions * read_ndecode */ }; /* base_space #1 * this schema brings in standard .READ column for v1 tables * * history: * 1.0.1 - base explicitly upon base_space_nocol #1.0.1 * 1.0.2 - base explicitly upon base_space_nocol #1.0.2 * 1.0.3 - base explicitly upon base_space_nocol #1.0.3 */ table NCBI:tbl:base_space #1.0.3 = NCBI:tbl:base_space_nocol #1.0.3 { // 2-bit 2na representation (0..3) INSDC:2na:packed out_2na_packed = .READ; // no rules for writing to .READ /* INSDC:tbl:sequence inherited productions * out_signal */ /* NCBI:tbl:n_encoding inherited productions * read_ndecode */ }; /* setRnaFlag * if metadata(RNA) is not set and in_read contains u/U, set metadata(RNA) to true * if metadata(RNA) is not set and in_read contains t/T, set metadata(RNA) to false * otherwise, do nothing * return in_read */ extern function INSDC:dna:text NCBI:SRA:setRnaFlag #1 ( INSDC:dna:text in_read ); /* useRnaFlag * if metadata(RNA) is set translate T to U * otherwise, do nothing */ extern function INSDC:dna:text NCBI:SRA:useRnaFlag #1 ( INSDC:dna:text in_read ); /* base_space #2 * standard current base-space table * * history: * 2.0.2 - base_space_common #1.0.2 * 2.0.3 - base_space_common #1.0.3 now has dcmp_base_space as well * 3.0.0 - uses metadata RNA_Flag to support RNA reads */ table NCBI:tbl:base_space #3 = NCBI:tbl:base_space_common #1.0.3 , NCBI:tbl:dcmp_base_space #1 { /* input rules */ // input text INSDC:dna:text in_dnarna_text_upper = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbnu','NACMGRSVTWYHKDBNU' > ( READ ); INSDC:dna:text in_dna_text = NCBI:SRA:setRnaFlag ( in_dnarna_text_upper ); // change U to T // input 4na bin INSDC:4na:bin in_4na_bin = < INSDC:4na:bin > range_validate < 0, 15 > ( READ ) | ( INSDC:4na:bin ) unpack ( in_4na_packed ) | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text ) | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_x2na_bin ); // input 4na packed INSDC:4na:packed in_4na_packed = READ; // input x2na bin INSDC:x2na:bin in_x2na_bin = < INSDC:x2na:bin > range_validate < 0, 4 > ( READ ) | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_4na_bin ); // input 2na bin INSDC:2na:bin in_2na_bin = < INSDC:2na:bin > range_validate < 0, 3 > ( READ ) | ( INSDC:2na:bin ) unpack ( in_2na_packed ) | INSDC:SEQ:rand_4na_2na ( in_4na_bin ); // input 2na packed INSDC:2na:packed in_2na_packed = READ; // input 4na alt-read ( ambiguities ) INSDC:4na:bin in_alt_4na_bin = ( INSDC:4na:bin ) < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin ); // preparing a feed into stats column U8 in_stats_bin = in_2na_bin; /* physical columns */ physical column INSDC:2na:packed .READ = in_2na_packed | ( INSDC:2na:packed ) pack ( in_2na_bin ) ; physical column < INSDC:4na:bin > zip_encoding .ALTREAD = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin ) ; /* output rules */ // output 2na packed INSDC:2na:packed out_2na_packed = .READ | out_dcmp_2na_packed; // output x2na bin INSDC:x2na:bin out_x2na_bin = out_dcmp_x2na_bin | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin ); // output 2na->4na bin INSDC:4na:bin out_2na_4na_bin = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin ); // output 4na bin INSDC:4na:bin out_4na_bin = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD ) | out_dcmp_4na_bin | out_2na_4na_bin; // output text INSDC:dna:text out_dnarna_text = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ); INSDC:dna:text out_dna_text = NCBI:SRA:useRnaFlag ( out_dnarna_text ); /* INSDC:tbl:sequence inherited productions * out_signal */ /* NCBI:tbl:dcmp_base_space inherited productions * out_dcmp_2na_bin * out_dcmp_4na_bin * out_dcmp_x2na_bin * out_dcmp_2na_packed */ }; /*-------------------------------------------------------------------------- * color_space - implementation * nucleotide sequences in color space */ extern function INSDC:x2na:bin NCBI:dna_from_color #1 ( INSDC:x2cs:bin color_bin, INSDC:coord:zero read_start, INSDC:coord:len read_len, INSDC:dna:text cs_key, U8 color_matrix ); /* dcmp_color_space * declares common virtual productions */ table NCBI:tbl:dcmp_color_space #1 { // rules to introduce purely virtual productions // never expected to resolve... INSDC:dna:text dcmp_virtual_productions = out_dcmp_x2cs_bin | out_dcmp_2cs_bin | out_dcmp_2cs_packed; } /* history: * 1.0.1 - base explicitly upn sequence #1.0.1, spotdesc #1.0.1 * 1.0.2 - spotdesc #1.0.2 * 1.0.3 - base upon dcmp_color_space for "out_dcmp_2cs_bin" */ table NCBI:tbl:color_space_common #1.0.3 = INSDC:tbl:sequence #1.0.1 , INSDC:SRA:tbl:spotdesc #1.0.2 , INSDC:SRA:tbl:stats #1.1.0 , NCBI:tbl:dcmp_color_space #1.0.0 { // cs_native - tells user color space is native bool cs_native = < bool > echo < true > (); // unambiguous unpacked 2cs INSDC:2cs:bin out_2cs_bin = out_dcmp_2cs_bin | ( INSDC:2cs:bin ) unpack ( out_2cs_packed ); // unambiguous synthesized 2na INSDC:2na:bin out_2na_bin = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin ); // synthesized unpacked 4na INSDC:4na:bin out_4na_bin = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ); // synthesized dna text INSDC:dna:text out_dna_text = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin ); // synthesized dna sequence INSDC:x2na:bin out_x2na_bin = NCBI:dna_from_color ( out_x2cs_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix ); // synthesized packed 2na INSDC:2na:packed out_2na_packed = ( INSDC:2na:packed ) pack ( out_2na_bin ); // synthesized packed 4na INSDC:4na:packed out_4na_packed = ( INSDC:4na:packed ) pack ( out_4na_bin ); // synthesized color text INSDC:color:text out_color_text = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ); // spot_len and fixed_spot_len INSDC:coord:len color_space_spot_len = ( INSDC:coord:len ) row_len ( out_2cs_packed ); INSDC:coord:len color_space_fixed_spot_len = ( INSDC:coord:len ) fixed_row_len ( out_2cs_packed ); /* INSDC:tbl:sequence inherited productions * in_cs_key * out_cs_key * out_signal * out_x2cs_bin * in_color_text * out_2cs_packed * out_color_matrix */ /* INSDC:SRA:tbl:stats inherited productions * in_stats_bin */ /* NCBI:tbl:dcmp_color_space inherited productions * out_dcmp_2cs_bin * out_dcmp_x2cs_bin * out_dcmp_2cs_packed */ }; /* color_space_nocol * this table describes viewing rules * but omits writing rules and physical column description * in order to support older tables * * history: * 1.0.1 - base explicitly upon color_space_common #1.0.1 * 1.0.2 - color_space_common #1.0.2 * 1.0.3 - color_space_common #1.0.3 */ table NCBI:tbl:color_space_nocol #1.0.3 = NCBI:tbl:color_space_common #1.0.3 , NCBI:tbl:n_encoding #1 { // incoming is disabled // v1 color matrix was stored in metadata U8 out_color_matrix = < U8 > meta:read < "COLOR_MATRIX" > () | < U8 > echo < INSDC:color:default_matrix > (); // unpacked 2cs with ambiguities INSDC:x2cs:bin out_x2cs_bin = ( INSDC:x2cs:bin ) read_ndecode; // interface with n-encoded qualities U8 read_unpack = out_2cs_bin; /* INSDC:tbl:sequence inherited productions * out_cs_key * out_signal * out_2cs_packed */ /* NCBI:tbl:n_encoding inherited productions * read_ndecode */ }; /* color_space #1 * this schema brings in .CSREAD and .CS_KEY columns for v1 tables * * history: * 1.0.1 - base explicitly upon color_space_nocol #1.0.1 * 1.0.2 - color_space_nocol #1.0.2 * 1.0.3 - color_space_nocol #1.0.3 */ table NCBI:tbl:color_space #1.0.3 = NCBI:tbl:color_space_nocol #1.0.3 { // stored as text INSDC:dna:text out_cs_key = .CS_KEY; // stored color sequence INSDC:2cs:packed out_2cs_packed = .CSREAD; /* INSDC:tbl:sequence inherited productions * out_signal */ /* NCBI:tbl:n_encoding inherited productions * read_ndecode */ }; /* color_space #2 * standard current color-space table * * history: * 2.0.1 - base explicitly upon color_space_common #1.0.1 * 2.0.2 - base explicitly upon color_space_common #1.0.2 * 2.1.0 - introduce hooks for compressed color space */ table NCBI:tbl:color_space #2.1 = NCBI:tbl:color_space_common #1.0.3 , NCBI:tbl:dcmp_color_space #1.0.0 { /* input rules */ // input text is not modified // illegal values are not detected here INSDC:color:text in_color_text = CSREAD; // input x2cs bin // illegal values will be caught here INSDC:x2cs:bin in_x2cs_bin = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CSREAD ) | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_color_text ); // input 2cs bin INSDC:2cs:bin in_2cs_bin = < INSDC:2cs:bin > range_validate < 0, 3 > ( CSREAD ) | ( INSDC:2cs:bin ) unpack ( in_2cs_packed ) | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_x2cs_bin ); // input 2cs packed INSDC:2cs:packed in_2cs_packed = CSREAD; // input x2cs alt-csread ( ambiguity ) INSDC:x2cs:bin in_alt_x2cs_bin = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_x2cs_bin ); // color-space keys ARE modified on input INSDC:dna:text in_cs_key = < INSDC:dna:text, INSDC:dna:text > map < 'acgt', 'ACGT' > ( CS_KEY ); // color matrix U8 in_color_matrix = < U8 > range_validate < 0, 4 > ( COLOR_MATRIX ); // prepairing a feed into stats column U8 in_stats_bin = in_2cs_bin; /* physical columns */ physical column INSDC:2cs:packed .CSREAD = in_2cs_packed | ( INSDC:2cs:packed ) pack ( in_2cs_bin ); physical column < INSDC:x2cs:bin > zip_encoding .ALTCSREAD = < INSDC:x2cs:bin > trim < 0, 0 > ( in_alt_x2cs_bin ); physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key; physical column < U8 > zip_encoding .COLOR_MATRIX = in_color_matrix; /* output rules */ // output 2cs packed INSDC:2cs:packed out_2cs_packed = .CSREAD | out_dcmp_2cs_packed; // unpacked 2cs with ambiguity INSDC:x2cs:bin out_x2cs_bin = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, .ALTCSREAD ) | out_dcmp_x2cs_bin | ( INSDC:x2cs:bin ) out_2cs_bin; // read directly from physical column INSDC:dna:text out_cs_key = .CS_KEY; // color matrix may be synthesized U8 out_color_matrix = .COLOR_MATRIX | < U8 > echo < INSDC:color:default_matrix > (); /* INSDC:tbl:sequence inherited productions * out_signal */ /* NCBI:tbl:dcmp_color_space inherited productions * out_dcmp_2cs_bin * out_dcmp_x2cs_bin * out_dcmp_2cs_packed */ }; /*-------------------------------------------------------------------------- * protein */ table NCBI:tbl:protein #1 = INSDC:tbl:protein #1 { /* upper-case letters */ INSDC:protein:text in_protein_text = < INSDC:protein:text, INSDC:protein:text > map < 'abcdefghijklmnopqrstvwxyzu','ABCDEFGHIJKLMNOPQRSTVWXYZU' > ( PROTEIN ); /* std aa */ INSDC:aa:bin in_aa_bin = < INSDC:aa:bin > range_validate < 1, 27 > ( PROTEIN ) | < INSDC:protein:text, INSDC:aa:bin > map < INSDC:aa:map:CHARSET, INSDC:aa:map:BINSET > ( in_protein_text ); /* physical column */ physical column < INSDC:aa:bin > zip_encoding .PROTEIN = in_aa_bin; /* output rules */ INSDC:aa:bin out_aa_bin = .PROTEIN; INSDC:protein:text out_protein_text = < INSDC:aa:bin, INSDC:protein:text > map < INSDC:aa:map:BINSET, INSDC:aa:map:CHARSET > ( out_aa_bin ); }; /*-------------------------------------------------------------------------- * phred * standard phred quality representation * limits values on input to 1..63 * reserves value 0 as ambiguity symbol for reads */ /* history: * 1.0.1 - base explicitly upon sequence #1.0.1 */ table NCBI:tbl:phred_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1 { /* [CS]READ - decoding */ U8 read_ndecode = < INSDC:quality:phred, U8 > map < 0, 4 > ( out_qual_phred, read_unpack ); /* INSDC:tbl:sequence inherited productions * out_qual_phred * out_qual_text_phred_33 * out_qual_text_phred_64 */ /* NCBI:tbl:n_encoding inherited productions * read_unpack */ }; /* history: * 1.0.1 - base explicitly upon phred_quality_nocol #1.0.1 */ table NCBI:tbl:phred_quality #1.0.1 = NCBI:tbl:phred_quality_nocol #1.0.1 { // read directly as n-encoded phred is compatible with phred NCBI:quality:n_encoded:phred out_qual_phred = .QUALITY; /* INSDC:tbl:sequence inherited productions * out_qual_text_phred_33 * out_qual_text_phred_64 */ /* NCBI:tbl:n_encoding inherited productions * read_unpack */ }; /* history: * 2.0.1 - added feed of in_stats_qual * 2.0.2 - added input of text encodings * 2.0.3 - base explicitly upon sequence #1.0.1 * 2.0.4 - change compression from izip to zip * 2.0.5 - change quality synthesizer to work per read * 2.0.6 - remove config check */ table NCBI:tbl:phred_quality #2.0.6 = INSDC:tbl:sequence #1.0.1 { // read directly quality as phred INSDC:quality:phred phys_qual_phred = .ORIGINAL_QUALITY | .QUALITY ; INSDC:quality:phred const_qual_phred = < INSDC:quality:phred > echo < 30 > ( out_2na_bin ) | < INSDC:quality:phred > echo < 30 > ( out_4na_bin ) ; INSDC:quality:phred syn_qual_phred = NCBI:SRA:syn_quality_read < 30, 3 > ( out_read_start, out_read_len, out_read_type, out_rd_filter ) | NCBI:SRA:syn_quality < 30, 3 > ( out_read_len, out_spot_filter ) | const_qual_phred ; INSDC:quality:phred out_qual_phred = phys_qual_phred | syn_qual_phred ; INSDC:SRA:spot_filter in_spot_filter = NCBI:SRA:make_spot_filter #1 ( in_4na_bin, in_qual_phred, in_read_start, in_read_len, in_read_type, in_spot_filter_0 ) ; // input rules INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY; INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY; INSDC:quality:phred in_qual_phred = QUALITY | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 ) | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 ) ; // physical storage physical column < INSDC:quality:phred > zip_encoding .ORIGINAL_QUALITY = in_qual_phred ; trigger pull_spot_filter = < INSDC:SRA:spot_filter > compare ( in_spot_filter, out_spot_filter ); // feed to compressed statistics INSDC:quality:phred in_stats_qual = in_qual_phred; /* INSDC:tbl:sequence inherited productions * out_qual_text_phred_33 * out_qual_text_phred_64 */ }; /*-------------------------------------------------------------------------- * log_odds * log-odds quality score support * * conversion from log-odds to phred is via formula * 10 * log ( 1 + pow ( 10, x / 10 ) ) / log ( 10 ) + 0.499 * for x = -4..40 : when x = -5, phred = 0 */ // the map function requires two lookup tables: // the first table detects every legal value... const INSDC:quality:log_odds NCBI:quality:from:log_odds = [ -6,-5,-4,-3,-2,-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,15,16,17,18,19,20, 21,22,23,24,25,26,27,28,29,30, 31,32,33,34,35,36,37,38,39,40 ]; // ...the second table gives positional translations const INSDC:quality:phred NCBI:quality:to:phred = [ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9,10,10, 11,12,13,14,15,16,17,18,19,20, 21,22,23,24,25,26,27,28,29,30, 31,32,33,34,35,36,37,38,39,40 ]; function INSDC:quality:phred NCBI:log_odds_to_phred #1 ( INSDC:quality:log_odds qual_log_odds ) { // this range enforcement may not be required INSDC:quality:log_odds log_odds_clip = < INSDC:quality:log_odds > clip < -6, 40 > ( qual_log_odds ); // use the tables above to map from log-odds to phred return < INSDC:quality:log_odds, INSDC:quality:phred > map < NCBI:quality:from:log_odds, NCBI:quality:to:phred > ( log_odds_clip ); } /* history: * 1.0.1 - base explicitly upon sequence #1.0.1 */ table NCBI:tbl:log_odds_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1 { /* READ - decoding */ U8 read_ndecode = < INSDC:quality:log_odds, U8 > map < -6, 4 > ( out_qual_log_odds, read_unpack ); /* QUALITY * declared in INSDC:tbl:sequence as phred * introduce here as log-odds */ extern column INSDC:quality:log_odds QUALITY = out_qual_log_odds; // resolve for phred INSDC:quality:phred out_qual_phred = out_qual2_phred | NCBI:log_odds_to_phred ( out_qual_log_odds ); /* INSDC:tbl:sequence inherited productions * out_qual_text_phred_33 * out_qual_text_phred_64 */ /* NCBI:tbl:n_encoding inherited productions * read_unpack */ /* NCBI:tbl:log_odds_quality_nocol productions * out_qual2_phred * out_qual_log_odds */ }; /* history: * 1.0.1 - base explicitly upon log_odds_quality_nocol #1.0.1 */ table NCBI:tbl:log_odds_quality #1.0.1 = NCBI:tbl:log_odds_quality_nocol #1.0.1 { // read directly as n-encoded log_odds is compatible with log_odds NCBI:quality:n_encoded:log_odds out_qual_log_odds = .QUALITY; /* INSDC:tbl:sequence inherited productions * out_qual_text_phred_33 * out_qual_text_phred_64 */ /* NCBI:tbl:n_encoding inherited productions * read_unpack */ /* NCBI:tbl:log_odds_quality_nocol inherited productions * out_qual2_phred */ }; /* history: * 2.0.1 - base explicitly upon sequence #1.0.1 * 2.1.0 - added production of in_qual_phred */ table NCBI:tbl:log_odds_quality_nocol #2.1.0 = INSDC:tbl:sequence #1.0.1 { /* QUALITY * declared in INSDC:tbl:sequence as phred * introduce here as log-odds */ extern column INSDC:quality:log_odds QUALITY = out_qual_log_odds; // resolve for phred INSDC:quality:phred in_qual_phred = NCBI:log_odds_to_phred ( in_qual_log_odds ); INSDC:quality:phred out_qual_phred = NCBI:log_odds_to_phred ( out_qual_log_odds ); /* INSDC:tbl:sequence inherited productions * out_qual_text_phred_33 * out_qual_text_phred_64 */ /* NCBI:tbl:log_odds_quality_nocol productions * out_qual_log_odds */ }; /* history: * 2.0.1 - added feed of in_stats_qual * 2.0.2 - added input of text encodings * 2.0.3 - base explicitly upon log_odds_quality_nocol #2.0.1 * 2.0.4 - changed compression from izip to zip * 2.1.0 - base explicitly upon log_odds_quality_nocol #2.1.0 */ table NCBI:tbl:log_odds_quality #2.1.0 = NCBI:tbl:log_odds_quality_nocol #2.1.0 { INSDC:quality:log_odds out_qual_log_odds= .QUALITY; extern column INSDC:quality:text:log_odds_64 QUALITY = out_qual_text_log_odds_64 | ( INSDC:quality:text:log_odds_64 ) < B8 > sum < 64 > ( out_qual_log_odds ); // input rules INSDC:quality:text:log_odds_64 in_qual_text_log_odds_64 = QUALITY; INSDC:quality:log_odds in_qual_log_odds = QUALITY | ( INSDC:quality:log_odds ) < B8 > diff < 64 > ( in_qual_text_log_odds_64 ); physical column < INSDC:quality:log_odds > zip_encoding .QUALITY = in_qual_log_odds; // feed to compressed statistics INSDC:quality:log_odds in_stats_qual = in_qual_log_odds; /* INSDC:tbl:sequence inherited productions * out_qual_text_phred_33 * out_qual_text_phred_64 */ /* NCBI:tbl:log_odds_quality productions * out_qual_text_log_odds_64 */ };