/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ /*========================================================================== * NCBI 454 Sequence Read Archive schema */ version 1; include 'ncbi/sra.vschema'; include 'ncbi/spotname.vschema'; include 'ncbi/clip.vschema'; /*-------------------------------------------------------------------------- * functions */ /* dynamic_read_desc * uses inputs to determine read type and segmentation * * "edit_distance" [ CONST, OPTIONAL ] - a tolerance figure for * linker matching, where 0 requires exact match, 5 is default. * * "spot" [ DATA ] - bases for entire spot * * "key" [ DATA, CONTROL ] - bases for key sequence. for version 1, * the first base following key is taken as biological start * * "linker" [ DATA, CONTROL, OPTIONAL ] - if present, is used to separate * all bases following "key" into mate pair biological reads * * returns a trio for each identified read, with read type, start and length */ typeset NCBI:SRA:_454_:drdparam_set { ascii, U8, INSDC:2na:packed }; extern function U32 [ 3 ] NCBI:SRA:_454_:dynamic_read_desc #1 < * U32 edit_distance > ( NCBI:SRA:_454_:drdparam_set spot, NCBI:SRA:_454_:drdparam_set key * NCBI:SRA:_454_:drdparam_set linker ); const U32 NCBI:SRA:_454_:dyn_read_type = 0; const U32 NCBI:SRA:_454_:dyn_read_start = 1; const U32 NCBI:SRA:_454_:dyn_read_len = 2; /* tokenize_spot_name * scans name on input * tokenizes into parts */ extern function NCBI:SRA:spot_name_token NCBI:SRA:_454_:tokenize_spot_name #1 ( ascii name ); /*-------------------------------------------------------------------------- * NCBI:SRA:_454_:common * Roche 454 SRA Platform * * history: * 1.0.1 - explictly base upon sra #1.0.1 * 1.0.2 - bring in clip processing from external table * 1.0.3 - base explicitly upon sra #1.0.2, clip #1.0.1 * 1.0.4 - base explicitly upon sra #1.0.3, clip #1.0.2 * 1.0.5 - base explicitly upon sra #1.0.4 */ table NCBI:SRA:_454_:common #1.0.5 = INSDC:SRA:tbl:sra #1.0.4, NCBI:SRA:tbl:clip #1.0.2 { /* PLATFORM * platform name is always 454 */ ascii platform_name = < ascii > echo < "454" > (); /* 454 TECHNICAL SEQUENCES */ column INSDC:dna:text FLOW_CHARS = out_flow_chars; INSDC:dna:text in_flow_chars = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( FLOW_CHARS ); column INSDC:dna:text KEY_SEQUENCE = out_key_sequence; INSDC:dna:text in_key_sequence = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( KEY_SEQUENCE ); column INSDC:dna:text LINKER_SEQUENCE = out_linker_sequence; INSDC:dna:text in_linker_sequence = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( LINKER_SEQUENCE ); // binary technical sequences INSDC:x2na:bin out_flow_bin = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_flow_chars ); INSDC:x2na:bin out_key_bin = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_key_sequence ); INSDC:x2na:bin out_linker_bin = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_linker_sequence ); /* SIGNAL * single channel integer */ column NCBI:isamp1 SIGNAL = out_signal; NCBI:isamp1 out_signal = .SIGNAL; /* INSDC:tbl:sequence inherited productions * cs_native * out_cs_key * in_dna_text * out_2cs_bin * out_2na_bin * out_4na_bin * out_dna_text * out_x2cs_bin * out_x2na_bin * out_2cs_packed * out_2na_packed * out_4na_packed * out_color_text * out_qual_phred * out_color_matrix */ /* INSDC:SRA:tbl:spotname inherited productions * out_x_coord * out_y_coord * out_name_fmt * out_spot_name * spot_ids_found */ /* INSDC:SRA:tbl:spotdesc inherited productions * trim_len * out_label * out_nreads * trim_start * out_read_len * out_label_len * out_rd_filter * out_read_type * out_read_start * out_label_start * static_fixed_spot_len */ /* INSDC:SRA:tbl:stats inherited productions * base_count * spot_count * max_spot_id * min_spot_id * in_stats_bin * bio_base_count */ /* NCBI:tbl:n_encoding inherited productions * read_unpack */ /* NCBI:SRA:_454_:common productions * .SIGNAL * .CLIP_ADAPTER_LEFT * .CLIP_QUALITY_LEFT * .CLIP_ADAPTER_RIGHT * .CLIP_QUALITY_RIGHT * out_flow_chars * out_key_sequence * out_linker_sequence */ }; /*-------------------------------------------------------------------------- * NCBI:SRA:_454_:tbl:v2 * Roche 454 SRA Platform * * history: * 1.0.1 - explictly base upon sra #1.0.1 and related changes * 1.0.2 - respond to change to 454:common base table #1.0.2 */ // encodings are declared to have their own version // so that they may be changed over time independently physical INSDC:coord:one NCBI:SRA:_454_:encoding:CLIP #2 { decode { return ( INSDC:coord:one ) iunzip ( @ ); } encode { return izip ( @ ); } } physical NCBI:isamp1 NCBI:SRA:_454_:encoding:SIGNAL #2 { decode { return ( NCBI:isamp1 ) iunzip ( @ ); } encode { return izip ( @ ); } } physical INSDC:position:one NCBI:SRA:_454_:encoding:POSITION #2 { decode { I32 pos_1st_deriv = iunzip ( @ ); return ( INSDC:position:one ) < I32 > integral ( pos_1st_deriv ); } encode { I32 pos_1st_deriv = < I32 > deriv ( @ ); return izip ( pos_1st_deriv ); } } /* normalized v2 table * * history: * 1.0.6 - base upon updated ancestry * 1.0.7 - base upon updated ancestry * 1.0.8 - base upon sra_nopos #2.1.4, common #1.0.5 * 2.0.0 - NCBI:tbl:base_space uses metadata RNA_Flag to support RNA reads */ table NCBI:SRA:_454_:tbl:v2 #2 = NCBI:SRA:tbl:sra_nopos #2.1.4 , NCBI:tbl:base_space #3 , NCBI:tbl:phred_quality #2.0.3 , NCBI:SRA:_454_:common #1.0.5 { /* NAME tokenizing and coordinates * most work happens in skeyname table * we still obtain REGION from name */ readonly column INSDC:coord:val REGION = ( INSDC:coord:val ) NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( _out_name, out_spot_name_tok ); NCBI:SRA:spot_name_token out_spot_name_tok = NCBI:SRA:_454_:tokenize_spot_name ( _out_name ); NCBI:SRA:spot_name_token in_spot_name_tok = NCBI:SRA:_454_:tokenize_spot_name ( NAME ); // special sequences INSDC:dna:text out_flow_chars = .FLOW_CHARS | < INSDC:dna:text > echo < 'TACG' > ( .SIGNAL ) | < INSDC:dna:text > echo < 'TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG' > (); physical column < INSDC:dna:text > zip_encoding .FLOW_CHARS = in_flow_chars; INSDC:dna:text out_key_sequence = .KEY_SEQUENCE | < INSDC:dna:text > echo < 'TCAG' > (); physical column < INSDC:dna:text > zip_encoding .KEY_SEQUENCE = in_key_sequence; INSDC:dna:text out_linker_sequence = .LINKER_SEQUENCE; physical column < INSDC:dna:text > zip_encoding .LINKER_SEQUENCE = in_linker_sequence; // linker needs to be representable by its own table // either in metadata or somewhere else // position stored as normal 1-based coordinate INSDC:position:one out_position = .POSITION; physical column NCBI:SRA:_454_:encoding:POSITION #2 .POSITION = POSITION; // clips physical column NCBI:SRA:_454_:encoding:CLIP #2 .CLIP_ADAPTER_LEFT = CLIP_ADAPTER_LEFT; physical column NCBI:SRA:_454_:encoding:CLIP #2 .CLIP_ADAPTER_RIGHT = CLIP_ADAPTER_RIGHT; physical column NCBI:SRA:_454_:encoding:CLIP #2 .CLIP_QUALITY_LEFT = CLIP_QUALITY_LEFT; physical column NCBI:SRA:_454_:encoding:CLIP #2 .CLIP_QUALITY_RIGHT = CLIP_QUALITY_RIGHT; // signal physical column NCBI:SRA:_454_:encoding:SIGNAL #2 .SIGNAL = SIGNAL; };