#!/home/passala/miniconda3/envs/Genomealign/bin/perl =head1 NAME bp_search2gff - turn a SearchIO report into GFF =head1 SYNOPSIS Usage: bp_search2gff [-o outputfile] [-f reportformat] [-i inputfilename] OR file1 file2 .. =head1 DESCRIPTION This script will turn a SearchIO report (BLAST, FASTP, SSEARCH, AXT, WABA) into GFF. The options are: -i infilename - (optional) inputfilename, will read either ARGV files or from STDIN -o filename - the output filename [default STDOUT] -f format - search result format (blast, fasta,waba,axt) (ssearch is fasta format). default is blast. -t/--type seqtype - if you want to see query or hit information in the GFF report -s/--source - specify the source (will be algorithm name otherwise like BLASTN) --method - the method tag (primary_tag) of the features (default is similarity) --scorefunc - a string or a file that when parsed evaluates to a closure which will be passed a feature object and that returns the score to be printed --locfunc - a string or a file that when parsed evaluates to a closure which will be passed two features, query and hit, and returns the location (Bio::LocationI compliant) for the GFF3 feature created for each HSP; the closure may use the clone_loc() and create_loc() functions for convenience, see their PODs --onehsp - only print the first HSP feature for each hit -p/--parent - the parent to which HSP features should refer if not the name of the hit or query (depending on --type) --target/--notarget - whether to always add the Target tag or not -h - this help menu --version - GFF version to use (put a 3 here to use gff 3) --component - generate GFF component fields (chromosome) -m/--match - generate a 'match' line which is a container of all the similarity HSPs --addid - add ID tag in the absence of --match -c/--cutoff - specify an evalue cutoff Additionally specify the filenames you want to process on the command-line. If no files are specified then STDIN input is assumed. You specify this by doing: bp_search2gff E file1 file2 file3 =head1 AUTHOR Jason Stajich, jason-at-bioperl-dot-org =head1 Contributors Hilmar Lapp, hlapp-at-gmx-dot-net =cut use strict; use warnings; use Bio::Tools::GFF; use Getopt::Long; use Bio::SearchIO; use Bio::Location::Simple; # pre-declare to simplify $locfunc implementations use Bio::Location::Atomic; # pre-declare to simplify $locfunc implementations use Storable qw(dclone); # for cloning location objects use Bio::Factory::FTLocationFactory; my ( $output, # output file (if not stdout) $input, # name of the input file $format, # format of the input file, defauly is blast $type, # 'query' or 'hit' $cutoff, # cut-off value for e-value filter $sourcetag, # explicit source tag (will be taken from program # otherwise $methodtag, # primary tag (a.k.a. method), default 'similarity' $gffver, # GFF version (dialect) to write $scorefunc, # closure returning the score for a passed feature $locfunc, # closure returning a location object for a passed # query and hit feature $addid, # flag: whether to always add the ID for $match == 0 $parent, # the name of the parent to use; if set and $match == 0 # will always add the target $comp, # flag: whether to print a component feature $addtarget, # flag: whether to always add the Target tag, default # is true $match, # flag: whether to print match lines as containers $onehsp, # flag: whether to consider only the first HSP for a hit $quiet, # flag: run quietly $help # flag: show help screen ); # set defaults: $format = 'blast'; $type = 'query'; $gffver = 2; $methodtag = "similarity"; $addtarget = 1; GetOptions( 'i|input:s' => \$input, 'component' => \$comp, 'm|match' => \$match, 'o|output:s' => \$output, 'f|format:s' => \$format, 's|source:s' => \$sourcetag, 'method=s' => \$methodtag, 'addid' => \$addid, 'scorefunc=s' => \$scorefunc, 'locfunc=s' => \$locfunc, 'p|parent=s' => \$parent, 'target!' => \$addtarget, 'onehsp' => \$onehsp, 't|type:s' => \$type, 'c|cutoff:s' => \$cutoff, 'v|version:i' => \$gffver, 'q|quiet' => \$quiet, 'h|help' => sub { exec( 'perldoc', $0 ); exit(0); }, ); $type = lc($type); if ( $type =~ /target/ ) { $type = 'hit' } elsif ( $type ne 'query' && $type ne 'hit' ) { die("seqtype must be either 'query' or 'hit'"); } # custom or default function returning the score $scorefunc = defined($scorefunc) ? parse_code($scorefunc) : sub { shift->score }; # custom or default function returning the location $locfunc = defined($locfunc) ? parse_code($locfunc) : sub { shift->location }; # if --match is given then $addid needs to be disabled $addid = undef if $addid && $match; # if no input is provided STDIN will be used my $parser = new Bio::SearchIO( -format => $format, -verbose => $quiet ? -1 : 0, -file => $input ); my $out; if ( defined $output ) { $out = new Bio::Tools::GFF( -gff_version => $gffver, -file => ">$output" ); } else { $out = new Bio::Tools::GFF( -gff_version => $gffver ); # STDOUT } my ( %seen_hit, %seen ); my $other = $type eq 'query' ? 'hit' : 'query'; while ( my $result = $parser->next_result ) { my $qname = $result->query_name; if ( $comp && $type eq 'query' && $result->query_length ) { $out->write_feature( Bio::SeqFeature::Generic->new( -start => 1, -end => $result->query_length, -seq_id => $qname, -source_tag => 'chromosome', -primary_tag => 'Component', -tag => { 'Sequence' => $qname } ) ); } while ( my $hit = $result->next_hit ) { next if ( defined $cutoff && $hit->significance > $cutoff ); my $acc = $qname; if ( $seen{ $qname . "-" . $hit->name }++ ) { $acc = $qname . "-" . $seen{ $qname . '-' . $hit->name }; } if ( $comp && $type eq 'hit' && $hit->length && !$seen_hit{ $hit->name }++ ) { $out->write_feature( Bio::SeqFeature::Generic->new( -start => 1, -end => $hit->length, -seq_id => $hit->name, -source_tag => 'chromosome', -primary_tag => 'Component', -tag => { 'Sequence' => $hit->name } ) ); } my ( %min, %max, $seqid, $name, $st ); while ( my $hsp = $hit->next_hsp ) { my $feature = new Bio::SeqFeature::Generic; my ( $proxyfor, $otherf ); if ( $type eq 'query' ) { ( $proxyfor, $otherf ) = ( $hsp->query, $hsp->hit ); $name ||= $hit->name; } else { ( $otherf, $proxyfor ) = ( $hsp->query, $hsp->hit ); $name ||= $acc; } $proxyfor->score( $hit->bits ) unless ( $proxyfor->score ); if ( ( $gffver == 3 ) && ( $match || $parent ) ) { $feature->add_tag_value( 'Parent', $parent || $name ); } $min{$type} = $proxyfor->start unless defined $min{$type} && $min{$type} < $proxyfor->start; $max{$type} = $proxyfor->end unless defined $max{$type} && $max{$type} > $proxyfor->end; $min{$other} = $otherf->start unless defined $min{$other} && $min{$other} < $otherf->start; $max{$other} = $otherf->end unless defined $max{$other} && $max{$other} > $otherf->end; if ( $addtarget || $match ) { $feature->add_tag_value( 'Target', 'Sequence:' . $name ); $feature->add_tag_value( 'Target', $otherf->start ); $feature->add_tag_value( 'Target', $otherf->end ); } if ($addid) { $feature->add_tag_value( 'ID', $name ); } $feature->location( &$locfunc( $proxyfor, $otherf ) ); # strand for feature is always going to be product of # query & hit strands so that target can always be just # '+' $feature->strand( $proxyfor->strand * $otherf->strand ); if ($sourcetag) { $feature->source_tag($sourcetag); } else { $feature->source_tag( $proxyfor->source_tag ); } $feature->score( &$scorefunc($proxyfor) ); $feature->frame( $proxyfor->frame ); $feature->seq_id( $proxyfor->seq_id ); $feature->primary_tag($methodtag); # add annotation if encoded in the query description my $desc = $result->query_description; while ( $desc =~ /\/([^=]+)=(\S+)/g ) { $feature->add_tag_value( $1, $2 ); } $seqid ||= $proxyfor->seq_id; $out->write_feature($feature); $st ||= $sourcetag || $proxyfor->source_tag; last if $onehsp; } if ($match) { my $matchf = Bio::SeqFeature::Generic->new( -start => $min{$type}, -end => $max{$type}, -strand => $hit->strand($type) * $hit->strand($other), -primary_tag => 'match', -source_tag => $st, -score => $hit->bits, -seq_id => $seqid ); if ( $gffver == 3 ) { $matchf->add_tag_value( 'ID', $name ); } $matchf->add_tag_value( 'Target', "Sequence:$name" ); $out->write_feature($matchf); } } } sub parse_code { my $src = shift; my $code; # file or subroutine? if ( -r $src ) { if ( !( ( $code = do $src ) && ( ref($code) eq "CODE" ) ) ) { die "error in parsing code block $src: $@" if $@; die "unable to read file $src: $!" if $!; die "failed to run $src, or it failed to return a closure"; } } else { $code = eval $src; die "error in parsing code block \"$src\": $@" if $@; die "\"$src\" fails to return a closure" unless ref($code) eq "CODE"; } return $code; } =head2 clone_loc Title : clone_loc Usage : my $l = clone_loc($feature->location); Function: Helper function to simplify the task of cloning locations for --locfunc closures. Presently simply implemented using Storable::dclone(). Example : Returns : A L object of the same type and with the same properties as the argument, but physically different. All structured properties will be cloned as well. Args : A L compliant object =cut sub clone_loc { return dclone(shift); } =head2 create_loc Title : create_loc Usage : my $l = create_loc("10..12"); Function: Helper function to simplify the task of creating locations for --locfunc closures. Creates a location from a feature- table formatted string. Example : Returns : A L object representing the location given as formatted string. Args : A GenBank feature-table formatted string. =cut sub create_loc { return Bio::Factory::FTLocationFactory->from_string(shift); }