[Bioperl-l] Strangeness in parsing blast file

Nabil Hafez nabil at broad.mit.edu
Sun Jul 30 04:28:00 UTC 2006


Hi,
    I am having a somewhat similar problem to what was posted in
http://bioperl.org/pipermail/bioperl-l/2006-May/021416.html
however, I have read through all of that thread and I don't believe what 
I am
experiencing is the exact same problem.  I also realize that the Bioperl 
version 1.5.1
fixes a problem with blast parsing. 

My problem: 
    My blastresults file parses fine and everything works swimmingly if 
I pass
the blast output file  by name   such as
$blast_result = 'test.blastout'; 

however when I do
$blast_result = &do_blast($sample_fasta);

even though in both cases $blast_result evaluate to "test.blastout", the 
parsing doesn't work, more specifically
it gets an undefined variable for $result in while( my $result = 
$report_obj->next_result ) {

Sorr y for the long email - any help would be appreciated,
Thanks - Nabil


The code...non releavant parts trimmed for size constraints....debugging 
from working and non-working
versions after the code.

use strict;
use Bio::SearchIO;
use Getopt::Std;
use List::Util qw(shuffle);
use Benchmark;

my ($inputfile, $samplefile, $blastfile, $blast_result, $blast_report, 
$blast_verbose); #files generated


#------------------#
# Subroutine Calls #
#------------------#

my $test = &create_sample_file($inputfile); #inputfile being a fasta 
file containing nucleotide sequence
$blast_result = &do_blast($test);
##$blast_result = 'test.blastout';  #when this is uncommented and 
replace the previous two lines with test.blastout being normal blast 
output - the script works fine. 
&parse_blast($blast_result);


#######################
# create_sample_file
#
#      Input: Original Fasta File
#
#      Output: Fasta file containing randomly sampled reads
#
#
sub create_sample_file {
     my $in = shift;
     my $linecount = 0;
     my @lines;

     $samplefile = $in . "_sample";

    #Determine total # of reads in input fasta
    $totalreads = `$grep -c '>' $inputfile`;
    $totalreads =~ s/\s+//;
    chomp $totalreads;

   if ($totalreads > 1000) { #sample if more than 1000 reads
       $sample_reads = sprintf("%.0f", $totalreads * 
($per_to_sample/100)); #number of reads to sample
   }
   else { #otherwise use all reads
       $sample_reads = $totalreads;
   }

   $/ = '>'; #define fasta record input seperator

   open (IN, "<$in") or die "Cannot open $in $!\n";
   open (OUT, ">$samplefile") or die "Cannot open $samplefile $!\n";


     while (<IN>) { #read lines into an array
        chomp;
        push (@lines, $_);
       }

     @lines = shuffle(@lines); #shuffle array
     foreach (@lines)  {
           print OUT ">$_" if $linecount <= $sample_reads; #output to 
file sampled number of reads
           $linecount++;
     }

    close IN;
    close OUT;

    return $samplefile;

}#end create_sample_file


#######################
# do_blast
#
#      Input: Fasta File containing SCREENED sampled reads
#
#      Output: Blast File
#
#

sub do_blast {
   my $bf = shift;
   my $blastoutput = $bf . ".blastout";

    print "Blasting against $db ...\n";

   `blast/blast-2.2.10/bin/megablast -d /prodinfo/proddata_ntblastdb/nt 
-e 1e-50 -p 99 -D 2 -i test -o test.blastout`;

    return $blastoutput;

}#end do_blast



#######################
# parse_blast
#
#      Input: Blast file
#
#      Output: Creates hash containing best hit for each read
#
#

sub parse_blast {
  my $blastoutfile = shift;

  if (! -e $blastoutfile) {
    die "$blastoutfile does not exist $!\n";
  }

  print "Parsing blast hits ...\n";


   my $report_obj = new Bio::SearchIO(-verbose => 1,
                                      -format => 'blast',
                                      -file => $blastoutfile);


    die "no valid $report_obj" unless defined $report_obj;


   while( my $result = $report_obj->next_result ) {
              die "no valid $result" unless defined $result;
     while( my $hit = $result->next_hit ) {
       while( my $hsp = $hit->next_hsp ) {
             my $name =  $result->query_name;
             my $hitDesc = $hit->description;
             my $length = $hsp->length('total');
             my $per_id = sprintf("%.2f", $hsp->percent_identity);
             my $eval = $hsp->evalue;
             next if (defined $blast_results{$name} && 
$blast_results{$name}->[0] > $length); #only keep best hit for any read
            $blast_results{$name} = [$length, $per_id, $eval, $hitDesc]; 
#store in hash of arrays
        }
      }
    }

} #end parse_blast





Debug of NON-working blast-parse:

main::(454/scripts/fasta_blasta_mb.pl:151):
151:    my $sample_fasta = &create_sample_file($inputfile);
  DB<2> n
main::(454/scripts/fasta_blasta_mb.pl:152):
152:    $blast_result = &do_blast($sample_fasta);
  DB<2> x $sample_fasta
0  'G782.2005-08-16-16-48.fasta_sample'
  DB<3> n
Blasting against NT ...
main::(454/scripts/fasta_blasta_mb.pl:154):
154:    &parse_blast($blast_result);
  DB<3> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:293):
293:      my $blastoutfile = shift;
  DB<3> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:295):
295:      if (! -e $blastoutfile) {
  DB<3> x $blastoutfile
0  'G782.2005-08-16-16-48.fasta_sample.blastout'
  DB<4> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:299):
299:      print "Parsing blast hits ...\n";
  DB<4> s
Parsing blast hits ...
main::parse_blast(454/scripts/fasta_blasta_mb.pl:302):
302:       my $report_obj = new Bio::SearchIO(-verbose => 1,
303:                                          -format => 'blast',
304:                                          -file => 
$blastoutfile);#or die "Could not open blast report $!";
  DB<4> s
Bio::SearchIO::new(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:129):
129:      my($caller, at args) = @_;
  DB<4> r
scalar context return from Bio::SearchIO::new: '_file' => 
'G782.2005-08-16-16-48.fasta_sample.blastout'
'_filehandle' => GLOB(0x8cef40c)
   -> *Symbol::GEN1
         FileHandle({*Symbol::GEN1}) => fileno(3)
'_flush_on_write' => 1
'_handler' => 
Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x9505724)
   '_factories' => HASH(0x95054c0)
      'hit' => Bio::Factory::ObjectFactory=HASH(0x95017b8)
         '_loaded_types' => HASH(0x9506c0c)
            'Bio::Search::Hit::BlastHit' => 1
         '_root_verbose' => 0
         'interface' => 'Bio::Search::Hit::HitI'
         'type' => 'Bio::Search::Hit::BlastHit'
      'hsp' => Bio::Factory::ObjectFactory=HASH(0x9500e10)
         '_loaded_types' => HASH(0x9506c18)
            'Bio::Search::HSP::GenericHSP' => 1
         '_root_verbose' => 0
         'interface' => 'Bio::Search::HSP::HSPI'
         'type' => 'Bio::Search::HSP::GenericHSP'
      'iteration' => Bio::Factory::ObjectFactory=HASH(0x9506c60)
         '_loaded_types' => HASH(0x9506af8)
            'Bio::Search::Iteration::GenericIteration' => 1
         '_root_verbose' => 0
         'interface' => 'Bio::Search::Iteration::IterationI'
         'type' => 'Bio::Search::Iteration::GenericIteration'
      'result' => Bio::Factory::ObjectFactory=HASH(0x9504c80)
         '_loaded_types' => HASH(0x9501f74)
            'Bio::Search::Result::BlastResult' => 1
         '_root_verbose' => 0
         'interface' => 'Bio::Search::Result::ResultI'
         'type' => 'Bio::Search::Result::BlastResult'
   '_inclusion_threshold' => 0.001
   '_root_verbose' => 1
'_handler_cache' => 
Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x9505724)
   -> REUSED_ADDRESS
'_notfirsttime' => 0
'_reporttype' => ''
'_root_cleanup_methods' => ARRAY(0x8cde434)
   0  CODE(0x82a9aec)
      -> &Bio::Root::IO::_io_cleanup in 
/util/lib/perl5/site_perl/5.8.0/Bio/Root/IO.pm:544-570
   1  CODE(0x82a9aec)
      -> REUSED_ADDRESS
'_root_verbose' => 1
main::parse_blast(454/scripts/fasta_blasta_mb.pl:307):
307:        die "no valid $report_obj" unless defined $report_obj;
  DB<4> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:310):
310:       while( my $result = $report_obj->next_result ) {
  DB<4> s
Bio::SearchIO::blast::next_result(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO/blast.pm:389):
389:       my ($self) = @_;
  DB<4> r
scalar context return from Bio::SearchIO::blast::next_result: undef
Bio::SearchIO::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:438):
438:        my $self = shift;
  DB<4> r
scalar context return from Bio::SearchIO::DESTROY: ''
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404:        my $self = shift;
  DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404:        my $self = shift;
  DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404:        my $self = shift;
  DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404:        my $self = shift;
  DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404:        my $self = shift;
  DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
main::(454/scripts/fasta_blasta_mb.pl:155):
155:    &output_results();
  DB<4> x $result
0  undef



Debug of WORKING blast-parse:
Parsing blast hits ...
main::parse_blast(454/scripts/fasta_blasta_mb.pl:302):
302:       my $report_obj = new Bio::SearchIO(-verbose => 1,
303:                                          -format => 'blast',
304:                                          -file => 
$blastoutfile);#or die "Could not open blast report $!";
  DB<3> s
Bio::SearchIO::new(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:129):
129:      my($caller, at args) = @_;
  DB<3> r
scalar context return from Bio::SearchIO::new: '_file' => 
'G782.2005-08-16-16-48.fasta_sample.blastout'
'_filehandle' => GLOB(0x8763100)
   -> *Symbol::GEN1
         FileHandle({*Symbol::GEN1}) => fileno(3)
'_flush_on_write' => 1
'_handler' => 
Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x8ab3be4)
   '_factories' => HASH(0x8ab1594)
      'hit' => Bio::Factory::ObjectFactory=HASH(0x8a7b7c0)
         '_loaded_types' => HASH(0x8abee10)
            'Bio::Search::Hit::BlastHit' => 1
         '_root_verbose' => 0
         'interface' => 'Bio::Search::Hit::HitI'
         'type' => 'Bio::Search::Hit::BlastHit'
      'hsp' => Bio::Factory::ObjectFactory=HASH(0x8a87200)
         '_loaded_types' => HASH(0x8abee1c)
            'Bio::Search::HSP::GenericHSP' => 1
         '_root_verbose' => 0
         'interface' => 'Bio::Search::HSP::HSPI'
         'type' => 'Bio::Search::HSP::GenericHSP'
      'iteration' => Bio::Factory::ObjectFactory=HASH(0x8abee64)
         '_loaded_types' => HASH(0x8abecfc)
            'Bio::Search::Iteration::GenericIteration' => 1
         '_root_verbose' => 0
         'interface' => 'Bio::Search::Iteration::IterationI'
         'type' => 'Bio::Search::Iteration::GenericIteration'
      'result' => Bio::Factory::ObjectFactory=HASH(0x8a81a84)
         '_loaded_types' => HASH(0x8a96ce8)
            'Bio::Search::Result::BlastResult' => 1
         '_root_verbose' => 0
         'interface' => 'Bio::Search::Result::ResultI'
         'type' => 'Bio::Search::Result::BlastResult'
   '_inclusion_threshold' => 0.001
   '_root_verbose' => 1
'_handler_cache' => 
Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x8ab3be4)
   -> REUSED_ADDRESS
'_notfirsttime' => 0
'_reporttype' => ''
'_root_cleanup_methods' => ARRAY(0x8762efc)
   0  CODE(0x82a9aec)
      -> &Bio::Root::IO::_io_cleanup in 
/util/lib/perl5/site_perl/5.8.0/Bio/Root/IO.pm:544-570
   1  CODE(0x82a9aec)
      -> REUSED_ADDRESS
'_root_verbose' => 1
main::parse_blast(454/scripts/fasta_blasta_mb.pl:307):
307:        die "no valid $report_obj" unless defined $report_obj;
  DB<3> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:310):
310:       while( my $result = $report_obj->next_result ) {
  DB<3> s
Bio::SearchIO::blast::next_result(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO/blast.pm:389):
389:       my ($self) = @_;
  DB<3> r
blast.pm: unrecognized line Reference: Zheng Zhang, Scott Schwartz, 
Lukas Wagner, and Webb Miller (2000),
blast.pm: unrecognized line "A greedy algorithm for aligning DNA 
sequences",
blast.pm: unrecognized line J Comput Biol 2000; 7(1-2):203-14.
blast.pm: unrecognized 
line                                                                  
Score    E
Got NCBI HSP score=354, evalue 0.0
scalar context return from Bio::SearchIO::blast::next_result: 
'_algorithm' => 'MEGABLAST'
'_algorithm_version' => '2.2.10 [Oct-19-2004]'
'_dbentries' => 4249067
'_dbletters' => 17735149364
'_dbname' => 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, 
STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences) '
'_hitindex' => 0
'_hits' => ARRAY(0x8b2acd0)
     empty array
'_inclusion_threshold' => 0.001
'_iteration_count' => 1
'_iteration_index' => 0
'_iterations' => ARRAY(0x8b2ac4c)
   0  Bio::Search::Iteration::GenericIteration=HASH(0x8b1cacc)
      '_newhits_below_threshold' => ARRAY(0x8b1ca84)
         0  Bio::Search::Hit::BlastHit=HASH(0x8b1cf64)
            '_accession' => 'AE004091'
            '_algorithm' => 'MEGABLAST'
            '_description' => 'Pseudomonas aeruginosa PAO1, complete genome'
            '_hsps' => ARRAY(0x8b1ceb0)
               0  Bio::Search::HSP::GenericHSP=HASH(0x8b2098c)
                  '_algorithm' => 'MEGABLAST'
                  '_frac_conserved' => HASH(0x8b266a0)
                     'hit' => 0.991803278688525
                     'query' => 0.991803278688525
                     'total' => 0.991803278688525
                  '_frac_identical' => HASH(0x8b2658c)
                     'hit' => 0.991803278688525
                     'query' => 0.991803278688525
                     'total' => 0.991803278688525
                  '_gaps' => HASH(0x8b24d94)
                     'hit' => 0
                     'query' => 0
                     'total' => 0
                  '_gsf_tag_hash' => HASH(0x8b20998)
                       empty hash
                  '_hit_string' => 
'cctgacctccgctcaactgcgcaaatacgccagcgccggtcggccgttccccgaagggcgcctgctggccgcctcctgccacgacgcggaggaactggccctggctgcctcgatgggagtggagttcgtcaccctttcgccggtacagccgaccgagagccatcccggcgagccggcgctgggttgggacaaggccgccgaactgatcgccggcttcaaccagccggtctacctgctgggtggcctcggtccgcagcaagccgagcaggcttgggagcatggagcccagggcgtggcgggtatccgtgcgttctggccgggcggcctttgacggtggaatgaagaaaaaaggaggcttcggcctcc'
                  '_homology_string' => 
'|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 
||||||||||||||||||||||||||||||||||||||||| 
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||'  
etc......



More information about the Bioperl-l mailing list