[Bioperl-l] Strangeness in parsing blast file
Nabil Hafez
nabil at broad.mit.edu
Sun Jul 30 04:28:00 UTC 2006
Hi,
I am having a somewhat similar problem to what was posted in
http://bioperl.org/pipermail/bioperl-l/2006-May/021416.html
however, I have read through all of that thread and I don't believe what
I am
experiencing is the exact same problem. I also realize that the Bioperl
version 1.5.1
fixes a problem with blast parsing.
My problem:
My blastresults file parses fine and everything works swimmingly if
I pass
the blast output file by name such as
$blast_result = 'test.blastout';
however when I do
$blast_result = &do_blast($sample_fasta);
even though in both cases $blast_result evaluate to "test.blastout", the
parsing doesn't work, more specifically
it gets an undefined variable for $result in while( my $result =
$report_obj->next_result ) {
Sorr y for the long email - any help would be appreciated,
Thanks - Nabil
The code...non releavant parts trimmed for size constraints....debugging
from working and non-working
versions after the code.
use strict;
use Bio::SearchIO;
use Getopt::Std;
use List::Util qw(shuffle);
use Benchmark;
my ($inputfile, $samplefile, $blastfile, $blast_result, $blast_report,
$blast_verbose); #files generated
#------------------#
# Subroutine Calls #
#------------------#
my $test = &create_sample_file($inputfile); #inputfile being a fasta
file containing nucleotide sequence
$blast_result = &do_blast($test);
##$blast_result = 'test.blastout'; #when this is uncommented and
replace the previous two lines with test.blastout being normal blast
output - the script works fine.
&parse_blast($blast_result);
#######################
# create_sample_file
#
# Input: Original Fasta File
#
# Output: Fasta file containing randomly sampled reads
#
#
sub create_sample_file {
my $in = shift;
my $linecount = 0;
my @lines;
$samplefile = $in . "_sample";
#Determine total # of reads in input fasta
$totalreads = `$grep -c '>' $inputfile`;
$totalreads =~ s/\s+//;
chomp $totalreads;
if ($totalreads > 1000) { #sample if more than 1000 reads
$sample_reads = sprintf("%.0f", $totalreads *
($per_to_sample/100)); #number of reads to sample
}
else { #otherwise use all reads
$sample_reads = $totalreads;
}
$/ = '>'; #define fasta record input seperator
open (IN, "<$in") or die "Cannot open $in $!\n";
open (OUT, ">$samplefile") or die "Cannot open $samplefile $!\n";
while (<IN>) { #read lines into an array
chomp;
push (@lines, $_);
}
@lines = shuffle(@lines); #shuffle array
foreach (@lines) {
print OUT ">$_" if $linecount <= $sample_reads; #output to
file sampled number of reads
$linecount++;
}
close IN;
close OUT;
return $samplefile;
}#end create_sample_file
#######################
# do_blast
#
# Input: Fasta File containing SCREENED sampled reads
#
# Output: Blast File
#
#
sub do_blast {
my $bf = shift;
my $blastoutput = $bf . ".blastout";
print "Blasting against $db ...\n";
`blast/blast-2.2.10/bin/megablast -d /prodinfo/proddata_ntblastdb/nt
-e 1e-50 -p 99 -D 2 -i test -o test.blastout`;
return $blastoutput;
}#end do_blast
#######################
# parse_blast
#
# Input: Blast file
#
# Output: Creates hash containing best hit for each read
#
#
sub parse_blast {
my $blastoutfile = shift;
if (! -e $blastoutfile) {
die "$blastoutfile does not exist $!\n";
}
print "Parsing blast hits ...\n";
my $report_obj = new Bio::SearchIO(-verbose => 1,
-format => 'blast',
-file => $blastoutfile);
die "no valid $report_obj" unless defined $report_obj;
while( my $result = $report_obj->next_result ) {
die "no valid $result" unless defined $result;
while( my $hit = $result->next_hit ) {
while( my $hsp = $hit->next_hsp ) {
my $name = $result->query_name;
my $hitDesc = $hit->description;
my $length = $hsp->length('total');
my $per_id = sprintf("%.2f", $hsp->percent_identity);
my $eval = $hsp->evalue;
next if (defined $blast_results{$name} &&
$blast_results{$name}->[0] > $length); #only keep best hit for any read
$blast_results{$name} = [$length, $per_id, $eval, $hitDesc];
#store in hash of arrays
}
}
}
} #end parse_blast
Debug of NON-working blast-parse:
main::(454/scripts/fasta_blasta_mb.pl:151):
151: my $sample_fasta = &create_sample_file($inputfile);
DB<2> n
main::(454/scripts/fasta_blasta_mb.pl:152):
152: $blast_result = &do_blast($sample_fasta);
DB<2> x $sample_fasta
0 'G782.2005-08-16-16-48.fasta_sample'
DB<3> n
Blasting against NT ...
main::(454/scripts/fasta_blasta_mb.pl:154):
154: &parse_blast($blast_result);
DB<3> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:293):
293: my $blastoutfile = shift;
DB<3> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:295):
295: if (! -e $blastoutfile) {
DB<3> x $blastoutfile
0 'G782.2005-08-16-16-48.fasta_sample.blastout'
DB<4> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:299):
299: print "Parsing blast hits ...\n";
DB<4> s
Parsing blast hits ...
main::parse_blast(454/scripts/fasta_blasta_mb.pl:302):
302: my $report_obj = new Bio::SearchIO(-verbose => 1,
303: -format => 'blast',
304: -file =>
$blastoutfile);#or die "Could not open blast report $!";
DB<4> s
Bio::SearchIO::new(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:129):
129: my($caller, at args) = @_;
DB<4> r
scalar context return from Bio::SearchIO::new: '_file' =>
'G782.2005-08-16-16-48.fasta_sample.blastout'
'_filehandle' => GLOB(0x8cef40c)
-> *Symbol::GEN1
FileHandle({*Symbol::GEN1}) => fileno(3)
'_flush_on_write' => 1
'_handler' =>
Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x9505724)
'_factories' => HASH(0x95054c0)
'hit' => Bio::Factory::ObjectFactory=HASH(0x95017b8)
'_loaded_types' => HASH(0x9506c0c)
'Bio::Search::Hit::BlastHit' => 1
'_root_verbose' => 0
'interface' => 'Bio::Search::Hit::HitI'
'type' => 'Bio::Search::Hit::BlastHit'
'hsp' => Bio::Factory::ObjectFactory=HASH(0x9500e10)
'_loaded_types' => HASH(0x9506c18)
'Bio::Search::HSP::GenericHSP' => 1
'_root_verbose' => 0
'interface' => 'Bio::Search::HSP::HSPI'
'type' => 'Bio::Search::HSP::GenericHSP'
'iteration' => Bio::Factory::ObjectFactory=HASH(0x9506c60)
'_loaded_types' => HASH(0x9506af8)
'Bio::Search::Iteration::GenericIteration' => 1
'_root_verbose' => 0
'interface' => 'Bio::Search::Iteration::IterationI'
'type' => 'Bio::Search::Iteration::GenericIteration'
'result' => Bio::Factory::ObjectFactory=HASH(0x9504c80)
'_loaded_types' => HASH(0x9501f74)
'Bio::Search::Result::BlastResult' => 1
'_root_verbose' => 0
'interface' => 'Bio::Search::Result::ResultI'
'type' => 'Bio::Search::Result::BlastResult'
'_inclusion_threshold' => 0.001
'_root_verbose' => 1
'_handler_cache' =>
Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x9505724)
-> REUSED_ADDRESS
'_notfirsttime' => 0
'_reporttype' => ''
'_root_cleanup_methods' => ARRAY(0x8cde434)
0 CODE(0x82a9aec)
-> &Bio::Root::IO::_io_cleanup in
/util/lib/perl5/site_perl/5.8.0/Bio/Root/IO.pm:544-570
1 CODE(0x82a9aec)
-> REUSED_ADDRESS
'_root_verbose' => 1
main::parse_blast(454/scripts/fasta_blasta_mb.pl:307):
307: die "no valid $report_obj" unless defined $report_obj;
DB<4> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:310):
310: while( my $result = $report_obj->next_result ) {
DB<4> s
Bio::SearchIO::blast::next_result(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO/blast.pm:389):
389: my ($self) = @_;
DB<4> r
scalar context return from Bio::SearchIO::blast::next_result: undef
Bio::SearchIO::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:438):
438: my $self = shift;
DB<4> r
scalar context return from Bio::SearchIO::DESTROY: ''
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404: my $self = shift;
DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404: my $self = shift;
DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404: my $self = shift;
DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404: my $self = shift;
DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
404: my $self = shift;
DB<4> r
scalar context return from Bio::Root::Root::DESTROY: undef
main::(454/scripts/fasta_blasta_mb.pl:155):
155: &output_results();
DB<4> x $result
0 undef
Debug of WORKING blast-parse:
Parsing blast hits ...
main::parse_blast(454/scripts/fasta_blasta_mb.pl:302):
302: my $report_obj = new Bio::SearchIO(-verbose => 1,
303: -format => 'blast',
304: -file =>
$blastoutfile);#or die "Could not open blast report $!";
DB<3> s
Bio::SearchIO::new(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:129):
129: my($caller, at args) = @_;
DB<3> r
scalar context return from Bio::SearchIO::new: '_file' =>
'G782.2005-08-16-16-48.fasta_sample.blastout'
'_filehandle' => GLOB(0x8763100)
-> *Symbol::GEN1
FileHandle({*Symbol::GEN1}) => fileno(3)
'_flush_on_write' => 1
'_handler' =>
Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x8ab3be4)
'_factories' => HASH(0x8ab1594)
'hit' => Bio::Factory::ObjectFactory=HASH(0x8a7b7c0)
'_loaded_types' => HASH(0x8abee10)
'Bio::Search::Hit::BlastHit' => 1
'_root_verbose' => 0
'interface' => 'Bio::Search::Hit::HitI'
'type' => 'Bio::Search::Hit::BlastHit'
'hsp' => Bio::Factory::ObjectFactory=HASH(0x8a87200)
'_loaded_types' => HASH(0x8abee1c)
'Bio::Search::HSP::GenericHSP' => 1
'_root_verbose' => 0
'interface' => 'Bio::Search::HSP::HSPI'
'type' => 'Bio::Search::HSP::GenericHSP'
'iteration' => Bio::Factory::ObjectFactory=HASH(0x8abee64)
'_loaded_types' => HASH(0x8abecfc)
'Bio::Search::Iteration::GenericIteration' => 1
'_root_verbose' => 0
'interface' => 'Bio::Search::Iteration::IterationI'
'type' => 'Bio::Search::Iteration::GenericIteration'
'result' => Bio::Factory::ObjectFactory=HASH(0x8a81a84)
'_loaded_types' => HASH(0x8a96ce8)
'Bio::Search::Result::BlastResult' => 1
'_root_verbose' => 0
'interface' => 'Bio::Search::Result::ResultI'
'type' => 'Bio::Search::Result::BlastResult'
'_inclusion_threshold' => 0.001
'_root_verbose' => 1
'_handler_cache' =>
Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x8ab3be4)
-> REUSED_ADDRESS
'_notfirsttime' => 0
'_reporttype' => ''
'_root_cleanup_methods' => ARRAY(0x8762efc)
0 CODE(0x82a9aec)
-> &Bio::Root::IO::_io_cleanup in
/util/lib/perl5/site_perl/5.8.0/Bio/Root/IO.pm:544-570
1 CODE(0x82a9aec)
-> REUSED_ADDRESS
'_root_verbose' => 1
main::parse_blast(454/scripts/fasta_blasta_mb.pl:307):
307: die "no valid $report_obj" unless defined $report_obj;
DB<3> s
main::parse_blast(454/scripts/fasta_blasta_mb.pl:310):
310: while( my $result = $report_obj->next_result ) {
DB<3> s
Bio::SearchIO::blast::next_result(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO/blast.pm:389):
389: my ($self) = @_;
DB<3> r
blast.pm: unrecognized line Reference: Zheng Zhang, Scott Schwartz,
Lukas Wagner, and Webb Miller (2000),
blast.pm: unrecognized line "A greedy algorithm for aligning DNA
sequences",
blast.pm: unrecognized line J Comput Biol 2000; 7(1-2):203-14.
blast.pm: unrecognized
line
Score E
Got NCBI HSP score=354, evalue 0.0
scalar context return from Bio::SearchIO::blast::next_result:
'_algorithm' => 'MEGABLAST'
'_algorithm_version' => '2.2.10 [Oct-19-2004]'
'_dbentries' => 4249067
'_dbletters' => 17735149364
'_dbname' => 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST,
STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences) '
'_hitindex' => 0
'_hits' => ARRAY(0x8b2acd0)
empty array
'_inclusion_threshold' => 0.001
'_iteration_count' => 1
'_iteration_index' => 0
'_iterations' => ARRAY(0x8b2ac4c)
0 Bio::Search::Iteration::GenericIteration=HASH(0x8b1cacc)
'_newhits_below_threshold' => ARRAY(0x8b1ca84)
0 Bio::Search::Hit::BlastHit=HASH(0x8b1cf64)
'_accession' => 'AE004091'
'_algorithm' => 'MEGABLAST'
'_description' => 'Pseudomonas aeruginosa PAO1, complete genome'
'_hsps' => ARRAY(0x8b1ceb0)
0 Bio::Search::HSP::GenericHSP=HASH(0x8b2098c)
'_algorithm' => 'MEGABLAST'
'_frac_conserved' => HASH(0x8b266a0)
'hit' => 0.991803278688525
'query' => 0.991803278688525
'total' => 0.991803278688525
'_frac_identical' => HASH(0x8b2658c)
'hit' => 0.991803278688525
'query' => 0.991803278688525
'total' => 0.991803278688525
'_gaps' => HASH(0x8b24d94)
'hit' => 0
'query' => 0
'total' => 0
'_gsf_tag_hash' => HASH(0x8b20998)
empty hash
'_hit_string' =>
'cctgacctccgctcaactgcgcaaatacgccagcgccggtcggccgttccccgaagggcgcctgctggccgcctcctgccacgacgcggaggaactggccctggctgcctcgatgggagtggagttcgtcaccctttcgccggtacagccgaccgagagccatcccggcgagccggcgctgggttgggacaaggccgccgaactgatcgccggcttcaaccagccggtctacctgctgggtggcctcggtccgcagcaagccgagcaggcttgggagcatggagcccagggcgtggcgggtatccgtgcgttctggccgggcggcctttgacggtggaatgaagaaaaaaggaggcttcggcctcc'
'_homology_string' =>
'||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|||||||||||||||||||||||||||||||||||||||||
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||'
etc......
More information about the Bioperl-l
mailing list