[Bioperl-l] Strangeness in parsing blast file
Nabil Hafez
nabil at broad.mit.edu
Mon Jul 31 18:57:48 UTC 2006
I have figured out the problem - not a problem with Bioperl.
In my create_sample_file() subroutine I defined
$/ = '>'; #define fasta record input seperator
when it should have been this
local $/ = "\n>";
the use of local made a big difference. Thanks to all for your help.
Nabil Hafez
Nabil Hafez wrote:
> Hi,
> I am having a somewhat similar problem to what was posted in
> http://bioperl.org/pipermail/bioperl-l/2006-May/021416.html
> however, I have read through all of that thread and I don't believe what
> I am
> experiencing is the exact same problem. I also realize that the Bioperl
> version 1.5.1
> fixes a problem with blast parsing.
>
> My problem:
> My blastresults file parses fine and everything works swimmingly if
> I pass
> the blast output file by name such as
> $blast_result = 'test.blastout';
>
> however when I do
> $blast_result = &do_blast($sample_fasta);
>
> even though in both cases $blast_result evaluate to "test.blastout", the
> parsing doesn't work, more specifically
> it gets an undefined variable for $result in while( my $result =
> $report_obj->next_result ) {
>
> Sorr y for the long email - any help would be appreciated,
> Thanks - Nabil
>
>
> The code...non releavant parts trimmed for size constraints....debugging
> from working and non-working
> versions after the code.
>
> use strict;
> use Bio::SearchIO;
> use Getopt::Std;
> use List::Util qw(shuffle);
> use Benchmark;
>
> my ($inputfile, $samplefile, $blastfile, $blast_result, $blast_report,
> $blast_verbose); #files generated
>
>
> #------------------#
> # Subroutine Calls #
> #------------------#
>
> my $test = &create_sample_file($inputfile); #inputfile being a fasta
> file containing nucleotide sequence
> $blast_result = &do_blast($test);
> ##$blast_result = 'test.blastout'; #when this is uncommented and
> replace the previous two lines with test.blastout being normal blast
> output - the script works fine.
> &parse_blast($blast_result);
>
>
> #######################
> # create_sample_file
> #
> # Input: Original Fasta File
> #
> # Output: Fasta file containing randomly sampled reads
> #
> #
> sub create_sample_file {
> my $in = shift;
> my $linecount = 0;
> my @lines;
>
> $samplefile = $in . "_sample";
>
> #Determine total # of reads in input fasta
> $totalreads = `$grep -c '>' $inputfile`;
> $totalreads =~ s/\s+//;
> chomp $totalreads;
>
> if ($totalreads > 1000) { #sample if more than 1000 reads
> $sample_reads = sprintf("%.0f", $totalreads *
> ($per_to_sample/100)); #number of reads to sample
> }
> else { #otherwise use all reads
> $sample_reads = $totalreads;
> }
>
> $/ = '>'; #define fasta record input seperator
>
> open (IN, "<$in") or die "Cannot open $in $!\n";
> open (OUT, ">$samplefile") or die "Cannot open $samplefile $!\n";
>
>
> while (<IN>) { #read lines into an array
> chomp;
> push (@lines, $_);
> }
>
> @lines = shuffle(@lines); #shuffle array
> foreach (@lines) {
> print OUT ">$_" if $linecount <= $sample_reads; #output to
> file sampled number of reads
> $linecount++;
> }
>
> close IN;
> close OUT;
>
> return $samplefile;
>
> }#end create_sample_file
>
>
> #######################
> # do_blast
> #
> # Input: Fasta File containing SCREENED sampled reads
> #
> # Output: Blast File
> #
> #
>
> sub do_blast {
> my $bf = shift;
> my $blastoutput = $bf . ".blastout";
>
> print "Blasting against $db ...\n";
>
> `blast/blast-2.2.10/bin/megablast -d /prodinfo/proddata_ntblastdb/nt
> -e 1e-50 -p 99 -D 2 -i test -o test.blastout`;
>
> return $blastoutput;
>
> }#end do_blast
>
>
>
> #######################
> # parse_blast
> #
> # Input: Blast file
> #
> # Output: Creates hash containing best hit for each read
> #
> #
>
> sub parse_blast {
> my $blastoutfile = shift;
>
> if (! -e $blastoutfile) {
> die "$blastoutfile does not exist $!\n";
> }
>
> print "Parsing blast hits ...\n";
>
>
> my $report_obj = new Bio::SearchIO(-verbose => 1,
> -format => 'blast',
> -file => $blastoutfile);
>
>
> die "no valid $report_obj" unless defined $report_obj;
>
>
> while( my $result = $report_obj->next_result ) {
> die "no valid $result" unless defined $result;
> while( my $hit = $result->next_hit ) {
> while( my $hsp = $hit->next_hsp ) {
> my $name = $result->query_name;
> my $hitDesc = $hit->description;
> my $length = $hsp->length('total');
> my $per_id = sprintf("%.2f", $hsp->percent_identity);
> my $eval = $hsp->evalue;
> next if (defined $blast_results{$name} &&
> $blast_results{$name}->[0] > $length); #only keep best hit for any read
> $blast_results{$name} = [$length, $per_id, $eval, $hitDesc];
> #store in hash of arrays
> }
> }
> }
>
> } #end parse_blast
>
>
>
>
>
> Debug of NON-working blast-parse:
>
> main::(454/scripts/fasta_blasta_mb.pl:151):
> 151: my $sample_fasta = &create_sample_file($inputfile);
> DB<2> n
> main::(454/scripts/fasta_blasta_mb.pl:152):
> 152: $blast_result = &do_blast($sample_fasta);
> DB<2> x $sample_fasta
> 0 'G782.2005-08-16-16-48.fasta_sample'
> DB<3> n
> Blasting against NT ...
> main::(454/scripts/fasta_blasta_mb.pl:154):
> 154: &parse_blast($blast_result);
> DB<3> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:293):
> 293: my $blastoutfile = shift;
> DB<3> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:295):
> 295: if (! -e $blastoutfile) {
> DB<3> x $blastoutfile
> 0 'G782.2005-08-16-16-48.fasta_sample.blastout'
> DB<4> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:299):
> 299: print "Parsing blast hits ...\n";
> DB<4> s
> Parsing blast hits ...
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:302):
> 302: my $report_obj = new Bio::SearchIO(-verbose => 1,
> 303: -format => 'blast',
> 304: -file =>
> $blastoutfile);#or die "Could not open blast report $!";
> DB<4> s
> Bio::SearchIO::new(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:129):
> 129: my($caller, at args) = @_;
> DB<4> r
> scalar context return from Bio::SearchIO::new: '_file' =>
> 'G782.2005-08-16-16-48.fasta_sample.blastout'
> '_filehandle' => GLOB(0x8cef40c)
> -> *Symbol::GEN1
> FileHandle({*Symbol::GEN1}) => fileno(3)
> '_flush_on_write' => 1
> '_handler' =>
> Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x9505724)
> '_factories' => HASH(0x95054c0)
> 'hit' => Bio::Factory::ObjectFactory=HASH(0x95017b8)
> '_loaded_types' => HASH(0x9506c0c)
> 'Bio::Search::Hit::BlastHit' => 1
> '_root_verbose' => 0
> 'interface' => 'Bio::Search::Hit::HitI'
> 'type' => 'Bio::Search::Hit::BlastHit'
> 'hsp' => Bio::Factory::ObjectFactory=HASH(0x9500e10)
> '_loaded_types' => HASH(0x9506c18)
> 'Bio::Search::HSP::GenericHSP' => 1
> '_root_verbose' => 0
> 'interface' => 'Bio::Search::HSP::HSPI'
> 'type' => 'Bio::Search::HSP::GenericHSP'
> 'iteration' => Bio::Factory::ObjectFactory=HASH(0x9506c60)
> '_loaded_types' => HASH(0x9506af8)
> 'Bio::Search::Iteration::GenericIteration' => 1
> '_root_verbose' => 0
> 'interface' => 'Bio::Search::Iteration::IterationI'
> 'type' => 'Bio::Search::Iteration::GenericIteration'
> 'result' => Bio::Factory::ObjectFactory=HASH(0x9504c80)
> '_loaded_types' => HASH(0x9501f74)
> 'Bio::Search::Result::BlastResult' => 1
> '_root_verbose' => 0
> 'interface' => 'Bio::Search::Result::ResultI'
> 'type' => 'Bio::Search::Result::BlastResult'
> '_inclusion_threshold' => 0.001
> '_root_verbose' => 1
> '_handler_cache' =>
> Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x9505724)
> -> REUSED_ADDRESS
> '_notfirsttime' => 0
> '_reporttype' => ''
> '_root_cleanup_methods' => ARRAY(0x8cde434)
> 0 CODE(0x82a9aec)
> -> &Bio::Root::IO::_io_cleanup in
> /util/lib/perl5/site_perl/5.8.0/Bio/Root/IO.pm:544-570
> 1 CODE(0x82a9aec)
> -> REUSED_ADDRESS
> '_root_verbose' => 1
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:307):
> 307: die "no valid $report_obj" unless defined $report_obj;
> DB<4> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:310):
> 310: while( my $result = $report_obj->next_result ) {
> DB<4> s
> Bio::SearchIO::blast::next_result(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO/blast.pm:389):
> 389: my ($self) = @_;
> DB<4> r
> scalar context return from Bio::SearchIO::blast::next_result: undef
> Bio::SearchIO::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:438):
> 438: my $self = shift;
> DB<4> r
> scalar context return from Bio::SearchIO::DESTROY: ''
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404: my $self = shift;
> DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404: my $self = shift;
> DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404: my $self = shift;
> DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404: my $self = shift;
> DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404: my $self = shift;
> DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> main::(454/scripts/fasta_blasta_mb.pl:155):
> 155: &output_results();
> DB<4> x $result
> 0 undef
>
>
>
> Debug of WORKING blast-parse:
> Parsing blast hits ...
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:302):
> 302: my $report_obj = new Bio::SearchIO(-verbose => 1,
> 303: -format => 'blast',
> 304: -file =>
> $blastoutfile);#or die "Could not open blast report $!";
> DB<3> s
> Bio::SearchIO::new(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:129):
> 129: my($caller, at args) = @_;
> DB<3> r
> scalar context return from Bio::SearchIO::new: '_file' =>
> 'G782.2005-08-16-16-48.fasta_sample.blastout'
> '_filehandle' => GLOB(0x8763100)
> -> *Symbol::GEN1
> FileHandle({*Symbol::GEN1}) => fileno(3)
> '_flush_on_write' => 1
> '_handler' =>
> Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x8ab3be4)
> '_factories' => HASH(0x8ab1594)
> 'hit' => Bio::Factory::ObjectFactory=HASH(0x8a7b7c0)
> '_loaded_types' => HASH(0x8abee10)
> 'Bio::Search::Hit::BlastHit' => 1
> '_root_verbose' => 0
> 'interface' => 'Bio::Search::Hit::HitI'
> 'type' => 'Bio::Search::Hit::BlastHit'
> 'hsp' => Bio::Factory::ObjectFactory=HASH(0x8a87200)
> '_loaded_types' => HASH(0x8abee1c)
> 'Bio::Search::HSP::GenericHSP' => 1
> '_root_verbose' => 0
> 'interface' => 'Bio::Search::HSP::HSPI'
> 'type' => 'Bio::Search::HSP::GenericHSP'
> 'iteration' => Bio::Factory::ObjectFactory=HASH(0x8abee64)
> '_loaded_types' => HASH(0x8abecfc)
> 'Bio::Search::Iteration::GenericIteration' => 1
> '_root_verbose' => 0
> 'interface' => 'Bio::Search::Iteration::IterationI'
> 'type' => 'Bio::Search::Iteration::GenericIteration'
> 'result' => Bio::Factory::ObjectFactory=HASH(0x8a81a84)
> '_loaded_types' => HASH(0x8a96ce8)
> 'Bio::Search::Result::BlastResult' => 1
> '_root_verbose' => 0
> 'interface' => 'Bio::Search::Result::ResultI'
> 'type' => 'Bio::Search::Result::BlastResult'
> '_inclusion_threshold' => 0.001
> '_root_verbose' => 1
> '_handler_cache' =>
> Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x8ab3be4)
> -> REUSED_ADDRESS
> '_notfirsttime' => 0
> '_reporttype' => ''
> '_root_cleanup_methods' => ARRAY(0x8762efc)
> 0 CODE(0x82a9aec)
> -> &Bio::Root::IO::_io_cleanup in
> /util/lib/perl5/site_perl/5.8.0/Bio/Root/IO.pm:544-570
> 1 CODE(0x82a9aec)
> -> REUSED_ADDRESS
> '_root_verbose' => 1
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:307):
> 307: die "no valid $report_obj" unless defined $report_obj;
> DB<3> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:310):
> 310: while( my $result = $report_obj->next_result ) {
> DB<3> s
> Bio::SearchIO::blast::next_result(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO/blast.pm:389):
> 389: my ($self) = @_;
> DB<3> r
> blast.pm: unrecognized line Reference: Zheng Zhang, Scott Schwartz,
> Lukas Wagner, and Webb Miller (2000),
> blast.pm: unrecognized line "A greedy algorithm for aligning DNA
> sequences",
> blast.pm: unrecognized line J Comput Biol 2000; 7(1-2):203-14.
> blast.pm: unrecognized
> line
> Score E
> Got NCBI HSP score=354, evalue 0.0
> scalar context return from Bio::SearchIO::blast::next_result:
> '_algorithm' => 'MEGABLAST'
> '_algorithm_version' => '2.2.10 [Oct-19-2004]'
> '_dbentries' => 4249067
> '_dbletters' => 17735149364
> '_dbname' => 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST,
> STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences) '
> '_hitindex' => 0
> '_hits' => ARRAY(0x8b2acd0)
> empty array
> '_inclusion_threshold' => 0.001
> '_iteration_count' => 1
> '_iteration_index' => 0
> '_iterations' => ARRAY(0x8b2ac4c)
> 0 Bio::Search::Iteration::GenericIteration=HASH(0x8b1cacc)
> '_newhits_below_threshold' => ARRAY(0x8b1ca84)
> 0 Bio::Search::Hit::BlastHit=HASH(0x8b1cf64)
> '_accession' => 'AE004091'
> '_algorithm' => 'MEGABLAST'
> '_description' => 'Pseudomonas aeruginosa PAO1, complete genome'
> '_hsps' => ARRAY(0x8b1ceb0)
> 0 Bio::Search::HSP::GenericHSP=HASH(0x8b2098c)
> '_algorithm' => 'MEGABLAST'
> '_frac_conserved' => HASH(0x8b266a0)
> 'hit' => 0.991803278688525
> 'query' => 0.991803278688525
> 'total' => 0.991803278688525
> '_frac_identical' => HASH(0x8b2658c)
> 'hit' => 0.991803278688525
> 'query' => 0.991803278688525
> 'total' => 0.991803278688525
> '_gaps' => HASH(0x8b24d94)
> 'hit' => 0
> 'query' => 0
> 'total' => 0
> '_gsf_tag_hash' => HASH(0x8b20998)
> empty hash
> '_hit_string' =>
> 'cctgacctccgctcaactgcgcaaatacgccagcgccggtcggccgttccccgaagggcgcctgctggccgcctcctgccacgacgcggaggaactggccctggctgcctcgatgggagtggagttcgtcaccctttcgccggtacagccgaccgagagccatcccggcgagccggcgctgggttgggacaaggccgccgaactgatcgccggcttcaaccagccggtctacctgctgggtggcctcggtccgcagcaagccgagcaggcttgggagcatggagcccagggcgtggcgggtatccgtgcgttctggccgggcggcctttgacggtggaatgaagaaaaaaggaggcttcggcctcc'
> '_homology_string' =>
> '||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
> |||||||||||||||||||||||||||||||||||||||||
> |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
> |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||'
> etc......
> _______________________________________________
> Bioperl-l mailing list
> Bioperl-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-l
More information about the Bioperl-l
mailing list