Bioperl: Blast.pm as BLAST filter?
Ian Korf
ikorf@sapiens.wustl.edu
Fri, 10 Dec 1999 13:38:23 -0600 (CST)
----------
X-Sun-Data-Type: text
X-Sun-Data-Description: text
X-Sun-Data-Name: text
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 3
Doesn't look like the attachment came though, trying again.
-Ian
----------
X-Sun-Data-Type: default
X-Sun-Data-Description: default
X-Sun-Data-Name: BPlite
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 132
#!/usr/local/bin/perl -w
use strict;
use Getopt::Long;
use vars qw($MIN_PERCENT $MIN_SCORE $MAX_P $NAME_LIKE $NAME_UNLIKE);
#########
# usage #
#########
my $usage = "
usage: BPlite [options] <blastfile>
options:
-min_percent <int> [filter individual HSPs of a Sbjct]
-min_score <int> [filter individual HSPs of a Sbjct]
-max_P <float> [filter individual HSPs of a Sbjct]
-name_like <string> [filter Sbjct]
-name_unlike <string> [filter Sbjct]
eg. -min_percent 90 -min_score 200 -max_P 1e-20 -name_like sapiens
";
###################################
# options and argument processing #
###################################
GetOptions(
"min_percent=i" => \$MIN_PERCENT,
"min_score=i" => \$MIN_SCORE,
"max_P=f" => \$MAX_P,
"name_like=s" => \$NAME_LIKE,
"name_unlike=s" => \$NAME_UNLIKE);
die $usage unless @ARGV == 1;
my ($blastfile) = @ARGV;
my $DONE = 0;
my $LASTLINE;
open(BLAST, $blastfile) or die "ERROR: could not open $blastfile\n";
my ($line, $query_name, $query_length) = printHeader();
while ($line = getNextSbjct($line)) {}
print $LASTLINE;
while(<BLAST>) {print}
exit(0);
################################################################################
# subroutines
################################################################################
sub printHeader {
my $query;
my $length;
while(<BLAST>) {
return $_ if $_ =~ /^>/;
print;
if ($_ =~ /^Parameters/) {return 0}
}
}
sub getNextSbjct {
my ($line) = @_;
#return 0 if $DONE; # can't get another Sbjct if already done
my $sbjct = $line;
my @sbjct_lines = ($line);
GET_SBJCT: while(<BLAST>) {
push @sbjct_lines, $_;
if ($_ !~ /\w/) {next} # skip blank lines
elsif ($_ =~ /Strand HSP/) {next} # not data
elsif ($_ =~ /^\s*Score/) {$line = $_; last} # done with SBJCT
else {$sbjct .= $_} # add SBJCT info
}
$sbjct =~ s/\s+/ /g; # remove multiple spaces
my @hsp_data; # each HSP gets its own array
my @hsp_lines;
push @{$hsp_data[0]}, $line; # saved from above
my $i = 0;
GET_HSPs: while(<BLAST>) {
push @{$hsp_lines[$i]}, $_ unless $_ =~ /^>/;
if ($_ !~ /\S/) {next} # skip blank lines
elsif ($_ =~ /Strand HSP/) {next} # not data
elsif ($_ =~ /^>/) {$line = $_; last} # end of these HSPs
elsif ($_ =~ /^Parameters/) {$DONE = 1; last} # end of WU-BLAST
elsif ($_ =~ /^\s+Database:/) {$DONE = 1; last} # end of NCBI-BLAST
$i++ if $_ =~ /^\s*Score/;
push @{$hsp_data[$i]}, $_ ;
}
$line = $_;
if ($DONE) {
$line = 0;
$LASTLINE = pop @{$hsp_lines[$i]};
}
my @output;
$i = 0;
my $hsp = 0;
foreach my $hsp_data (@hsp_data) {
# parse the score lines
my $scoreline = $hsp_data->[0] . $hsp_data->[1];
my ($score) = $scoreline =~ /Score =\s+(\S+)/;
my ($percent) = $scoreline =~ /Identities = \d+\/\d+ \((\d+)%\)/;
my ($p) = $scoreline =~ /[Sum ]*P[\(\d+\)]* = (\S+)/;
if (not defined $p) {($p) = $line =~ /Expect = (\S+)/}
# parse the alignment lines
my ($ql, $al, $sl) = ("", "", ""); # hold the alignment strings
my ($qb, $qe, $sb, $se) = (0,0,0,0); # beginnings and ends
my ($QL, $SL) = ("", ""); # the whole line
for(my $i=2;$i<@$hsp_data;$i+=3) {
$hsp_data->[$i] =~ /^Query:\s+(\d+)\s+(\S+)\s+(\d+)/;
$ql = $2; $qb = $1 unless $qb; $qe = $3;
$hsp_data->[$i+2] =~ /^Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)/;
$sl = $2; $sb = $1 unless $sb; $se = $3;
$QL .= $ql;
$SL .= $sl;
}
# querries on HSP
next unless (not defined $MIN_PERCENT or $percent > $MIN_PERCENT);
next unless (not defined $MIN_SCORE or $score > $MIN_SCORE);
next unless (not defined $MAX_P or $p < $MAX_P);
push @output, @{$hsp_lines[$i]};
$i++;
$hsp++;
}
# query the whole thing
return $line unless (not defined $NAME_LIKE or $sbjct =~ /$NAME_LIKE/);
return $line unless (not defined $NAME_UNLIKE or $sbjct !~ /$NAME_UNLIKE/);
return $line unless $hsp;
print @sbjct_lines;
print @output;
return $line;
}
# Copyright (C) 1999 Ian Korf. All Rights Reserved.
=========== Bioperl Project Mailing List Message Footer =======
Project URL: http://bio.perl.org/
For info about how to (un)subscribe, where messages are archived, etc:
http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/vsns-bcd-perl.html
====================================================================