Bioperl: Blast.pm as BLAST filter?

Ian Korf ikorf@sapiens.wustl.edu
Fri, 10 Dec 1999 13:38:23 -0600 (CST)


----------
X-Sun-Data-Type: text
X-Sun-Data-Description: text
X-Sun-Data-Name: text
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 3

Doesn't look like the attachment came though, trying again.

-Ian
----------
X-Sun-Data-Type: default
X-Sun-Data-Description: default
X-Sun-Data-Name: BPlite
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 132

#!/usr/local/bin/perl -w
use strict;
use Getopt::Long;
use vars qw($MIN_PERCENT $MIN_SCORE $MAX_P $NAME_LIKE $NAME_UNLIKE);

#########
# usage #
#########
my $usage = "
usage: BPlite [options] <blastfile>
options:
  -min_percent <int>     [filter individual HSPs of a Sbjct]
  -min_score   <int>     [filter individual HSPs of a Sbjct]
  -max_P       <float>   [filter individual HSPs of a Sbjct]
  -name_like   <string>  [filter Sbjct]
  -name_unlike <string>  [filter Sbjct]
  
  eg. -min_percent 90 -min_score 200 -max_P 1e-20 -name_like sapiens
";

###################################
# options and argument processing #
###################################
GetOptions(
	"min_percent=i" => \$MIN_PERCENT,
	"min_score=i"   => \$MIN_SCORE,
	"max_P=f"       => \$MAX_P,
	"name_like=s"   => \$NAME_LIKE,
	"name_unlike=s" => \$NAME_UNLIKE);
die $usage unless @ARGV == 1;
my ($blastfile) = @ARGV;

my $DONE = 0;
my $LASTLINE;
open(BLAST, $blastfile) or die "ERROR: could not open $blastfile\n";
my ($line, $query_name, $query_length) = printHeader();
while ($line = getNextSbjct($line)) {}
print $LASTLINE;
while(<BLAST>) {print}
exit(0);

################################################################################
# subroutines
################################################################################
sub printHeader {
	my $query;
	my $length;
	while(<BLAST>) {
		return $_ if $_ =~ /^>/;
		print;
		if ($_ =~ /^Parameters/) {return 0}
	}
}
sub getNextSbjct {
	my ($line) = @_;
	#return 0 if $DONE; # can't get another Sbjct if already done
	
	my $sbjct = $line;
	my @sbjct_lines = ($line);
	GET_SBJCT: while(<BLAST>) {
		push @sbjct_lines, $_;
		if    ($_ !~ /\w/)         {next} # skip blank lines
		elsif ($_ =~ /Strand HSP/) {next} # not data
		elsif ($_ =~ /^\s*Score/)  {$line = $_; last} # done with SBJCT
		else                       {$sbjct .= $_} # add SBJCT info
	}
	$sbjct =~ s/\s+/ /g; # remove multiple spaces
	
	my @hsp_data; # each HSP gets its own array
	my @hsp_lines;
	push @{$hsp_data[0]}, $line; # saved from above
	my $i = 0;
	GET_HSPs: while(<BLAST>) {
		push @{$hsp_lines[$i]}, $_ unless $_ =~ /^>/;
		if    ($_ !~ /\S/)            {next} # skip blank lines
		elsif ($_ =~ /Strand HSP/)    {next} # not data
		elsif ($_ =~ /^>/)            {$line = $_; last} # end of these HSPs
		elsif ($_ =~ /^Parameters/)   {$DONE = 1; last}  # end of WU-BLAST
		elsif ($_ =~ /^\s+Database:/) {$DONE = 1; last}  # end of NCBI-BLAST
		
		$i++ if $_ =~ /^\s*Score/;
		push @{$hsp_data[$i]}, $_ ;
	}
	$line = $_;
	if ($DONE) {
		$line = 0;
		$LASTLINE = pop @{$hsp_lines[$i]};
	}
	my @output;
	$i = 0;
	my $hsp = 0;
	foreach my $hsp_data (@hsp_data) {
		# parse the score lines
		my $scoreline = $hsp_data->[0] . $hsp_data->[1];
		my ($score)   = $scoreline =~ /Score =\s+(\S+)/;
		my ($percent) = $scoreline =~ /Identities = \d+\/\d+ \((\d+)%\)/;
		my ($p)       = $scoreline =~ /[Sum ]*P[\(\d+\)]* = (\S+)/;
		if (not defined $p) {($p) = $line =~ /Expect = (\S+)/}
		
		# parse the alignment lines
		my ($ql, $al, $sl) = ("", "", "");   	# hold the alignment strings
		my ($qb, $qe, $sb, $se) = (0,0,0,0); 	# beginnings and ends
		my ($QL, $SL) = ("", "");               # the whole line
		
		for(my $i=2;$i<@$hsp_data;$i+=3) {
			$hsp_data->[$i]   =~ /^Query:\s+(\d+)\s+(\S+)\s+(\d+)/;
			$ql = $2; $qb = $1 unless $qb; $qe = $3;
			$hsp_data->[$i+2] =~ /^Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)/;
			$sl = $2; $sb = $1 unless $sb; $se = $3;
			$QL .= $ql;
			$SL .= $sl;
		}

		# querries on HSP
		next unless (not defined $MIN_PERCENT or $percent > $MIN_PERCENT);
		next unless (not defined $MIN_SCORE or $score > $MIN_SCORE);
		next unless (not defined $MAX_P or $p < $MAX_P);
		push @output, @{$hsp_lines[$i]};
		$i++;
		$hsp++;
	}
	
	# query the whole thing
	return $line unless (not defined $NAME_LIKE or $sbjct =~ /$NAME_LIKE/);
	return $line unless (not defined $NAME_UNLIKE or $sbjct !~ /$NAME_UNLIKE/);
	return $line unless $hsp;
	print @sbjct_lines;
	print @output;
	return $line;
}

# Copyright (C) 1999 Ian Korf. All Rights Reserved.
=========== Bioperl Project Mailing List Message Footer =======
Project URL: http://bio.perl.org/
For info about how to (un)subscribe, where messages are archived, etc:
http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/vsns-bcd-perl.html
====================================================================