[Bioperl-l] simplealign, prediction parselist

Richard Adams Richard.Adams@ed.ac.uk
Mon, 23 Sep 2002 12:57:38 +0100


> Hi Heikki,

Here is code for a netphos parsing/analysis module. I'm posting it in an unfinished
state to get some feedback on
what other functionality might be useful. Having said that, all the code should work.

Netphos  predicts phosphorylation sites in proteins and is available over the web.
I want to do this programmatically for multiple proteins, and also to
analyse the data returned.
The code below sends a request to the netphos server and writes the data to file
#!/usr/bin/perl -w
use strict;
use LWP::UserAgent;
use HTTP::Request::Common qw (POST GET);
use HTTP::Response;


#input sequence must be a single sequence in raw format (can expand to include more
formats)
#
# sends request , gets waiting message, when report comes back saying request is done

# can then access it.

my $infile = shift; #aafile (plaintext)
my $outfile = shift; #file for netphos output.

#open and format sequence for insertion into query form.
my $sequence = readfile ($infile);
my $ua = LWP::UserAgent->new();
my $request = POST 'http://www.cbs.dtu.dk/cgi-bin/nph-webface',
    Content_Type => 'form-data',
    Content    => [configfile =>
'/usr/opt/www/pub/CBS/services/NetPhos-2.0/NetPhos.cf',
      SEQPASTE => $sequence];
my $content = $ua->request($request);
my $text = $content->content;

my ($result_url) = $text =~ /follow <a href="(.*?)"/;
print "url is $result_url\n\n";

my $ua2 = LWP::UserAgent->new();
my $content2 = $ua2->request(POST $result_url);

my $ua3 = LWP::UserAgent->new();
$result_url =~ s/&.*//;
print "final result url is $result_url\n";
my $content3 = $ua3->request(POST $result_url);

my $response = $content3->content;
$response =~ s/.*<pre>(.*)<\/pre>.*/$1/s;

writefile ($outfile, $response);




sub readfile {

my $file  = shift;
open (IN, "<$file") or die "cannot open $file for reading";
my $sequence = <IN>;
close IN;
return $sequence;

}

sub writefile {
my $file  = shift;
my $text = shift;
open (OUT, ">$file") or die "cannot open $file for writing";
print OUT $text;
close OUT;
}

The following code :
Takes netphos report anda msf alignment file as input parameters.

1. Reads and parses the data out of the saved report (sub netphosparse)
2. Sorts by result (with optional significance threshold)  (sub sort STY by score)
3. Compares predictions with an alignment file containing related sequences to the
query protein. This would highlight
        predicted phosphorylated sites conservedin evolution and may therefore be of
greater significance. (sub is_conserved).


Other functions I need to add are basic get methods for data e.g.,
$report->highest_score,  $report->no_predictions ,
$report->get_tyrosine_predictionsetc but these will be straightforward
additions. Also a way of submitting multiple sequences will need to be added. But I'd
certainly appreciate any feedback now. I'm alsowriting similar code
for other prediction servers e.g., signal peptide, NLS predictors etc so would
appreciate any advice on how to integrate all this into BioPerl.
Cheers,

Richard


#!/usr/bin/perl -w
use strict;

use Bio::AlignIO;
use Bio::Seq;
#declare subs

sub netphosparse($);
sub _get_query_seq($$);
sub sortSTYbyscore ($$);
sub is_conserved ($$);

#useage: netphosparse [phosphositefile][alignmentfile]
#opens an MSA file and a netphos prediction,
# finds if predicted phospho sites are conserved through alignment at 50%
# or 30% level.
##############main#########
my ($phospho_site_file, $aln_file) = @ARGV;
my $threshold = 0.7;  #or could be input parameter
my @report = netphosparse ($phospho_site_file);
my @sortedreport = sortSTYbyscore (\@report, $threshold);
is_conserved ($aln_file, \@sortedreport );

#######end of main###########
sub is_conserved ($$){
 my ($align_file, $ref)  = @_;
 my @results = @$ref;


 my $in = Bio::AlignIO->new (-file => $align_file);
 my $Aln = $in->next_aln;
 my %concensuses;
 #get queryseq from alignment for finding column number of STY residues.
 my $query_seq = _get_query_seq($Aln, '26102'); #currently hardwired but can change
 foreach my $percent (100, 70, 50, 30) {
  my $concensus = $Aln->consensus_string($percent);
  $concensuses{$percent} = $concensus;
  }
 foreach my $element(@results) {
  my ($aatype) = $element->[3] =~ /(\w)/;
  my $sty_position = $element->[0];
  my $aln_column = $Aln->column_from_residue_number($query_seq->display_id,
$sty_position);

 $aln_column -= 1;
  if ($element->[2] >= 0.5 ) {  #if above minimum threshold
   for my $test (sort {$b <=> $a} keys %concensuses) {
    if ((substr $concensuses{$test}, $aln_column, 1)  eq $aatype) {
     print "$element->[1] score $element->[2] at position $element->[0] at consensus
$test\n";
     last;
     }

   }
  }
 }
}
sub _get_query_seq ($$) { # internal sub for is_conserved
 my ($aln, $query_id)  = @_;
 foreach my $seq ($aln->each_seq){
  if ($seq->display_id =~ /$query_id/) {
   my $query = $seq;
   return $query;
   }
 }
}

sub netphosparse ($) {
#####gets raw data and puts into an array of references to data
#####will be made into a "netphos object".
 my $infile = shift;
 my @phospho_data;
 open (IN, "<$infile") or die "cannot open  $infile for reading. $!";
 while (my $result = <IN>) {
  next unless  $result =~ /^Sequence/;
  push @phospho_data,  [(split /\s+/, $result)[1..4] ] ;
  }
 return @phospho_data;

}

sub sortSTYbyscore ($$) {
####this sub sorts results by score and imposes threshold if defined#####
 my ($ref, $threshold) = @_;

 my @_report = @$ref; #localcopy of netphos report

 my @sorted_by_allscores = sort {$b->[2] <=> $a->[2]} @_report;
 my @truncated;
 if (defined $threshold) {
  foreach (@sorted_by_allscores) {
   if ($_->[2]>$threshold ) {
    push @truncated, $_;
    }
  }
  print "\nsorted report by score \nXXXXXXXXXXXXXX\n";
  foreach my $result2(@truncated) {
   print join "\t", @$result2;
   print "\n";
  }
  return @truncated;
 }
 else
   {
   foreach my $result(@sorted_by_allscores) {
    print join "\t", @$result;
    print "\n";
    }
   return @sorted_by_allscores;
   }
 }




>
> > > > > >
> > > > > > Richard
> > > > > >
> > > > > > Dr Richard Adams
> > > > > > Molecular Medicine Centre
> > > > > > University of Edinburgh UK
> > > > > >
> > > > > >
> > > >