[Bioperl-pipeline] Fwd: [Bioperl-l] predicting secondary structure

Tue Mar 4 10:06:27 EST 2003

Below the mail from Richard Adams regarind modules for secondary 
structure prediction, we should look into this and try to make it all 
proper in bioperl, bioperl-run, and biopipe....

Elia

Begin forwarded message:

> From: Richard Adams <Richard.Adams at ed.ac.uk>
> Date: Mon Mar 3, 2003  11:22:01 PM Asia/Singapore
> To: Elia Stupka <elia at fugu-sg.org>
> Subject: Re: [Bioperl-l] predicting secondary structure
>
>> Hi Elia,
>
> Here it is below. Or should I enter it in Bugzilla as well?
>
> As you can see, to Bioperlize it will require complete rewriting and
> redesign. Heikki Lehvaslaiho had
> some proposals for this sort of thing (see Sept 02 archive) but I never
> got round to doing anything about it.
>     There are several things lacking in this script - dealing robustly
> with failed web connections for example - which would need dealing
> with.  I'm very happy to develop the parsers for more predecition
> programs if someone will help with class design , not having much
> experience of that side of things.
>
> Richard
>
> #!/bin/perl -w
> use strict;
> #############    Sec_struc_predict.pl        #############
> ########################### INTRO STUFF         #####################
> #### CPAN modules ##
> use LWP::UserAgent;
> use HTTP::Request::Common qw (POST);
> use HTTP::Response;
> use LWP::Simple;
> use Getopt::Std;
>
> ###BioPerl modules ###
> use Bio::SeqIO;
> use Bio::SeqFeature::Generic;
>
> use constant MIN_STRUC_LEN => 3; #constant for min no. of consecutive
> residues
>
> # for prediction
>
> ##### declare subs #####
>
> ### subs for retrieving analysis over web :
> sub get_sopma ;
> sub get_gor4;
> sub get_hnn;
>
> ####control sub for parser ####
> sub secondary_struc_parse;
>
> #### parsing raw data from prediction #####
> sub parse_gor4;
> sub parse_hnn_sopma;
>
> ### making hash of results ####
> sub _get_2ary_type_coords;
>
> ### making seq feature ####
> sub add_as_gb_feature;
>
>
> #####     automated secondary structure prediction #######
>
> our ($opt_d) ; ### prints progress report
> getopts('d');
>
>  my ($infile, $dir, $outfile) = @ARGV;
>  if (!-d $dir) {
>   mkdir $dir;
>   }
>  my $seq_IN = Bio::SeqIO->new (-file => $infile, -format => 'fasta');
>
>  ##  go through sequences in genbank file and get raw data saved to 
> $dir
> directory
>  while (my $seqobj = $seq_IN->next_seq() ) {
>  $opt_d &&  print STDERR "Looking at ", $seqobj->display_id, "\n";
>  $opt_d &&  print STDERR "getting sopma report\n";
>    get_sopma ($seqobj, $dir);
>  $opt_d &&  print STDERR "getting gor4 report\n";
>    get_gor4 ($seqobj, $dir);
>  $opt_d &&  print STDERR "getting hnn report\n";
>    get_hnn ($seqobj, $dir);
>   }
>
>
>   $seq_IN = Bio::SeqIO->new (-file => $infile, -format => 'genbank');
>  my $seq_OUT = Bio::SeqIO->new (-file => ">$outfile", -format =>
> 'genbank');
>
>  # now go through sequence file again aand annotate with prediction 
> data
>
>  while (my $seqobj = $seq_IN->next_seq() ) {
>
>   ## find prediction data for the sequence
>   my $seq_id = $seqobj->display_id;
>   opendir(RAW, "$dir") or die "can't access result files -$!";
>   my @files = grep {$_ =~ /$seq_id/ && $_ =~ /sopma|gor4|hnn/} readdir
> RAW;
>   close RAW;
>
>   #### add annotations
>   for my $file (@files) {
>    my %sec_predictions = secondary_struc_parse($file);
>    ($sec_predictions{'method'}) = $file =~ /.*\.(.+)/;
>    add_as_gb_feature ($seqobj, \%sec_predictions);
>    }
>
>   ### write out annotated sequence to new file##
>   $seq_OUT->write_seq($seqobj);
>
>   }
>
> ######################  end of main  ########################
>
>
> ########### adding feature to Genbank file ##########
> sub add_as_gb_feature {
>  my ($seqobj,$sec_prdn_ref) =@_;
>  for my $type (keys %$sec_prdn_ref) {
>   next if $type =~  /\w{2,}/;
>   foreach my $location(@{$sec_prdn_ref->{$type}} ) {
>    my $struc_feat = Bio::SeqFeature::Generic->new (-start =>
> $location->{'start'},
>             -end => $location->{'end'},
>             -source => 'Conc',
>             -primary => '2ary',
>             -tag => {
>              type => $type,
>              method => $sec_prdn_ref->{'method'},
>               });
>   $seqobj->add_SeqFeature($struc_feat);
>   }
>  }
> }
>
>
> ###################### subs for parsing raw data ######################
> sub secondary_struc_parse {
>  my ($report)  = @_;
>  my @Protein;
>  #call parsing routine depending on input###
>  if ( $report =~ /gor4/i) {
>    @Protein = parse_gor4($report);
>  $opt_d && print STDERR "Protein has  ", scalar @Protein, " elements
> in\n";
>   }
>  elsif ($report =~ /hnn/i  || $report =~ /sopma/i ){
>    @Protein = parse_hnn_sopma($report);
>   }
>
>  my %secondary_data = _get_2ary_type_coords (\@Protein);
>  return %secondary_data;
>
> }
>
> sub parse_hnn_sopma {
>   # just gets predictiion, not numbers, can change..
>   my $report = shift;
>   my @Protein;
>   open (IN, "$dir/$report") or die "can't open  $report";
>   while (<IN>) {
>    next unless /^[A-Z]\s/;# or for sopma/hnn  /^[A-Z]\s/
>    my ($type) = /^([A-Z])/;# or for sopma/hnn is my $type = /^([A-Z])/;
>    push @Protein, $type;
>   }
>  close IN;
>  return @Protein;
> }
>
>
>
> sub parse_gor4 {
>   # just gets predictiion, not numbers, can change..
>   my $report = shift;
>   my @Protein;
>   open (IN, "$dir/$report") or die "can't open  $report";
>
>  while (<IN>) {
>   next unless /^\s[A-Z]\s/;# or for sopma/hnn  /^[A-Z]\s/
>   my ($type) = (split)[1];# or for sopma/hnn is my $type = /^([A-Z])/;
>   push @Protein, $type;
>   }
>  close IN;
>  return @Protein;
> }
>
>
>
> sub _get_2ary_type_coords {
> ##extracts runs of structure > MIN_STRUC_LENresidues or less if Turn:
> #i.e., helical prediction for 1 residue isn't very meaningful...
> ## and poulates array of hashes with start/end values.
> ##keys of $Result are 'H' 'T' etc.
>  my $ref = shift;
>  my @prot = @$ref;
>  my %Result;
>  for (my $index = 0; $index <= $#prot; $index++) {
>
>   my $type  = $prot[$index];
>   next unless $type =~ /[HTCE]/;
>   my $length = 1;
>   for (my $j = $index + 1; $j <= $#prot; $j++) {
>     my $test = $prot[$j];
>    if ($test eq $type) {
>     $length++;
>     }
>    elsif (  $length > MIN_STRUC_LEN  || ($length <= MIN_STRUC_LEN &&
> $type eq 'T') )  {
>     push @{$Result{$type}}, {start => $index + 1 ,  end => $j};
>     $index += $length -1;
>     last;
>     }
>    else {
>     $index += $length - 1;
>     last;
>    }
>
>
>   }
>  }
>  return %Result;
> }
>
>
> ################# retrieval subs for getting raw data
> #######################
>
> sub get_hnn {
>
>   my ($seqobj, $dir)  = @_;
>   my $aa = $seqobj -> seq();
>   my $ua = LWP::UserAgent->new();
>   my $request = POST 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_hnn.pl',
>                          Content_Type => 'form-data',
>                         Content  => [title => "",
>          notice => $aa,
>          ali_width => 70,
>          ];
>  my $content = $ua->request($request);
>  my $text = $content->content;
>
>  #### get text only version of results ##
>  my ($next) = $text =~ /Prediction.*?=(.*?)>/;
>  my $ua2 = LWP::UserAgent->new();
>  my $out = "http://npsa-pbil.ibcp.fr/". "$next";
>  my $req2 = HTTP::Request->new(GET=>$out);
>  my $resp2 = $ua->request ($req2);
>  my $outfile =  "$dir/" .$seqobj->display_id . ".hnn";
>  $opt_d &&  print STDERR " gor outfile is  $outfile\n";
>  open (OUT, ">$outfile") or die "can't write hnn report to outfile";
>  print OUT $resp2->content;
>
>  close OUT;
>
>
>
> }
>
>
> sub get_gor4 {
>  my ($seqobj, $dir)  = @_;
>   my $aa = $seqobj -> seq();
>   my $ua = LWP::UserAgent->new();
>   my $request = POST 
> 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_gor4.pl',
>
>                          Content_Type => 'form-data',
>                         Content  => [title => "",
>          notice => $aa,
>          ali_width => 70,
>          ];
>  my $content = $ua->request($request);
>  my $text = $content->content;
>
>  #### get text only version of results ##
>  my ($next) = $text =~ /Prediction.*?=(.*?)>/;
>  my $ua2 = LWP::UserAgent->new();
>  my $out = "http://npsa-pbil.ibcp.fr/". "$next";
>  my $req2 = HTTP::Request->new(GET=>$out);
>  my $resp2 = $ua->request ($req2);
>  my $outfile =  "$dir/" .$seqobj->display_id . ".gor4";
>  $opt_d && print STDERR " gor outfile is  $outfile\n";
>  open (OUT, ">$outfile") or die "can't write gor4 report to outfile";
>  print OUT $resp2->content;
>
>  close OUT;
>
> }
>
>
>
>
> sub get_sopma {
>   my ($seqobj, $dir)  = @_;
>   my $aa = $seqobj -> seq();
>   my $ua = LWP::UserAgent->new();
>   my $request = POST
> 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_sopma.pl',
>                          Content_Type => 'form-data',
>                         Content  => [title => "",
>          notice => $aa,
>          ali_width => 70,
>          states  => 4,
>          threshold =>8,
>          width =>17,
>          ];
>
>  my $content = $ua->request($request);
>  my $text = $content->content;
>
>
>  #### get text only version of results ##
>  my ($next) = $text =~ /Prediction.*?=(.*?)>/;
>  my $ua2 = LWP::UserAgent->new();
>  my $out = "http://npsa-pbil.ibcp.fr/". "$next";
>  my $req2 = HTTP::Request->new(GET=>$out);
>  my $resp2 = $ua->request ($req2);
>  my $ext = $seqobj->display_id;
>  if (length $ext > 10) {
>   $ext = substr ($ext, 0 , 10);
>   }
>  my $outfile =  "$dir/" .$seqobj->display_id . ".sopma";
>  $opt_d && print STDERR "outfile is  $outfile\n";
>  open (OUT, ">$outfile") or die "can't write sopma report to outfile";
>  print OUT $resp2->content;
>
>  close OUT;
>
> }
>
>
>
> __END__
>
>
> =head1 TITLE
>
>  autoget_sec_struc2.pl
>
> =head1 SYNOPSIS
>
>  takes 3 required parameters
>
>   autoget_sec_struc2 [-d] infile directory outfile
>
>  where infile is genbank format protein sequence file
>     directory stores raw output
>   outfile is annotated sequence file...
>
> =head1  OPTIONS
>
>  -d prints progress report to std error.
>
>
>
> =head1 DESCRIPTION
>
>   Get protein secondary structural  predictions over web (via pbil in
> Lyons, France)
>   q using Sopma, Gor4 and
>   HNN algorithms.
>   Dumps raw report.
>   Then reads in report and makes new seq feature for each sec. 
> structure
> element.
>
> =head1 To DO
>
>   LOTS!!!!!
>   1. Improve interface e.g., support other sequence formats
>   2. other prediction methods
>   3. concensus predictions
>   4. abstract out in to classes for general re-use in
>           protein prediction programs
>   5. Deal with numerical values of predictions
>   6. Treat HNN/Sopma/Gor4 as subfeatures rather as tag values?
>   7. Supporting options in running program (e.g., window width,
>    similarity threshold etc)
>
>
> =head1 BUGS
>
>   This script works fine using BioPerl 1.1 or 1.2 and perl 5.6 run on
> Solaris
>
>
> =head1 AUTHOR
>
>   Richard Adams Jan-Feb 2003 ÿ
>