[Bioperl-l] predicting secondary structure
Richard Adams
Richard.Adams at ed.ac.uk
Tue Mar 4 15:41:28 EST 2003
Hi Elia,
Apologies if you're getting this twice.
Here is script "Sec_struc_predict.pl" for your perusal,
as you can see it would have to be completely rewritten
and reorganised for Bioperlisation. But am willing to write better
parsers
for other prediction methods if somoen can help with working out the
class structure.
Richard
#!/bin/perl -w
use strict;
########################### INTRO STUFF #####################
#### CPAN modules ##
use LWP::UserAgent;
use HTTP::Request::Common qw (POST);
use HTTP::Response;
use LWP::Simple;
use Getopt::Std;
###BioPerl modules ###
use Bio::SeqIO;
use Bio::SeqFeature::Generic;
use constant MIN_STRUC_LEN => 3; #constant for min no. of coonsecutive
residues
# for prediction
##### declare subs #####
### subs for retrieving analysis over web :
sub get_sopma ;
sub get_gor4;
sub get_hnn;
####control sub for parser ####
sub secondary_struc_parse;
#### parsing raw data from prediction #####
sub parse_gor4;
sub parse_hnn_sopma;
### making hash of results ####
sub _get_2ary_type_coords;
### making seq feature ####
sub add_as_gb_feature;
##### automated secondary structure function/domain prediction for
#######
our ($opt_d) ; ### prints progress report
getopts('d');
my ($infile, $dir, $outfile) = @ARGV;
if (!-d $dir) {
mkdir $dir;
}
my $seq_IN = Bio::SeqIO->new (-file => $infile, -format => 'fasta');
## go through sequences in genbank file and get raw data saved to $dir
directory
while (my $seqobj = $seq_IN->next_seq() ) {
$opt_d && print STDERR "Looking at ", $seqobj->display_id, "\n";
$opt_d && print STDERR "getting sopma report\n";
get_sopma ($seqobj, $dir);
$opt_d && print STDERR "getting gor4 report\n";
get_gor4 ($seqobj, $dir);
$opt_d && print STDERR "getting hnn report\n";
get_hnn ($seqobj, $dir);
}
$seq_IN = Bio::SeqIO->new (-file => $infile, -format => 'genbank');
my $seq_OUT = Bio::SeqIO->new (-file => ">$outfile", -format =>
'genbank');
# now go through sequence file again aand annotate with prediction data
while (my $seqobj = $seq_IN->next_seq() ) {
## find prediction data for the sequence
my $seq_id = $seqobj->display_id;
opendir(RAW, "$dir") or die "can't access result files -$!";
my @files = grep {$_ =~ /$seq_id/ && $_ =~ /sopma|gor4|hnn/} readdir
RAW;
close RAW;
#### add annotations
for my $file (@files) {
my %sec_predictions = secondary_struc_parse($file);
($sec_predictions{'method'}) = $file =~ /.*\.(.+)/;
add_as_gb_feature ($seqobj, \%sec_predictions);
}
### write out annotated sequence to new file##
$seq_OUT->write_seq($seqobj);
}
###################### end of main ########################
########### adding feature to Genbank file ##########
sub add_as_gb_feature {
my ($seqobj,$sec_prdn_ref) =@_;
for my $type (keys %$sec_prdn_ref) {
next if $type =~ /\w{2,}/;
foreach my $location(@{$sec_prdn_ref->{$type}} ) {
my $struc_feat = Bio::SeqFeature::Generic->new (-start =>
$location->{'start'},
-end => $location->{'end'},
-source => 'Conc',
-primary => '2ary',
-tag => {
type => $type,
method => $sec_prdn_ref->{'method'},
});
$seqobj->add_SeqFeature($struc_feat);
}
}
}
###################### subs for parsing raw data ######################
sub secondary_struc_parse {
my ($report) = @_;
my @Protein;
#call parsing routine depending on input###
if ( $report =~ /gor4/i) {
@Protein = parse_gor4($report);
$opt_d && print STDERR "Protein has ", scalar @Protein, " elements
in\n";
}
elsif ($report =~ /hnn/i || $report =~ /sopma/i ){
@Protein = parse_hnn_sopma($report);
}
my %secondary_data = _get_2ary_type_coords (\@Protein);
return %secondary_data;
}
sub parse_hnn_sopma {
# just gets predictiion, not numbers, can change..
my $report = shift;
my @Protein;
open (IN, "$dir/$report") or die "can't open $report";
while (<IN>) {
next unless /^[A-Z]\s/;# or for sopma/hnn /^[A-Z]\s/
my ($type) = /^([A-Z])/;# or for sopma/hnn is my $type = /^([A-Z])/;
push @Protein, $type;
}
close IN;
return @Protein;
}
sub parse_gor4 {
# just gets predictiion, not numbers, can change..
my $report = shift;
my @Protein;
open (IN, "$dir/$report") or die "can't open $report";
while (<IN>) {
next unless /^\s[A-Z]\s/;# or for sopma/hnn /^[A-Z]\s/
my ($type) = (split)[1];# or for sopma/hnn is my $type = /^([A-Z])/;
push @Protein, $type;
}
close IN;
return @Protein;
}
sub _get_2ary_type_coords {
##extracts runs of structure > MIN_STRUC_LENresidues or less if Turn:
#i.e., helical prediction for 1 residue isn't very meaningful...
## and poulates array of hashes with start/end values.
##keys of $Result are 'H' 'T' etc.
my $ref = shift;
my @prot = @$ref;
my %Result;
for (my $index = 0; $index <= $#prot; $index++) {
my $type = $prot[$index];
next unless $type =~ /[HTCE]/;
my $length = 1;
for (my $j = $index + 1; $j <= $#prot; $j++) {
my $test = $prot[$j];
if ($test eq $type) {
$length++;
}
elsif ( $length > MIN_STRUC_LEN || ($length <= MIN_STRUC_LEN &&
$type eq 'T') ) {
push @{$Result{$type}}, {start => $index + 1 , end => $j};
$index += $length -1;
last;
}
else {
$index += $length - 1;
last;
}
}
}
return %Result;
}
################# retrieval subs for getting raw data
#######################
sub get_hnn {
my ($seqobj, $dir) = @_;
my $aa = $seqobj -> seq();
my $ua = LWP::UserAgent->new();
my $request = POST 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_hnn.pl',
Content_Type => 'form-data',
Content => [title => "",
notice => $aa,
ali_width => 70,
];
my $content = $ua->request($request);
my $text = $content->content;
#### get text only version of results ##
my ($next) = $text =~ /Prediction.*?=(.*?)>/;
my $ua2 = LWP::UserAgent->new();
my $out = "http://npsa-pbil.ibcp.fr/". "$next";
my $req2 = HTTP::Request->new(GET=>$out);
my $resp2 = $ua->request ($req2);
my $outfile = "$dir/" .$seqobj->display_id . ".hnn";
$opt_d && print STDERR " gor outfile is $outfile\n";
open (OUT, ">$outfile") or die "can't write hnn report to outfile";
print OUT $resp2->content;
close OUT;
}
sub get_gor4 {
my ($seqobj, $dir) = @_;
my $aa = $seqobj -> seq();
my $ua = LWP::UserAgent->new();
my $request = POST 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_gor4.pl',
Content_Type => 'form-data',
Content => [title => "",
notice => $aa,
ali_width => 70,
];
my $content = $ua->request($request);
my $text = $content->content;
#### get text only version of results ##
my ($next) = $text =~ /Prediction.*?=(.*?)>/;
my $ua2 = LWP::UserAgent->new();
my $out = "http://npsa-pbil.ibcp.fr/". "$next";
my $req2 = HTTP::Request->new(GET=>$out);
my $resp2 = $ua->request ($req2);
my $outfile = "$dir/" .$seqobj->display_id . ".gor4";
$opt_d && print STDERR " gor outfile is $outfile\n";
open (OUT, ">$outfile") or die "can't write gor4 report to outfile";
print OUT $resp2->content;
close OUT;
}
sub get_sopma {
my ($seqobj, $dir) = @_;
my $aa = $seqobj -> seq();
my $ua = LWP::UserAgent->new();
my $request = POST
'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_sopma.pl',
Content_Type => 'form-data',
Content => [title => "",
notice => $aa,
ali_width => 70,
states => 4,
threshold =>8,
width =>17,
];
my $content = $ua->request($request);
my $text = $content->content;
#### get text only version of results ##
my ($next) = $text =~ /Prediction.*?=(.*?)>/;
my $ua2 = LWP::UserAgent->new();
my $out = "http://npsa-pbil.ibcp.fr/". "$next";
my $req2 = HTTP::Request->new(GET=>$out);
my $resp2 = $ua->request ($req2);
my $ext = $seqobj->display_id;
if (length $ext > 10) {
$ext = substr ($ext, 0 , 10);
}
my $outfile = "$dir/" .$seqobj->display_id . ".sopma";
$opt_d && print STDERR "outfile is $outfile\n";
open (OUT, ">$outfile") or die "can't write sopma report to outfile";
print OUT $resp2->content;
close OUT;
}
__END__
=head1 TITLE
autoget_sec_struc2.pl
=head1 SYNOPSIS
takes 3 required parameters
autoget_sec_struc2 [-d] infile directory outfile
where infile is genbank format protein sequence file
directory stores raw output
outfile is annotated sequence file...
=head1 OPTIONS
-d prints progress report to std error.
=head1 DESCRIPTION
Get protein secondary structural predictions over web (via pbil in
Lyons, France)
q using Sopma, Gor4 and
HNN algorithms.
Dumps raw report.
Then reads in report and makes new seq feature for each sec. structure
element.
=head1 To DO
LOTS!!!!!
1. Improve interface e.g., support other sequence formats
2. other prediction methods
3. concensus predictions
4. abstract out in to classes for general re-use in
protein prediction programs
5. Deal with numerical values of predictions
6. Treat HNN/Sopma/Gor4 as subfeatures rather as tag values?
7. Supporting options in running program (e.g., window width,
similarity threshold etc)
=head1 BUGS
This script works fine using BioPerl 1.1 or 1.2 and perl 5.6 run on
Solaris
=head1 AUTHOR
Richard Adams Jan-Feb 2003
More information about the Bioperl-l
mailing list