[Bioperl-l] SeqFeatureI->spliced_translation
Matthew Betts
Matthew.Betts at ii.uib.no
Mon Nov 3 13:06:24 EST 2003
Hi,
I've written a method 'spliced_translation' for Bio::SeqFeatureI that
translates a spliced sequence and deals with any codon exceptions.
It is really just glue between the existing SeqFeatureI->spliced_seq and
PrimarySeqI->translate, but can deal with codons that are non-standard
across the whole sequence ('/codon' in GenBank feature tables) and codons
that are non-standard at specific locations ('/transl_except').
I mainly use it to check the conceptual translation against that given in
the feature tables. I could do a bit (a lot...) of polishing
(suggestions welcome) if it's useful to anyone else?
Matt
--
Matthew Betts, mailto:matthew.betts at ii.uib.no
Phone: (+47) 55 58 40 22, Fax: (+47) 55 58 42 95
CBU, BCCS, UNIFOB / Universitetet i Bergen
Thormøhlensgt. 55, N-5008 Bergen, Norway
--------------------
sub spliced_translation {
my $self = shift;
my $db = shift;
my $not_concept = shift;
# if $not_concept is defined, will return the sequence
# given by the /translation qualifier rather than the
# conceptual translation. All the checks are still done.
my $complete5;
my $complete3;
my $frame;
my $table;
my $loc_factory;
my @exceptions;
my $except;
my $cds;
my $trans;
my @locs;
my $loc;
my $fstrand;
my $mixed;
my $ft_trans;
my $trans_aa;
my $cdna_start;
my $na_e_start;
my $aa_e_pos;
# FIXME - improve the warnings. Also allow 'throw' if requested
if($self->primary_tag ne 'CDS') {
$self->warn("Calling spliced_seq on a feature which is not a CDS");
}
# is the whole sequence of the CDS known?
if(defined($self->location->strand) and ($self->location->strand == -1)) {
$complete5 = ($self->location->to_FTstring =~ />/) ? 0 : 1;
$complete3 = ($self->location->to_FTstring =~ /</) ? 0 : 1;
}
else {
$complete5 = ($self->location->to_FTstring =~ /</) ? 0 : 1;
$complete3 = ($self->location->to_FTstring =~ />/) ? 0 : 1;
}
# find the reading frame before translating...
$frame = 0;
if($self->has_tag('codon_start')) {
$frame = join '', $self->get_tag_values('codon_start');
# '/codon_start' tags are 1, 2, or 3, but bioperl
# uses 0, 1, or 2 to indicate reading frame, so...
$frame--;
}
# ...and the codon table too
$table = 1;
if($self->has_tag('transl_table')) {
$table = join '', $self->get_tag_values('transl_table');
}
$cds = $self->spliced_seq($db);
$trans = $cds->translate(undef, undef, $frame, $table, undef, undef, $complete5, $complete3);
# the following exceptions should ideally be dealt with
# by translate, except that the single codon exceptions
# need to know about locations on the genomic sequence
$trans_aa = $trans->seq;
# deal with codons that differ from the reference genetic
# code ('/codon' qualifiers in gb feature table)
@exceptions = ();
if($self->has_tag('codon')) {
foreach $except ($self->get_tag_values('codon')) {
$except =~ s/\s+//g; # spaces are meaningless here
$except =~ s/["']//g; # don't need quotes either
if($except =~ /seq:(...),aa:(.*)\)/) {
my $codon = $1;
my $aa_temp = substr($2, 0, 3);
$aa_temp =~ s/(.)(..)/\u$1\L$2/; # seq3in() expects first letter as capital, rest lower-case
my $aa = Bio::Seq->new('-alphabet' => 'protein');
Bio::SeqUtils->seq3in($aa, $aa_temp);
push @exceptions, {
'codon' => $codon,
'aa' => $aa->seq,
};
}
}
}
my @codons = grep(!/\A\Z/, split(/(...)/, substr($cds->seq, $frame)));
$aa_e_pos = 0;
foreach my $codon (@codons) {
foreach $except (@exceptions) {
($except->{'codon'} =~ /$codon/i) and (substr($trans_aa, $aa_e_pos, 1) = $except->{'aa'})
}
$aa_e_pos++;
}
# deal with single non standard codons
# ('/transl_except' qualifiers in gb feature table)
$loc_factory = Bio::Factory::FTLocationFactory->new();
@exceptions = ();
if($self->has_tag('transl_except')) {
foreach $except ($self->get_tag_values('transl_except')) {
$except =~ s/\s+//g; # spaces are meaningless here
if($except =~ /\(pos:(.*?),aa:(.*)\)/) {
my $loc_str = $1;
my $aa_temp = substr($2, 0, 3);
$aa_temp =~ s/(.)(..)/\u$1\L$2/; # seq3in() expects first letter as capital, rest lower-case
my $aa = Bio::Seq->new('-alphabet' => 'protein');
Bio::SeqUtils->seq3in($aa, $aa_temp);
push @exceptions, {
'loc' => $loc_factory->from_string($loc_str),
'aa' => $aa->seq,
};
}
else {
$self->warn("Cannot parse translation exception '$except'");
}
}
}
# order the locations in the same way that spliced_seq does
@locs = $self->location->each_Location;
foreach $loc (@locs) {
defined($fstrand) or ($fstrand = $loc->strand);
if(defined($loc->strand) and ($fstrand != $loc->strand)) {
$mixed = 1;
last;
}
}
if($mixed) {
$self->warn("Mixed strand locations, spliced seq using the input order rather than trying to sort");
}
elsif($fstrand == -1) {
@locs = reverse $self->location->each_Location;
}
# pair up any translation exceptions with
# their corresponding sub locations, and
# calculate the position in the amino acid
# sequence that is exceptional
$cdna_start = 1 - $frame; # start position of the current segment in the na seq of the cds
$na_e_start = 0; # position of the exception on the na seq of the cds
$aa_e_pos = 0; # position of the exception on the aa seq of the cds
# there might be a clever way to avoid the following if-else...
if(!$mixed and ($fstrand == -1)) {
foreach $loc (@locs) {
foreach $except (@exceptions) {
if($loc->overlaps($except->{'loc'})) {
$na_e_start = $loc->end - $except->{'loc'}->end + $cdna_start;
$aa_e_pos = ($na_e_start + 2) / 3;
# Ignore this position if it is off the end of
# the sequence. This can happen when the
# exception is for a non-standard stop codon.
($aa_e_pos > $trans->length) and next;
# Otherwise, replace the aa in the translation
$aa_e_pos--; # positions above start at one
substr($trans_aa, $aa_e_pos, 1) = $except->{'aa'};
}
}
$cdna_start += ($loc->end - $loc->start + 1);
}
$trans->seq($trans_aa);
}
else {
foreach $loc (@locs) {
foreach $except (@exceptions) {
if($loc->overlaps($except->{'loc'})) {
$na_e_start = $except->{'loc'}->start - $loc->start + $cdna_start;
$aa_e_pos = ($na_e_start + 2) / 3;
# Ignore this position if it is off the end of
# the sequence. This can happen when the
# exception is for a non-standard stop codon.
($aa_e_pos > $trans->length) and next;
# Otherwise, replace the aa in the translation
$aa_e_pos--; # positions above start at one
substr($trans_aa, $aa_e_pos, 1) = $except->{'aa'};
}
}
$cdna_start += ($loc->end - $loc->start + 1);
}
$trans->seq($trans_aa);
}
# check that the translation matches that given
# by the feature table. This is better than just
# using the feature table translation, since it
# is also an indirect check that the DNA has been
# spliced together correctly
$ft_trans = undef;
if($self->has_tag('translation')) {
$ft_trans = join '', $self->get_tag_values('translation');
}
if(defined($ft_trans)) {
if($trans->seq !~ /^$ft_trans/) {
my $display_id = $self->seq->display_id;
$self->warn("Translated sequence '$display_id' does not match '/translation' tag");
}
}
else {
$self->warn("Warning: no translation tag so can't check");
}
$not_concept and $ft_trans and $trans->seq($ft_trans);
return $trans, $complete5, $complete3, $frame, $table;
}
More information about the Bioperl-l
mailing list