some additions for Seq.pm
Chris Dagdigian
cdagdigian@genetics.com
Fri, 6 Dec 1996 15:29:06 -0400
Hi All,
I've been playing around with the Seq.pm module as found at:
http://www.techfak.uni-bielefeld.de/bcd/Tec/Bioperl/Code/Bio/Seq.pm
and have been experimenting with adding my own methods to it.
I'm self-taught at perl and new to OO-style programming so any comments on
the enclosed code would be welcome, particularly in making it more
productive/functional/elegant.
Enclosed are functions for:
o outputing GCG format sequences
o translating RNA/DNA -> protein sequence
o reversing a sequence
o complementing a sequence
o reverse-complementing a sequence
o translating DNA->RNA
Most of the guts came from jong's molbio routines, mistakes are all mine :)
Regards,
Chris Dagdigian
#--------------------------------------------------------------------
=pod
=head1 revcom()
#_______________________________________________________________________
# Title : revcom
# Usage : $reverse_complemented_seq = $mySeq->revcom;
# Function : Returns a char string containing the reverse
# : complement of a nucleotide object sequence
# Example : $reverse_complemented_seq = $mySeq->revcom;
# Source : Guts from Jong's <jong@mrc-lmb.cam.ac.uk>
# : library of molbio perl routines
# Note :
# : The letter codes and compliment translations
# : are those proposed by IUB (Nomenclature Committee,
# : 1985, Eur. J. Biochem. 150; 1-5) and are also
# : used by the GCG package. The IUB/GCG letter codes
# : for nucleotide ambiguity are compatible with
# : EMBL, GenBank and PIR database formats but are
# : *NOT* compatible with Stadem/Sanger ambiguity
# : symbols. Staden/Sanger use different symbols to
# : represent uncertainty and frame abiguity.
# :
# : Currently Staden/Sanger are not recognized
# : sequence types.
# :
# : GCG Documentation on sequence symbols:
# URL : http://www.neb.com/gcgdoc/GCGdoc/Appendices
# : /appendix_iii.html
# :
# Translation :
# : GCG/IUB Meaning Complement
# : ------------------------------------
# : A A T
# : C C G
# : G G C
# : T T A
# : U U A
# : M A or C K
# : R A or G Y
# : W A or T W
# : S C or G S
# : Y C or T R
# : K G or T M
# : V A or C or G B
# : H A or C or T D
# : D A or G or T H
# : B C or G or T V
# : X G or A or T or C X
# : N G or A or T or C N
# :--------------------------------------
# Revision : 0.01 / 6 Dec 1996
# Returns : char string
# Argument : n/a
#-----------------------------------------------------------------------
=cut
sub revcom {
my($self)= @_;
my($seq,$revseq);
# CD: Some type of check be made here to make
# CD: sure the sequence is nucleotide.
$seq = $self->{"seq"};
# CD: If a sequence format uses different
# CD: symbols for nucleotide ambiguity than
# CD: GCG/IUB than this will certainly break.
$seq =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
$revseq = reverse $seq;
}
=pod
=head1 complement
#_______________________________________________________________________
# Title : complement
# Usage : $complemented_seq = $mySeq->compliment;
# Function : Returns a char string containing
# : the complementary sequence (eg; other strand)
# : of the original sequence. The translation method
# : is identical to revcom() but the nucleotide order
# : is not reversed.
# :
# Example : $complemented_seq = $mySeq->complement;
# :
# Source : Guts from Jong's <jong@mrc-lmb.cam.ac.uk>
# : library of molbio perl routines
# Note :
# : The letter codes and complement translations
# : are those proposed by IUB (Nomenclature Committee,
# : 1985, Eur. J. Biochem. 150; 1-5) and are also
# : used by the GCG package. The IUB/GCG letter codes
# : for nucleotide ambiguity are compatible with
# : EMBL, GenBank and PIR database formats but are
# : *NOT* compatible with Stadem/Sanger ambiguity
# : symbols. Staden/Sanger use different symbols to
# : represent uncertainty and frame abiguity.
# :
# : Currently Staden/Sanger are not recognized
# : sequence types.
# :
# : GCG Documentation on sequence symbols:
# URL : http://www.neb.com/gcgdoc/GCGdoc/Appendices
# : /appendix_iii.html
# :
# Translation :
# : GCG/IUB Meaning Complement
# : ------------------------------------
# : A A T
# : C C G
# : G G C
# : T T A
# : U U A
# : M A or C K
# : R A or G Y
# : W A or T W
# : S C or G S
# : Y C or T R
# : K G or T M
# : V A or C or G B
# : H A or C or T D
# : D A or G or T H
# : B C or G or T V
# : X G or A or T or C X
# : N G or A or T or C N
# :--------------------------------------
# :
# Revision : 0.01 / 6 Dec 1996
# Returns : char string
# Argument : n/a
#-----------------------------------------------------------------------
=cut
sub complement {
my($self)= @_;
my($seq);
# CD: See notes in revcom() about checking for nucleotide sequence
# CD: and dealing with non-GCG/IUB ambiguity symbols.
$seq = $self->{"seq"};
$seq =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
return $seq;
}
=pod
=head1 reverse
#_______________________________________________________________________
# Title : reverse
# Usage : $reversed_seq = $mySeq->reverse;
# Function : Returns a char string containing the
# : reverse of the object sequence
# :
# Example : $reversed_seq = $mySeq->reverse;
# :
# Revision : 0.01 / 6 Dec 1996
# Returns : char string
# Argument : n/a
#-----------------------------------------------------------------------
=cut
sub reverse {
my($self)= @_;
my($seq);
$seq = $self->{"seq"};
scalar reverse $seq;
}
=pod
=head1 outGCG()
#_______________________________________________________________________
# Title : outGCG
# Usage : @formatted_seq = $mySeq->layout("GCG");
# Function : Returns a GCG formatted sequence array
# : when invoked via the layout method
# :
# Example :
# : @gcg_seq = $mySeq->layout("GCG");
# : foreach(@gcg_seq){print;}
# :
# : The above code would produce the following output:
# :
# : Sample Bio::Seq sequence
# : sample Length: 240 Wed Nov 27 13:24:28 EST 1996 Type: N
Check: 5371
..
# :
# : 1 aaaacctatg gggtgggctc tcaagctgag accctgtgtg cacagccctc
# : 51 tggctggtgg cagtggagac gggatnnnat gacaagcctg ggggacatga
# : 101 ccccagagaa ggaacgggaa caggatgagt gagaggaggt tctaaattat
# : 151 ccattagcac aggctgccag tggtccttgc ataaatgtat agagcacaca
# : 201 ggtgggggga aagggagaga gagaagaagc cagggtataa
# :
# :
# Note : GCG formatted sequences contain a "Type:" field
# : that must be "N" or "P". Thus, the bio::seq object
# : needs to have a SeqType that can be classified as
# : such.
# :
# Warning : Unconventional numbering offsets may not
# : be robustly handled
# :
# Revision : 0.01 / 6 Dec 1996
# Source : Found guts of this code on bionet.gcg, unknown author
# Returns : Array
# Argument : n/a
#-----------------------------------------------------------------------
=cut
sub outGCG {
my($self) = @_;
my($id,$comment,$len,$type,$seq);
my($cnt,$sum,$i,$j,$tmp,$offset);
my($c_t,@out);
## CD: The user should also be able to pass in an arbitrary date
## CD: value to be used when formatting the output.
## CD: The user should also be able to pass in a header string and
## CD: maybe switches to control capitalization and line-spacing.
## CD: None of these would be required, but if passed in they would
## CD: overwrite any default behaviour.
## CD: Not sure how appropriate this is. `date` is not very portable
chomp($c_t = `date`);
$seq = $self->{"seq"};
$id = $self->{"id"};
$comment = $self->{"desc"};
$len = length($seq);
$type = "";
#GCG format needs to know if sequence is nucleotide or protein
if(($self->_monomer eq "0") || ($self->_monomer eq "4")) {
carp"Unusable SeqType, GCG format requires defined nucleotide or peptide
sequence";
}
# CD: The user should be able to pass in an overriding type
# CD: of "N" or "P" to be used in the formatted output.
# CD: I'm not sure how to get arguments to this function
# CD: becuase it is meant to be accessed through the
# CD: "layout" method.
#Deal with the GCG format Type field
if($self->_monomer eq "3") { $type="P";}
else { if(($self->_monomer eq "1") || ($self->_monomer eq "2")) {
$type="N"; }
}
# The default "N" or "P" type has been set if possible. After dealing with any
# overiding user arguments, we can test it and carp if undefined
# CD: [deal with user-given type here...]
# Test $type
unless($type eq "N" || $type eq "P") { carp "Unable to set Type: N or P for
GCG-format output."; }
$offset=0;
#Set the offset if we have any non-standard numbering going on
if($self->numbering < 0) { $offset = ( 0 + $self->numbering); }
if($self->numbering >= 1) { $offset = $self->numbering;}
if($self->numbering == 0) { $offset = -1;}
$sum=0; #Need this to shut strict() up
#Generate the GCG Checksum value
for($i=0; $i<$len ;$i++) {
$cnt++;
$sum += $cnt * ord(substr($seq,$i,1));
($cnt == 57) && ($cnt=0);
}
$sum %= 10000;
#Output the sequence header info
push(@out,"$comment\n");
push(@out," $id Length: $len $c_t Type: $type Check: $sum ..\n\n");
#Format the sequence
$len = length($seq);
$i = $#out + 1;
for($j = 0 ; $j < $len ; ) {
if( $j % 50 == 0) {
$out[$i] = sprintf("%8d ",($j+$offset)); #numbering
}
$out[$i] .= sprintf("%s",substr($seq,$j,10));
$j += 10;
if( $j < $len && $j % 50 != 0 ) {
$out[$i] .= " ";
}elsif($j % 50 == 0 ) {
$out[$i++] .= "\n";
#$out[$i++] = "\n"; #adds the extra blank lines in the sequence
} #(user should be able to control spacing...)
}
if($j % 50 != 0 ) {
$out[$i] .= "\n";
}
$out[$i] .= "\n";
@out;
} # end of sub
=pod
=head1 DNA_to_RNA()
#_______________________________________________________________________
# Title : DNA_to_RNA
# Usage : $translated_seq = $mySeq->DNA_to_RNA;
# Function : Returns a char string containing the
# : RNA translation of the DNA nucleotide sequence
# : (Replaces T with U)
# :
# Example : $translated_seq = $mySeq->DNA_to_RNA;
# :
# Source : modified from Jong's <jong@mrc-lmb.cam.ac.uk>
# : library of molbio perl routines
# :
# Revision : 0.01 / 6 Dec 1996
# Returns : char string
# Argument : n/a
#-----------------------------------------------------------------------
=cut
sub DNA_to_RNA {
my($self)=@_;
## CD: This is a simple substitution from T -> U
## CD: so we shouldnt have to do intricate error checking
## CD: here. Right now we only carp if the sequence is
## CD: explicitly an amino acid seq.
carp "Can't translate an amino acid sequence to RNA." if($self->_monomer
eq "3");
my($seq) = $self->{"seq"}; # get sequence
##Quietly deal with capitalization
$seq =~ s/T/U/g; # change (T to U)
$seq =~ s/t/u/g; # change (t to u)
$seq; # return value
}
=pod
=head1 translate()
#_______________________________________________________________________
# Title : translate
# Usage :
# Function : Returns a char string containing the single-letter
# : protein translation of a DNA/RNA sequence
# :
# : "*" is the default symbol for a stop codon
# : "X" is the default symbol for an unknown codon
# :
# Example : $translation = $mySeq->translate;
# : -or- with user defined stop/unknown codon symbols:
# : $translation = $mySeq->translate($stop_symbol,$unknown_symbol);
# :
# Source : modified from Jong's <jong@mrc-lmb.cam.ac.uk>
# : library of molbio perl routines
# :
# Revision : 0.01 / 6 Dec 1996
# Returns : char string
# Argument : n/a
#-----------------------------------------------------------------------
=cut
sub translate {
my($self) = shift;
my($stop,$unknown) = @_;
my($i,$len,$output) = (0,0,'');
my($codon) = "";
my($seq) = $self->{"seq"};
## User can pass in symbol for stop and unknown codons
unless(defined($stop)) { $stop = "*"; }
unless(defined($unknown)) { $unknown = "X"; }
##Error if monomer is "Amino"
carp "Can't translate an amino acid sequence." if($self->_monomer eq "3");
#CD: Is this OK?
#If sequence monomer is anything other than RNA, we should first pipe it
#through DNA_to_RNA() to change any T's to U's (just in case)
unless($self->_monomer eq "2") { $seq = $self->DNA_to_RNA;}
for($len=length($seq),$seq =~ tr/a-z/A-Z/,$i=0; $i<($len-2) ; $i+=3) {
$codon = substr($seq,$i,3);
if ($codon =~ /^UC/) {$output .= 'S'; } # Serine
elsif($codon =~ /^UU[UC]/) {$output .= 'F'; } # Phenylalanine
elsif($codon =~ /^UU[AG]/) {$output .= 'L'; } # Leucine
elsif($codon =~ /^UA[UC]/) {$output .= 'Y'; } # Tyrosine
elsif($codon =~ /^UA[AG]/) {$output .= $stop; } # Stop
elsif($codon =~ /^UG[UC]/) {$output .= 'C'; } # Cysteine
elsif($codon =~ /^UGA/) {$output .= $stop; } # Stop
elsif($codon =~ /^UGG/) {$output .= 'W'; } # Tryptophan
elsif($codon =~ /^CU/) {$output .= 'L'; } # Leucine
elsif($codon =~ /^CC/) {$output .= 'P'; } # Proline
elsif($codon =~ /^CA[UC]/) {$output .= 'H'; } # Histidine
elsif($codon =~ /^CA[AG]/) {$output .= 'Q'; } # Glutamine
elsif($codon =~ /^CG/) {$output .= 'R'; } # Arginine
elsif($codon =~ /^AU[UCA]/){$output .= 'I'; } # Isoleucine
elsif($codon =~ /^AUG/) {$output .= 'M'; } # Methionine
elsif($codon =~ /^AC/) {$output .= 'T'; } # Threonine
elsif($codon =~ /^AA[UC]/) {$output .= 'N'; } # Asparagine
elsif($codon =~ /^AA[AG]/) {$output .= 'K'; } # Lysine
elsif($codon =~ /^AG[UC]/) {$output .= 'S'; } # Serine
elsif($codon =~ /^AG[AG]/) {$output .= 'R'; } # Arginine
elsif($codon =~ /^GU/) {$output .= 'V'; } # Valine
elsif($codon =~ /^GC/) {$output .= 'A'; } # Alanine
elsif($codon =~ /^GA[UC]/) {$output .= 'D'; } # Aspartic Acid
elsif($codon =~ /^GA[AG]/) {$output .= 'E'; } # Glutamic Acid
elsif($codon =~ /^GG/) {$output .= 'G'; } # Glycine
else {$output .= $unknown; } # Unknown Codon
}
$output; # return string
}