[Bioperl-l] counting sequences

David Messina dmessina@humgen.wustl.edu
Thu, 20 Dec 2001 11:15:00 -0600


Hi all,

I've written a script to pluck the CDS out of Genbank files.  This 
doesn't handle "join" entries, where the sequence is split over two or 
more Genbank records, but it does warn if the input record has no CDS 
feature tag or contains a non-[ACTGN] letter in the sequence.  It's not 
a Bioperl module, but on the off chance it serves your needs...

David Messina
   Division of Human Genetics
   Washington Univ. in St. Louis
   dmessina@genetics.wustl.edu
   314/747-1063 fx 314/747-2489


gb2cds - parses Genbank records and dumps coding sequence in FASTA format

usage: gb2cds [options] <file of genbank record(s)>
options:
  -d     include gene description text on FASTA headers
  -s     output list of skipped records (usually no CDS tag in feature 
table)

  -t     output tab-delimited file with
         - Genbank accession number
         - cds start position (in original sequence)
         - cds end position
         - cds length
         (-d is meaningless with -t)


#!/usr/local/bin/perl -w
# David Messina (dmessina@genetics.wustl.edu) 10/6/01

use Getopt::Std;
use vars qw($opt_t $opt_d $opt_s);
getopts('tds');
my $usage = "
gb2cds - parses Genbank records and dumps coding sequence in FASTA format

usage: $0 [options] <genbank record(s)>
options:
  -d     add gene description text to FASTA headers
  -s     output list of skipped records (usually no CDS tag in feature 
table)

  -t     output tab-delimited file with
         - Genbank accession number
         - cds start position (in original sequence)
         - cds end position
         - cds length
         (-d is meaningless with -t)
";
die $usage unless @ARGV >= 1;

# initialize variables
$in_def=0;    # seen a definition line
$in_feat=0;   # within feature table
$in_seq=0;    # sequence
$seq = "";
$rec_count=0; # count how many records we process

LINE: while (<>)
{
     # parse a record
     if ($_ =~ /^LOCUS\s+\w+\s+(\d+)\sbp/) { $len = $1; }
     if ($_ =~ /^DEFINITION\s+(.+)$/) { $def = $1; $in_def=1; }
     if ($in_def and $_ =~ /^\s+(.+)$/) { $def .= " $1"; }
     if ($_ =~ /^ACCESSION\s+(\w+)/) { $acc = $1; $in_def=0; }
     if ($_ =~ /^VERSION\s+\S+\s+GI:(\d+)/) { $gi = $1; }
     if ($_ =~ /^FEATURES/) { $in_feat = 1; }
     if ($in_feat == 1 and $_ =~ /\s+CDS\s+\D*(\d+)\.\.\D*(\d+)/)
     { $cds_start = $1; $cds_end = $2; $in_feat = 0; }
     if ($_ =~ /^ORIGIN/) { $in_seq = 1; }
     if ($in_seq == 1 and $_ !~ /^ORIGIN/)
     {
         # strip spaces, numbers, and newlines from sequence
         (my $line = $_) =~ s/[^acgtnACGTN]//g;
         $seq .= uc($line);
     }

     if ($_ =~ /^\/\//) # end of record
     {
         $rec_count++;

         # checks for bad parsing
         if (!($gi)) { skip_record($acc, "no GI!"); next LINE; }
         if (!($acc)) { skip_record($gi, "no accession number!"); next 
LINE; }
         if (!($seq and $len))
         { skip_record($acc, "no sequence!"); next LINE; }
         if (check_seq($seq, $len) == 1)
         { skip_record($acc, "bad sequence length! $len"); next LINE; }
         if (!($cds_end and $cds_start))
         { skip_record($acc, "no CDS info!"); next LINE; }

         # extract cds from full seq
         my $cds_len = ($cds_end - $cds_start) + 1;
         my $cds = substr($seq, $cds_start-1, $cds_len);

         # write FASTA record
         unless ($opt_t) # no FASTA if coordinates-only option is on
         {
             my $header = "$acc/$cds_start-$cds_end";
             if ($opt_d) { $header .= " $def"; }
             write_fasta($header, $cds_len, $cds);
         }
         # write tab-delimited coordinates-only output
         if ($opt_t)
         { print STDOUT "$acc\t$cds_start\t$cds_end\t$cds_len\n"; }

         reset_vars();
     }
}

print STDERR "$rec_count record(s) processed.\n";
if ($opt_s) { show_skips(); }

sub write_fasta
{
     my($desc, $length, $sequence) = @_;
     my($pos, $line);

     print STDOUT ">$desc\n";
     for ($pos = 0; $pos < $length; $pos += 60)
     {
         $line = substr($sequence,$pos,60);
         print $line, "\n";
     }
     1;
}

sub skip_record
{
     my ($acc, $error) = @_;
     if (!defined($acc)) { $acc = "number $rec_count"; }
     push @miss, "$acc\t$error";
     reset_vars();
}

sub reset_vars
{
     undef $len;
     undef $gi;
     undef $acc;
     undef $def;
     undef $cds_start;
     undef $cds_end;
     $seq = "";
     $in_def = 0;
     $in_feat = 0;
     $in_seq = 0;
}

# compares listed seq length (from LOCUS line) with our sequence length
sub check_seq
{
     my ($seq, $exp_len) = @_;
     my $seq_len = length($seq);
     if ($seq_len != $exp_len) { return 1; }
     else { return 0; }
}

sub show_skips
{
     my $count = scalar(@miss);
     print STDOUT "$rec_count record(s) processed.\n";
     print STDOUT "$count record(s) were skipped:\n";
     foreach my $skip (@miss)
     {
         print STDOUT $skip, "\n";
     }
}