[Bioperl-l] counting sequences
David Messina
dmessina@humgen.wustl.edu
Thu, 20 Dec 2001 11:15:00 -0600
Hi all,
I've written a script to pluck the CDS out of Genbank files. This
doesn't handle "join" entries, where the sequence is split over two or
more Genbank records, but it does warn if the input record has no CDS
feature tag or contains a non-[ACTGN] letter in the sequence. It's not
a Bioperl module, but on the off chance it serves your needs...
David Messina
Division of Human Genetics
Washington Univ. in St. Louis
dmessina@genetics.wustl.edu
314/747-1063 fx 314/747-2489
gb2cds - parses Genbank records and dumps coding sequence in FASTA format
usage: gb2cds [options] <file of genbank record(s)>
options:
-d include gene description text on FASTA headers
-s output list of skipped records (usually no CDS tag in feature
table)
-t output tab-delimited file with
- Genbank accession number
- cds start position (in original sequence)
- cds end position
- cds length
(-d is meaningless with -t)
#!/usr/local/bin/perl -w
# David Messina (dmessina@genetics.wustl.edu) 10/6/01
use Getopt::Std;
use vars qw($opt_t $opt_d $opt_s);
getopts('tds');
my $usage = "
gb2cds - parses Genbank records and dumps coding sequence in FASTA format
usage: $0 [options] <genbank record(s)>
options:
-d add gene description text to FASTA headers
-s output list of skipped records (usually no CDS tag in feature
table)
-t output tab-delimited file with
- Genbank accession number
- cds start position (in original sequence)
- cds end position
- cds length
(-d is meaningless with -t)
";
die $usage unless @ARGV >= 1;
# initialize variables
$in_def=0; # seen a definition line
$in_feat=0; # within feature table
$in_seq=0; # sequence
$seq = "";
$rec_count=0; # count how many records we process
LINE: while (<>)
{
# parse a record
if ($_ =~ /^LOCUS\s+\w+\s+(\d+)\sbp/) { $len = $1; }
if ($_ =~ /^DEFINITION\s+(.+)$/) { $def = $1; $in_def=1; }
if ($in_def and $_ =~ /^\s+(.+)$/) { $def .= " $1"; }
if ($_ =~ /^ACCESSION\s+(\w+)/) { $acc = $1; $in_def=0; }
if ($_ =~ /^VERSION\s+\S+\s+GI:(\d+)/) { $gi = $1; }
if ($_ =~ /^FEATURES/) { $in_feat = 1; }
if ($in_feat == 1 and $_ =~ /\s+CDS\s+\D*(\d+)\.\.\D*(\d+)/)
{ $cds_start = $1; $cds_end = $2; $in_feat = 0; }
if ($_ =~ /^ORIGIN/) { $in_seq = 1; }
if ($in_seq == 1 and $_ !~ /^ORIGIN/)
{
# strip spaces, numbers, and newlines from sequence
(my $line = $_) =~ s/[^acgtnACGTN]//g;
$seq .= uc($line);
}
if ($_ =~ /^\/\//) # end of record
{
$rec_count++;
# checks for bad parsing
if (!($gi)) { skip_record($acc, "no GI!"); next LINE; }
if (!($acc)) { skip_record($gi, "no accession number!"); next
LINE; }
if (!($seq and $len))
{ skip_record($acc, "no sequence!"); next LINE; }
if (check_seq($seq, $len) == 1)
{ skip_record($acc, "bad sequence length! $len"); next LINE; }
if (!($cds_end and $cds_start))
{ skip_record($acc, "no CDS info!"); next LINE; }
# extract cds from full seq
my $cds_len = ($cds_end - $cds_start) + 1;
my $cds = substr($seq, $cds_start-1, $cds_len);
# write FASTA record
unless ($opt_t) # no FASTA if coordinates-only option is on
{
my $header = "$acc/$cds_start-$cds_end";
if ($opt_d) { $header .= " $def"; }
write_fasta($header, $cds_len, $cds);
}
# write tab-delimited coordinates-only output
if ($opt_t)
{ print STDOUT "$acc\t$cds_start\t$cds_end\t$cds_len\n"; }
reset_vars();
}
}
print STDERR "$rec_count record(s) processed.\n";
if ($opt_s) { show_skips(); }
sub write_fasta
{
my($desc, $length, $sequence) = @_;
my($pos, $line);
print STDOUT ">$desc\n";
for ($pos = 0; $pos < $length; $pos += 60)
{
$line = substr($sequence,$pos,60);
print $line, "\n";
}
1;
}
sub skip_record
{
my ($acc, $error) = @_;
if (!defined($acc)) { $acc = "number $rec_count"; }
push @miss, "$acc\t$error";
reset_vars();
}
sub reset_vars
{
undef $len;
undef $gi;
undef $acc;
undef $def;
undef $cds_start;
undef $cds_end;
$seq = "";
$in_def = 0;
$in_feat = 0;
$in_seq = 0;
}
# compares listed seq length (from LOCUS line) with our sequence length
sub check_seq
{
my ($seq, $exp_len) = @_;
my $seq_len = length($seq);
if ($seq_len != $exp_len) { return 1; }
else { return 0; }
}
sub show_skips
{
my $count = scalar(@miss);
print STDOUT "$rec_count record(s) processed.\n";
print STDOUT "$count record(s) were skipped:\n";
foreach my $skip (@miss)
{
print STDOUT $skip, "\n";
}
}