[Bioperl-l] Extracting subsequences from genbank files
Rolf Nimzyk
rfn at uni-bremen.de
Tue Aug 5 11:30:43 EDT 2003
Hi,
I use a sligthly modyfied example script from Jason to extract subsequences
out of a genbank files created from a sequence by GeneMachine,
http://genome.nhgri.nih.gov/genemachine/. With some genbank files it works
fine but with others not. I don't now what the problem is.
I am using Perl 5.8, BioPerl 1.2.2.
Thanks in advance
Rolf
The script
#!/usr/bin/perl -w
# Contributed by Jason Stajich <jason at bioperl.org>
# simple extract the CDS features from a genbank file and
# write out the CDS and Peptide sequences
use strict;
use Bio::SeqIO;
my $filename = shift || die("pass in a genbank filename on the cmd line");
my $in = new Bio::SeqIO(-file => $filename, -format => 'genbank');
my $out = new Bio::SeqIO(-file => ">$filename.gene.fas");
while( my $seq = $in->next_seq ) {
my @cds = grep { $_->primary_tag eq 'gene' } $seq->get_SeqFeatures();
foreach my $feature ( @cds ) {
my $featureseq = $feature->spliced_seq;
my $newid = $filename . ' possible gene ' . $feature->start . ".." .
$feature->end;
$featureseq->display_id($newid);
$out->write_seq($featureseq);
}
}
The exception
Argument "bp" isn't numeric in numeric gt (>) at
/usr/local/lib/perl5/site_perl/5.8.0/Bio/PrimarySeq.pm line 362, <GEN0> line
2376.
------------- EXCEPTION -------------
MSG: You have to have start positive
and length less than the total length of sequence [4682:4770] Total bp
STACK Bio::PrimarySeq::subseq
/usr/local/lib/perl5/site_perl/5.8.0/Bio/PrimarySeq.pm:362
STACK Bio::PrimarySeqI::trunc
/usr/local/lib/perl5/site_perl/5.8.0/Bio/PrimarySeqI.pm:456
STACK Bio::SeqFeature::Generic::seq
/usr/local/lib/perl5/site_perl/5.8.0/Bio/SeqFeature/Generic.pm:604
STACK toplevel extract_cds_exon.pl:24
Part of the genbank file
LOCUS zwischen_LOC_126015_und_LOC_34292888777 bp DNA linear
01-JAN-1900
DEFINITION zwischen LOC 126015 und LOC 342928.seq.
ACCESSION zwischen_LOC_126015_und_LOC_342928
VERSION
KEYWORDS .
SOURCE Unknown.
ORGANISM Unknown.
Unclassified.
FEATURES Location/Qualifiers
source 1..88777
/mol_type="unassigned DNA"
gene complement(join(1000..1176,2043..2243))
/gene="HMMGene"
/note="HMMGene; Prob1=0.426, Prob2=0.602, Prob3=0.301
bestparse:cds_3"
repeat_region 1108..1390
/note="RepeatMasker: AluSx"
/rpt_family="SINE/Alu"
repeat_region 1402..1512
/note="RepeatMasker: FLAM_C"
/rpt_family="SINE/Alu"
repeat_region 1541..1653
/note="RepeatMasker: MER57B"
/rpt_family="LTR/ERV1"
repeat_region 1654..1953
/note="RepeatMasker: AluSp"
/rpt_family="SINE/Alu"
repeat_region 1954..2272
/note="RepeatMasker: MER57B"
/rpt_family="LTR/ERV1"
gene complement(join(2043..2243,10387..10426))
/gene="Genscan"
/note="GenScan; P1=Prom, P2=0.526"
repeat_unit 2281..2321
/note="Sputnik"
/rpt_type=pentanucleotide
repeat_unit 2330..2398
/note="Sputnik"
/rpt_type=pentanucleotide
repeat_region complement(2385..2695)
/note="RepeatMasker: AluSx"
/rpt_family="SINE/Alu"
repeat_region 2697..2734
/note="RepeatMasker: MER57B"
/rpt_family="LTR/ERV1"
repeat_region 2735..3247
/note="RepeatMasker: MER57B-int"
/rpt_family="LTR/ERV1"
repeat_region complement(3248..3551)
/note="RepeatMasker: AluY"
/rpt_family="SINE/Alu"
repeat_region 3552..3646
/note="RepeatMasker: MER57B-int"
/rpt_family="LTR/ERV1"
repeat_region 3649..3939
/note="RepeatMasker: AluSx"
/rpt_family="SINE/Alu"
repeat_region 3941..4263
/note="RepeatMasker: AluSq"
/rpt_family="SINE/Alu"
repeat_region 4276..4450
/note="RepeatMasker: MER57B-int"
/rpt_family="LTR/ERV1"
repeat_region 4445..5174
/note="RepeatMasker: MER57B-int"
/rpt_family="LTR/ERV1"
exon 4682..4770
/note="MZEF; P=0.545"
gene join(4758..4770,8044..8208,9490..9656,22534..22662,
42550..42582,46069..46096,52948..53003)
/gene="HMMGene"
/note="HMMGene; Prob1=0.242, Prob2=0.894, Prob3=0.319,
Prob4=0.589, Prob5=0.406, Prob6=0.405, Prob7=0.006,
Prob8=0.000 bestparse:cds_1"
repeat_region 5193..5300
/note="RepeatMasker: MER57-int"
/rpt_family="LTR/ERV1"
repeat_region 5303..5630
/note="RepeatMasker: AluSx"
/rpt_family="SINE/Alu"
repeat_unit 5587..5630
/gene="HMMGene"
/note="Sputnik"
/rpt_type=pentanucleotide
repeat_region 5636..5741
/note="RepeatMasker: U6"
/rpt_family="snRNA"
repeat_region complement(5979..6276)
/note="RepeatMasker: AluSq"
/rpt_family="SINE/Alu"
repeat_region 6290..6438
/note="RepeatMasker: MER57-int"
/rpt_family="LTR/ERV1"
repeat_region 6421..6876
/note="RepeatMasker: MER57-int"
/rpt_family="LTR/ERV1"
repeat_region 7097..7586
/note="RepeatMasker: MER57-int"
/rpt_family="LTR/ERV1"
repeat_region 7623..7791
/note="RepeatMasker: AluSg/x"
/rpt_family="SINE/Alu"
repeat_region complement(8267..8896)
/note="RepeatMasker: AluSx"
/rpt_family="SINE/Alu"
repeat_region complement(8948..8998)
/note="RepeatMasker: FLAM"
/rpt_family="SINE/Alu"
repeat_region complement(8999..9323)
/note="RepeatMasker: AluSx"
/rpt_family="SINE/Alu"
gene complement(join(9914..10210,19268..19390,40859..40909))
/gene="HMMGene"
/note="HMMGene; Prob1=0.245, Prob2=0.500, Prob3=0.397,
Prob4=0.007 bestparse:cds_2"
repeat_region 10551..10860
/note="RepeatMasker: AluSx"
/rpt_family="SINE/Alu"
More information about the Bioperl-l
mailing list