[Bioperl-l] bioperl and kegg(out of memory problem )
Ambrose
aharry2001 at yahoo.com
Mon Apr 2 13:56:33 UTC 2007
Hello ALL,
I have the code below,which parses my kegg files.A host of the files are parsed and the information is inserted into my databases but unfortunate after the program runs for some hours it stops showing the message out of memory.I assume that this happens because the bioperl object is too big.Please just check the code below
best regards Ambrose
#!/usr/local/ActivePerl/bin/perl
#
#
use strict;
use Bio::SeqIO;
use Bio::FASTASequence;
use DBI;
use Benchmark qw(:all) ;
my($ko,$prosite,$ncbigi,$ncbigeneid,$pfam,$uniprot,$ecn1,$pathway_id1,$pathway_name1,$ec_num);
my(%dblink_KO,%dblink_Pfam,%dblink_PROSITE,%dblink_NCBIGI,%dblink_NCBIGENEID,%dblink_UniProt);
my(%pathway_name,%pathway_id,%ecnumbers,%crc64,%ntseq,%aaseq);
my( @kg_id);
my $db="gbdb";
my $host="localhost";
my $userid="root";
my $passwd="ubuntu";
my $connectionInfo="dbi:mysql:$db;"."mysql_socket=/var/run/mysqld/mysqld.sock";
my ($t1,$t2);
my $dbh = DBI->connect($connectionInfo,$userid,$passwd);
my $time_used;
eval { $dbh->do("DROP TABLE kegginfo") };
print "Dropping kegginfo failed: $@\n" if $@;
$dbh->do("CREATE TABLE kegginfo (kg_id BIGINT NOT NULL AUTO_INCREMENT,
up_id INT UNSIGNED REFERENCES uniprotinfo(up_id),
filename VARCHAR(50),
kegg_id VARCHAR(50),
keggaccn VARCHAR(50),
description VARCHAR(250),
ec_numbers VARCHAR(250),
pathway_id VARCHAR(250),
pathway_name VARCHAR(250),
crc64 VARCHAR(50),
ko_id VARCHAR(50),
pfam_id VARCHAR(50),
ncbigi_id VARCHAR(50),
ncbigeneid_id VARCHAR(50),
uniprot_id VARCHAR(50),
prosite_id VARCHAR(50),
PRIMARY KEY (kg_id)
)");
eval { $dbh->do("DROP TABLE keggntsequence") };
print "Dropping keggntsequence failed: $@\n" if $@;
$dbh->do("CREATE TABLE keggntsequence (kg_id BIGINT(15) UNSIGNED REFERENCES uniprotinfo(kg_id),
keggaccn VARCHAR(50),
nucleotidesequence text
)");
eval { $dbh->do("DROP TABLE keggaasequence") };
print "Dropping keggaasequence failed: $@\n" if $@;
$dbh->do("CREATE TABLE keggaasequence (kg_id BIGINT(15) UNSIGNED REFERENCES uniprotinfo(kg_id),
keggaccn VARCHAR(50),
crc64 VARCHAR(50),
aminoacidsequence text
)");
eval { $dbh->do("DROP TABLE timestable") };
print "Dropping timestable failed: $@\n" if $@;
$dbh->do("CREATE TABLE timestable (aut_id BIGINT(15) UNSIGNED NOT NULL AUTO_INCREMENT,
genome VARCHAR(100),
totaltime_seconds int(100),
PRIMARY KEY(aut_id))");
open (LIST, "genomes.list") || die "Cannot open input kegg genomes file genomes.list\n $! \n";
$t1=new Benchmark;
my @genomelist = ();
while (my $line=<LIST>) {
#ignore comment lines
if ($line !~ /^#/) {
chomp $line;
push (@genomelist, $line); #store the filename
}
}
close LIST;
my $count=0;
foreach my $genomefile (@genomelist) {
#in case the user fails to remove some strange files from
#the genomes.list file.. check for the KEGG format
my $check=checkKeggFormat($genomefile);
if ($check==0) {
#if file is not kegg, start with the next one...
print "ERROR: $genomefile doesn't look like a KEGG file to me! \n";
#<stdin>;
next;
}
#print $genomefile,"\n";
my $stream = Bio::SeqIO->new(-file => $genomefile, -format => 'KEGG');
while ( my $seq = $stream->next_seq() ) {
my $primary_id = $seq->primary_id;
my $display_id = $seq->display_id; #name
my $keggaccn = $seq->accession; #accn
my @description = $seq->annotation->get_Annotations('description');
my @dblinks = $seq->annotation->get_Annotations('dblink');
my @orthologs = $seq->annotation->get_Annotations('ortholog');
my @orthologs = grep {$_->database eq 'KO'} $seq->annotation->get_Annotations('dblink');
my @class = $seq->annotation->get_Annotations('pathway');
$ntseq{$keggaccn} = $seq->seq;
$aaseq{$keggaccn} = $seq->translate->seq;
$aaseq{$keggaccn} =~s /\*$//;
my $fasta = ">".$count."\n".$aaseq{$keggaccn};
my $newseq = Bio::FASTASequence->new($fasta);
$crc64{$keggaccn}=$newseq->getCrc64();
#print $keggaccn,"crc64:$crc64{$keggaccn}\n";
$count++;
if ($keggaccn eq "") { print "PRIMARY KEY NOT FOUND no keggaccn\n";
next;}
if(@dblinks)
{
my @dblink_KO=();
my @dblink_Pfam=();
my @dblink_PROSITE=();
my @dblink_NCBIGI=();
my @dblink_NCBIGENEID=();
my @dblink_UniProt=();
foreach my $ele (@dblinks) {
if ($ele =~ /^KO:/){
$ele=~s/KO://;
push (@dblink_KO,$ele);
$dblink_KO{$keggaccn}=$ele;
next;
}
#parse Pfam: dblink
if ($ele =~ /^Pfam:/){
$ele=~s/Pfam://;
push (@dblink_Pfam,$ele);
$dblink_Pfam{$keggaccn}=$ele;
next;
}
#parse PROSITE: dblink
if ($ele =~ /^PROSITE:/){
$ele=~s/PROSITE://;
push (@dblink_PROSITE,$ele);
$dblink_PROSITE{$keggaccn}=$ele;
next;
}
#parse NCBI-GI: dblink
if ($ele =~ /^NCBI-GI:/){
$ele=~s/NCBI-GI://;
push (@dblink_NCBIGI,$ele);
$dblink_NCBIGI{$keggaccn}=$ele;
next;
}
#parse NCBI-GeneID: dblink
if ($ele =~ /^NCBI-GeneID:/){
$ele=~s/NCBI-GeneID://;
push (@dblink_NCBIGENEID,$ele);
$dblink_NCBIGENEID{$keggaccn}=$ele;
next;
}
#parse UniProt: dblink
if ($ele =~ /^UniProt:/){
$ele=~s/UniProt://;
push (@dblink_UniProt,$ele);
$dblink_UniProt{$keggaccn}=$ele;
next;
}
}#end foreach #finished parsing all dblinks
}#end if @dblinks
if(@class)
{
foreach my $pathway (@class) {
$pathway=~s/^\s+|\s+$//;
my @arr = split (/\s+/,$pathway);
my $pathway_id = $arr[0];
shift @arr;
my $pathway_name = join(" ", at arr);
$pathway_name{$keggaccn}=$pathway_name;
$pathway_id{$keggaccn}=$pathway_id;
#print $pathway_id{$keggaccn},"\t",$pathway_name{$keggaccn},"\n";
}
}
my @ecnumbers=();
@ecnumbers = extractECnumbers(@description);
if(@ecnumbers)
{
if (@ecnumbers!=0)
{
foreach my $ecn (@ecnumbers)
{
$ecnumbers{$keggaccn}=$ecn;
}#end foreach
}
else {
#print "ECnumbers:\n";
}
}
# print $keggaccn,"\t",$dblink_UniProt{$keggaccn},"\t",$dblink_NCBIGENEID{$keggaccn},
# "\t",$dblink_NCBIGI{$keggaccn},"\t","ec:$ecnumbers{$keggaccn}","\t",
# "p1:$pathway_id{$keggaccn}","\t","p2:$pathway_name{$keggaccn}","\n";
#
$dbh->do("INSERT INTO kegginfo VALUES (?,?, ?, ?, ?, ?,?,?,?,?,?,?,?,?,?,?)",
undef,"NULL","NULL",$genomefile,$display_id,$keggaccn, at description,$ecnumbers{$keggaccn},
$pathway_id{$keggaccn},$pathway_name{$keggaccn},$crc64{$keggaccn},$dblink_KO{$keggaccn},
$dblink_Pfam{$keggaccn},$dblink_NCBIGI{$keggaccn},$dblink_NCBIGENEID{$keggaccn},
$dblink_UniProt{$keggaccn},$dblink_PROSITE{$keggaccn});
$dbh->do("INSERT INTO keggaasequence VALUES (?,?,?,?)",
undef,"",$keggaccn,$crc64{$keggaccn},$aaseq{$keggaccn});
$dbh->do("INSERT INTO keggntsequence VALUES (?,?,?)",
undef,"",$keggaccn,$ntseq{$keggaccn});
}
$t2=new Benchmark;
$time_used=timeThis($t1,$t2,"Finished parsing file $genomefile");
$dbh->do("INSERT INTO timestable VALUES (?,?,?)",
undef,"NULL",$genomefile,$time_used);
}
$dbh->do("CREATE INDEX keggIindex ON kegginfo (kg_id,keggaccn)");
print "Index created on kegginfo\n";
$dbh->do("CREATE INDEX keggaasequence1 ON keggaasequence (kg_id,keggaccn)");
print "Index created on keggaasequence\n";
$dbh->do("CREATE INDEX keggntsequence1 ON keggntsequence (kg_id,keggaccn)");
print "Index created on keggntsequence\n";
print"Updating the tables................\n";
$dbh->do("update kegginfo,keggaasequence set keggaasequence.kg_id=kegginfo.kg_id
where
kegginfo.keggaccn=keggaasequence.keggaccn");
print " keggaasequence kg_id\n";
$dbh->do("update kegginfo,keggntsequence set keggntsequence.kg_id=kegginfo.kg_id
where
kegginfo.keggaccn=keggntsequence.keggaccn");
print " keggaasequence kg_id\n";
sub extractECnumbers ($) {
#sample description lines
#riboflavin kinase / FMN adenylyltransferase [EC:2.7.1.26 2.7.7.2]
#ATP synthase F0 subunit c [EC:3.6.3.14]
my @desc=shift;
my $description = join ("", at desc);
my @ecnumbers=();
#print "parsing ec for $description..\n";
#check if EC number exists
if ($description=~/\[EC:/) {
my @array = split (/\[EC:/,$description);
$array[1]=~s/]//g;
shift @array; #remove the annotation , only EC numbers remain
foreach my $ele (@array) {
$ele=~s/^\s+|\s+$//g;
$ele= "EC:".$ele;
push (@ecnumbers,$ele);
}
return @ecnumbers;
}
else {
#return an empty value
return ;
}
}
sub checkKeggFormat ($) {
=head2
checkKeggFormat
make sure that the file is a valid KEGG file
function checks the first two lines,
1st must start with ENTRY
2nd must start with DEFINITION
returns 0 or 1
=cut
my $genomefile=shift;
open (TEST,$genomefile) || die "Cannot open file $genomefile for reading \n";
my $testline=<TEST>;
#print "$testline\n";
if ($testline=~/^ENTRY/) {
#continue
#$testline=<TEST>;#double check
#if ($testline=~/^NAME/) {
#this looks like a valid kegg file
return 1;
#}
#else {
# close TEST;
# return 0;
#}
}
else {
close TEST;
return 0;
}
}
sub timeThis ($$$)
{
my ($start,$end,$message) = @_;
my $td = timediff($end, $start);
my $t = timestr($td);
print "$message : ",$t,"\n";
my @array = split (/\s+/,$t);
#20 wallclock secs (14.23 usr + 0.84 sys = 15.07 CPU)
return $array[0]; #return the no. of seconds.
}
---------------------------------
Looking for earth-friendly autos?
Browse Top Cars by "Green Rating" at Yahoo! Autos' Green Center.
More information about the Bioperl-l
mailing list