[Bioperl-l] bioperl and kegg(out of memory problem )
Chris Fields
cjfields at uiuc.edu
Mon Apr 2 15:32:59 UTC 2007
Ambrose,
Data is persisting in your hashes (in particular DBLink objects),
which is eating away at your memory. If I take a sample KEGG gene
file and simply parse it:
while (my $seq = $io->next_seq) {
print $seq->accession,"\n";
}
there are no memory issues, but if I store the data in hashes
declared outside the loop:
my(%dblink_KO,%dblink_Pfam,%dblink_PROSITE,%dblink_NCBIGI,%
dblink_NCBIGENEID,%dblink_UniProt);
my(%pathway_name,%pathway_id,%ecnumbers,%crc64,%ntseq,%aaseq);
while (my $seq = $io->next_seq) {
# store Bio::Seq data in hashes
}
I see problems with only one genome file with KEGG records. You'll
definitely run into memory issues if you are parsing many genome
files, which you appear to be:
my(%dblink_KO,%dblink_Pfam,%dblink_PROSITE,%dblink_NCBIGI,%
dblink_NCBIGENEID,%dblink_UniProt);
my(%pathway_name,%pathway_id,%ecnumbers,%crc64,%ntseq,%aaseq);
for my $genomefile (@genomelist) {
while (my $seq = $io->next_seq) {
# store Bio::Seq data in hashes
}
}
Localizing the hashes to the genome or sequence loops should prevent
the memory problem.
Note that the DBLink Annotation objects are overloaded so they act
like a string ($ele =~ /^KO:/) but are actually
Bio::Annotation::DBLink objects, something we will likely get rid of
in the near future.
chris
On Apr 2, 2007, at 8:56 AM, Ambrose wrote:
>
>
> Hello ALL,
>
> I have the code below,which parses my kegg files.A host of the
> files are parsed and the information is inserted into my databases
> but unfortunate after the program runs for some hours it stops
> showing the message out of memory.I assume that this happens
> because the bioperl object is too big.Please just check the code below
>
> best regards Ambrose
>
>
> #!/usr/local/ActivePerl/bin/perl
> #
> #
>
> use strict;
> use Bio::SeqIO;
> use Bio::FASTASequence;
> use DBI;
> use Benchmark qw(:all) ;
>
> my($ko,$prosite,$ncbigi,$ncbigeneid,$pfam,$uniprot,$ecn1,
> $pathway_id1,$pathway_name1,$ec_num);
> my(%dblink_KO,%dblink_Pfam,%dblink_PROSITE,%dblink_NCBIGI,%
> dblink_NCBIGENEID,%dblink_UniProt);
> my(%pathway_name,%pathway_id,%ecnumbers,%crc64,%ntseq,%aaseq);
> my( @kg_id);
> my $db="gbdb";
> my $host="localhost";
> my $userid="root";
> my $passwd="ubuntu";
> my $connectionInfo="dbi:mysql:$db;"."mysql_socket=/var/run/mysqld/
> mysqld.sock";
> my ($t1,$t2);
> my $dbh = DBI->connect($connectionInfo,$userid,$passwd);
> my $time_used;
>
>
>
> eval { $dbh->do("DROP TABLE kegginfo") };
> print "Dropping kegginfo failed: $@\n" if $@;
> $dbh->do("CREATE TABLE kegginfo (kg_id BIGINT NOT NULL
> AUTO_INCREMENT,
> up_id INT UNSIGNED REFERENCES
> uniprotinfo(up_id),
>
> filename VARCHAR(50),
> kegg_id VARCHAR
> (50),
> keggaccn VARCHAR(50),
>
> description VARCHAR(250),
> ec_numbers VARCHAR(250),
> pathway_id VARCHAR(250),
> pathway_name VARCHAR
> (250),
> crc64 VARCHAR(50),
> ko_id VARCHAR(50),
> pfam_id VARCHAR(50),
> ncbigi_id VARCHAR(50),
> ncbigeneid_id VARCHAR(50),
> uniprot_id VARCHAR(50),
> prosite_id VARCHAR(50),
> PRIMARY KEY (kg_id)
> )");
>
>
> eval { $dbh->do("DROP TABLE keggntsequence") };
> print "Dropping keggntsequence failed: $@\n" if $@;
> $dbh->do("CREATE TABLE keggntsequence (kg_id BIGINT(15) UNSIGNED
> REFERENCES uniprotinfo(kg_id),
> keggaccn VARCHAR
> (50),
> nucleotidesequence text
> )");
>
> eval { $dbh->do("DROP TABLE keggaasequence") };
> print "Dropping keggaasequence failed: $@\n" if $@;
> $dbh->do("CREATE TABLE keggaasequence (kg_id BIGINT(15) UNSIGNED
> REFERENCES uniprotinfo(kg_id),
> keggaccn VARCHAR
> (50),
> crc64 VARCHAR(50),
> aminoacidsequence text
> )");
> eval { $dbh->do("DROP TABLE timestable") };
> print "Dropping timestable failed: $@\n" if $@;
> $dbh->do("CREATE TABLE timestable (aut_id BIGINT(15) UNSIGNED NOT
> NULL AUTO_INCREMENT,
> genome VARCHAR(100),
> totaltime_seconds int(100),
>
> PRIMARY KEY(aut_id))");
>
>
>
> open (LIST, "genomes.list") || die "Cannot open input kegg genomes
> file genomes.list\n $! \n";
> $t1=new Benchmark;
> my @genomelist = ();
> while (my $line=<LIST>) {
> #ignore comment lines
> if ($line !~ /^#/) {
> chomp $line;
>
> push (@genomelist, $line); #store the filename
> }
> }
>
> close LIST;
> my $count=0;
> foreach my $genomefile (@genomelist) {
>
> #in case the user fails to remove some strange files from
> #the genomes.list file.. check for the KEGG format
> my $check=checkKeggFormat($genomefile);
> if ($check==0) {
> #if file is not kegg, start with the next one...
> print "ERROR: $genomefile doesn't look like a KEGG file to
> me! \n";
> #<stdin>;
> next;
> }
> #print $genomefile,"\n";
> my $stream = Bio::SeqIO->new(-file => $genomefile, -format =>
> 'KEGG');
>
> while ( my $seq = $stream->next_seq() ) {
>
> my $primary_id = $seq->primary_id;
> my $display_id = $seq->display_id; #name
> my $keggaccn = $seq->accession; #accn
> my @description = $seq->annotation->get_Annotations
> ('description');
>
> my @dblinks = $seq->annotation->get_Annotations('dblink');
> my @orthologs = $seq->annotation->get_Annotations
> ('ortholog');
> my @orthologs = grep {$_->database eq 'KO'} $seq-
> >annotation->get_Annotations('dblink');
> my @class = $seq->annotation->get_Annotations
> ('pathway');
> $ntseq{$keggaccn} = $seq->seq;
> $aaseq{$keggaccn} = $seq->translate->seq;
> $aaseq{$keggaccn} =~s /\*$//;
> my $fasta = ">".$count."\n".$aaseq{$keggaccn};
> my $newseq = Bio::FASTASequence->new($fasta);
> $crc64{$keggaccn}=$newseq->getCrc64();
> #print $keggaccn,"crc64:$crc64{$keggaccn}\n";
>
> $count++;
> if ($keggaccn eq "") { print "PRIMARY KEY NOT FOUND no
> keggaccn\n";
> next;}
>
> if(@dblinks)
> {
> my @dblink_KO=();
> my @dblink_Pfam=();
> my @dblink_PROSITE=();
> my @dblink_NCBIGI=();
> my @dblink_NCBIGENEID=();
> my @dblink_UniProt=();
>
> foreach my $ele (@dblinks) {
> if ($ele =~ /^KO:/){
> $ele=~s/KO://;
> push (@dblink_KO,$ele);
> $dblink_KO{$keggaccn}=$ele;
> next;
> }
> #parse Pfam: dblink
> if ($ele =~ /^Pfam:/){
> $ele=~s/Pfam://;
> push (@dblink_Pfam,$ele);
> $dblink_Pfam{$keggaccn}=$ele;
> next;
> }
> #parse PROSITE: dblink
> if ($ele =~ /^PROSITE:/){
> $ele=~s/PROSITE://;
> push (@dblink_PROSITE,$ele);
> $dblink_PROSITE{$keggaccn}=$ele;
> next;
> }
> #parse NCBI-GI: dblink
> if ($ele =~ /^NCBI-GI:/){
> $ele=~s/NCBI-GI://;
> push (@dblink_NCBIGI,$ele);
> $dblink_NCBIGI{$keggaccn}=$ele;
> next;
> }
> #parse NCBI-GeneID: dblink
> if ($ele =~ /^NCBI-GeneID:/){
> $ele=~s/NCBI-GeneID://;
> push (@dblink_NCBIGENEID,$ele);
> $dblink_NCBIGENEID{$keggaccn}=$ele;
> next;
> }
> #parse UniProt: dblink
> if ($ele =~ /^UniProt:/){
> $ele=~s/UniProt://;
> push (@dblink_UniProt,$ele);
> $dblink_UniProt{$keggaccn}=$ele;
> next;
> }
>
> }#end foreach #finished parsing all dblinks
> }#end if @dblinks
> if(@class)
> {
> foreach my $pathway (@class) {
>
> $pathway=~s/^\s+|\s+$//;
> my @arr = split (/\s+/,$pathway);
> my $pathway_id = $arr[0];
> shift @arr;
> my $pathway_name = join(" ", at arr);
> $pathway_name{$keggaccn}=$pathway_name;
> $pathway_id{$keggaccn}=$pathway_id;
> #print $pathway_id{$keggaccn},"\t",$pathway_name
> {$keggaccn},"\n";
>
> }
>
> }
>
> my @ecnumbers=();
> @ecnumbers = extractECnumbers(@description);
> if(@ecnumbers)
> {
> if (@ecnumbers!=0)
> {
> foreach my $ecn (@ecnumbers)
> {
> $ecnumbers{$keggaccn}=$ecn;
> }#end foreach
> }
> else {
> #print "ECnumbers:\n";
> }
> }
>
>
> # print $keggaccn,"\t",$dblink_UniProt{$keggaccn},"\t",
> $dblink_NCBIGENEID{$keggaccn},
> # "\t",$dblink_NCBIGI{$keggaccn},"\t","ec:$ecnumbers
> {$keggaccn}","\t",
> # "p1:$pathway_id{$keggaccn}","\t","p2:
> $pathway_name{$keggaccn}","\n";
> #
> $dbh->do("INSERT INTO kegginfo VALUES
> (?,?, ?, ?, ?, ?,?,?,?,?,?,?,?,?,?,?)",
> undef,"NULL","NULL",$genomefile,$display_id,
> $keggaccn, at description,$ecnumbers{$keggaccn},
> $pathway_id{$keggaccn},$pathway_name{$keggaccn},
> $crc64{$keggaccn},$dblink_KO{$keggaccn},
> $dblink_Pfam{$keggaccn},$dblink_NCBIGI{$keggaccn},
> $dblink_NCBIGENEID{$keggaccn},
> $dblink_UniProt{$keggaccn},$dblink_PROSITE
> {$keggaccn});
>
>
> $dbh->do("INSERT INTO keggaasequence VALUES (?,?,?,?)",
> undef,"",$keggaccn,$crc64{$keggaccn},$aaseq{$keggaccn});
>
>
> $dbh->do("INSERT INTO keggntsequence VALUES (?,?,?)",
> undef,"",$keggaccn,$ntseq{$keggaccn});
>
>
> }
> $t2=new Benchmark;
> $time_used=timeThis($t1,$t2,"Finished parsing file $genomefile");
> $dbh->do("INSERT INTO timestable VALUES (?,?,?)",
> undef,"NULL",$genomefile,$time_used);
>
> }
>
>
> $dbh->do("CREATE INDEX keggIindex ON kegginfo (kg_id,keggaccn)");
> print "Index created on kegginfo\n";
>
> $dbh->do("CREATE INDEX keggaasequence1 ON keggaasequence
> (kg_id,keggaccn)");
> print "Index created on keggaasequence\n";
>
> $dbh->do("CREATE INDEX keggntsequence1 ON keggntsequence
> (kg_id,keggaccn)");
> print "Index created on keggntsequence\n";
>
>
> print"Updating the tables................\n";
>
>
> $dbh->do("update kegginfo,keggaasequence set
> keggaasequence.kg_id=kegginfo.kg_id
> where
> kegginfo.keggaccn=keggaasequence.keggaccn");
> print " keggaasequence kg_id\n";
>
> $dbh->do("update kegginfo,keggntsequence set
> keggntsequence.kg_id=kegginfo.kg_id
> where
> kegginfo.keggaccn=keggntsequence.keggaccn");
> print " keggaasequence kg_id\n";
>
>
>
> sub extractECnumbers ($) {
> #sample description lines
> #riboflavin kinase / FMN adenylyltransferase [EC:2.7.1.26
> 2.7.7.2]
> #ATP synthase F0 subunit c [EC:3.6.3.14]
>
> my @desc=shift;
> my $description = join ("", at desc);
> my @ecnumbers=();
> #print "parsing ec for $description..\n";
> #check if EC number exists
> if ($description=~/\[EC:/) {
>
> my @array = split (/\[EC:/,$description);
> $array[1]=~s/]//g;
> shift @array; #remove the annotation , only EC numbers remain
> foreach my $ele (@array) {
> $ele=~s/^\s+|\s+$//g;
> $ele= "EC:".$ele;
> push (@ecnumbers,$ele);
> }
> return @ecnumbers;
> }
> else {
> #return an empty value
> return ;
>
> }
>
> }
>
>
>
>
>
>
>
> sub checkKeggFormat ($) {
> =head2
>
> checkKeggFormat
>
> make sure that the file is a valid KEGG file
> function checks the first two lines,
> 1st must start with ENTRY
> 2nd must start with DEFINITION
>
> returns 0 or 1
>
> =cut
> my $genomefile=shift;
>
> open (TEST,$genomefile) || die "Cannot open file $genomefile
> for reading \n";
> my $testline=<TEST>;
> #print "$testline\n";
> if ($testline=~/^ENTRY/) {
> #continue
> #$testline=<TEST>;#double check
> #if ($testline=~/^NAME/) {
> #this looks like a valid kegg file
> return 1;
> #}
> #else {
> # close TEST;
> # return 0;
> #}
> }
> else {
> close TEST;
> return 0;
> }
>
> }
>
> sub timeThis ($$$)
> {
> my ($start,$end,$message) = @_;
> my $td = timediff($end, $start);
> my $t = timestr($td);
> print "$message : ",$t,"\n";
> my @array = split (/\s+/,$t);
> #20 wallclock secs (14.23 usr + 0.84 sys = 15.07 CPU)
> return $array[0]; #return the no. of seconds.
> }
>
>
>
>
> ---------------------------------
> Looking for earth-friendly autos?
> Browse Top Cars by "Green Rating" at Yahoo! Autos' Green Center.
> _______________________________________________
> Bioperl-l mailing list
> Bioperl-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-l
Christopher Fields
Postdoctoral Researcher
Lab of Dr. Robert Switzer
Dept of Biochemistry
University of Illinois Urbana-Champaign
More information about the Bioperl-l
mailing list