[Bioperl-l] Indexing nr database

Tue Sep 7 08:28:00 UTC 2010

By the following codes, I wanna index the 4G nr database, however, the index
file is > 1T and the job has been running for weeks and still hasn't
finished. Could anybody tell me how you accomplish the goal? Thanks in
advance.

    use strict;

     use Bio::DB::Flat::BinarySearch;

     (my $baseDir, my $dbName, my $seqFile, my $testId, my $testGi) = @ARGV;

     # use single quotes so you don't have to write

     # regular expressions like "gi\\|(\\d+)"

     #my $primary_pattern = '^>(\S+)';

     #if ($fullHeader == 1) {

        my $primary_pattern = '^>(.+)';

     #}

     my $string = "gi|41353971|emb|AL123456.2| Mycobacterium tuberculosis
H37Rv complete genome";
#$string =~ s/$primary_pattern/RRR/g;

     #print "$string\n";

     # one or more patterns stored in a hash:

     my $secondary_patterns = {GI => 'gi\|(\d+)'};

     my $db = Bio::DB::Flat::BinarySearch->new(

                           -directory          => $baseDir,

                           -dbname             => $dbName,

                           -write_flag         => 1,

                           -primary_pattern    => $primary_pattern,

                           -primary_namespace  => 'ACC',

                           -secondary_patterns => $secondary_patterns,

                           -verbose            => 1,

                           -format             => 'fasta'  );

     $db->build_index($seqFile);