[Bioperl-l] Bio::DB::Fasta fails for files over 4GB
Charles Tilford
charles.tilford at bms.com
Mon Aug 7 18:30:03 UTC 2006
Chris Fields wrote:
> Dynamically determining the packing based on file size is probably the
> way to go; it would be nice to see how this affects speed.
>
> Chris
Ok, I seem to have a functioning patch. I was also concerned about
performance; I assume Lincoln's using a constant for the pack format
because it optimizes compilation of the _pack() and _unpack() methods.
So rather than make the format a variable, I made the methods themselves
variant. The code looks at the file(s) being indexed, and if any file
exceeds 4Gb it will use 64-bit packing (for both file offset and
sequence length - being paranoid on the later, in case we get Martian
genomes with chromosomes over 4Gb in length). I have not tested it for a
directory of multiple files, but it should still work. Two packing
formats are defined as constants, and the pack / unpack methods are
bifurcated to use one or the other. I do not have a good feeling for the
performance difference in calling a method directly or by de-referencing
a method reference, but I assume it is minuscule.
-s is used for the file size test - I assume that is nuclear-hard
portable across platforms?
I didn't figure out a good reason for the _pack/_unpack calls to remain
object methods, so they're not in the patch.
The patch below is against:
# $Id: Fasta.pm,v 1.44 2006/07/17 10:39:37 sendu Exp $
--- /net/thegeneral/home/tilfordc/Fasta.pm 2006-08-07
14:01:27.000000000 -0400
+++ /stf/biocgi/tilfordc/patch_lib/Bio/DB/Fasta.pm 2006-08-07
14:07:52.033844532 -0400
@@ -418,7 +418,8 @@
*ids = \&get_all_ids;
*get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
-use constant STRUCT =>'NNnnCa*';
+use constant STRUCT =>'NNnnCa*';
+use constant STRUCTBIG =>'QQnnCa*'; # 64-bit file offset and seq length
use constant DNA => 1;
use constant RNA => 2;
use constant PROTEIN => 3;
@@ -568,6 +569,7 @@
# get the most recent modification time of any of the contents
my $modtime = 0;
my %modtime;
+ $self->set_pack_method( @files );
foreach (@files) {
my $m = (stat($_))[9];
$modtime{$_} = $m;
@@ -612,6 +614,32 @@
return Bio::PrimarySeq::Fasta->new($self,$id);
}
+=head2 set_pack_method
+
+ Title : set_pack_method
+ Usage : $db->set_pack_method( @files )
+ Function: Determines whether data packing uses 32 or 64 bit integers
+ Returns :
+ Args : one or more file paths
+
+=cut
+
+sub set_pack_method {
+ my $self = shift;
+ # Find the maximum file size:
+ my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
+ my $fourGB = (2 ** 32) - 1;
+
+ if ($maxsize > $fourGB) {
+ # At least one file exceeds 4Gb - we will need to use 64 bit ints
+ $self->{packmeth} = \&_packBig;
+ $self->{unpackmeth} = \&_unpackBig;
+ } else {
+ $self->{packmeth} = \&_pack;
+ $self->{unpackmeth} = \&_unpack;
+ }
+}
+
=head2 index_file
Title : index_file
@@ -629,6 +657,7 @@
my $file = shift;
my $force_reindex = shift;
+ $self->set_pack_method( $file );
my $index = $self->index_name($file);
# if caller has requested reindexing, then unlink the index
unlink $index if $force_reindex;
@@ -716,9 +745,9 @@
if ($id) {
my $seqlength = $pos - $offset - length($_);
$seqlength -= $termination_length * $seq_lines;
- $offsets->{$id} = $self->_pack($offset,$seqlength,
- $linelength,$firstline,
- $type,$base);
+ $offsets->{$id} = &{$self->{packmeth}}($offset,$seqlength,
+ $linelength,$firstline,
+ $type,$base);
}
$id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
($offset,$firstline,$linelength) = ($pos,length($_),0);
@@ -746,10 +775,10 @@
}
$seqlength -= $termination_length * $seq_lines;
};
- $offsets->{$id} = $self->_pack($offset,$seqlength,
- $linelength,$firstline,
- $type,$base);
- }
+ $offsets->{$id} = &{$self->{packmeth}}($offset,$seqlength,
+ $linelength,$firstline,
+ $type,$base);
+}
$offsets->{__termination_length} = $termination_length;
return \%offsets;
}
@@ -770,35 +799,35 @@
my $self = shift;
my $id = shift;
my $offset = $self->{offsets}{$id} or return;
- ($self->_unpack($offset))[0];
+ (&{$self->{unpackmeth}}($offset))[0];
}
sub length {
my $self = shift;
my $id = shift;
my $offset = $self->{offsets}{$id} or return;
- ($self->_unpack($offset))[1];
+ (&{$self->{unpackmeth}}($offset))[1];
}
sub linelen {
my $self = shift;
my $id = shift;
my $offset = $self->{offsets}{$id} or return;
- ($self->_unpack($offset))[2];
+ (&{$self->{unpackmeth}}($offset))[2];
}
sub headerlen {
my $self = shift;
my $id = shift;
my $offset = $self->{offsets}{$id} or return;
- ($self->_unpack($offset))[3];
+ (&{$self->{unpackmeth}}($offset))[3];
}
sub alphabet {
my $self = shift;
my $id = shift;
my $offset = $self->{offsets}{$id} or return;
- my $type = ($self->_unpack($offset))[4];
+ my $type = (&{$self->{unpackmeth}}($offset))[4];
return $type == DNA ? 'dna'
: $type == RNA ? 'rna'
: 'protein';
@@ -818,7 +847,7 @@
my $self = shift;
my $id = shift;
my $offset = $self->{offsets}{$id} or return;
- $self->fileno2path(($self->_unpack($offset))[5]);
+ $self->fileno2path((&{$self->{unpackmeth}}($offset))[5]);
}
sub fileno2path {
@@ -899,7 +928,7 @@
my $self = shift;
my $id = shift;
my ($offset,$seqlength,$linelength,$firstline,$type,$file)
- = $self->_unpack($self->{offsets}{$id}) or return;
+ = &{$self->{unpackmeth}}($self->{offsets}{$id}) or return;
$offset -= $firstline;
my $data;
my $fh = $self->fh($id) or return;
@@ -914,7 +943,7 @@
my $self = shift;
my $id = shift;
my $a = shift()-1;
- my ($offset,$seqlength,$linelength,$firstline,$type,$file) =
$self->_unpack($self->{offsets}{$id});
+ my ($offset,$seqlength,$linelength,$firstline,$type,$file) =
&{$self->{unpackmeth}}($self->{offsets}{$id});
$a = 0 if $a < 0;
$a = $seqlength-1 if $a >= $seqlength;
my $tl = $self->{offsets}{__termination_length};
@@ -940,15 +969,21 @@
}
sub _pack {
- shift;
pack STRUCT, at _;
}
+sub _packBig {
+ pack STRUCTBIG, at _;
+}
+
sub _unpack {
- shift;
unpack STRUCT,shift;
}
+sub _unpackBig {
+ unpack STRUCTBIG,shift;
+}
+
sub _type {
shift;
local $_ = shift;
>
> On Aug 7, 2006, at 11:05 AM, Charles Tilford wrote:
>
>> I just found out that Bio::DB::Fasta has an inherit 4GB file size limit
>> in it. This is due to how indexing information is stored. The module
>> pack()s information using this format:
>>
>> use constant STRUCT =>'NNnnCa*';
>>
>> ... where the first token is the file offset. N = 32-bit unsigned
>> integer, and rolls-over when the file position passes the 4GB mark,
>> resulting in garbage out for those entries. Changing the packing
>> format to:
>>
>> use constant STRUCT =>'QNnnCa*';
>>
>> ...solves the problem (Q = 64-bit unsigned int). We have several genomic
>> files (ensembl dumps) where this is an issue:
>>
>> -rw-rw-r-- 1 kirovs bioinfo 7.2G Jul 13 12:28
>> pan_troglodytes.genome.CHIMP1A.fa
>> -rw-rw-r-- 1 kirovs bioinfo 6.8G Jul 13 12:25
>> monodelphis_domestica.genome.BROADO3.fa
>> -rw-rw-r-- 1 kirovs bioinfo 5.0G Jul 13 12:26
>> mus_musculus.genome.NCBIM36.fa
>> -rw-rw-r-- 1 kirovs bioinfo 4.6G Aug 2 15:31
>> bos_taurus.genome.Btau2.fa
>> -rw-rw-r-- 1 kirovs bioinfo 4.1G Jul 13 12:22
>> danio_rerio.genome.ZFISH6.fa
>>
>> These are not really large genomes, but have a fair number of
>> unassembled (duplicitous) fragments in them, which bump up the file
>> size. Some fully assembled genomes will probably eventually top the 4GB
>> mark, anyway.
>>
>> Unfortunately, this raises a backward compatibility issue, since an
>> index packed with 'N' will fail when unpacked with 'Q'. Perhaps the
>> module could dynamically bifurcate the packing structure based on a file
>> size test?
>>
>> The second token is for the sequence length, I can't imagine a single
>> sequence exceeding 4Gb, so it's probably safe - yes? Should it also be Q
>> in the event that biology someday exceeds our current imagination?
>>
>> Thanks,
>> CAT
>>
>> --
>> Charles Tilford, Bioinformatics-Applied Genomics
>> Bristol-Myers Squibb PRI, Hopewell 3A039
>> P.O. Box 5400, Princeton, NJ 08543-5400, (609) 818-3213
>> charles.tilford at bms.com <mailto:charles.tilford at bms.com>
>>
>>
>> _______________________________________________
>> Bioperl-l mailing list
>> Bioperl-l at lists.open-bio.org <mailto:Bioperl-l at lists.open-bio.org>
>> http://lists.open-bio.org/mailman/listinfo/bioperl-l
>
> Christopher Fields
> Postdoctoral Researcher
> Lab of Dr. Robert Switzer
> Dept of Biochemistry
> University of Illinois Urbana-Champaign
>
>
>
--
Charles Tilford, Bioinformatics-Applied Genomics
Bristol-Myers Squibb PRI, Hopewell 3A039
P.O. Box 5400, Princeton, NJ 08543-5400, (609) 818-3213
charles.tilford at bms.com
More information about the Bioperl-l
mailing list