[Bioperl-l] Bio::Assembly::IO::ace with CAP3 ace files

Marc Logghe Marc.Logghe@devgen.com
Mon, 23 Dec 2002 14:53:38 +0100


Hi,
I had to hack into Bio::Assembly::IO::ace a little to make it work with ace
files generated by CAP3 and to allow handling of multiple assemblies per
file.
The diff is included in case somebody would need it also (revision 1.1 is
the initial commit on bioperl-live). I hope I did not break anything; did
not test it with phrap ace files.
Great module, I especially like the change_coord() method to change between
the 4 coordinate systems. Really cool !!! Thank you Robson F. de Souza !
Cheers,
Marc


===================================================================
RCS file: /usr/local/cvs/scripts/ace.pm,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- scripts/ace.pm	2002/12/23 06:12:19	1.1
+++ scripts/ace.pm	2002/12/23 18:52:48	1.3
@@ -1,4 +1,4 @@
-# $Id: ace.pm,v 1.1 2002/12/23 06:12:19 marcl Exp $
+# $Id: ace.pm,v 1.3 2002/12/23 18:52:48 marcl Exp $
 #
 ## BioPerl module for Bio::Assembly::IO::ace
 #
@@ -118,24 +118,26 @@
 sub next_assembly {
     my $self = shift; # Object reference
     local $/="\n";
+    my $AS_flag;
 
     # Resetting assembly data structure
     my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
-
     # Looping over all ACE file lines
     my ($contigOBJ,$read_name);
     my $read_data = {}; # Temporary holder for read data
+    READLINE:
     while ($_ = $self->_readline) { # By now, ACE files hold a single
assembly
 	chomp;
 
 	# Loading assembly information (ASsembly field)
-#	(/^AS (\d+) (\d+)/) && do {
+	(/^AS (\d+) (\d+)/) && do {
+            last READLINE if $AS_flag++; 
 #	    $assembly->_set_nof_contigs($1);
 #	    $assembly->_set_nof_sequences_in_contigs($2);
-#	};
+	};
 
 	# Loading contig sequence (COntig sequence field)
-	(/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig
found!
+	(/^CO (.*Contig\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig
found!
 	    my $contigID = $1;
 	    $contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap',
-id=>$contigID);
 #	    $contigOBJ->set_nof_bases($2); # Contig length in base pairs
@@ -197,7 +199,7 @@
 	};
 
 	# Loading read info... (Assembled From field)
-	/^AF (\S+) (C|U) (-*\d+)/ && do {
+	/^AF (\S+).* (C|U) (-*\d+)$/ && do {
 	    $read_name = $1; my $ori = $2;
 	    $read_data->{$read_name}{'padded_start'} = $3; # aligned start
 	    $ori = ( $ori eq 'U' ? 1 : -1);
@@ -212,7 +214,7 @@
 #	};
 
 	# Loading reads... (ReaD sequence field
-	/^RD (\S+) (-*\d+) (\d+) (\d+)/ && do {
+	/^RD (\S+).* (-*\d+) (\d+) (\d+)$/ && do {
 	    $read_name = $1;
 	    $read_data->{$read_name}{'length'} = $2; #
number_of_padded_bases
 	    $read_data->{$read_name}{'contig'} = $contigOBJ;
@@ -243,7 +245,6 @@
 						      );
 	    $contigOBJ->set_seq_coord($coord,$read);
 	};
-
 	# Loading read trimming and alignment ranges...
 	/^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do {
 	    my $qual_start  = $1; my $qual_end  = $2;
@@ -357,6 +358,8 @@
 	};
 
     } # while ($_ = $self->_readline)
+    
+    return undef unless ($assembly->all_contigs); # otherwise endless loop
 
     $assembly->update_seq_list();
     return $assembly;