[bioperl] Multiple sequences & ReadSeq

Will Fischer wfischer@walrus.bio.indiana.edu
Fri, 14 Feb 1997 16:45:25 -0500


----------
X-Sun-Data-Type: text
X-Sun-Data-Description: text
X-Sun-Data-Name: text
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 36

cdagdigian@genetics.com (Chris Dagdigian) wrote:
> ...
> My preferences on solving the parsing problem are in order of attraction:
> ... 
>         3. Writing sequences to temp files and opening a one-way pipe to readseq
> 
> ... I'm willing do tackle
> this ASAP if people would give some guidance on where these temp files
> should be written ...

/tmp is cleaner for unix systems (IMHO), but using the current working
directory may be more portable (I guess).  

> Maybe there should be a global $TEMP_DIR config option, and Seq.pm can
> write temp files to $TEMP_DIR/[PID]/xxx.tmp where [PID] is the process ID
> of the running script?

Good idea -- $TEMP_DIR could default as above

It should be noted that readseq currently does NOT handle NEXUS files
well (underscores and periods in taxon names cause the subsequent part
of the names to be inserted into the sequence).  We've bugged Don about
this, but he is frying other fish at the moment :-).

I have attached my nexus-to-fasta/nexus-to-mega code (which appears to
work fine), as well as a set of format-conversion regular-expressions
written by Tongjia Yin (a former postdoc here).  The latter could
(subject to his approval) at least serve as templates.

____________________________________________________________
Will Fischer			

Biology Department             		wfischer@indiana.edu
Jordan Hall                   		http://www.bio.indiana.edu/~wfischer
Indiana University            		Lab:    812-855-2549
Bloomington, Indiana 47405 USA		FAX:    812-855-6705
----------
X-Sun-Data-Type: default-app
X-Sun-Data-Description: default
X-Sun-Data-Name: readnexus
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 170

#!/usr/local/bin/perl -I/will/DNAscripts/perl
#$Header$

eval "exec /usr/local/bin/perl -I/will/DNAscripts/perl -S $0 $*"
      if $running_under_some_shell;

use File::Basename;

INFILES:
while ($_ = shift @ARGV) {
	
	/^-/ && (
		/^-f/ && ($format = 'fasta',	next INFILES),
		/^-m/ && ($format = 'mega',	next INFILES),
		/^-o/ && ($outfile = shift,	next INFILES),
		warn qq(unrecognized option "$file" (ignoring)\n),
	);

	$file = $_;
	$format = 'fasta' unless $format;

	# set $fasta or $mega or whatever
	${$format}++;

	$last_file = $file;
	open (INFILE, "<$file") or die "couldn't open $file: $@\n";
	@input = (<INFILE>);
	close INFILE;

	OUTFILE:{
		# create new filename from input filename
		#
		# strip old suffix & directory, add new suffix
		$rootfile = fileparse($file,'.\w+');

		for ($format) {
			/fasta/	&& ($new_suffix = '.fasta');
			/mega/	&& ($new_suffix = '.meg');
		}
		$outfile = $rootfile . "$new_suffix";
    
		my($i) = 0;
		while (-e $outfile) {
			warn "$outfile exists!  Overwrite? [y/n]\n";
			(($confirm = <STDIN>) =~ /[yY]/) 
			 && last || ($outfile = $rootfile . '.' . ++$i . "$new_suffix");
		}
		open(OUTFILE,">$outfile")  &&
			(warn "writing $format format to $outfile\n") || 
			die "couldn't write $outfile",": $!\n" ;
		select OUTFILE;
	}
	$_ = join ('',@input);

	# remove all comment blocks
	s/\A(#NEXUS.*?begin data.*?matrix\s*)//si && ($header = $1);
	s/\[.*?\]//gs;
	s/\s*;\s*end(block)*;\s*.*\Z//gsi;

	$header =~ m/\[!\s*(.*?)\s*\]/ && ($title = $1);
	$header =~ /MISSING=(.)/i   	&& ($missing  = $1);
	$header =~ /GAP=(.)/i   	&& ($gap      = $1);
	$header =~ /DATATYPE=(\w+)/i	&& ($datatype = $1);
	$header =~ /MATCHCHAR=(.)/i 	&& ($match  = $1);

	# format title
	$title =~ s/\s+/ /gs;
	my(@newtitle) = ();
	while ($title =~ m/(.{0,70})\s+/gs) {
		push(@newtitle,$1);
	}
	$title = join("\n\t",@newtitle);

	$missing   = '?' unless $missing;
	$gap       = '-' unless $gap;
	if (!$match) {
		if ($gap ne '.') {
			$match = '.';
		}
		else {
			$match = '-';
		}
	}

	$megagap   = '-';
	$megamatch = '.';

	# remove blank lines
	tr/\n/\n/s;

	while (m/^\s*('([^']*?)'|([^ \']\S+))\s+(.*?)\n/mg) {
		my($seq_name) = ( $3 || $2 );
		my($seq_fragment) = $4;
		$seq_name =~ s/ /_/g;

		# change spaces in name to underscores
		$seq_name =~ s/ /_/g;

		push( @{$seq_name}, $seq_fragment );
		unless ($seqnames{$seq_name}) {
			$seqnames{$seq_name} = 'seen_it';
			push( @seqnames, $seq_name );
		}
	}
}


$progname = fileparse($0);
if ($mega) {
	# PRINT MEGA HEADER
	print "#mega\nTITLE: $last_file\n  -- converted by $progname\n\t$title\n\n";
	print qq(\tmatch = "$megamatch"; gap = "$megagap";\n\n);
}

# PRINT SEQUENCES
while ($seqname = shift @seqnames) {
	print &print_seqs($format,$seqname,@{$seqname});
}


sub print_seqs {
	my($format,$seqname,@seq_bits) = (@_);

	my($sequence) = &simple_blocks(join('',@seq_bits));
	for ($format) {
		/fasta/ && do {
			$formatted_sequence = sprintf(">$seqname\n$sequence\n");
		};
		/mega/ && do {

			# change dots to dashes, and non-ACGTU bases to "?"
			# ('cause mega won't deal w/ IUPAC codes)
			#
			my($transl) = "\$sequence =~ tr/${gap}${match}${missing}MRWSYKVHDBNmrwsykvhdbn/${megagap}${megamatch}???????????????????????/";
			# escape dash (so it won't refer to char sets)
			$transl =~ s/\-/\\-/;
			eval $transl;
			$formatted_sequence = sprintf("#$seqname\n$sequence\n");
		};
	}
	return $formatted_sequence;
}

sub simple_blocks { # format a sequence nicely into blocks, preserving gaps
	my($iupac_nt) = ".-~ACGTUMRWSYKVHDBNacgtumrwsykvhdbn" ;
	my($iupac_aa) = ".-~ACDEFGHIKLMNPQRSTVWY\$\*acdefghiklmnpqrstvwyXx" ;
	my($blocked_seq) = '';
	my($format);
	my(@blocks,@blocks_by_line,@lines) = ();
	local($_,$num_blocks,$block_size,$to_number) = @_;

	$block_size = 50	unless $block_size;
	$num_blocks = 1 	unless $num_blocks;
	
	# trim out everything that isn't a valid IUPAC abbrev.
	# (eval it so that the variable $iupac_nt is evaluated
	# before the command is run)
	#
	eval "tr/-$iupac_aa$iupac_nt//dc";

	$format = ('a' . $block_size) x int(1 + ( length($_) / $block_size ) );
	# warn "$format\n";
	@blocks = unpack($format,$_);
	while(@blocks_by_line = splice(@blocks,0,$num_blocks)) {
		push(@lines,join(' ',@blocks_by_line));
	}		
	my($blocked_seq) = join("\n", @lines);
	
	return($blocked_seq);
}
----------
X-Sun-Data-Type: default
X-Sun-Data-Description: default
X-Sun-Data-Name: tongjia-format-reading
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 115

# BEGIN code directly from tjyin - we'll need his permission

sub Format{
        # various DB formats
        local($_, $format) = @_;
        @format = split(/:/, $format);
# see format file for details
        undef @name; undef %seq; undef @piece;
        if($format[0] ne "") {
# exclude stuff before seq cluster
                ($aa1, $aa2) = split(/$format[0]/i, $_, 2);
                $_ = $aa2;
        }
        if($format[1] ne "") {
# exclude stuff after seq cluster
                ($aa3, $aa4) = split(/$format[1]/, $_, 2);
                $_ = $aa3;
        }
        if($format[7] ne "") {  s/($format[7])(\w)/$1!$2/g; } 
# insert ! for certain type of formats

        if($format[2] ne "") {
# This is for interleaved formats
                s/$format[2]/!/g;
                @block = split(/!/, $_);
# chop into blocks
                for (@block) {
                        $count1++; $count2 = 0;
# count1 for checking if it is the first
                        s/$format[3]/!/g;
# count2 for ordering of the seq
                        @seq = split(/!/, $_);
# split into seq
                        for (@seq) {
                                if(!/^\[/ && length($_) > 10) {
# get rid of non-seq entries
                                        s/$format[4]/!/;
                                        if($count1 == 1) {
                                                ($name, $seq) = split(/!/,$_, 2);   
# set the first batch
                                                push(@name, $name);
                                                push(@piece, $seq);
                                        }
                                        elsif($format[6] eq "noname") {
                                                $piece[$count2] .= $_;  }       # run other (noname)
                                        else {

# run other (with name)
                                                ($name, $seq) = split(/!/,$_, 2);
                                                $piece[$count2] .= $seq;
                                        }
                                        $count2++;
                                        $cc = $count2;
# it is necessary because there can be empty blocks
                                }
                        }
                }
                for($i = 0; $i < $cc; $i++) {
# generate the array
                        $name = $name[$i];
                        $seq = $piece[$i];
                        $seq =~ s/$format[5]//g;
# to get rid of non-seq char
                        $seq{$name} .= $seq;
                }
                undef $count1;
        }
        else {
                s/$format[3]/~/g;
                $pp= $_;
                @seq = split(/~/, $_);
# split into seq
                for (@seq) {
                        if(!/^\[/ && length($_) > 10) {
# get rid of non-seq entries
                                s/$format[4]/!/;
                                ($name, $seq) = split(/!/, $_, 2);
                                $seq =~ s/$format[5]//g;
# to get rid of non-seq char
                                $name =~ /$format[6]\s*(\w+)/;
# extract names
                                $name = $1;
                                $seq{$name} = $seq;
# generate the array
                                push(@name, $name);
# get the name array
                        }
                }
        }
}




Sequence format conversion templates:
Name:cluster start:cluster end:block divider:record divider:info/seq:seq
clean:extract name/phylip's noname:p_split
IG/Stanford::::;:\cM[^\cM]+\cM|\n[^\n]+\n|\cM\n[^\cM\n]+\cM\n:[\cM\n1!]::
GenBank/GB::::\/\/:ORIGIN[^\cM\n]+(\cM\n|\cM|\n):[\d\cM\n\s!]:LOCUS:
NBRF::::\cM\s+\\cM|\n\n|\cM\n\cM\n:\cM[^\cM]+\cM|\n[^\n]+\n|\cM\n[^\cM\n]+\c
M\n:[\s\cM\n!]:\W\w\w;:
EMBL::::\/\/:SQ[^\cM\n]+(\cM\n|\cM|\n):[\cM\n\s!]:ID:
GCG::::\cM\s+\\cM|\n\n|\cM\n\cM\n:\cM[^\cM]+\cM|\n[^\n]+\n|\cM\n[^\cM\n]+\cM
\n:[\d\cM\n\s!]::
DNAStrider::::\/\/:;\cM\n|;\cM|;\n:[\cM\n!]:DNA sequence:
Fitch::::!:\cM\n|\cM|\n:[\cM\n\s!]::\cM\n|\cM|\n
Pearson/Fasta::::>:\cM\n|\cM|\n:[\cM\n!]::
Phylip3.2::::!:\s+:[\cM\n\s!]::\cM\n|\cM|\n
Phylip:::\cM\cM|\n\n|\cM\n\cM\n:\cM\n|\cM|\n:\s+:([\cM\n\s!\[\d*\]]|\*$):noname:PIR/CODATA::::\/\/\/:SEQUENCE[^\cM\n]+(\cM\n|\cM|\n)[^\cM\n]+(\cM\n|\cM|\n):
[\d\cM\n\s!]:ENTRY:
ASN.1:seq-set \{:::seq \{:        iupac\wa ":[\cM\n\}\s,"]:title ":
NEXUS:Matrix:;:\cM\cM|\n\n|\cM\n\cM\n:\cM\n|\cM|\n:\s+:([\s!\[\d*\]]|\*\.+$|
\*$)::
NEXUS (non-interleave):Matrix:;::\cM\n|\cM|\n:\s+:([\s!\[\d*\]]|\*\.+$|\*$)::
test::::;:\cM\w*\cM|\n\w*\n|\cM\n\w*\cM\n:[\cM\n1!]::