[bioperl] Multiple sequences & ReadSeq
Will Fischer
wfischer@walrus.bio.indiana.edu
Fri, 14 Feb 1997 16:45:25 -0500
----------
X-Sun-Data-Type: text
X-Sun-Data-Description: text
X-Sun-Data-Name: text
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 36
cdagdigian@genetics.com (Chris Dagdigian) wrote:
> ...
> My preferences on solving the parsing problem are in order of attraction:
> ...
> 3. Writing sequences to temp files and opening a one-way pipe to readseq
>
> ... I'm willing do tackle
> this ASAP if people would give some guidance on where these temp files
> should be written ...
/tmp is cleaner for unix systems (IMHO), but using the current working
directory may be more portable (I guess).
> Maybe there should be a global $TEMP_DIR config option, and Seq.pm can
> write temp files to $TEMP_DIR/[PID]/xxx.tmp where [PID] is the process ID
> of the running script?
Good idea -- $TEMP_DIR could default as above
It should be noted that readseq currently does NOT handle NEXUS files
well (underscores and periods in taxon names cause the subsequent part
of the names to be inserted into the sequence). We've bugged Don about
this, but he is frying other fish at the moment :-).
I have attached my nexus-to-fasta/nexus-to-mega code (which appears to
work fine), as well as a set of format-conversion regular-expressions
written by Tongjia Yin (a former postdoc here). The latter could
(subject to his approval) at least serve as templates.
____________________________________________________________
Will Fischer
Biology Department wfischer@indiana.edu
Jordan Hall http://www.bio.indiana.edu/~wfischer
Indiana University Lab: 812-855-2549
Bloomington, Indiana 47405 USA FAX: 812-855-6705
----------
X-Sun-Data-Type: default-app
X-Sun-Data-Description: default
X-Sun-Data-Name: readnexus
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 170
#!/usr/local/bin/perl -I/will/DNAscripts/perl
#$Header$
eval "exec /usr/local/bin/perl -I/will/DNAscripts/perl -S $0 $*"
if $running_under_some_shell;
use File::Basename;
INFILES:
while ($_ = shift @ARGV) {
/^-/ && (
/^-f/ && ($format = 'fasta', next INFILES),
/^-m/ && ($format = 'mega', next INFILES),
/^-o/ && ($outfile = shift, next INFILES),
warn qq(unrecognized option "$file" (ignoring)\n),
);
$file = $_;
$format = 'fasta' unless $format;
# set $fasta or $mega or whatever
${$format}++;
$last_file = $file;
open (INFILE, "<$file") or die "couldn't open $file: $@\n";
@input = (<INFILE>);
close INFILE;
OUTFILE:{
# create new filename from input filename
#
# strip old suffix & directory, add new suffix
$rootfile = fileparse($file,'.\w+');
for ($format) {
/fasta/ && ($new_suffix = '.fasta');
/mega/ && ($new_suffix = '.meg');
}
$outfile = $rootfile . "$new_suffix";
my($i) = 0;
while (-e $outfile) {
warn "$outfile exists! Overwrite? [y/n]\n";
(($confirm = <STDIN>) =~ /[yY]/)
&& last || ($outfile = $rootfile . '.' . ++$i . "$new_suffix");
}
open(OUTFILE,">$outfile") &&
(warn "writing $format format to $outfile\n") ||
die "couldn't write $outfile",": $!\n" ;
select OUTFILE;
}
$_ = join ('',@input);
# remove all comment blocks
s/\A(#NEXUS.*?begin data.*?matrix\s*)//si && ($header = $1);
s/\[.*?\]//gs;
s/\s*;\s*end(block)*;\s*.*\Z//gsi;
$header =~ m/\[!\s*(.*?)\s*\]/ && ($title = $1);
$header =~ /MISSING=(.)/i && ($missing = $1);
$header =~ /GAP=(.)/i && ($gap = $1);
$header =~ /DATATYPE=(\w+)/i && ($datatype = $1);
$header =~ /MATCHCHAR=(.)/i && ($match = $1);
# format title
$title =~ s/\s+/ /gs;
my(@newtitle) = ();
while ($title =~ m/(.{0,70})\s+/gs) {
push(@newtitle,$1);
}
$title = join("\n\t",@newtitle);
$missing = '?' unless $missing;
$gap = '-' unless $gap;
if (!$match) {
if ($gap ne '.') {
$match = '.';
}
else {
$match = '-';
}
}
$megagap = '-';
$megamatch = '.';
# remove blank lines
tr/\n/\n/s;
while (m/^\s*('([^']*?)'|([^ \']\S+))\s+(.*?)\n/mg) {
my($seq_name) = ( $3 || $2 );
my($seq_fragment) = $4;
$seq_name =~ s/ /_/g;
# change spaces in name to underscores
$seq_name =~ s/ /_/g;
push( @{$seq_name}, $seq_fragment );
unless ($seqnames{$seq_name}) {
$seqnames{$seq_name} = 'seen_it';
push( @seqnames, $seq_name );
}
}
}
$progname = fileparse($0);
if ($mega) {
# PRINT MEGA HEADER
print "#mega\nTITLE: $last_file\n -- converted by $progname\n\t$title\n\n";
print qq(\tmatch = "$megamatch"; gap = "$megagap";\n\n);
}
# PRINT SEQUENCES
while ($seqname = shift @seqnames) {
print &print_seqs($format,$seqname,@{$seqname});
}
sub print_seqs {
my($format,$seqname,@seq_bits) = (@_);
my($sequence) = &simple_blocks(join('',@seq_bits));
for ($format) {
/fasta/ && do {
$formatted_sequence = sprintf(">$seqname\n$sequence\n");
};
/mega/ && do {
# change dots to dashes, and non-ACGTU bases to "?"
# ('cause mega won't deal w/ IUPAC codes)
#
my($transl) = "\$sequence =~ tr/${gap}${match}${missing}MRWSYKVHDBNmrwsykvhdbn/${megagap}${megamatch}???????????????????????/";
# escape dash (so it won't refer to char sets)
$transl =~ s/\-/\\-/;
eval $transl;
$formatted_sequence = sprintf("#$seqname\n$sequence\n");
};
}
return $formatted_sequence;
}
sub simple_blocks { # format a sequence nicely into blocks, preserving gaps
my($iupac_nt) = ".-~ACGTUMRWSYKVHDBNacgtumrwsykvhdbn" ;
my($iupac_aa) = ".-~ACDEFGHIKLMNPQRSTVWY\$\*acdefghiklmnpqrstvwyXx" ;
my($blocked_seq) = '';
my($format);
my(@blocks,@blocks_by_line,@lines) = ();
local($_,$num_blocks,$block_size,$to_number) = @_;
$block_size = 50 unless $block_size;
$num_blocks = 1 unless $num_blocks;
# trim out everything that isn't a valid IUPAC abbrev.
# (eval it so that the variable $iupac_nt is evaluated
# before the command is run)
#
eval "tr/-$iupac_aa$iupac_nt//dc";
$format = ('a' . $block_size) x int(1 + ( length($_) / $block_size ) );
# warn "$format\n";
@blocks = unpack($format,$_);
while(@blocks_by_line = splice(@blocks,0,$num_blocks)) {
push(@lines,join(' ',@blocks_by_line));
}
my($blocked_seq) = join("\n", @lines);
return($blocked_seq);
}
----------
X-Sun-Data-Type: default
X-Sun-Data-Description: default
X-Sun-Data-Name: tongjia-format-reading
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 115
# BEGIN code directly from tjyin - we'll need his permission
sub Format{
# various DB formats
local($_, $format) = @_;
@format = split(/:/, $format);
# see format file for details
undef @name; undef %seq; undef @piece;
if($format[0] ne "") {
# exclude stuff before seq cluster
($aa1, $aa2) = split(/$format[0]/i, $_, 2);
$_ = $aa2;
}
if($format[1] ne "") {
# exclude stuff after seq cluster
($aa3, $aa4) = split(/$format[1]/, $_, 2);
$_ = $aa3;
}
if($format[7] ne "") { s/($format[7])(\w)/$1!$2/g; }
# insert ! for certain type of formats
if($format[2] ne "") {
# This is for interleaved formats
s/$format[2]/!/g;
@block = split(/!/, $_);
# chop into blocks
for (@block) {
$count1++; $count2 = 0;
# count1 for checking if it is the first
s/$format[3]/!/g;
# count2 for ordering of the seq
@seq = split(/!/, $_);
# split into seq
for (@seq) {
if(!/^\[/ && length($_) > 10) {
# get rid of non-seq entries
s/$format[4]/!/;
if($count1 == 1) {
($name, $seq) = split(/!/,$_, 2);
# set the first batch
push(@name, $name);
push(@piece, $seq);
}
elsif($format[6] eq "noname") {
$piece[$count2] .= $_; } # run other (noname)
else {
# run other (with name)
($name, $seq) = split(/!/,$_, 2);
$piece[$count2] .= $seq;
}
$count2++;
$cc = $count2;
# it is necessary because there can be empty blocks
}
}
}
for($i = 0; $i < $cc; $i++) {
# generate the array
$name = $name[$i];
$seq = $piece[$i];
$seq =~ s/$format[5]//g;
# to get rid of non-seq char
$seq{$name} .= $seq;
}
undef $count1;
}
else {
s/$format[3]/~/g;
$pp= $_;
@seq = split(/~/, $_);
# split into seq
for (@seq) {
if(!/^\[/ && length($_) > 10) {
# get rid of non-seq entries
s/$format[4]/!/;
($name, $seq) = split(/!/, $_, 2);
$seq =~ s/$format[5]//g;
# to get rid of non-seq char
$name =~ /$format[6]\s*(\w+)/;
# extract names
$name = $1;
$seq{$name} = $seq;
# generate the array
push(@name, $name);
# get the name array
}
}
}
}
Sequence format conversion templates:
Name:cluster start:cluster end:block divider:record divider:info/seq:seq
clean:extract name/phylip's noname:p_split
IG/Stanford::::;:\cM[^\cM]+\cM|\n[^\n]+\n|\cM\n[^\cM\n]+\cM\n:[\cM\n1!]::
GenBank/GB::::\/\/:ORIGIN[^\cM\n]+(\cM\n|\cM|\n):[\d\cM\n\s!]:LOCUS:
NBRF::::\cM\s+\\cM|\n\n|\cM\n\cM\n:\cM[^\cM]+\cM|\n[^\n]+\n|\cM\n[^\cM\n]+\c
M\n:[\s\cM\n!]:\W\w\w;:
EMBL::::\/\/:SQ[^\cM\n]+(\cM\n|\cM|\n):[\cM\n\s!]:ID:
GCG::::\cM\s+\\cM|\n\n|\cM\n\cM\n:\cM[^\cM]+\cM|\n[^\n]+\n|\cM\n[^\cM\n]+\cM
\n:[\d\cM\n\s!]::
DNAStrider::::\/\/:;\cM\n|;\cM|;\n:[\cM\n!]:DNA sequence:
Fitch::::!:\cM\n|\cM|\n:[\cM\n\s!]::\cM\n|\cM|\n
Pearson/Fasta::::>:\cM\n|\cM|\n:[\cM\n!]::
Phylip3.2::::!:\s+:[\cM\n\s!]::\cM\n|\cM|\n
Phylip:::\cM\cM|\n\n|\cM\n\cM\n:\cM\n|\cM|\n:\s+:([\cM\n\s!\[\d*\]]|\*$):noname:PIR/CODATA::::\/\/\/:SEQUENCE[^\cM\n]+(\cM\n|\cM|\n)[^\cM\n]+(\cM\n|\cM|\n):
[\d\cM\n\s!]:ENTRY:
ASN.1:seq-set \{:::seq \{: iupac\wa ":[\cM\n\}\s,"]:title ":
NEXUS:Matrix:;:\cM\cM|\n\n|\cM\n\cM\n:\cM\n|\cM|\n:\s+:([\s!\[\d*\]]|\*\.+$|
\*$)::
NEXUS (non-interleave):Matrix:;::\cM\n|\cM|\n:\s+:([\s!\[\d*\]]|\*\.+$|\*$)::
test::::;:\cM\w*\cM|\n\w*\n|\cM\n\w*\cM\n:[\cM\n1!]::