[Bioperl-l] Fwd: how to parse maf file format
galeb abu-ali
abualiga2 at gmail.com
Tue Dec 18 22:08:51 UTC 2012
Hi,
I am writing a script to parse a multiple genome alignment file in maf
format, generated with mugsy alignment of e.coli genomes. So far, my
script parses SNPs from synteny blocks conserved in all aligned strains,
and it excludes gaps, which is enough for a phylogenetic analyses. I was
wondering how can I parse the remaining blocks that are not conserved in
all strains, to see what is conserved in n-1, n-2, etc. strains or unique
to each strain. I guess this is not a BioPerl question, but it's a Perl
for biologists question so I was hoping to get some insight here. If there
is a more appropriate forum, please let me know.
Below is my code.
many thanks!
galeb
#!/usr/local/bin/perl
use Modern::Perl '2013';
use autodie;
use List::MoreUtils qw/ each_arrayref /;
# gsa 18.12.2012
# parse mugsy multiple genome alignment for SNPs in synteny blocks
conserved in all aligned strains
=head
##maf version=1 scoring=mugsy
a score=7891 label=40 mult=4
s O55H7_RM12579.O55H7_RM12579 1596752 7262 + 5263980
CGGGATGCGGGGATGGGAATGCC-TGGTTGACGGGGTGGCG
s O55H7_CB9615.O55H7_CB9615 1604426 7262 + 5386352
CGGGATGCGGGGATGGGAATGCC-TGGTTGACGGGGTGGCGG-AT
s O157H7_Sakai.O157H7_Sakai 1787303 7068 + 5498450
CGGGATGCGGGGATGGGAATGCC-TGGTTGACGGGGTGGCGG-AT
s O157H7_EDL.O157H7_EDL933 1729749 7082 + 5528445
CGGGATGCGGGAATGGGAATGCCTTGGTTGACGGGGTGGCGGAAT
a score=6756 label=41 mult=4
s O55H7_RM12579.O55H7_RM12579 1986265 6749 + 5263980
CAGGAGGGGCATCAGCTCACACCGACAGCCCCTGCGTATGG
s O55H7_CB9615.O55H7_CB9615 1991733 6749 + 5386352
CAGGAGGGGCATCAGCTCACACCGACAGCCCCTGCGTATGGTTAC
s O157H7_Sakai.O157H7_Sakai 3940728 6751 - 5498450
CAGGAGGGGCATCAGCTCACACCGACAGCCCCTGCGTATGGTTAC
s O157H7_EDL.O157H7_EDL933 4260689 4042 - 5528445
---------------------------------------------
=cut
my $infile = shift or die "Usage: $0 <alignment_file.maf>\n";
my %snps;
my $strains = 0;
my @alignment;
my( $score, $blkLen, $mult );
my $total_snps;
my $syn_len;
my %lengths;
open my $fh, '<', $infile;
while( <$fh> ) {
next if /^#/;
chomp;
if( /^a/ ) {
( $score, $blkLen, $mult ) = ( split )[1,2,3];
$score =~ s/score\=(\d+)/$1/; # length of alignment block including
'-'
$blkLen =~ s/label\=(\d+)/$1/; # alignment block number; numbers
ranked on alignment length
$mult =~ s/mult\=(\d+)/$1/;# number of strains aligned in block
$strains = $mult if $mult > $strains; # total number of strains in
alignment
}
elsif( /^s/ ) { push @alignment, $_ }
elsif( /^$/ || ! length $_ ) {
my( @strNames, @starts, @strands, @dna_mtrx );
# if sequence conserved in all strains
if( $strains == @alignment ) {
$syn_len += $score; # total aligned sequence in all strains
for( @alignment ) {
# name, align start, align length (w/o '-'), direction,
align sequence w/ '-'
my( $name, $start, $len, $strand, $dna ) = ( split /\s+/ )[
1, 2, 3, 4, 6 ];
#$name =~ s/.*\.(.*)/$1/; # remove duplicated strain name
# strains are always in same order when all strains in
block.
push @strNames, $name;
push @starts, $start;
push @strands, $strand;
push @dna_mtrx, [ split '', $dna ];
# total seqeunce in each strain w/o '-' that is conserved
in all strains
$lengths{ $name } += $len;
}
my $ea = each_arrayref( @dna_mtrx );
my %gaps;
my $cnt;
while( my( @bases ) = $ea->() ) {
++$cnt;
my %temp;
for( 0 .. $#bases ) { # store gaps if any
if( $bases[$_] eq '-' ) {
$gaps{$_}++; # key is number, corresponds to index
of other arrays
}
}
# skip gaps '-'
unless( '-' ~~ @bases ) { $temp{ uc $_}++ for @bases } # if
snp then %temp will have > 1 key
if( keys %temp > 1 ) { # if SNP exists, get base and
position for all strains in alignment
++$total_snps;
my $pos;
for( 0 .. $#bases ) {
if( $strands[$_] eq '+' ) { $pos = $starts[$_] +
$cnt - ( $gaps{$_} // 0 ) } # genome positn
elsif( $strands[$_] eq '-' ) { $pos = $starts[$_] -
$cnt - ( $gaps{$_} // 0 ) }
# HoAoH
push @{ $snps{ $strNames[$_] } }, { $pos =>
$bases[$_] };
}
}
}
}
@alignment = ();
}
}
close $fh;
#print Dumper( \%snps ); use Data::Dumper;
say "Sum length of synteny blocks conserved in all strains, including gaps:
$syn_len bp";
say "Length of conserved sequence for each strain, excluding gaps:";
for my $strain ( keys %lengths ) {
say "$strain\t$lengths{ $strain } bp";
}
my $outfile = $infile;
$outfile =~ s/\.maf$/_snps.txt/;
open my $fh2, '>', $outfile;
say {$fh2} map{ $_ . "_base\t", $_ . "_pos\t" } keys %snps;
for my $snp ( 0 .. ( $total_snps - 1 ) ) {
for my $strain ( keys %snps ){
for my $href ( keys %{ $snps{ $strain }[ $snp ] } ) {
print {$fh2} "$snps{ $strain }[ $snp ]->{ $href }\t$href\t";
}
}
print {$fh2} "\n";
}
More information about the Bioperl-l
mailing list