[Bioperl-l] Fwd: how to parse maf file format

Tue Dec 18 22:08:51 UTC 2012

Hi,

I am writing a script to parse a multiple genome alignment file in maf
format, generated with mugsy alignment of e.coli genomes.  So far, my
script parses SNPs from synteny blocks conserved in all aligned strains,
and it excludes gaps, which is enough for a phylogenetic analyses.  I was
wondering how can I parse the remaining blocks that are not conserved in
all strains, to see what is conserved in n-1, n-2, etc. strains or unique
to each strain.  I guess this is not a BioPerl question, but it's a Perl
for biologists question so I was hoping to get some insight here.  If there
is a more appropriate forum, please let me know.

Below is my code.

many thanks!

galeb

#!/usr/local/bin/perl
use Modern::Perl '2013';
use autodie;
use List::MoreUtils qw/ each_arrayref /;

# gsa 18.12.2012
# parse mugsy multiple genome alignment for SNPs in synteny blocks
conserved in all aligned strains
=head
##maf version=1 scoring=mugsy
a score=7891 label=40 mult=4
s O55H7_RM12579.O55H7_RM12579        1596752 7262 + 5263980
CGGGATGCGGGGATGGGAATGCC-TGGTTGACGGGGTGGCG
s O55H7_CB9615.O55H7_CB9615        1604426 7262 + 5386352
CGGGATGCGGGGATGGGAATGCC-TGGTTGACGGGGTGGCGG-AT
s O157H7_Sakai.O157H7_Sakai        1787303 7068 + 5498450
CGGGATGCGGGGATGGGAATGCC-TGGTTGACGGGGTGGCGG-AT
s O157H7_EDL.O157H7_EDL933        1729749 7082 + 5528445
CGGGATGCGGGAATGGGAATGCCTTGGTTGACGGGGTGGCGGAAT

a score=6756 label=41 mult=4
s O55H7_RM12579.O55H7_RM12579        1986265 6749 + 5263980
CAGGAGGGGCATCAGCTCACACCGACAGCCCCTGCGTATGG
s O55H7_CB9615.O55H7_CB9615        1991733 6749 + 5386352
CAGGAGGGGCATCAGCTCACACCGACAGCCCCTGCGTATGGTTAC
s O157H7_Sakai.O157H7_Sakai        3940728 6751 - 5498450
CAGGAGGGGCATCAGCTCACACCGACAGCCCCTGCGTATGGTTAC
s O157H7_EDL.O157H7_EDL933        4260689 4042 - 5528445
---------------------------------------------
=cut

my $infile = shift or die "Usage: $0 <alignment_file.maf>\n";

my %snps;
my $strains = 0;
my @alignment;
my( $score, $blkLen, $mult );
my $total_snps;
my $syn_len;
my %lengths;

open my $fh, '<', $infile;

while( <$fh> ) {
    next if /^#/;
    chomp;

    if( /^a/ ) {
        ( $score, $blkLen, $mult ) = ( split )[1,2,3];
        $score =~ s/score\=(\d+)/$1/; # length of alignment block including
'-'
        $blkLen =~ s/label\=(\d+)/$1/; # alignment block number; numbers
ranked on alignment length
        $mult  =~ s/mult\=(\d+)/$1/;# number of strains aligned in block

        $strains = $mult if $mult > $strains; # total number of strains in
alignment
    }
    elsif( /^s/ ) { push @alignment, $_ }

    elsif( /^$/ || ! length $_ ) {
        my( @strNames, @starts, @strands, @dna_mtrx );
        # if sequence conserved in all strains
        if( $strains == @alignment ) {
            $syn_len += $score; # total aligned sequence in all strains
            for( @alignment ) {
                # name, align start, align length (w/o '-'), direction,
align sequence w/ '-'
                my( $name, $start, $len, $strand, $dna ) = ( split /\s+/ )[
1, 2, 3, 4, 6 ];
                #$name =~ s/.*\.(.*)/$1/; # remove duplicated strain name

                # strains are always in same order when all strains in
block.
                push @strNames, $name;
                push @starts, $start;
                push @strands, $strand;
                push @dna_mtrx, [ split '', $dna ];
                # total seqeunce in each strain w/o '-' that is conserved
in all strains
                $lengths{ $name } += $len;
            }

            my $ea = each_arrayref( @dna_mtrx );
            my %gaps;
            my $cnt;
            while( my( @bases ) = $ea->() ) {
                ++$cnt;
                my %temp;
                for( 0 .. $#bases ) { # store gaps if any
                    if( $bases[$_] eq '-' ) {
                        $gaps{$_}++; # key is number, corresponds to index
of other arrays
                    }
                }
                # skip gaps '-'
                unless( '-' ~~ @bases ) { $temp{ uc $_}++ for @bases } # if
snp then %temp will have > 1 key
                if( keys %temp > 1 ) { # if SNP exists, get base and
position for all strains in alignment
                    ++$total_snps;
                    my $pos;
                    for( 0 .. $#bases ) {
                        if( $strands[$_] eq '+' ) { $pos = $starts[$_] +
$cnt - ( $gaps{$_} // 0 ) } # genome positn
                        elsif( $strands[$_] eq '-' ) { $pos = $starts[$_] -
$cnt - ( $gaps{$_} // 0 ) }
                        # HoAoH
                        push @{ $snps{ $strNames[$_] } }, { $pos =>
$bases[$_] };
                    }
                }
            }
        }
        @alignment = ();
    }
}
close $fh;
#print Dumper( \%snps ); use Data::Dumper;
say "Sum length of synteny blocks conserved in all strains, including gaps:
$syn_len bp";
say "Length of conserved sequence for each strain, excluding gaps:";
for my $strain ( keys %lengths ) {
    say "$strain\t$lengths{ $strain } bp";
}

my $outfile = $infile;
$outfile =~ s/\.maf$/_snps.txt/;
open my $fh2, '>', $outfile;
say {$fh2} map{ $_ . "_base\t", $_ . "_pos\t" } keys %snps;
for my $snp ( 0 .. ( $total_snps - 1 ) ) {
    for my $strain ( keys %snps ){
        for my $href ( keys %{ $snps{ $strain }[ $snp ] } ) {
            print {$fh2} "$snps{ $strain }[ $snp ]->{ $href }\t$href\t";
            }
        }
    print {$fh2} "\n";
}