[Bioperl-l] Re: Counting gaps in sequence data revisited.

Sun Oct 17 18:57:13 EDT 2004

I cleaned up the code a little. So, here it is for anyone interested:

#!usr/bin/perl
# By Michael S. Robeson II with the help from the folks at lernperl.org 
and bioperl.org
# 10/16/2004
# Last updated: 10/17/2004
# This script was made for the purpose of searching for indels (gaps) 
in aligned
# DNA or protein sequences that are in FASTA format. It tallys up all 
of the different
# size gaps within each sequence string. While it does this it counts 
the number of
# times each gap of a given size is represented in each sequence and at 
the same time
# reports all of the positions that that particular "gap-size" or indel 
appears.
# contact: robeson at colorado.edu if you have questions or comments

use warnings;
use strict;

#########################
# Introduction
#########################

print "\n\t**Welcome to Mike Robeson's Gap-Counting Script!**\n
	A - Just be sure that your sequence alignment file
	    is in FASTA format!
	B - Make sure there are no duplicate names within an individual file!
	C - Output file will be based on the name of the input file. It is 
named
	    by appending \'indel_list_\' to the name of your input file.\n\n";

###############################
# Open Sequence Data & OUTFILE
###############################

print "Enter in the name of the DNA sequence file:\n";
chomp (my $dna_seq = <STDIN>);

open(DNA_SEQ, $dna_seq)
	or die "Can't open file: $!\n";

open(OUTFILE, ">indel_list_"."$dna_seq")
	or die "Can't open outfile: $!\n";

#################################
# Read sequence data into a hash
#################################

my %sequences;

$/ = '>'; 	

print "\n***Discovered the following DNA sequences:***\n";

while ( <DNA_SEQ> ) {
	chomp;
	next unless s/^\s*(.+)//;
	my $name = $1;				
	s/\s//g;					
	$sequences{$name} = $_;		
	print "$name found!\n";		

	print"\n";
}
close DNA_SEQ;

######################################
# Iterate over gaps and write to file
######################################

foreach (keys %sequences) {			
	print "\t\t\>\>\>\>\>\> $_ \<\<\<\<\<\<\n";		
	print OUTFILE "\>\>\>\>\>\> $_ \<\<\<\<\<\<\n";	

	my $dna = $sequences{$_};		
	my %gap_data;					
	my %position;					

	while ($dna =~ /(\-+)/g) {
		my $gap_length = length $1;
		my $gap_pos = pos ($dna) - $gap_length + 1;
		$gap_data{$gap_length}++;
		$position{$gap_length} .= "$gap_pos"." \n";
	}

	my @indels = keys (%gap_data);			
	my @keys = sort { $a <=> $b} @indels;	

	foreach my $key (@keys) {
		print "Indel size:\t$key\tTimes found:\t$gap_data{$key}\n";
		print OUTFILE "Indel size:\t$key\tTimes found:\t$gap_data{$key}\n";
		print "Positions:\n";
		print OUTFILE "Positions:\n";
		print "$position{$key}";
		print OUTFILE "$position{$key}";
		print "\n";
		print OUTFILE "\n";
		}
}