[Bioperl-l] Split genomes

Mon Jun 20 14:14:21 UTC 2011

Hey,

I want to split mitochondrial genome of certain mammals into ~250bp length
of fragments which i further use for running RepeatMasker on them. I have
posted below the script I use for the purpose, the issue is that it does not
split the genomes in the size i require. Can anybody please modify my script
so that it splits the genome in fragments of 200-275 bp. Thanks a lot !

Script (minus the shebang line) -->

# USE: file_split.fasta.pl fastafile
# USE: Splits a given fasta file into smaller files
# USE: -n splits into files of n numbered sequences
# USE: -f splits into n numbered files
# USE: -l splits into files of a total of l length

use strict;
use Getopt::Long;

my ($nseq, $nfiles, $seql);

GetOptions( "n:i" => \$nseq,
	    "f:i" => \$nfiles,
	    "l:i" => \$seql);

my $usage = "file_split.fasta.pl -n -f fasta file\n";

if (!($ARGV[0])) { die "$usage";} #checks to make sure that some form of
split variable has been given

my $infile = shift or die;

chomp $infile;

if ($infile) {
	open (INITIAL, $infile) or die "cannot open $infile\n";
		}  #take initial_input line 1

my $fnum = 0;

if ($nseq && !($nfiles) && !($seql)){
		my $seqcount = 0;	
		my $outfile = $infile.".".$fnum;
		open (OUTFILE, ">$outfile");

		while(<INITIAL>) {
				if (m/^>/) {$seqcount++;}
				if ($seqcount > $nseq)
						{	$seqcount = 1;
							$fnum++;
							close OUTFILE;
							$outfile = $infile.".".$fnum;
							open (OUTFILE, ">$outfile");
				}
				print OUTFILE "$_";
			}
		close OUTFILE;
		close INITIAL;		
} #ends split on n-sequences loop

if ($nfiles && !($nseq) && !($seql)){
	my $totalseqs = totalcount($infile);
	my $numseqs = int ($totalseqs/$nfiles);
	my $outfile = $infile.".".$fnum;
	my $seqcount = 0;	
	open (OUTFILE, ">$outfile");	

	while(<INITIAL>) {
			if (m/^>/) {$seqcount++;}
			if (($seqcount > $numseqs) && ($fnum < ($nfiles-1)))
					{	$seqcount = 1;
						$fnum++;
						close OUTFILE;
						$outfile = $infile.".".$fnum;
						open (OUTFILE, ">$outfile");
				}
				print OUTFILE "$_";
			}
		close OUTFILE;
		close INITIAL;
}  #ends split into $nfile files loop

if ($seql && !($nfiles) && !($nseq)){
	my $totallength = getlength($infile);
	my $filetotal = int ($totallength/$seql);	

	my $outfile = $infile.".".$fnum;
	open (OUTFILE, ">$outfile");	
	my $sumlength = 0;

	while(<INITIAL>) {

			if (!(m/^>/)) {$sumlength += length ($_) }
			if (($sumlength > $seql) && (m/^>/))
					{	$sumlength = 0;
						$fnum++;
						close OUTFILE;
						$outfile = $infile.".".$fnum;
						open (OUTFILE, ">$outfile");
				}
				print OUTFILE "$_";
			}
		close OUTFILE;
		close INITIAL;
}
$fnum++;
print "Split $infile into $fnum files\n";	

sub totalcount {
	open (FILE, "$_[0]") or die "cannot find $_[0]\n";
	my $seqnum = 0;
	while (<FILE>){
		if (m/^>/) {$seqnum++;}
	}
	close FILE;
	return $seqnum;
} #returns number of sequences in the file

sub getlength{
	open (FILE, "$_[0]") or die "cannot find $_[0]\n";
	my $totalbp = 0;
	while (<FILE>){
		if (!(m/^>/)){ $totalbp += length ($_);}
	}
	print "Total bp = $totalbp\n";
	close FILE;
	return $totalbp;	
} #returns the total basepairs in the file.
-- 
View this message in context: http://old.nabble.com/Split-genomes-tp31886043p31886043.html
Sent from the Perl - Bioperl-L mailing list archive at Nabble.com.