[Bioperl-l] Split genomes
anna_sh
a.gautam168 at yahoo.co.in
Mon Jun 20 14:14:21 UTC 2011
Hey,
I want to split mitochondrial genome of certain mammals into ~250bp length
of fragments which i further use for running RepeatMasker on them. I have
posted below the script I use for the purpose, the issue is that it does not
split the genomes in the size i require. Can anybody please modify my script
so that it splits the genome in fragments of 200-275 bp. Thanks a lot !
Script (minus the shebang line) -->
# USE: file_split.fasta.pl fastafile
# USE: Splits a given fasta file into smaller files
# USE: -n splits into files of n numbered sequences
# USE: -f splits into n numbered files
# USE: -l splits into files of a total of l length
use strict;
use Getopt::Long;
my ($nseq, $nfiles, $seql);
GetOptions( "n:i" => \$nseq,
"f:i" => \$nfiles,
"l:i" => \$seql);
my $usage = "file_split.fasta.pl -n -f fasta file\n";
if (!($ARGV[0])) { die "$usage";} #checks to make sure that some form of
split variable has been given
my $infile = shift or die;
chomp $infile;
if ($infile) {
open (INITIAL, $infile) or die "cannot open $infile\n";
} #take initial_input line 1
my $fnum = 0;
if ($nseq && !($nfiles) && !($seql)){
my $seqcount = 0;
my $outfile = $infile.".".$fnum;
open (OUTFILE, ">$outfile");
while(<INITIAL>) {
if (m/^>/) {$seqcount++;}
if ($seqcount > $nseq)
{ $seqcount = 1;
$fnum++;
close OUTFILE;
$outfile = $infile.".".$fnum;
open (OUTFILE, ">$outfile");
}
print OUTFILE "$_";
}
close OUTFILE;
close INITIAL;
} #ends split on n-sequences loop
if ($nfiles && !($nseq) && !($seql)){
my $totalseqs = totalcount($infile);
my $numseqs = int ($totalseqs/$nfiles);
my $outfile = $infile.".".$fnum;
my $seqcount = 0;
open (OUTFILE, ">$outfile");
while(<INITIAL>) {
if (m/^>/) {$seqcount++;}
if (($seqcount > $numseqs) && ($fnum < ($nfiles-1)))
{ $seqcount = 1;
$fnum++;
close OUTFILE;
$outfile = $infile.".".$fnum;
open (OUTFILE, ">$outfile");
}
print OUTFILE "$_";
}
close OUTFILE;
close INITIAL;
} #ends split into $nfile files loop
if ($seql && !($nfiles) && !($nseq)){
my $totallength = getlength($infile);
my $filetotal = int ($totallength/$seql);
my $outfile = $infile.".".$fnum;
open (OUTFILE, ">$outfile");
my $sumlength = 0;
while(<INITIAL>) {
if (!(m/^>/)) {$sumlength += length ($_) }
if (($sumlength > $seql) && (m/^>/))
{ $sumlength = 0;
$fnum++;
close OUTFILE;
$outfile = $infile.".".$fnum;
open (OUTFILE, ">$outfile");
}
print OUTFILE "$_";
}
close OUTFILE;
close INITIAL;
}
$fnum++;
print "Split $infile into $fnum files\n";
sub totalcount {
open (FILE, "$_[0]") or die "cannot find $_[0]\n";
my $seqnum = 0;
while (<FILE>){
if (m/^>/) {$seqnum++;}
}
close FILE;
return $seqnum;
} #returns number of sequences in the file
sub getlength{
open (FILE, "$_[0]") or die "cannot find $_[0]\n";
my $totalbp = 0;
while (<FILE>){
if (!(m/^>/)){ $totalbp += length ($_);}
}
print "Total bp = $totalbp\n";
close FILE;
return $totalbp;
} #returns the total basepairs in the file.
--
View this message in context: http://old.nabble.com/Split-genomes-tp31886043p31886043.html
Sent from the Perl - Bioperl-L mailing list archive at Nabble.com.
More information about the Bioperl-l
mailing list