[Bioperl-l] Split genomes

anna_sh a.gautam168 at yahoo.co.in
Wed Jun 22 17:39:45 UTC 2011


Thanks for the script. It was just the perfect fix.



anna_sh wrote:
> 
> Hey,
> 
> I want to split mitochondrial genome of certain mammals into ~250MB length
> of fragments which i further use for running RepeatMasker on them. I have
> posted below the script I use for the purpose, the issue is that it does
> not split the genomes in the size i require. Can anybody please modify my
> script so that it splits the genome in fragments of 200-275MB. Thanks a
> lot !
> 
> 
> 
> 
> 
> Script (minus the shebang line) -->
> 
> 
> # USE: file_split.fasta.pl fastafile
> # USE: Splits a given fasta file into smaller files
> # USE: -n splits into files of n numbered sequences
> # USE: -f splits into n numbered files
> # USE: -l splits into files of a total of l length
> 
> use strict;
> use Getopt::Long;
> 
> my ($nseq, $nfiles, $seql);
> 
> GetOptions( "n:i" => \$nseq,
> 	    "f:i" => \$nfiles,
> 	    "l:i" => \$seql);
> 
> my $usage = "file_split.fasta.pl -n -f fasta file\n";
> 	    
> if (!($ARGV[0])) { die "$usage";} #checks to make sure that some form of
> split variable has been given
> 	    
> my $infile = shift or die;
> 
> chomp $infile;
> 
> 
> if ($infile) {
> 	open (INITIAL, $infile) or die "cannot open $infile\n";
> 		}  #take initial_input line 1
> 		
> my $fnum = 0;
> 		
> if ($nseq && !($nfiles) && !($seql)){
> 		my $seqcount = 0;	
> 		my $outfile = $infile.".".$fnum;
> 		open (OUTFILE, ">$outfile");
> 		
> 		while(<INITIAL>) {
> 				if (m/^>/) {$seqcount++;}
> 				if ($seqcount > $nseq)
> 						{	$seqcount = 1;
> 							$fnum++;
> 							close OUTFILE;
> 							$outfile = $infile.".".$fnum;
> 							open (OUTFILE, ">$outfile");
> 				}
> 				print OUTFILE "$_";
> 			}
> 		close OUTFILE;
> 		close INITIAL;		
> } #ends split on n-sequences loop
> 	
> if ($nfiles && !($nseq) && !($seql)){
> 	my $totalseqs = totalcount($infile);
> 	my $numseqs = int ($totalseqs/$nfiles);
> 	my $outfile = $infile.".".$fnum;
> 	my $seqcount = 0;	
> 	open (OUTFILE, ">$outfile");	
> 	
> 	while(<INITIAL>) {
> 			if (m/^>/) {$seqcount++;}
> 			if (($seqcount > $numseqs) && ($fnum < ($nfiles-1)))
> 					{	$seqcount = 1;
> 						$fnum++;
> 						close OUTFILE;
> 						$outfile = $infile.".".$fnum;
> 						open (OUTFILE, ">$outfile");
> 				}
> 				print OUTFILE "$_";
> 			}
> 		close OUTFILE;
> 		close INITIAL;
> }  #ends split into $nfile files loop
> 
> if ($seql && !($nfiles) && !($nseq)){
> 	my $totallength = getlength($infile);
> 	my $filetotal = int ($totallength/$seql);	
> 	
> 	my $outfile = $infile.".".$fnum;
> 	open (OUTFILE, ">$outfile");	
> 	my $sumlength = 0;
> 	
> 	while(<INITIAL>) {
> 
> 			if (!(m/^>/)) {$sumlength += length ($_) }
> 			if (($sumlength > $seql) && (m/^>/))
> 					{	$sumlength = 0;
> 						$fnum++;
> 						close OUTFILE;
> 						$outfile = $infile.".".$fnum;
> 						open (OUTFILE, ">$outfile");
> 				}
> 				print OUTFILE "$_";
> 			}
> 		close OUTFILE;
> 		close INITIAL;
> }
> $fnum++;
> print "Split $infile into $fnum files\n";	
> 
> sub totalcount {
> 	open (FILE, "$_[0]") or die "cannot find $_[0]\n";
> 	my $seqnum = 0;
> 	while (<FILE>){
> 		if (m/^>/) {$seqnum++;}
> 	}
> 	close FILE;
> 	return $seqnum;
> } #returns number of sequences in the file
> 
> sub getlength{
> 	open (FILE, "$_[0]") or die "cannot find $_[0]\n";
> 	my $totalbp = 0;
> 	while (<FILE>){
> 		if (!(m/^>/)){ $totalbp += length ($_);}
> 	}
> 	print "Total bp = $totalbp\n";
> 	close FILE;
> 	return $totalbp;	
> } #returns the total basepairs in the file.
> 

-- 
View this message in context: http://old.nabble.com/Split-genomes-tp31886043p31905239.html
Sent from the Perl - Bioperl-L mailing list archive at Nabble.com.




More information about the Bioperl-l mailing list