[Biojava-l] Reading frames and amino acids

facemann off2w0rk at yahoo.com
Fri Mar 5 22:09:18 EST 2004


Here is a small contribution.  I use it to find simple motifs. Feel free to edit or scrap.
 
/**
 *MotifLister.java
 *Andy Hammer
 *08 Aug 2003
 *Lists all instances of a motif in specified (dna\rna\protein) fasta file.
 *The motif can contain Ambiguity symbols
 *Lists the ORF title and position of motif
 *Outputs a list of counts to stdout.
 */
import java.io.*;
import java.util.*;
import java.util.regex.*;
import org.biojava.bio.*;
import org.biojava.bio.seq.*;
import org.biojava.bio.seq.io.*;
import org.biojava.bio.symbol.*;
public class MotifLister{
  public MotifLister(String type, String inputFile, String target, String placement)throws Exception{
    System.out.println("MotifLister is searching file " + inputFile + " for the motif '" + target +
                        "' in frame " + placement + ".");
    try{
      if(type.equalsIgnoreCase("dna")){
        motif = DNATools.createDNA(target);
      }else if(type.equalsIgnoreCase("rna")){
        motif = RNATools.createRNA(target);
      }else{
        motif = ProteinTools.createProtein(target);
      }
    }
    catch(BioError e){
      System.out.println("Error!!  Data type must match type of motif.");
      System.out.println("Specifically, " + target + " is not " + type);
      System.exit(0);
    }
    Pattern p = Pattern.compile( MotifTools.createRegex(motif) );
    frame = Integer.parseInt(placement);
    if(frame < 0 || frame > 3){
      System.out.println("Only frames 0 through 3 are alloweds");
      System.out.println("frame zero searches all frames.");
      System.exit(0);
    }
    count = 0;
    //read the file
    //input
    FileInputStream fis = new FileInputStream(inputFile);
    InputStreamReader isr = new InputStreamReader(fis);
    BufferedReader input = new BufferedReader(isr);
    try{
      if(type.equalsIgnoreCase("dna")){
        si = SeqIOTools.readFastaDNA(input);
      }else if(type.equalsIgnoreCase("rna")){
        si = SeqIOTools.readFastaRNA(input);
      }else{
        si = SeqIOTools.readFastaProtein(input);
      }
      while (si.hasNext()){
        Sequence seq = si.nextSequence();
        Matcher matcher = p.matcher(seq.seqString());
        int start = 0;
        while(matcher.find(start)) {
          start = matcher.start();
          int end = matcher.end();
          int result = (start % 3) + 1;
          if(result == frame || frame == 0){
            System.out.println(seq.getName() + " : " + "[" + (start + 1) + "," + (end) + "]");
            count++;
          }
          start++;
        }
      }
      input.close(); //close the file
      System.out.println("Total Hits = " + count);
    }
    catch(BioException e){
      System.out.println(inputFile + " is not a " + type + " file.");
      System.out.println(e);
    }
  }
  public static void main(String[] args)throws Exception{
    if (args.length < 4) {
      System.err.println(" Usage: >java -jar MotifLister.jar type fastaFile motif frame" +
                         "\n Ex: >java -jar MotifLister.jar dna eColi.fasta AAAAAAG 3 > output.txt" +
                         "\n would search for A AAA AAG in the third frame in dna file eColi.fasta" +
                         "\n and print the results to file output.txt." +
                         "\n 'type' can be dna, rna, or protein." +
                         "\n 'frame' can be integers 0 through 3." +
                         "\n 0 counts any instance of the motif." +
                         "\n 1, 2, 3 counts only instances of the motif in the specified frame." +
                         "\n Capture output with redirection operator '>'.");
    }else{
      MotifLister ML = new MotifLister(args[0], args[1], args[2], args[3]);
    }
  }
  private SymbolList motif;
  private int frame;
  private int count;
  private SequenceIterator si;
}


---------------------------------
Do you Yahoo!?
Yahoo! Search - Find what you’re looking for faster.


More information about the Biojava-l mailing list