[Biojava-l] Genbank file parser error

gang wu gwu at molbio.mgh.harvard.edu
Thu Jan 29 04:51:28 UTC 2009


Hi Everyone,

I have a piece of code to parse Genbank file and retrieve gene sequence 
and related information. It works well with sequences such as 
Arabidopsis thaliana, C. elegans, Bos taurus. But it failed with Mus 
musculus chromosome 2. The contig that the code failed on is the largest 
one in my test. Contig NT_039207 has 116366104 bp, but the code shows 
it's cut to 100000020 bp. That causes some gene coordinates out of 
range. Attached is the code. Can anyone give some suggesttion?

The Mus musculus Genbank file can be downloaded at :
ftp://ftp.ncbi.nih.gov/genomes/M_musculus/CHR_02/mm_alt_chr2.gbk.gz

Thanks in advance

Gang
==========================================
public class TestMus {
    public void testMusChr2() throws FileNotFoundException, 
NoSuchElementException, BioException {
        String fp="/tmp/mm_alt_chr2.gbk";
        System.out.println("File: " + fp);
        BufferedReader gReader = new BufferedReader(new 
InputStreamReader(new FileInputStream(new File(fp))));
        Namespace ns = (Namespace) RichObjectFactory.getDefaultNamespace();
        RichSequenceIterator seqI = 
RichSequence.IOTools.readGenbankDNA(gReader, ns);
        while (seqI.hasNext()) {
            RichSequence seq = seqI.nextRichSequence();
            String organism = seq.getTaxon().getDisplayName();
            String accession = seq.getAccession();
            String identifier = seq.getIdentifier();
            int taxonID = seq.getTaxon().getNCBITaxID();
            String division = seq.getDivision();
            String seqVersion = "" + seq.getSeqVersion();
            int seqLength = seq.length();
            String description = seq.getDescription();
            System.out.println("Organism: " + organism
                    + "\nAccession: " + accession
                    + "\nIdentifier: " + identifier
                    + "\nTaxonID: " + taxonID
                    + "\nDivision: " + division
                    + "\nSeqVersion: " + seqVersion
                    + "\nLength: " + seqLength);
            System.out.println("2041-2101: " + seq.subStr(2041, 2101));
            for (Iterator i = seq.features(); i.hasNext();) {
                RichFeature f = (RichFeature) i.next();
                int rank = f.getRank();
                String fType = f.getType();
                if (fType.toLowerCase().equals("gene")) {
                    int startPos=f.getLocation().getMin();
                    int endPos=f.getLocation().getMax();
                    int geneLen=endPos-startPos+1;
                    String sequence=seq.subStr(startPos, endPos);
                    String strand = f.getStrand().getToken() + "";
                    Annotation ann = (Annotation) f.getAnnotation();
                    String geneIdentifier ="";
                    if (ann.containsProperty("locus_tag")) {
                        geneIdentifier=ann.getProperty("locus_tag") + "";
                    }
                    else geneIdentifier=ann.getProperty("gene") + "";

                    String alternativeIdentifiers="";
                    try {
                        alternativeIdentifiers= (String) 
ann.getProperty("gene");

                    } catch(NoSuchElementException e) {}
                    String annotation="";
                    System.out.println(rank + "\t" + geneIdentifier + 
"\t" + alternativeIdentifiers + "\t"
                            + startPos + "\t" + endPos + "\t" + geneLen 
+ "\t" + strand);
                }
            }
        }
    }
    public static void main(String [] args) throws Exception {
       TestMus tm=new TestMus();
        tm.testMusChr2();
    }
}



More information about the Biojava-l mailing list