[Biopython-dev] "Online" tests, was [Bug 1972]

Mon May 15 16:22:30 UTC 2006

On Fri, 2006-03-31 at 21:41 +0100, Peter (BioPython Dev) wrote:
> Bill Barnard wrote:
> > I've made a first cut unit test, tentatively named
> > test_Parsers_for_newest_formats, ...
> 
> Sounds good to me.  But not a very snappy name - how about something 
> shorter like test_OnlineFormats.py instead?

Works for me.

> I think the Blast test should actually submit a short protein/nucleotide 
> sequence known to be in the online database.  Maybe do some basic sanity 
> testing like check it returns at least N results and the best hit is at 
> least a certain score.

I have one such test working.

> 
> >>In some cases (e.g. GenBank, Fasta) once the sample file is downloaded 
> >>there are multiple parsers to be checked (e.g. record and feature parsers).

I have tests using EUtils that check the record and feature parsers for
GenBank and Fasta files.

> I'll volunteer to add cases for GenBank, Fasta and GEO files.

I've not yet looked at GEO files, so there's that yet to do. I expect to
have some free time soon so I may look at the GEO and other parsers at
that time. I'd certainly be interested in any feedback you have on these
tests, or to see additional test cases. Finding the parsers to test and
implementing the tests seems pretty straightforward.

I've attached the test code to this email, and added a .txt file
extension just in case the listserv won't allow attaching a code file.

Bill

p.s. I'm still looking for a more interesting project I can contribute
to Biopython. Doing some of the tests and such is useful to me in
learning my way around the codebase; I hope to find something requiring
a bit more creativity. If anyone has any suggestions about areas that
need some attention I'll happily consider them. I am considering writing
some code for HMMs that would build on some prototype code I wrote.
-------------- next part --------------
"""Test to make sure all parsers retrieve and read current
data formats.
"""
import requires_internet

import sys
import unittest

from Bio import ParserSupport
# ExpasyTest
from Bio import Prosite
from Bio.Prosite import Prodoc
from Bio.SwissProt import SProt
# PubMedTest
from Bio import PubMed, Medline
# EUtilsTest
from Bio import EUtils
from Bio.EUtils import DBIdsClient
from Bio import GenBank
from Bio import Fasta
# BlastTest
try:
    import cStringIO as StringIO
except ImportError:
    import StringIO
from Bio.Blast import NCBIWWW, NCBIXML

def run_tests(argv):
    test_suite = testing_suite()
    runner = unittest.TextTestRunner(sys.stdout, verbosity = 2)
    runner.run(test_suite)

def testing_suite():
    """Generate the suite of tests.
    """
    test_suite = unittest.TestSuite()

    test_loader = unittest.TestLoader()
    test_loader.testMethodPrefix = 't_'
    tests = [ExpasyTest, PubMedTest, EUtilsTest, BlastTest]

    for test in tests:
        cur_suite = test_loader.loadTestsFromTestCase(test)
        test_suite.addTest(cur_suite)

    return test_suite

class ExpasyTest(unittest.TestCase):
    """Test that parsers can read the current Expasy database formats
    """
    def setUp(self):
        pass
    def t_parse_prosite_record(self):
        """Retrieve a Prosite record and parse it
        """
        prosite_dict = Prosite.ExPASyDictionary(parser=Prosite.RecordParser())
        accession = 'PS00159'
        entry = prosite_dict[accession]
        self.assertEqual(entry.accession, accession)
    def t_parse_prodoc_record(self):
        """Retrieve a Prodoc record and parse it
        """
        prodoc_dict = Prodoc.ExPASyDictionary(parser=Prodoc.RecordParser())
        accession = 'PDOC00933'
        entry = prodoc_dict[accession]
        self.assertEqual(entry.accession, accession)
    def t_parse_sprot_record(self):
        """Retrieve a SwissProt record and parse it into Record format
        """
        sprot_record_dict = SProt.ExPASyDictionary(parser=SProt.RecordParser())
        accession = 'Q5TYW8'
        entry = sprot_record_dict[accession]
        self.failUnless(accession in entry.accessions)
    def t_parse_sprot_seq(self):
        """Retrieve a SwissProt record and parse it into Sequence format
        """
        sprot_seq_dict = SProt.ExPASyDictionary(parser=SProt.SequenceParser())
        accession = 'Q5TYW8'
        entry = sprot_seq_dict[accession]
        self.assertEqual(entry.id, accession)

class PubMedTest(unittest.TestCase):
    """Test that Medline parsers can read the current PubMed format
    """
    def setUp(self):
        pass
    def t_parse_pubmed_record(self):
        """Retrieve a PubMed record and parse it
        """
        pubmed_dict = PubMed.Dictionary(parser=Medline.RecordParser())
        pubmed_id = '3136164'
        entry = pubmed_dict[pubmed_id]
        self.assertEqual(entry.pubmed_id, pubmed_id)

class EUtilsTest(unittest.TestCase):
    """Test that GenBank, Fasta parsers can read  EUtils retrieved db formats
    """
    # Database primary IDs

    # http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?
    # lists all E-Utility database names. Primary IDs, which are
    # always an integer, are listed below for a few databases:

    # Entrez Database   Primary ID      E-Utility Database Name
    # 3D Domains        3D SDI          domains
    # Domains           PSSM-ID         cdd
    # Genome            Genome ID       genome
    # Nucleotide        GI number       nucleotide
    # OMIM              MIM number      omim
    # PopSet            Popset ID       popset
    # Protein           GI number       protein
    # ProbeSet          GEO ID          geo
    # PubMed            PMID            pubmed
    # Structure         MMDB ID         structure
    # SNP               SNP ID          snp
    # Taxonomy          TAXID           taxonomy
    # UniGene           UniGene ID      unigene
    # UniSTS            UniSTS ID       unists

    # see Bio.EUtils.Config.py for supported EUtils databases

    def setUp(self):
        self.client = DBIdsClient.DBIdsClient()
    def t_parse_nucleotide_gb(self):
        """Use EUtils to retrieve and parse an NCBI nucleotide GenBank record
        """
        db = "nucleotide"
        gi = "57165207"
        result = self.client.search(gi, db, retmax=1)
        sfp = result[0].efetch(retmode="text", rettype="gb", \
                               seq_start=5458145, seq_stop=5458942)
        text = sfp.readlines()
        sfp.close()
        self.assertEqual(result.dbids.db, db) # sanity check, not a parser test
        locus_line = text[0]
        # now parse the text with GenBank parsers
        parser_input = StringIO.StringIO(''.join(text))
        # RecordParser
        parser = GenBank.RecordParser()
        record = parser.parse(parser_input)
        parser_input.reset()
        self.assertEqual(record.gi, gi)
        # split locus line to eliminate whitespace differences
        self.assertEqual(record._locus_line().split(), locus_line.split())
        # FeatureParser
        parser = GenBank.FeatureParser()
        record = parser.parse(parser_input)
        parser_input.reset()
        self.assertEqual(record.annotations['gi'], gi)
    def t_parse_protein_fasta(self):
        """Use EUtils to retrieve and parse an NCBI protein Fasta record
        """
        db = "protein"
        gi = "21220767"
        result = self.client.search(gi, db, retmax=1)
        sfp = result[0].efetch(retmode="text", rettype="fasta")
        text = sfp.readlines()
        sfp.close()
        self.assertEqual(result.dbids.db, db) # sanity check, not a parser test
        # now parse the text with Fasta parsers
        parser_input = StringIO.StringIO(''.join(text))
        # RecordParser
        parser = Fasta.RecordParser()
        record = parser.parse(parser_input)
        parser_input.reset()
        self.assert_(gi in record.title.split('|'))
        self.assertEqual(record.title, text[0][1:-1]) # strip > and \n from text[0]
        # SequenceParser
        parser = Fasta.SequenceParser()
        record = parser.parse(parser_input)
        parser_input.reset()
        self.assert_(gi in record.description.split('|'))
        self.assertEqual(record.description, text[0][1:-1]) # strip > and \n from text[0]

class BlastTest(unittest.TestCase):
    """Test that the Blast XML parser can read the current NCBI formats
    """
    def setUp(self):
        pass
    def t_parse_blast_xml(self):
        """Use NCBIWWW to retrieve Blast results and use NCBIXML to parse it
        """
        query = '''>sp|P09405|NUCL_MOUSE Nucleolin (Protein C23) - Mus musculus (Mouse).
VKLAKAGKTHGEAKKMAPPPKEVEEDSEDEEMSEDEDDSSGEEEVVIPQKKGKKATTTPA
KKVVVSQTKKAAVPTPAKKAAVTPGKKAVATPAKKNITPAKVIPTPGKKGAAQAKALVPT'''
        result_handle = NCBIWWW.qblast('blastp', 'swissprot', \
                                       query, expect=0.0001, format_type="XML")
        blast_results = result_handle.read()
        result_handle.close()
        blast_out = StringIO.StringIO(blast_results)
        parser = NCBIXML.BlastParser()
        b_record = parser.parse(blast_out)
        ### write output file for testing purposes ~35 kB
##        import os
##        save_fname = os.path.expanduser('~/tmp/blast_out.xml')
##        save_file = open(save_fname, 'w')
##        save_file.write(blast_results)
##        save_file.close()
        ###
        # When I ran this on 4 May 2006 I got max_score of 601 & 21 hits
        expected_score_cutoff = 600
        expected_min_hits = 20
        max_score = 0.0
        for alignment in b_record.alignments:
            for hsp in alignment.hsps:
                if hsp.score > max_score: max_score = hsp.score

        msg = 'max score (%g) < expected_score_cutoff(%g)' % \
              (max_score, expected_score_cutoff)
        self.assert_(max_score >= expected_score_cutoff, msg)

        msg = 'N(%d) < expected_min_hits(%d)' % \
              (len(b_record.alignments), expected_min_hits)
        self.assert_(len(b_record.alignments) >= expected_min_hits, msg)

if __name__ == "__main__":
    sys.exit(run_tests(sys.argv))