[Bioperl-l] sequence parsing using a grammar

Heikki Lehvaslaiho heikki@ebi.ac.uk
07 Mar 2002 14:24:25 +0000


--=-UcNW1fe1A2frykRMZte5
Content-Type: text/plain
Content-Transfer-Encoding: 7bit


Thanks to  Andrew Macgregor's recent mail  I finally had a shot at using
Parse::RecDescent. After a little bit of fumbling I think I got hang of
it.

Rule based parsing is really robust and powerful way of handling data
parsing. I really think this is a way forward but I would like to hear
of other opinions, too.

I'll attach a stub parser which correctly splits one EMBL entry into
fields and creates, for the time being, a very simple Bio::PrimarySeq
object.

Turn debugging on to see how all the fields, including multiline fields
and field blocks like the feature table, have been parsed out of the
entry.

I am not suggesting this to 1.0 but maybe for the next rewrite ...

        -Heikki

P.S I tried to send this yesterday, but it was automatically rejected
with:

"Your message had a suspicious header.  This list no longer accepts
HTML mail attachments due to virus problems."

-- 
______ _/      _/_____________________________________________________
      _/      _/                      http://www.ebi.ac.uk/mutations/
     _/  _/  _/  Heikki Lehvaslaiho          heikki@ebi.ac.uk
    _/_/_/_/_/  EMBL Outstation, European Bioinformatics Institute
   _/  _/  _/  Wellcome Trust Genome Campus, Hinxton
  _/  _/  _/  Cambs. CB10 1SD, United Kingdom
     _/      Phone: +44 (0)1223 494 644   FAX: +44 (0)1223 494 468
___ _/_/_/_/_/________________________________________________________

--=-UcNW1fe1A2frykRMZte5
Content-Disposition: attachment; filename=embl_parser.pl
Content-Transfer-Encoding: quoted-printable
Content-Type: text/x-perl; charset=ISO-8859-1

#!/usr/local/bin/perl
use Parse::RecDescent;
use Bio::PrimarySeq;
use Data::Dumper;=20
use strict;

$::RD_ERRORS =3D 1;
$::RD_WARN =3D 1;
$::RD_HINT =3D 1;
$::debug =3D 0;

my $grammar =3D
    q{
	{=20
	    use Data::Dumper;=20
	    my ($id, $tag);=20
	    my $seq =3D new Bio::PrimarySeq;
        }


           entry:   field(s)=20
                    { $seq ;  }
		   =20
           field:   spacer |=20
                    id |
                    seq |
		    generic

	      id:   /ID/  /(\S+)\s+\S+\;\s+([^;]+)\;\s+(\S+)\;/

                    {
			$seq->id($1);
                    }

	     seq:   /SQ/ ln /(.*?[ \t]*\n)+[^\/]*/

                    {=20
                    	$item[-1] =3D~ s/[\d\W]+//g;
			my ($given_len) =3D $item{ln} =3D~ /Sequence ([\d+]+) BP/ ;
			die ("Reported and actual sequence length do not match")
			    unless $given_len =3D=3D length $item[-1];

			#$seq->seq($item[-1]);
			$seq->seq(substr$item[-1], 0, 20);

		    	print Dumper(\%item) if $::debug;
			1;
		    }

         generic:
                    ...tag   { $tag =3D $item[1]; }

                    /.*\n($tag.*\n)*/=20
		    {  =20
		    	print Dumper(\%item) if $::debug;
			1;
		    }

              ln:   /.*/
             tag:   'R' | /\w\w/
          spacer:   'XX'

	   };

my $parse =3D new Parse::RecDescent ($grammar);

my $data =3D=20
    qq(ID   HSANDREC   standard; RNA; HUM; 3569 BP.
XX
AC   M20132; J03180;
XX
SV   M20132.1
XX
DT   23-NOV-1989 (Rel. 21, Created)
DT   02-JUL-1999 (Rel. 60, Last updated, Version 4)
XX
DE   Human androgen receptor (AR) mRNA, complete cds.
XX
KW   androgen receptor.
XX
OS   Homo sapiens (human)
OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Mammalia; Eutheria=
;
OC   Primates; Catarrhini; Hominidae; Homo.
XX
RN   [1]
RP   1-3569
RX   MEDLINE; 89112208.
RA   Lubahn D.B., Joseph D.R., Sar M., Tan J., Higgs H.N., Larson R.E.,
RA   French F.S., Wilson E.M.;
RT   "The human androgen receptor: complementary deoxyribonucleic acid clon=
ing,
RT   sequence analysis and gene expression in prostate";
RL   Mol. Endocrinol. 2(12):1265-1275(1988).
XX
DR   GDB; 120556; AR.
DR   SWISS-PROT; P10275; ANDR_HUMAN.
DR   TRANSFAC; T00040; T00040.
XX
CC   Draft entry and computer readable sequence [1] kindly submitted by
CC   E.M.Wilson, 18-AUG-1988.
XX
FH   Key             Location/Qualifiers
FH
FT   source          1..3569
FT                   /db_xref=3D"taxon:9606"
FT                   /organism=3D"Homo sapiens"
FT                   /map=3D"Xq11.2-q12"
FT   gene            363..3122
FT                   /gene=3D"AR"
FT   CDS             363..3122
FT                   /codon_start=3D1
FT                   /db_xref=3D"SWISS-PROT:P10275"
FT                   /note=3D"androgen receptor"
FT                   /gene=3D"AR"
FT                   /protein_id=3D"AAA51729.1"
FT                   /translation=3D"MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPG=
PRHPEAA
FT                   SAAPPGASLLLLQQQQQQQQQQQQQQQQQQQQQETSPRQQQQQQGEDGSPQAHR=
RGPTG
FT                   YLVLDEEQQPSQPQSALECHPERGCVPEPGAAVAASKGLPQQLPAPPDEDDSAA=
PSTLS
FT                   LLGPTFPGLSSCSADLKDILSEASTMQLLQQQQQEAVSEGSSSGRAREASGAPT=
SSKDN
FT                   YLGGTSTISDNAKELCKAVSVSMGLGVEALEHLSPGEQLRGDCMYAPLLGVPPA=
VRPTP
FT                   CAPLAECKGSLLDDSAGKSTEDTAEYSPFKGGYTKGLEGESLGCSGSAAAGSSG=
TLELP
FT                   STLSLYKSGALDEAAAYQSRDYYNFPLALAGPPPPPPPPHPHARIKLENPLDYG=
SAWAA
FT                   AAAQCRYGDLASLHGAGAAGPGSGSPSAAASSSWHTLFTAEEGQLYGPCGGGGG=
GGGGG
FT                   GGGGGGGGGGGGGGEAGAVAPYGYTRPPQGLAGQESDFTAPDVWYPGGMVSRVP=
YPSPT
FT                   CVKSEMGPWMDSYSGPYGDMRLETARDHVLPIDYYFPPQKTCLICGDEASGCHY=
GALTC
FT                   GSCKVFFKRAAEGKQKYLCASRNDCTIDKFRRKNCPSCRLRKCYEAGMTLGARK=
LKKLG
FT                   NLKLQEEGEASSTTSPTEETTQKLTVSHIEGYECQPIFLNVLEAIEPGVVCAGH=
DNNQP
FT                   DSFAALLSSLNELGERQLVHVVKWAKALPGFRNLHVDDQMAVIQYSWMGLMVFA=
MGWRS
FT                   FTNVNSRMLYFAPDLVFNEYRMHKSRMYSQCVRMRHLSQEFGWLQITPQEFLCM=
KALLL
FT                   FSIIPVDGLKNQKFFDELRMNYIKELDRIIACKRKNPTSCSRRFYQLTKLLDSV=
QPIAR
FT                   ELHQFTFDLLIKSHMVSVDFPEMMAEIISVQVPKILSGKVKPIYFHTQ"
XX
SQ   Sequence 3569 BP; 796 A; 1009 C; 974 G; 790 T; 0 other;
     taataactca gttcttattt gcacctactt cagtggacac tgaatttgga aggtggagga     =
   60
     ttttgttttt ttcttttaag atctgggcat cttttgaatc tacccttcaa gtattaagag     =
  120
     acagactgtg agcctagcag ggcagatctt gtccaccgtg tgtcttcttc tgcacgagac     =
  180
     tttgaggctg tcagagcgct ttttgcgtgg ttgctcccgc aagtttcctt ctctggagct     =
  240
     tcccgcaggt gggcagctag ctgcagcgac taccgcatca tcacagcctg ttgaactctt     =
  300
     ctgagcaaga gaaggggagg cggggtaagg gaagtaggtg gaagattcag ccaagctcaa     =
  360
     ggatggaagt gcagttaggg ctgggaaggg tctaccctcg gccgccgtcc aagacctacc     =
  420
     gaggagcttt ccagaatctg ttccagagcg tgcgcgaagt gatccagaac ccgggcccca     =
  480
     ggcacccaga ggccgcgagc gcagcacctc ccggcgccag tttgctgctg ctgcagcagc     =
  540
     agcagcagca gcagcagcag cagcagcagc agcagcagca gcagcagcag cagcaagaga     =
  600
     ctagccccag gcagcagcag cagcagcagg gtgaggatgg ttctccccaa gcccatcgta     =
  660
     gaggccccac aggctacctg gtcctggatg aggaacagca accttcacag ccgcagtcgg     =
  720
     ccctggagtg ccaccccgag agaggttgcg tcccagagcc tggagccgcc gtggccgcca     =
  780
     gcaaggggct gccgcagcag ctgccagcac ctccggacga ggatgactca gctgccccat     =
  840
     ccacgttgtc cctgctgggc cccactttcc ccggcttaag cagctgctcc gctgacctta     =
  900
     aagacatcct gagcgaggcc agcaccatgc aactccttca gcaacagcag caggaagcag     =
  960
     tatccgaagg cagcagcagc gggagagcga gggaggcctc gggggctccc acttcctcca     =
 1020
     aggacaatta cttagggggc acttcgacca tttctgacaa cgccaaggag ttgtgtaagg     =
 1080
     cagtgtcggt gtccatgggc ctgggtgtgg aggcgttgga gcatctgagt ccaggggaac     =
 1140
     agcttcgggg ggattgcatg tacgccccac ttttgggagt tccacccgct gtgcgtccca     =
 1200
     ctccttgtgc cccattggcc gaatgcaaag gttctctgct agacgacagc gcaggcaaga     =
 1260
     gcactgaaga tactgctgag tattcccctt tcaagggagg ttacaccaaa gggctagaag     =
 1320
     gcgagagcct aggctgctct ggcagcgctg cagcagggag ctccgggaca cttgaactgc     =
 1380
     cgtctaccct gtctctctac aagtccggag cactggacga ggcagctgcg taccagagtc     =
 1440
     gcgactacta caactttcca ctggctctgg ccggaccgcc gccccctccg ccgcctcccc     =
 1500
     atccccacgc tcgcatcaag ctggagaacc cgctggacta cggcagcgcc tgggcggctg     =
 1560
     cggcggcgca gtgccgctat ggggacctgg cgagcctgca tggcgcgggt gcagcgggac     =
 1620
     ccggttctgg gtcaccctca gccgccgctt cctcatcctg gcacactctc ttcacagccg     =
 1680
     aagaaggcca gttgtatgga ccgtgtggtg gtggtggggg tggtggcggc ggcggcggcg     =
 1740
     gcggcggcgg cggcggcggc ggcggcggcg gcggcggcga ggcgggagct gtagccccct     =
 1800
     acggctacac tcggccccct caggggctgg cgggccagga aagcgacttc accgcacctg     =
 1860
     atgtgtggta ccctggcggc atggtgagca gagtgcccta tcccagtccc acttgtgtca     =
 1920
     aaagcgaaat gggcccctgg atggatagct actccggacc ttacggggac atgcgtttgg     =
 1980
     agactgccag ggaccatgtt ttgcccattg actattactt tccaccccag aagacctgcc     =
 2040
     tgatctgtgg agatgaagct tctgggtgtc actatggagc tctcacatgt ggaagctgca     =
 2100
     aggtcttctt caaaagagcc gctgaaggga aacagaagta cctgtgcgcc agcagaaatg     =
 2160
     attgcactat tgataaattc cgaaggaaaa attgtccatc ttgtcgtctt cggaaatgtt     =
 2220
     atgaagcagg gatgactctg ggagcccgga agctgaagaa acttggtaat ctgaaactac     =
 2280
     aggaggaagg agaggcttcc agcaccacca gccccactga ggagacaacc cagaagctga     =
 2340
     cagtgtcaca cattgaaggc tatgaatgtc agcccatctt tctgaatgtc ctggaagcca     =
 2400
     ttgagccagg tgtagtgtgt gctggacacg acaacaacca gcccgactcc tttgcagcct     =
 2460
     tgctctctag cctcaatgaa ctgggagaga gacagcttgt acacgtggtc aagtgggcca     =
 2520
     aggccttgcc tggcttccgc aacttacacg tggacgacca gatggctgtc attcagtact     =
 2580
     cctggatggg gctcatggtg tttgccatgg gctggcgatc cttcaccaat gtcaactcca     =
 2640
     ggatgctcta cttcgcccct gatctggttt tcaatgagta ccgcatgcac aagtcccgga     =
 2700
     tgtacagcca gtgtgtccga atgaggcacc tctctcaaga gtttggatgg ctccaaatca     =
 2760
     ccccccagga attcctgtgc atgaaagcac tgctactctt cagcattatt ccagtggatg     =
 2820
     ggctgaaaaa tcaaaaattc tttgatgaac ttcgaatgaa ctacatcaag gaactcgatc     =
 2880
     gtatcattgc atgcaaaaga aaaaatccca catcctgctc aagacgcttc taccagctca     =
 2940
     ccaagctcct ggactccgtg cagcctattg cgagagagct gcatcagttc acttttgacc     =
 3000
     tgctaatcaa gtcacacatg gtgagcgtgg actttccgga aatgatggca gagatcatct     =
 3060
     ctgtgcaagt gcccaagatc ctttctggga aagtcaagcc catctatttc cacacccagt     =
 3120
     gaagcattgg aaaccctatt tccccacccc agctcatgcc ccctttcaga tgtcttctgc     =
 3180
     ctgttataac tctgcactac tcctctgcag tgccttgggg aatttcctct attgatgtac     =
 3240
     agtctgtcat gaacatgttc ctgaattcta tttgctgggc tttttttttc tctttctctc     =
 3300
     ctttcttttt cttcttccct ccctatctaa ccctcccatg gcaccttcag actttgcttc     =
 3360
     ccattgtggc tcctatctgt gttttgaatg gtgttgtatg cctttaaatc tgtgatgatc     =
 3420
     ctcatatggc ccagtgtcaa gttgtgcttg tttacagcac tactctgtgc cagccacaca     =
 3480
     aacgtttact tatcttatgc cacgggaagt ttagagagct aagattatct ggggaaatca     =
 3540
     aaacaaaaaa caagcaaaca aaaaaaaaa                                       =
 3569
//
);


my $seq =3D $parse->entry($data);
print Dumper($seq);

--=-UcNW1fe1A2frykRMZte5--