[Bioperl-l] Re: pI calculation

Aaron J Mackey Aaron J. Mackey" <amackey@virginia.edu
Mon, 18 Mar 2002 10:12:40 -0500 (EST)


> do you know about a Perl-routine that calculates the isoelectric point
> of a protein ?

Yes; here's the one I use.  It relies on Inline::C being installed on your
computer, because it actually uses C code used by ComputepI/MW at Expasy,
provided by Elisabeth Gasteiger (see comments in C code):  bioperl people,
please feel free to include this in wherever you wish.

-Aaron

P.S. the algorithm could, of course, easily be converted to Perl, but I
use this for database loads of nr, so I needed it to be a little faster.

#!/usr/bin/perl -w

use strict;
use Inline C => 'DATA',
           CC => 'gcc',
           CCFLAGS => '-O2';

my $seq = 'MGTSTMSHDKLDHIGKPPSWQER'; # you get the idea
my $pI = pIcalc($seq);
my $mw = MWcalc($seq);

__DATA__

/*

Date: Mon, 14 May 2001 13:55:01 +0200
From: Elisabeth Gasteiger <Elisabeth.Gasteiger@isb-sib.ch>

[...]

The ComputepI/Mw tool is not available as a ready-to-use standalone
version, however, I can send you the pK table used and the algorithm.

[...]

*/

/* Begin code from E. Gasteiger at EXPASY: */

/*
// Table of pk values :
//  Note: the current algorithm does not use the last two columns. Each
//  row corresponds to an amino acid starting with Ala. J, O and U are
//  inexistant, but here only in order to have the complete alphabet.
//
//     Ct    Nt   Sm     Sc     Sn
*/

static double cPk[26][5] = {
  3.55, 7.59, 0.   , 0.   , 0.    , /* A */
  3.55, 7.50, 0.   , 0.   , 0.    , /* B */
  3.55, 7.50, 9.00 , 9.00 , 9.00  , /* C */
  4.55, 7.50, 4.05 , 4.05 , 4.05  , /* D */
  4.75, 7.70, 4.45 , 4.45 , 4.45  , /* E */
  3.55, 7.50, 0.   , 0.   , 0.    , /* F */
  3.55, 7.50, 0.   , 0.   , 0.    , /* G */
  3.55, 7.50, 5.98 , 5.98 , 5.98  , /* H */
  3.55, 7.50, 0.   , 0.   , 0.    , /* I */
  0.00, 0.00, 0.   , 0.   , 0.    , /* J */
  3.55, 7.50, 10.00, 10.00, 10.00 , /* K */
  3.55, 7.50, 0.   , 0.   , 0.    , /* L */
  3.55, 7.00, 0.   , 0.   , 0.    , /* M */
  3.55, 7.50, 0.   , 0.   , 0.    , /* N */
  0.00, 0.00, 0.   , 0.   , 0.    , /* O */
  3.55, 8.36, 0.   , 0.   , 0.    , /* P */
  3.55, 7.50, 0.   , 0.   , 0.    , /* Q */
  3.55, 7.50, 12.0 , 12.0 , 12.0  , /* R */
  3.55, 6.93, 0.   , 0.   , 0.    , /* S */
  3.55, 6.82, 0.   , 0.   , 0.    , /* T */
  0.00, 0.00, 0.   , 0.   , 0.    , /* U */
  3.55, 7.44, 0.   , 0.   , 0.    , /* V */
  3.55, 7.50, 0.   , 0.   , 0.    , /* W */
  3.55, 7.50, 0.   , 0.   , 0.    , /* X */
  3.55, 7.50, 10.00, 10.00, 10.00 , /* Y */
  3.55, 7.50, 0.   , 0.   , 0.   }; /* Z */

static double cMW[26] = {
  89.09,    /* A */
  132.65,   /* B */
  121.15,   /* C */
  133.1,    /* D */
  147.13,   /* E */
  165.19,   /* F */
  75.07,    /* G */
  155.16,   /* H */
  131.18,   /* I */
  0.00,     /* J */
  146.19,   /* K */
  131.18,   /* L */
  149.22,   /* M */
  132.12,   /* N */
  0.00,     /* O */
  115.13,   /* P */
  146.15,   /* Q */
  174.21,   /* R */
  105.09,   /* S */
  119.12,   /* T */
  168.05,   /* U selenocysteine, Sec */
  117.15,   /* V */
  204.22,   /* W */
  129.26,   /* X */
  181.19,   /* Y */
  146.73 }; /* Z */

#define H2O 18.015    /* mol. wt. of water */

#define PH_MIN 0.0   /* minimum pH value */
#define PH_MAX 14.0  /* maximum pH value */
#define MAXLOOP 2000 /* maximum number of iterations */
#define EPSI 0.0001  /* desired precision */

double pIcalc (char *sequence) {

  int i;
  char c;
  int R, H, K, D, E, C, Y;
  int comp[26] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  int nTermResidue;
  int cTermResidue;
  double phMin;
  double phMid;
  double phMax;
  double charge;
  double cter, nter, carg, chis, clys, casp, cglu, ccys, ctyr;

  R = 'R' - 'A';
  H = 'H' - 'A';
  K = 'K' - 'A';
  D = 'D' - 'A';
  E = 'E' - 'A';
  C = 'C' - 'A';
  Y = 'Y' - 'A';

  /*
    Compute the amino-acid composition.
  */
  i = 0;
  while (c = sequence[i++]) {
    comp[toupper(c) - 'A']++;
  }

  /*
    Look up N-terminal and C-terminal residue.
  */
  nTermResidue = toupper(sequence[0]) - 'A';
  cTermResidue = toupper(sequence[i - 1]) - 'A';

  phMin = PH_MIN;
  phMax = PH_MAX;

  for (i = 0, charge = 1.0; i < MAXLOOP && (phMax - phMin) > EPSI; i++) {
      phMid = phMin + (phMax - phMin) / 2;

      cter = exp10(-cPk[cTermResidue][0]) / (exp10(-cPk[cTermResidue][0]) + exp10(-phMid));
      nter = exp10(-phMid) / (exp10(-cPk[nTermResidue][1]) + exp10(-phMid));

      carg = (double) comp[R] * exp10(-phMid) / (exp10(-cPk[R][2]) + exp10(-phMid));
      chis = (double) comp[H] * exp10(-phMid) / (exp10(-cPk[H][2]) + exp10(-phMid));
      clys = (double) comp[K] * exp10(-phMid) / (exp10(-cPk[K][2]) + exp10(-phMid));

      casp = (double) comp[D] * exp10(-cPk[D][2]) / (exp10(-cPk[D][2]) + exp10(-phMid));
      cglu = (double) comp[E] * exp10(-cPk[E][2]) / (exp10(-cPk[E][2]) + exp10(-phMid));

      ccys = (double) comp[C] * exp10(-cPk[C][2]) / (exp10(-cPk[C][2]) + exp10(-phMid));
      ctyr = (double) comp[Y] * exp10(-cPk[Y][2]) / (exp10(-cPk[Y][2]) + exp10(-phMid));

      charge = carg + clys + chis + nter - (casp + cglu + ctyr + ccys + cter);

      if (charge > 0.0)
	phMin = phMid;
      else
	phMax = phMid;
  }
  return phMid;
}

double MWcalc (char* sequence) {

  int i;
  char c;
  int comp[26] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  double MW;

  /*
    Compute the amino-acid composition.
  */
  i = 0;
  while (c = sequence[i++]) {
    comp[toupper(c) - 'A']++;
  }

  /* subtract N - 1 water molecules: */
  MW = - (double) (i - 2)  * H2O;

  for(i = 0 ; i < 26 ; i++) {
    MW += (double) comp[i] * cMW[i];
  }

  return MW;
}