[Bioperl-l] Re: pI calculation
Aaron J Mackey
Aaron J. Mackey" <amackey@virginia.edu
Mon, 18 Mar 2002 10:12:40 -0500 (EST)
> do you know about a Perl-routine that calculates the isoelectric point
> of a protein ?
Yes; here's the one I use. It relies on Inline::C being installed on your
computer, because it actually uses C code used by ComputepI/MW at Expasy,
provided by Elisabeth Gasteiger (see comments in C code): bioperl people,
please feel free to include this in wherever you wish.
-Aaron
P.S. the algorithm could, of course, easily be converted to Perl, but I
use this for database loads of nr, so I needed it to be a little faster.
#!/usr/bin/perl -w
use strict;
use Inline C => 'DATA',
CC => 'gcc',
CCFLAGS => '-O2';
my $seq = 'MGTSTMSHDKLDHIGKPPSWQER'; # you get the idea
my $pI = pIcalc($seq);
my $mw = MWcalc($seq);
__DATA__
/*
Date: Mon, 14 May 2001 13:55:01 +0200
From: Elisabeth Gasteiger <Elisabeth.Gasteiger@isb-sib.ch>
[...]
The ComputepI/Mw tool is not available as a ready-to-use standalone
version, however, I can send you the pK table used and the algorithm.
[...]
*/
/* Begin code from E. Gasteiger at EXPASY: */
/*
// Table of pk values :
// Note: the current algorithm does not use the last two columns. Each
// row corresponds to an amino acid starting with Ala. J, O and U are
// inexistant, but here only in order to have the complete alphabet.
//
// Ct Nt Sm Sc Sn
*/
static double cPk[26][5] = {
3.55, 7.59, 0. , 0. , 0. , /* A */
3.55, 7.50, 0. , 0. , 0. , /* B */
3.55, 7.50, 9.00 , 9.00 , 9.00 , /* C */
4.55, 7.50, 4.05 , 4.05 , 4.05 , /* D */
4.75, 7.70, 4.45 , 4.45 , 4.45 , /* E */
3.55, 7.50, 0. , 0. , 0. , /* F */
3.55, 7.50, 0. , 0. , 0. , /* G */
3.55, 7.50, 5.98 , 5.98 , 5.98 , /* H */
3.55, 7.50, 0. , 0. , 0. , /* I */
0.00, 0.00, 0. , 0. , 0. , /* J */
3.55, 7.50, 10.00, 10.00, 10.00 , /* K */
3.55, 7.50, 0. , 0. , 0. , /* L */
3.55, 7.00, 0. , 0. , 0. , /* M */
3.55, 7.50, 0. , 0. , 0. , /* N */
0.00, 0.00, 0. , 0. , 0. , /* O */
3.55, 8.36, 0. , 0. , 0. , /* P */
3.55, 7.50, 0. , 0. , 0. , /* Q */
3.55, 7.50, 12.0 , 12.0 , 12.0 , /* R */
3.55, 6.93, 0. , 0. , 0. , /* S */
3.55, 6.82, 0. , 0. , 0. , /* T */
0.00, 0.00, 0. , 0. , 0. , /* U */
3.55, 7.44, 0. , 0. , 0. , /* V */
3.55, 7.50, 0. , 0. , 0. , /* W */
3.55, 7.50, 0. , 0. , 0. , /* X */
3.55, 7.50, 10.00, 10.00, 10.00 , /* Y */
3.55, 7.50, 0. , 0. , 0. }; /* Z */
static double cMW[26] = {
89.09, /* A */
132.65, /* B */
121.15, /* C */
133.1, /* D */
147.13, /* E */
165.19, /* F */
75.07, /* G */
155.16, /* H */
131.18, /* I */
0.00, /* J */
146.19, /* K */
131.18, /* L */
149.22, /* M */
132.12, /* N */
0.00, /* O */
115.13, /* P */
146.15, /* Q */
174.21, /* R */
105.09, /* S */
119.12, /* T */
168.05, /* U selenocysteine, Sec */
117.15, /* V */
204.22, /* W */
129.26, /* X */
181.19, /* Y */
146.73 }; /* Z */
#define H2O 18.015 /* mol. wt. of water */
#define PH_MIN 0.0 /* minimum pH value */
#define PH_MAX 14.0 /* maximum pH value */
#define MAXLOOP 2000 /* maximum number of iterations */
#define EPSI 0.0001 /* desired precision */
double pIcalc (char *sequence) {
int i;
char c;
int R, H, K, D, E, C, Y;
int comp[26] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int nTermResidue;
int cTermResidue;
double phMin;
double phMid;
double phMax;
double charge;
double cter, nter, carg, chis, clys, casp, cglu, ccys, ctyr;
R = 'R' - 'A';
H = 'H' - 'A';
K = 'K' - 'A';
D = 'D' - 'A';
E = 'E' - 'A';
C = 'C' - 'A';
Y = 'Y' - 'A';
/*
Compute the amino-acid composition.
*/
i = 0;
while (c = sequence[i++]) {
comp[toupper(c) - 'A']++;
}
/*
Look up N-terminal and C-terminal residue.
*/
nTermResidue = toupper(sequence[0]) - 'A';
cTermResidue = toupper(sequence[i - 1]) - 'A';
phMin = PH_MIN;
phMax = PH_MAX;
for (i = 0, charge = 1.0; i < MAXLOOP && (phMax - phMin) > EPSI; i++) {
phMid = phMin + (phMax - phMin) / 2;
cter = exp10(-cPk[cTermResidue][0]) / (exp10(-cPk[cTermResidue][0]) + exp10(-phMid));
nter = exp10(-phMid) / (exp10(-cPk[nTermResidue][1]) + exp10(-phMid));
carg = (double) comp[R] * exp10(-phMid) / (exp10(-cPk[R][2]) + exp10(-phMid));
chis = (double) comp[H] * exp10(-phMid) / (exp10(-cPk[H][2]) + exp10(-phMid));
clys = (double) comp[K] * exp10(-phMid) / (exp10(-cPk[K][2]) + exp10(-phMid));
casp = (double) comp[D] * exp10(-cPk[D][2]) / (exp10(-cPk[D][2]) + exp10(-phMid));
cglu = (double) comp[E] * exp10(-cPk[E][2]) / (exp10(-cPk[E][2]) + exp10(-phMid));
ccys = (double) comp[C] * exp10(-cPk[C][2]) / (exp10(-cPk[C][2]) + exp10(-phMid));
ctyr = (double) comp[Y] * exp10(-cPk[Y][2]) / (exp10(-cPk[Y][2]) + exp10(-phMid));
charge = carg + clys + chis + nter - (casp + cglu + ctyr + ccys + cter);
if (charge > 0.0)
phMin = phMid;
else
phMax = phMid;
}
return phMid;
}
double MWcalc (char* sequence) {
int i;
char c;
int comp[26] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
double MW;
/*
Compute the amino-acid composition.
*/
i = 0;
while (c = sequence[i++]) {
comp[toupper(c) - 'A']++;
}
/* subtract N - 1 water molecules: */
MW = - (double) (i - 2) * H2O;
for(i = 0 ; i < 26 ; i++) {
MW += (double) comp[i] * cMW[i];
}
return MW;
}