[Biojava-dev] Protein Alignment Exception
John Hawkins
john.hawkins at biotec.tu-dresden.de
Tue Dec 7 18:16:15 UTC 2010
Thanks Chris,
I am a little confused, when I go to the download page for BioJava it
appeared that 1.7.1 is still the latest.
How stable are the BioJava3 libraries ?
I agree that my SubstitutionMatrix code is ugly, but it was my quick
hack to get things going.
In the meantime I managed to solve my own problem, albeit in an
inelegant fashion.
The conflict was not between the alphabets of the two sequences, it was
between the alphabets in
the sequence and the one that was used to instantiate the
SubstitutionMatrix.
In essence I abandoned the idea of instantiating the Alphabet using
AlphabetManager.alphabetForName.
Instead I created the sequence objects first and took the alphabet from
the first sequence object.
The new code is below.
Included in this code is another quick hack to calculate percent
sequence identities from the alignments.
I couldn't find this function built into the existing codebase, which
seemed a little strange, is it something I
overlooked ?
---------------------------------------------------------------
package utilities;
import org.biojava.bio.alignment.NeedlemanWunsch;
import org.biojava.bio.alignment.SequenceAlignment;
import org.biojava.bio.alignment.SmithWaterman;
import org.biojava.bio.alignment.SubstitutionMatrix;
import org.biojava.bio.seq.ProteinTools;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.symbol.AlphabetManager;
import org.biojava.bio.symbol.FiniteAlphabet;
public class PairWiseAlignment {
private String seq1;
private String seq2;
private int score;
private int score2;
private String aln1;
private String aln2;
private double pid1;
private double pid2;
public PairWiseAlignment(String s1, String s2, int score, int
score2, String aln1, String aln2) {
super();
this.seq1 = s1;
this.seq2 = s2;
this.score = score;
this.score2 = score2;
this.aln1 = aln1;
this.aln2 = aln2;
this.pid1 = getPID( s1, s2, aln1);
this.pid2 = getPID( s1, s2, aln2);
}
/** This performs an alignment of two given sequences and
* returns the result in an object.
*
* @param seq1 a query sequence
* @param seq2 a target sequence
*/
public static PairWiseAlignment alignProteins(String sequence1,
String sequence2) {
int score = 0;
int score2 = 0;
String aln1= "";
String aln2= "";
try {
// LETS IGNORE THIS STEP AND TAKE THE ALPHABET OUT OF THE
SEQUENCES
//FiniteAlphabet alphabet = (FiniteAlphabet)
AlphabetManager.alphabetForName("PROTEIN");
Sequence query =
ProteinTools.createProteinSequence(sequence1, "query");
Sequence target =
ProteinTools.createProteinSequence(sequence2, "target");
FiniteAlphabet alphabet = (FiniteAlphabet) query.getAlphabet();
SubstitutionMatrix matrix = new SubstitutionMatrix(alphabet,
PairWiseAlignment.getSubstitutionMatrix(), "BLOSUM62");
// Define the default costs for sequence manipulation for the
global alignment.
SequenceAlignment aligner = new NeedlemanWunsch(
(short) 0, // match
(short) 3, // replace
(short) 2, // insert
(short) 2, // delete
(short) 1, // gapExtend
matrix // SubstitutionMatrix
);
// Perform an alignment and save the results.
score = aligner.pairwiseAlignment(
query, // first sequence
target // second one
);
aln1 = "Global alignment with Needleman-Wunsch:\n" +
aligner.getAlignmentString();
// Perform a local alginment from the sequences with
Smith-Waterman.
// Firstly, define the expenses (penalties) for every single
operation.
aligner = new SmithWaterman(
(short) -1, // match
(short) 3, // replace
(short) 2, // insert
(short) 2, // delete
(short) 1, // gapExtend
matrix // SubstitutionMatrix
);
// Perform the local alignment.
score2 = aligner.pairwiseAlignment(query, target);
//aligner.getAlignment(query, target).
aln2 = "\nLocal alignment with SmithWaterman:\n" +
aligner.getAlignmentString();
} catch (Exception exc) {
exc.printStackTrace();
}
return new PairWiseAlignment(sequence1, sequence2, score,
score2, aln1, aln2);
}
public static String getSubstitutionMatrix() {
//TODO: Load this as a resource
String result = "# Matrix made by matblas from
blosum62.iij \n" +
"# * column uses minimum score\n" +
"# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\n" +
"# Blocks Database = /data/blocks_5.0/blocks.dat\n" +
"# Cluster Percentage: >= 62\n" +
"# Entropy = 0.6979, Expected = -0.5209\n" +
" A R N D C Q E G H I L K M F P S T W Y
V B Z X \n" +
"A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2
0 -2 -1 0 \n" +
"R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2
-3 -1 0 -1 \n" +
"N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2
-3 3 0 -1 \n" +
"D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3
-3 4 1 -1 \n" +
"C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2
-1 -3 -3 -2 \n" +
"Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1
-2 0 3 -1 \n" +
"E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2
-2 1 4 -1 \n" +
"G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3
-3 -1 -2 -1 \n" +
"H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2
-3 0 0 -1 \n" +
"I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1
3 -3 -3 -1 \n" +
"L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1
1 -4 -3 -1 \n" +
"K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2
-2 0 1 -1 \n" +
"M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1
1 -3 -1 -1 \n" +
"F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3
-1 -3 -3 -1 \n" +
"P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3
-2 -2 -1 -2 \n" +
"S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2
-2 0 0 0 \n" +
"T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2
0 -1 -1 0 \n" +
"W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2
-3 -4 -3 -2 \n" +
"Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7
-1 -3 -2 -1 \n" +
"V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1
4 -3 -2 -1 \n" +
"B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3
-3 4 1 -1 \n" +
"Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2
-2 1 4 -1 \n" +
"X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1
-1 -1 -1 -1 ";
return result;
}
public static void main (String args[]) {
String seq1=
"RRRVTVRKADAGGLGISIKGGRENKMPILISKIFKGLAADQTEALFVGDAILSVNGEDLSSATHDEAVQALKKTGKEVVLEVKYMK";
//String seq2=
"RRRVTVRKADAGGLGISIKGGRENKMPILISSIFKGLAADQTEALFVGDAILSVNGEDLSSATHDEAVQALKKTGKEVVLEVKYMK";
String seq2=
"LGEEDIPREPRRIVIHRGSTGLGFNIVGGEDGEGIFISFILAGGPADLSGELRKGDQILSVNGVDLRNASHEQAAIALKNAGQTVTIIAQYKPEEYSRFEAN";
PairWiseAlignment test = PairWiseAlignment.alignProteins(
seq1, seq2 ) ;
System.err.println("PID 1: " + test.getPid1() + "\n" +
test.getAln1() + "\n");
System.err.println("PID 2: " + test.getPid2() + "\n" +
test.getAln2() + "\n");
}
public int getScore() {
return score;
}
public void setScore(int score) {
this.score = score;
}
public int getScore2() {
return score2;
}
public void setScore2(int score2) {
this.score2 = score2;
}
public String getAln1() {
return aln1;
}
public void setAln1(String aln1) {
this.aln1 = aln1;
}
public String getAln2() {
return aln2;
}
public void setAln2(String aln2) {
this.aln2 = aln2;
}
public String getSeq1() {
return seq1;
}
public String getSeq2() {
return seq2;
}
public double getPid1() {
return pid1;
}
public double getPid2() {
return pid2;
}
public double getPID(String s1, String s2, String aln) {
// Calculate the Percent Sequence Identity for the alignment
// using the 'arithmetic mean sequence' length as the denominator
// 'Percent Sequence IdentityThe Need to Be Explicit'
// (2004) Alex C. W. May
double arthMean = ((double) s1.length() + s2.length() ) / 2.0;
int identicalChars = count(aln, '|');
double result =((double) identicalChars )/ arthMean;
return result;
}
public int count(String sourceString, char lookFor) {
if (sourceString == null) {
return -1;
}
int count = 0;
for (int i = 0; i < sourceString.length(); i++) {
final char c = sourceString.charAt(i);
if (c == lookFor) {
count++;
}
}
return count;
}
}
----------------------------------------------------------------
On 12/07/2010 06:11 PM, Chris Friedline wrote:
> Hi John,
>
> I do this all the time and at (very) quick glance your code looks fine
> from what I can remember about the legacy 1.7.1 codebase. However,
> when I did it, I normally loaded from external substitution matrix
> files (downloaded from NCBI) and used the PROTEIN-TERM alphabet.
>
> You might also want to get going with the BioJava3 libraries. I'm
> seeing a pretty big boost in performance now with beta2 and less code
> (~11k/sec on 8 CPUs).
>
> I also had some issues a while back with BioJava 1.7.1 and protein
> alignments that were fixed in the subversion repo, but not made part
> of the main binary release on biojava.org <http://biojava.org>. You
> might want to download and compile the 1.7.1 dev version and try those
> libraries, as well. There are some changes that haven't made it into
> the wiki with the dev version 1.7.1 (as I recall), but it's pretty
> straightforward to get it to work almost like what you have below.
>
> Chris
>
> On Tue, Dec 7, 2010 at 11:26 AM, John Hawkins
> <john.hawkins at biotec.tu-dresden.de
> <mailto:john.hawkins at biotec.tu-dresden.de>> wrote:
>
> Hi All,
>
> I have searched the archives of this list to try and solve my problem
> and I can not find an answer.
>
> I am attempting to use BioJava to perform some pairwise alignments of
> protein sequences.
>
> I have written a simple test class to check the functionality, and
> I am
> getting an exception which does not make sense to me.
>
> Using the code below I receive the Exception
>
> org.biojava.bio.BioRuntimeException: Alphabet missmatch occured:
> sequences with different alphabet cannot be aligned.
> at
> org.biojava.bio.alignment.NeedlemanWunsch.pairwiseAlignment(NeedlemanWunsch.java:635)
> at
> utilities.PairWiseAlignment.alignProteins(PairWiseAlignment.java:68)
> at utilities.PairWiseAlignment.main(PairWiseAlignment.java:184)
>
> This is occuring even when I input two identical protein sequences (as
> Strings).
> It appears that the ProteinTools.createProteinSequence method
> generates
> a unique alphabet for each sequence it is given,
> and I cannot see any way to force the resulting sequence to have the
> same underlying alphabet.
>
> Please forgive me I have overlooked something obvious.
>
> Regards
> John Hawkins
> ----------------------------------------------------------------------------------------------------------------------------
> package utilities;
>
> import org.biojava.bio.alignment.NeedlemanWunsch;
> import org.biojava.bio.alignment.SequenceAlignment;
> import org.biojava.bio.alignment.SmithWaterman;
> import org.biojava.bio.alignment.SubstitutionMatrix;
> import org.biojava.bio.seq.ProteinTools;
> import org.biojava.bio.seq.Sequence;
> import org.biojava.bio.symbol.AlphabetManager;
> import org.biojava.bio.symbol.FiniteAlphabet;
>
>
> public class PairWiseAlignment {
>
> private int score;
> private int score2;
> private String aln1;
> private String aln2;
>
> public PairWiseAlignment(int score, int score2, String aln1,
> String aln2) {
> super();
> this.score = score;
> this.score2 = score2;
> this.aln1 = aln1;
> this.aln2 = aln2;
> }
>
>
> /** This performs an alignment of two given sequences and
> * returns the result in an object.
> *
> * @param seq1 a query sequence
> * @param seq2 a target sequence
> */
> public static PairWiseAlignment alignProteins(String seq1, String
> seq2) {
> int score = 0;
> int score2 = 0;
> String aln1= "";
> String aln2= "";
> try {
> // The alphabet of the sequences. For this example DNA is
> choosen.
> FiniteAlphabet alphabet = (FiniteAlphabet)
> AlphabetManager.alphabetForName("PROTEIN");
> // Read the substitution matrix file.
> // For this example the matrix NUC.4.4 is good.
> SubstitutionMatrix matrix = new SubstitutionMatrix(alphabet,
> PairWiseAlignment.getSubstitutionMatrix(), "BLOSUM62");
> // Define the default costs for sequence manipulation for the
> global alignment.
> SequenceAlignment aligner = new NeedlemanWunsch(
> (short) 0, // match
> (short) 3, // replace
> (short) 2, // insert
> (short) 2, // delete
> (short) 1, // gapExtend
> matrix // SubstitutionMatrix
> );
> Sequence query = ProteinTools.createProteinSequence(seq1,
> "query");
> Sequence target = ProteinTools.createProteinSequence(seq2,
> "target");
>
> System.err.println("ALPHA 1: " + query.getAlphabet() );
> System.err.println("ALPHA 2: " + target.getAlphabet() );
>
> // Perform an alignment and save the results.
> score = aligner.pairwiseAlignment(
> query, // first sequence
> target // second one
> );
> // Print the alignment to the screen
> aln1 = "Global alignment with Needleman-Wunsch:\n" +
> aligner.getAlignmentString();
>
> // Perform a local alginment from the sequences with
> Smith-Waterman.
> // Firstly, define the expenses (penalties) for every single
> operation.
> aligner = new SmithWaterman(
> (short) -1, // match
> (short) 3, // replace
> (short) 2, // insert
> (short) 2, // delete
> (short) 1, // gapExtend
> matrix // SubstitutionMatrix
> );
> // Perform the local alignment.
> score2 = aligner.pairwiseAlignment(query, target);
>
> aln2 = "\nlocal alignment with SmithWaterman:\n" +
> aligner.getAlignmentString();
> } catch (Exception exc) {
> exc.printStackTrace();
> }
>
> return new PairWiseAlignment(score, score2, aln1, aln2);
>
> }
>
> public static String getSubstitutionMatrix() {
> //TODO: Load this as a resource
> String result = "# Matrix made by matblas from
> blosum62.iij \n" +
> "# * column uses minimum score\n" +
> "# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\n" +
> "# Blocks Database = /data/blocks_5.0/blocks.dat\n" +
> "# Cluster Percentage: >= 62\n" +
> "# Entropy = 0.6979, Expected = -0.5209\n" +
> " A R N D C Q E G H I L K M F P S T W Y
> V B Z X \n" +
> "A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2
> 0 -2 -1 0 \n" +
> "R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2
> -3 -1 0 -1 \n" +
> "N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2
> -3 3 0 -1 \n" +
> "D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3
> -3 4 1 -1 \n" +
> "C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2
> -1 -3 -3 -2 \n" +
> "Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1
> -2 0 3 -1 \n" +
> "E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2
> -2 1 4 -1 \n" +
> "G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3
> -3 -1 -2 -1 \n" +
> "H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2
> -3 0 0 -1 \n" +
> "I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1
> 3 -3 -3 -1 \n" +
> "L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1
> 1 -4 -3 -1 \n" +
> "K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2
> -2 0 1 -1 \n" +
> "M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1
> 1 -3 -1 -1 \n" +
> "F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3
> -1 -3 -3 -1 \n" +
> "P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3
> -2 -2 -1 -2 \n" +
> "S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2
> -2 0 0 0 \n" +
> "T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2
> 0 -1 -1 0 \n" +
> "W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2
> -3 -4 -3 -2 \n" +
> "Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7
> -1 -3 -2 -1 \n" +
> "V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1
> 4 -3 -2 -1 \n" +
> "B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3
> -3 4 1 -1 \n" +
> "Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2
> -2 1 4 -1 \n" +
> "X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1
> -1 -1 -1 -1 ";
>
> return result;
> }
>
>
> public static void main (String args[]) {
> String seq1=
> "RRRVTVRKADAGGLGISIKGGRENKMPILISKIFKGLAADQTEALFVGDAILSVNGEDLSSATHDEAVQALKKTGKEVVLEVKYMK";
> String seq2=
> "LGEEDIPREPRRIVIHRGSTGLGFNIVGGEDGEGIFISFILAGGPADLSGELRKGDQILSVNGVDLRNASHEQAAIALKNAGQTVTIIAQYKPEEYSRFEAN";
> PairWiseAlignment test = PairWiseAlignment.alignProteins(
> seq2, seq2 ) ;
> System.err.println("SCORE 1: " + test.score + "\n" +
> test.getAln1() + "\n");
> System.err.println("SCORE 2: " + test.score2 + "\n" +
> test.getAln2() + "\n");
> }
>
>
> public int getScore() {
> return score;
> }
>
>
> public void setScore(int score) {
> this.score = score;
> }
>
>
> public int getScore2() {
> return score2;
> }
>
>
> public void setScore2(int score2) {
> this.score2 = score2;
> }
>
>
> public String getAln1() {
> return aln1;
> }
>
>
> public void setAln1(String aln1) {
> this.aln1 = aln1;
> }
>
>
> public String getAln2() {
> return aln2;
> }
>
>
> public void setAln2(String aln2) {
> this.aln2 = aln2;
> }
> }
>
> -------------------------------------------------------------------------------------------------------
>
>
> --
>
> Dr John Hawkins
> Post-Doctoral Researcher
>
> Technische Universität Dresden
> Biotechnology Center
> Tatzberg 47/49
> 01307 Dresden
>
> Tel.: +49 (0) 351 463-40083
> Fax: +49 (0) 351 463-40087
> E-Mail: john.hawkins at biotec.tu-dresden
> Webpage: www.biotec.tu-dresden.de <http://www.biotec.tu-dresden.de>
>
> _______________________________________________
> biojava-dev mailing list
> biojava-dev at lists.open-bio.org <mailto:biojava-dev at lists.open-bio.org>
> http://lists.open-bio.org/mailman/listinfo/biojava-dev
>
>
>
>
> --
> PhD Candidate, Integrative Life Sciences
> Virginia Commonwealth University
> Richmond, VA
>
--
Dr John Hawkins
Post-Doctoral Researcher
Technische Universität Dresden
Biotechnology Center
Tatzberg 47/49
01307 Dresden
Tel.: +49 (0) 351 463-40083
Fax: +49 (0) 351 463-40087
E-Mail: john.hawkins at biotec.tu-dresden
Webpage: www.biotec.tu-dresden.de
More information about the biojava-dev
mailing list