[Biopython] Comparing two already aligned sequences quickly to count gaps/matches/mismatches

Peter Cock p.j.a.cock at googlemail.com
Fri Feb 7 07:40:28 EST 2025


Good idea, thanks!

I often forget than numpy can work with string based arrays. My initial
working solution (below) was about x10 faster!

Peter


import timeit
import numpy as np


def counts(query_seq, subject_seq):
    gap = "-"
    matches = non_gap_mismatches = either_gapped = 0
    for q, s in zip(query_seq, subject_seq, strict=True):
        # Cache these two as booleans:
        q_gap = q == gap
        s_gap = s == gap
        if q_gap and s_gap:
            continue  # does not contribute to aln_length
        if q == s:
            matches += 1
        elif q_gap or s_gap:
            # Don't want to count this towards coverage (max 100%)
            either_gapped += 1
        else:
            non_gap_mismatches += 1
    return matches, non_gap_mismatches, either_gapped


def np_counts(q_array, s_array):
    q_non_gaps = q_array != b"-"
    s_non_gaps = s_array != b"-"
    naive_matches = q_array == s_array  # includes double gaps!
    matches = int((naive_matches & q_non_gaps).sum())
    one_gapped = q_non_gaps ^ s_non_gaps
    non_gap_mismatches = int((~naive_matches & ~one_gapped).sum())
    either_gapped = int(one_gapped.sum())
    return matches, non_gap_mismatches, either_gapped


SEQ1 = "ACGT-AACCGATG-ATCGTATCGTAG-CGCGTATATGGGG--TTTCGT" * 100000
SEQ2 = "ACGT-AGGTAGCATGCTA-G-CCGTAGCGCGTATATGGGG--TTACGT" * 100000
SEQ1a = np.array(list(SEQ1), "S1")  # bytes!
SEQ2a = np.array(list(SEQ2), "S1")  # bytes!
assert counts(SEQ1, SEQ2) == np_counts(SEQ1a, SEQ2a)
print(timeit.timeit("counts(SEQ1, SEQ2)", number=100, globals=globals()))
print(timeit.timeit("np_counts(SEQ1a, SEQ2a)", number=100,
globals=globals()))
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.open-bio.org/pipermail/biopython/attachments/20250207/67ab8121/attachment.htm>


More information about the Biopython mailing list