[Biopython] Comparing two already aligned sequences quickly to count gaps/matches/mismatches
    Peter Cock 
    p.j.a.cock at googlemail.com
       
    Fri Feb  7 07:40:28 EST 2025
    
    
  
Good idea, thanks!
I often forget than numpy can work with string based arrays. My initial
working solution (below) was about x10 faster!
Peter
import timeit
import numpy as np
def counts(query_seq, subject_seq):
    gap = "-"
    matches = non_gap_mismatches = either_gapped = 0
    for q, s in zip(query_seq, subject_seq, strict=True):
        # Cache these two as booleans:
        q_gap = q == gap
        s_gap = s == gap
        if q_gap and s_gap:
            continue  # does not contribute to aln_length
        if q == s:
            matches += 1
        elif q_gap or s_gap:
            # Don't want to count this towards coverage (max 100%)
            either_gapped += 1
        else:
            non_gap_mismatches += 1
    return matches, non_gap_mismatches, either_gapped
def np_counts(q_array, s_array):
    q_non_gaps = q_array != b"-"
    s_non_gaps = s_array != b"-"
    naive_matches = q_array == s_array  # includes double gaps!
    matches = int((naive_matches & q_non_gaps).sum())
    one_gapped = q_non_gaps ^ s_non_gaps
    non_gap_mismatches = int((~naive_matches & ~one_gapped).sum())
    either_gapped = int(one_gapped.sum())
    return matches, non_gap_mismatches, either_gapped
SEQ1 = "ACGT-AACCGATG-ATCGTATCGTAG-CGCGTATATGGGG--TTTCGT" * 100000
SEQ2 = "ACGT-AGGTAGCATGCTA-G-CCGTAGCGCGTATATGGGG--TTACGT" * 100000
SEQ1a = np.array(list(SEQ1), "S1")  # bytes!
SEQ2a = np.array(list(SEQ2), "S1")  # bytes!
assert counts(SEQ1, SEQ2) == np_counts(SEQ1a, SEQ2a)
print(timeit.timeit("counts(SEQ1, SEQ2)", number=100, globals=globals()))
print(timeit.timeit("np_counts(SEQ1a, SEQ2a)", number=100,
globals=globals()))
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.open-bio.org/pipermail/biopython/attachments/20250207/67ab8121/attachment.htm>
    
    
More information about the Biopython
mailing list