[Biopython] Comparing two already aligned sequences quickly to count gaps/matches/mismatches
Peter Cock
p.j.a.cock at googlemail.com
Fri Feb 7 07:40:28 EST 2025
Good idea, thanks!
I often forget than numpy can work with string based arrays. My initial
working solution (below) was about x10 faster!
Peter
import timeit
import numpy as np
def counts(query_seq, subject_seq):
gap = "-"
matches = non_gap_mismatches = either_gapped = 0
for q, s in zip(query_seq, subject_seq, strict=True):
# Cache these two as booleans:
q_gap = q == gap
s_gap = s == gap
if q_gap and s_gap:
continue # does not contribute to aln_length
if q == s:
matches += 1
elif q_gap or s_gap:
# Don't want to count this towards coverage (max 100%)
either_gapped += 1
else:
non_gap_mismatches += 1
return matches, non_gap_mismatches, either_gapped
def np_counts(q_array, s_array):
q_non_gaps = q_array != b"-"
s_non_gaps = s_array != b"-"
naive_matches = q_array == s_array # includes double gaps!
matches = int((naive_matches & q_non_gaps).sum())
one_gapped = q_non_gaps ^ s_non_gaps
non_gap_mismatches = int((~naive_matches & ~one_gapped).sum())
either_gapped = int(one_gapped.sum())
return matches, non_gap_mismatches, either_gapped
SEQ1 = "ACGT-AACCGATG-ATCGTATCGTAG-CGCGTATATGGGG--TTTCGT" * 100000
SEQ2 = "ACGT-AGGTAGCATGCTA-G-CCGTAGCGCGTATATGGGG--TTACGT" * 100000
SEQ1a = np.array(list(SEQ1), "S1") # bytes!
SEQ2a = np.array(list(SEQ2), "S1") # bytes!
assert counts(SEQ1, SEQ2) == np_counts(SEQ1a, SEQ2a)
print(timeit.timeit("counts(SEQ1, SEQ2)", number=100, globals=globals()))
print(timeit.timeit("np_counts(SEQ1a, SEQ2a)", number=100,
globals=globals()))
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.open-bio.org/pipermail/biopython/attachments/20250207/67ab8121/attachment.htm>
More information about the Biopython
mailing list