From 3f98774301b7f04938cfe364f962af1e3a42b1e9 Mon Sep 17 00:00:00 2001 From: mikessh Date: Fri, 10 Apr 2026 07:06:26 +0300 Subject: [PATCH 01/24] some setup --- CMakeLists.txt | 2 +- pyproject.toml | 15 ++++++++++----- quick_setup.sh | 4 ++++ requirements.txt | 3 +++ 4 files changed, 18 insertions(+), 6 deletions(-) create mode 100644 quick_setup.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 9703920..6060df7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,8 @@ cmake_minimum_required(VERSION 3.18) project(mir_cdrscore LANGUAGES CXX) +find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) find_package(pybind11 CONFIG REQUIRED) -find_package(Python REQUIRED COMPONENTS Interpreter Development.Module) pybind11_add_module(cdrscore MODULE mir/distances/cdrscore.cpp) target_compile_features(cdrscore PRIVATE cxx_std_17) diff --git a/pyproject.toml b/pyproject.toml index bc11b2d..4da2ddc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,19 +4,22 @@ requires = [ "pybind11", "numpy", "ninja" -] +] build-backend = "scikit_build_core.build" [project] name = "mirpy-lib" -version = "1.0.0" -authors = [{ name="antigenomics", email="elizaveta.k.vlasova@gmail.com" }] +version = "0.1.3" +authors = [ + { name="VEK239", email="elizaveta.k.vlasova@gmail.com" }, + { name="mikessh", email="mikhail.shugay@gmail.com"}, +] description = "A library for managing and analyzing immunosequencing data" readme = "README.md" requires-python = ">=3.11" classifiers = [ "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", + "License :: OSI Approved :: GPL-3.0 license", "Operating System :: OS Independent", ] dependencies = [ @@ -32,9 +35,10 @@ dependencies = [ "scikit-learn", "textdistance", "plotnine", + "stringzilla", "tcrtrie @ git+https://github.com/MikePodsytnik/TCRtrie@0.1.2-tcrtriepy", ] -license = {text = "MIT License"} +license = {text = "GPL-3.0 license"} [project.urls] Homepage = "https://github.com/antigenomics/mirpy" @@ -43,3 +47,4 @@ Issues = "https://github.com/antigenomics/mirpy/issues" [tool.scikit-build] wheel.packages = ["mir"] logging.level = "INFO" + diff --git a/quick_setup.sh b/quick_setup.sh new file mode 100644 index 0000000..bc0252b --- /dev/null +++ b/quick_setup.sh @@ -0,0 +1,4 @@ +python3 -m venv venv +. venv/bin/activate.fish +export CMAKE_POLICY_VERSION_MINIMUM=3.5 && pip install . +pip install pytest pylint \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8f3e88c..554849f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,7 @@ statsmodels==0.14.1 textdistance==4.5.0 tcrtrie @ git+https://github.com/MikePodsytnik/TCRtrie@0.1.2-tcrtriepy tqdm==4.66.2 +stringzilla==4.6.0 +stringutils==1.0.6 umap-learn==0.5.3 +pybind11==2.11.0 From 5f4e0b23cfa6ed8dfe5c12721c021e23b3bace9a Mon Sep 17 00:00:00 2001 From: mikessh Date: Fri, 10 Apr 2026 07:06:43 +0300 Subject: [PATCH 02/24] wip --- mir/basic/sequence.py | 386 +++++++++++++++++++++++++++++++++++++++++ test/sequence_tests.py | 119 +++++++++++++ 2 files changed, 505 insertions(+) create mode 100644 mir/basic/sequence.py create mode 100644 test/sequence_tests.py diff --git a/mir/basic/sequence.py b/mir/basic/sequence.py new file mode 100644 index 0000000..9a8b3e0 --- /dev/null +++ b/mir/basic/sequence.py @@ -0,0 +1,386 @@ +"""Biological sequence types backed by NumPy byte arrays. + +This module defines alphabet-validated sequence classes for nucleotide and +amino-acid data. All sequences are stored as ``np.ndarray`` of dtype ``S1`` +(single-byte ASCII characters) so they can be operated on efficiently with +NumPy primitives and ``stringzilla``. + +Classes: + SequenceAlphabet -- Singleton alphabet definition. + AlphabetSequence -- Base class for alphabet-constrained sequences. + NucleotideSequence -- DNA sequence (A/T/G/C by default). + AminoAcidSequence -- Standard 20-AA + stop/unknown sequence. + SimpleAminoAcidSequence -- Reduced amino-acid alphabet used for fuzzy + matching (groups physico-chemically similar AAs). +""" + +from __future__ import annotations + +import numpy as np +import stringzilla as sz +from typing import Self + + +#: Maps each standard amino-acid one-letter code (plus ``X``, ``*``, ``_``) +#: to a reduced symbol representing its physico-chemical class:: +#: +#: l aliphatic/hydrophobic (A, I, L, V) +#: b basic (R, H, K) +#: m amide (N, Q) +#: c acidic/charged (D, E) +#: s sulphur-containing (C, M) +#: h hydroxyl (S, T) +#: G glycine +#: F phenylalanine +#: P proline +#: W tryptophan +#: Y tyrosine +#: X unknown +#: * stop codon +#: _ gap +AMINO_ACID_TO_SIMPLE_AMINO_ACID: dict[str, str] = { + "A": "l", + "R": "b", + "N": "m", + "D": "c", + "C": "s", + "Q": "m", + "E": "c", + "G": "G", + "H": "b", + "I": "l", + "L": "l", + "K": "b", + "M": "s", + "F": "F", + "P": "P", + "S": "h", + "T": "h", + "W": "W", + "Y": "Y", + "V": "l", + "X": "X", + "*": "*", + "_": "_", +} + + +class SequenceAlphabet: + """Singleton-like immutable alphabet definition keyed by allowed symbols. + + Instances are cached by their ``allowed_symbols`` tuple so that two + ``SequenceAlphabet`` objects constructed with identical symbol sets are + guaranteed to be the *same* object (``is`` comparison holds). + + Attributes: + allowed_symbols (tuple[str, ...]): Immutable ordered collection of + single-character symbols that belong to this alphabet. + allowed_array (np.ndarray): ``S1``-dtype NumPy array of the same + symbols, pre-built for fast membership testing via ``np.isin``. + """ + + _instances: dict[tuple[str, ...], "SequenceAlphabet"] = {} + + def __new__(cls, allowed_symbols: tuple[str, ...]) -> "SequenceAlphabet": + """Return the cached instance for *allowed_symbols*, creating it if needed.""" + key = tuple(allowed_symbols) + if key not in cls._instances: + instance = super().__new__(cls) + cls._instances[key] = instance + return cls._instances[key] + + def __init__(self, allowed_symbols: tuple[str, ...]) -> None: + """Initialise the alphabet (no-op when the cached instance already exists). + + Args: + allowed_symbols: Ordered tuple of single-character strings that + define the legal symbol set for this alphabet. + """ + if hasattr(self, "allowed_symbols"): + return + self.allowed_symbols = tuple(allowed_symbols) + self.allowed_array = np.array([c.encode("ascii") for c in self.allowed_symbols], dtype="S1") + + +#: Pre-built :class:`SequenceAlphabet` for the reduced amino-acid symbol set +#: derived from :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`. +SIMPLE_AMINO_ACID_ALPHABET = SequenceAlphabet(tuple(dict.fromkeys(AMINO_ACID_TO_SIMPLE_AMINO_ACID.values()))) + + +class AlphabetSequence: + """Compact alphabet-validated sequence backed by a NumPy array. + + This is the abstract base class for all concrete sequence types in this + module. Direct instantiation is allowed but callers should prefer one of + the concrete subclasses (:class:`NucleotideSequence`, + :class:`AminoAcidSequence`, :class:`SimpleAminoAcidSequence`) which + provide a sensible default alphabet. + + Attributes: + content (np.ndarray): One-dimensional ``S1``-dtype array storing the + sequence as individual ASCII bytes. + alphabet (SequenceAlphabet): The alphabet that ``content`` was + validated against. + """ + + def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet) -> None: + """Construct a validated sequence from an existing byte array. + + Args: + content: One-dimensional NumPy array with ``dtype='S1'``. + alphabet: Alphabet to validate *content* against. + + Raises: + ValueError: If *content* does not have dtype ``S1``, is not + one-dimensional, or contains symbols absent from *alphabet*. + """ + if content.dtype != np.dtype("S1"): + raise ValueError("Sequence storage must have dtype S1") + if content.ndim != 1: + raise ValueError("Sequence storage must be one-dimensional") + + if not np.isin(content, alphabet.allowed_array).all(): + raise ValueError("Sequence contains symbols outside of alphabet") + + self.content = content + self.alphabet = alphabet + + @classmethod + def from_string(cls: type[Self], sequence: str, alphabet: SequenceAlphabet) -> Self: + """Create an instance by parsing a plain Python string. + + Args: + sequence: String whose characters must all belong to *alphabet*. + alphabet: Alphabet to validate the sequence against. + + Returns: + A new instance of the calling class backed by a freshly allocated + ``S1`` NumPy array. + + Raises: + ValueError: If any character in *sequence* is outside *alphabet*. + """ + sz_sequence = sz.Str(sequence) + sequence_bytes = bytes(sz_sequence) + array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() + return cls(array, alphabet) + + def to_string(self) -> str: + """Decode the byte array back to a plain Python string. + + Returns: + The sequence as a ``str``. + """ + return str(sz.Str(self.content.tobytes())) + + def substring(self, start: int, stop: int | None = None) -> "AlphabetSequence": + """Return a validated slice of this sequence. + + Uses the same slicing semantics as Python built-in strings: *start* is + inclusive, *stop* is exclusive, and ``None`` means "to the end". + + Args: + start: Index of the first character to include (0-based). + stop: Index of the first character to *exclude*, or ``None`` to + slice through to the end of the sequence. + + Returns: + A new instance of the same concrete class containing the requested + subsequence with the same alphabet. + """ + view = sz.Str(self.content.tobytes()) + part = view[start:stop] + part_bytes = bytes(part) + sub_array = np.frombuffer(memoryview(part_bytes), dtype="S1").copy() + return self.__class__(sub_array, self.alphabet) + + def __len__(self) -> int: + """Return the number of characters in the sequence.""" + return int(self.content.shape[0]) + + +class NucleotideSequence(AlphabetSequence): + """A DNA nucleotide sequence restricted to the standard four-base alphabet. + + The default alphabet is ``("A", "T", "G", "C")``. A custom + :class:`SequenceAlphabet` may be supplied to support, for example, + ambiguity codes. + + Class Attributes: + DEFAULT_ALPHABET (SequenceAlphabet): Standard DNA alphabet ``{A, T, G, C}``. + """ + + DEFAULT_ALPHABET = SequenceAlphabet(("A", "T", "G", "C")) + + def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet = DEFAULT_ALPHABET) -> None: + """Construct a nucleotide sequence from a byte array. + + Args: + content: One-dimensional ``S1``-dtype NumPy array. + alphabet: Alphabet to validate against (defaults to ``{A, T, G, C}``). + """ + super().__init__(content, alphabet) + + @classmethod + def from_string( + cls: type[Self], + sequence: str, + alphabet: SequenceAlphabet = DEFAULT_ALPHABET, + ) -> Self: + """Create a :class:`NucleotideSequence` from a plain string. + + Args: + sequence: DNA string (e.g. ``"ATCG"``). All characters must + belong to *alphabet*. + alphabet: Defaults to the standard four-base DNA alphabet. + + Returns: + A new :class:`NucleotideSequence` instance. + + Raises: + ValueError: If *sequence* contains characters outside *alphabet*. + """ + sz_sequence = sz.Str(sequence) + sequence_bytes = bytes(sz_sequence) + array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() + return cls(array, alphabet) + + +class AminoAcidSequence(AlphabetSequence): + """A standard amino-acid sequence using the 20-letter IUPAC alphabet. + + In addition to the 20 canonical amino acids the alphabet includes: + + * ``*`` — stop codon + * ``_`` — gap + * ``X`` — unknown / any amino acid + + Class Attributes: + DEFAULT_ALPHABET (SequenceAlphabet): The 20 canonical AAs plus + ``*``, ``_``, and ``X``. + """ + + DEFAULT_ALPHABET = SequenceAlphabet( + ( + "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", + "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", + "*", "_", "X", + ) + ) + + def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet = DEFAULT_ALPHABET) -> None: + """Construct an amino-acid sequence from a byte array. + + Args: + content: One-dimensional ``S1``-dtype NumPy array. + alphabet: Alphabet to validate against (defaults to the standard + 20-AA + stop/gap/unknown alphabet). + """ + super().__init__(content, alphabet) + + @classmethod + def from_string( + cls: type[Self], + sequence: str, + alphabet: SequenceAlphabet = DEFAULT_ALPHABET, + ) -> Self: + """Create an :class:`AminoAcidSequence` from a plain string. + + Args: + sequence: Amino-acid string in single-letter code + (e.g. ``"CASSLAPGATNEKLFF"``). All characters must belong to + *alphabet*. + alphabet: Defaults to the standard amino-acid alphabet. + + Returns: + A new :class:`AminoAcidSequence` instance. + + Raises: + ValueError: If *sequence* contains characters outside *alphabet*. + """ + sz_sequence = sz.Str(sequence) + sequence_bytes = bytes(sz_sequence) + array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() + return cls(array, alphabet) + + def to_simple_amino_acid(self) -> "SimpleAminoAcidSequence": + """Convert to a :class:`SimpleAminoAcidSequence` using physico-chemical grouping. + + Each amino acid is mapped to a reduced symbol according to + :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`. + + Returns: + A :class:`SimpleAminoAcidSequence` of the same length whose + symbols represent physico-chemical classes. + """ + converted = "".join(AMINO_ACID_TO_SIMPLE_AMINO_ACID[s] for s in self.to_string()) + return SimpleAminoAcidSequence.from_string(converted) + + def matches_simple_amino_acid(self, simple_sequence: "SimpleAminoAcidSequence") -> bool: + """Check whether this sequence maps to a given simple amino-acid sequence. + + Converts ``self`` to the reduced alphabet and compares byte-for-byte + with *simple_sequence*. + + Args: + simple_sequence: A :class:`SimpleAminoAcidSequence` to compare + against. + + Returns: + ``True`` if the reduced representation of this sequence equals + *simple_sequence*, ``False`` otherwise. + """ + return self.to_simple_amino_acid().content.tobytes() == simple_sequence.content.tobytes() + + +class SimpleAminoAcidSequence(AlphabetSequence): + """An amino-acid sequence encoded in the reduced physico-chemical alphabet. + + Symbols are those produced by :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`: + ``l``, ``b``, ``m``, ``c``, ``s``, ``h``, ``G``, ``F``, ``P``, ``W``, + ``Y``, ``X``, ``*``, ``_``. + + Instances are typically obtained via + :meth:`AminoAcidSequence.to_simple_amino_acid` rather than constructed + directly. + + Class Attributes: + DEFAULT_ALPHABET (SequenceAlphabet): :data:`SIMPLE_AMINO_ACID_ALPHABET`. + """ + + DEFAULT_ALPHABET = SIMPLE_AMINO_ACID_ALPHABET + + def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet = DEFAULT_ALPHABET) -> None: + """Construct a simple amino-acid sequence from a byte array. + + Args: + content: One-dimensional ``S1``-dtype NumPy array. + alphabet: Alphabet to validate against (defaults to + :data:`SIMPLE_AMINO_ACID_ALPHABET`). + """ + super().__init__(content, alphabet) + + @classmethod + def from_string( + cls: type[Self], + sequence: str, + alphabet: SequenceAlphabet = DEFAULT_ALPHABET, + ) -> Self: + """Create a :class:`SimpleAminoAcidSequence` from a plain string. + + Args: + sequence: String using the reduced physico-chemical symbols + (e.g. ``"slhhllGGlhmcbllW"``). All characters must belong + to *alphabet*. + alphabet: Defaults to :data:`SIMPLE_AMINO_ACID_ALPHABET`. + + Returns: + A new :class:`SimpleAminoAcidSequence` instance. + + Raises: + ValueError: If *sequence* contains characters outside *alphabet*. + """ + sz_sequence = sz.Str(sequence) + sequence_bytes = bytes(sz_sequence) + array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() + return cls(array, alphabet) diff --git a/test/sequence_tests.py b/test/sequence_tests.py new file mode 100644 index 0000000..2a86f2d --- /dev/null +++ b/test/sequence_tests.py @@ -0,0 +1,119 @@ +"""Unit tests for :mod:`mir.basic.sequence`. + +Coverage: + SequenceAlphabet -- singleton caching behaviour. + AlphabetSequence -- construction, round-trip string conversion, + substring slicing, length, and alphabet + rejection. + NucleotideSequence -- DNA string parsing and slicing. + AminoAcidSequence -- protein string parsing, slicing, and + conversion to the reduced alphabet. + SimpleAminoAcidSequence -- reduced-alphabet string parsing and + slicing. +""" + +import unittest + +import numpy as np + +from mir.basic.sequence import ( + AminoAcidSequence, + NucleotideSequence, + SequenceAlphabet, + SimpleAminoAcidSequence, +) + + +class TestAlphabetSequence(unittest.TestCase): + """Tests for :class:`~mir.basic.sequence.AlphabetSequence` and its subclasses.""" + + def test_create_convert_and_substring(self) -> None: + """Sequences round-trip through ``from_string`` / ``to_string`` and slice correctly. + + Verifies that: + * ``DEFAULT_ALPHABET`` is the singleton cached by :class:`SequenceAlphabet`. + * The ``S1`` dtype is preserved after construction. + * ``to_string`` reconstructs the original string exactly. + * ``substring(start, stop)`` returns the expected subsequence for both + :class:`NucleotideSequence` and :class:`AminoAcidSequence`. + """ + self.assertIs(NucleotideSequence.DEFAULT_ALPHABET, SequenceAlphabet(("A", "T", "G", "C"))) + + nt = NucleotideSequence.from_string("ATTAGACA") + self.assertEqual(nt.to_string(), "ATTAGACA") + self.assertEqual(nt.content.dtype, np.dtype("S1")) + self.assertEqual(nt.content.tobytes(), b"ATTAGACA") + self.assertEqual(nt.substring(2, 6).to_string(), "TAGA") + + aa = AminoAcidSequence.from_string("CASSLAPGATNEKLFF") + self.assertEqual(aa.to_string(), "CASSLAPGATNEKLFF") + self.assertEqual(aa.substring(4, 9).to_string(), "LAPGA") + + def test_empty_or_invalid_sequence(self) -> None: + """Empty sequences are valid; out-of-alphabet characters raise ``ValueError``. + + Verifies that: + * An empty :class:`NucleotideSequence` and :class:`AminoAcidSequence` + have length 0 and round-trip to ``""``. + * ``substring(0, 0)`` on a non-empty sequence returns an empty sequence. + * Constructing a :class:`NucleotideSequence` from ``"ATU"`` raises + ``ValueError`` (``U`` is not in the DNA alphabet). + * Constructing an :class:`AminoAcidSequence` from ``"B"`` raises + ``ValueError`` (``B`` is not a standard amino acid). + """ + empty_nt = NucleotideSequence.from_string("") + self.assertEqual(len(empty_nt), 0) + self.assertEqual(empty_nt.to_string(), "") + + empty_aa = AminoAcidSequence.from_string("") + self.assertEqual(len(empty_aa), 0) + self.assertEqual(empty_aa.to_string(), "") + + self.assertEqual(NucleotideSequence.from_string("ATTAGACA").substring(0, 0).to_string(), "") + + with self.assertRaises(ValueError): + NucleotideSequence.from_string("ATU") + + with self.assertRaises(ValueError): + AminoAcidSequence.from_string("B") + + +class TestSimpleAminoAcidSequence(unittest.TestCase): + """Tests for :class:`~mir.basic.sequence.SimpleAminoAcidSequence` and AA conversion.""" + + def test_amino_acid_to_simple_conversion_and_match(self) -> None: + """``to_simple_amino_acid`` applies the physico-chemical grouping map correctly. + + Verifies that: + * The reduced string produced by :meth:`AminoAcidSequence.to_simple_amino_acid` + matches the expected character-by-character mapping from + :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`. + * :meth:`AminoAcidSequence.matches_simple_amino_acid` returns ``True`` + for the sequence's own reduced form and ``False`` for an altered one. + """ + aa: AminoAcidSequence = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") + simple = aa.to_simple_amino_acid() + + self.assertEqual(simple.to_string(), "slhhllGGlhmcbllW") + self.assertTrue(aa.matches_simple_amino_acid(simple)) + self.assertFalse(aa.matches_simple_amino_acid(SimpleAminoAcidSequence.from_string("slhhllGGlhmcbllY"))) + + def test_simple_substrings(self) -> None: + """``substring`` on a :class:`SimpleAminoAcidSequence` slices correctly. + + Verifies that: + * A half-open slice returns the expected subsequence. + * ``substring(start, None)`` slices through to the end of the sequence. + * An out-of-alphabet character (``Z``) raises ``ValueError``. + """ + simple = SimpleAminoAcidSequence.from_string("slhhllGGlhmcbllW") + self.assertEqual(simple.substring(0, 4).to_string(), "slhh") + self.assertEqual(simple.substring(6, 8).to_string(), "GG") + self.assertEqual(simple.substring(11, None).to_string(), "cbllW") + + with self.assertRaises(ValueError): + SimpleAminoAcidSequence.from_string("Z") + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From d35dd96b395f2e5f5478f5d6af813e1e9cf3bcf0 Mon Sep 17 00:00:00 2001 From: mikessh Date: Fri, 10 Apr 2026 20:40:19 +0300 Subject: [PATCH 03/24] wip tests fixing --- .vscode/launch.json | 2 +- .vscode/settings.json | 5 ++--- requirements.txt | 1 + test/test_repertoires/test_meta.csv | 5 ----- tests/assets/meta.csv | 5 +++++ .../test_repertoire_1.csv => tests/assets/repertoire_1.csv | 0 .../test_repertoire_2.csv => tests/assets/repertoire_2.csv | 0 .../test_repertoire_3.csv => tests/assets/repertoire_3.csv | 0 .../test_repertoire_4.csv => tests/assets/repertoire_4.csv | 0 .../benchmakrs/memory_benchmark.py | 2 -- test/repertoire_tests.py => tests/test_repertoire.py | 4 ++-- test/segments_tests.py => tests/test_segments.py | 0 test/sequence_tests.py => tests/test_sequence.py | 0 13 files changed, 11 insertions(+), 13 deletions(-) delete mode 100644 test/test_repertoires/test_meta.csv create mode 100644 tests/assets/meta.csv rename test/test_repertoires/test_repertoire_1.csv => tests/assets/repertoire_1.csv (100%) rename test/test_repertoires/test_repertoire_2.csv => tests/assets/repertoire_2.csv (100%) rename test/test_repertoires/test_repertoire_3.csv => tests/assets/repertoire_3.csv (100%) rename test/test_repertoires/test_repertoire_4.csv => tests/assets/repertoire_4.csv (100%) rename test/memory_test/testing_mamory.py => tests/benchmakrs/memory_benchmark.py (97%) rename test/repertoire_tests.py => tests/test_repertoire.py (93%) rename test/segments_tests.py => tests/test_segments.py (100%) rename test/sequence_tests.py => tests/test_sequence.py (100%) diff --git a/.vscode/launch.json b/.vscode/launch.json index 306f58e..92390e4 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -6,7 +6,7 @@ "configurations": [ { "name": "Python: Current File", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${file}", "console": "integratedTerminal", diff --git a/.vscode/settings.json b/.vscode/settings.json index e20447e..4143014 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,11 +1,10 @@ { "python.testing.pytestArgs": [ - "mir" + "tests" ], "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "[python]": { "editor.defaultFormatter": "ms-python.python" - }, - "python.formatting.provider": "none" + } } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 554849f..7845217 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ stringzilla==4.6.0 stringutils==1.0.6 umap-learn==0.5.3 pybind11==2.11.0 +multipy==0.16 diff --git a/test/test_repertoires/test_meta.csv b/test/test_repertoires/test_meta.csv deleted file mode 100644 index d609e58..0000000 --- a/test/test_repertoires/test_meta.csv +++ /dev/null @@ -1,5 +0,0 @@ -,sample_id,file_name,status -0,id_1,test_repertoire_1.csv,healthy -0,id_2,test_repertoire_2.csv,healthy -0,id_3,test_repertoire_3.csv,ill -0,id_4,test_repertoire_4.csv,ill \ No newline at end of file diff --git a/tests/assets/meta.csv b/tests/assets/meta.csv new file mode 100644 index 0000000..9773890 --- /dev/null +++ b/tests/assets/meta.csv @@ -0,0 +1,5 @@ +,sample_id,file_name,status +0,id_1,repertoire_1.csv,healthy +0,id_2,repertoire_2.csv,healthy +0,id_3,repertoire_3.csv,ill +0,id_4,repertoire_4.csv,ill \ No newline at end of file diff --git a/test/test_repertoires/test_repertoire_1.csv b/tests/assets/repertoire_1.csv similarity index 100% rename from test/test_repertoires/test_repertoire_1.csv rename to tests/assets/repertoire_1.csv diff --git a/test/test_repertoires/test_repertoire_2.csv b/tests/assets/repertoire_2.csv similarity index 100% rename from test/test_repertoires/test_repertoire_2.csv rename to tests/assets/repertoire_2.csv diff --git a/test/test_repertoires/test_repertoire_3.csv b/tests/assets/repertoire_3.csv similarity index 100% rename from test/test_repertoires/test_repertoire_3.csv rename to tests/assets/repertoire_3.csv diff --git a/test/test_repertoires/test_repertoire_4.csv b/tests/assets/repertoire_4.csv similarity index 100% rename from test/test_repertoires/test_repertoire_4.csv rename to tests/assets/repertoire_4.csv diff --git a/test/memory_test/testing_mamory.py b/tests/benchmakrs/memory_benchmark.py similarity index 97% rename from test/memory_test/testing_mamory.py rename to tests/benchmakrs/memory_benchmark.py index 6e349a7..aaa6c6b 100644 --- a/test/memory_test/testing_mamory.py +++ b/tests/benchmakrs/memory_benchmark.py @@ -12,7 +12,6 @@ from mir.common.repertoire import Repertoire from mir.common.parser import AIRRParser -# ======= Настройки ======= input_path = "/projects/immunestatus/pogorelyy/airr_format/P1_0_F1_with_1.txt" proto_path = "tcremp_prototypes_olga.tsv" species = ["HomoSapiens"] @@ -21,7 +20,6 @@ metric = "dissimilarity" nproc = 32 llen, hlen = 5, 30 -# ========================== def report_memory(stage): process = psutil.Process(os.getpid()) diff --git a/test/repertoire_tests.py b/tests/test_repertoire.py similarity index 93% rename from test/repertoire_tests.py rename to tests/test_repertoire.py index ac08548..574317f 100644 --- a/test/repertoire_tests.py +++ b/tests/test_repertoire.py @@ -10,11 +10,11 @@ class TestRepertoireDataset(unittest.TestCase): def setUp(self): - self.meta = pd.read_csv('test_repertoires/test_meta.csv') + self.meta = pd.read_csv('assets/test_meta.csv') self.rd = RepertoireDataset.load(parser=VDJtoolsParser(sep=','), metadata=self.meta, threads=1, - paths=[f'test_repertoires/{x}' for x in self.meta.file_name]) + paths=[f'assets/{x}' for x in self.meta.file_name]) self.ill_rd, self.healthy_rd = self.rd.split_by_metadata_function(splitting_method=lambda x: x.status == 'ill') diff --git a/test/segments_tests.py b/tests/test_segments.py similarity index 100% rename from test/segments_tests.py rename to tests/test_segments.py diff --git a/test/sequence_tests.py b/tests/test_sequence.py similarity index 100% rename from test/sequence_tests.py rename to tests/test_sequence.py From 19bfc61ef691b4ba9f2d75a20159cfb1601c8d79 Mon Sep 17 00:00:00 2001 From: mikessh Date: Fri, 10 Apr 2026 21:01:02 +0300 Subject: [PATCH 04/24] wip sequence mask --- mir/basic/sequence.py | 174 ++++++++++++++++++++++++++++++++--------- tests/test_sequence.py | 101 ++++++++++++++++++------ 2 files changed, 214 insertions(+), 61 deletions(-) diff --git a/mir/basic/sequence.py b/mir/basic/sequence.py index 9a8b3e0..5efe592 100644 --- a/mir/basic/sequence.py +++ b/mir/basic/sequence.py @@ -1,17 +1,17 @@ """Biological sequence types backed by NumPy byte arrays. This module defines alphabet-validated sequence classes for nucleotide and -amino-acid data. All sequences are stored as ``np.ndarray`` of dtype ``S1`` +amino-acid data. All sequences are stored as ``np.ndarray`` of dtype ``S1`` (single-byte ASCII characters) so they can be operated on efficiently with NumPy primitives and ``stringzilla``. Classes: - SequenceAlphabet -- Singleton alphabet definition. - AlphabetSequence -- Base class for alphabet-constrained sequences. - NucleotideSequence -- DNA sequence (A/T/G/C by default). - AminoAcidSequence -- Standard 20-AA + stop/unknown sequence. - SimpleAminoAcidSequence -- Reduced amino-acid alphabet used for fuzzy - matching (groups physico-chemically similar AAs). + SequenceAlphabet -- Singleton alphabet definition. + AlphabetSequence -- Base class for alphabet-constrained sequences. + NucleotideSequence -- DNA sequence (A/T/G/C/N by default). + AminoAcidSequence -- Standard 20-AA + stop/unknown sequence. + ReducedAminoAcidSequence -- Reduced amino-acid alphabet used for fuzzy + matching (groups physico-chemically similar AAs). """ from __future__ import annotations @@ -38,7 +38,7 @@ #: X unknown #: * stop codon #: _ gap -AMINO_ACID_TO_SIMPLE_AMINO_ACID: dict[str, str] = { +AMINO_ACID_TO_REDUCED_AMINO_ACID: dict[str, str] = { "A": "l", "R": "b", "N": "m", @@ -102,9 +102,17 @@ def __init__(self, allowed_symbols: tuple[str, ...]) -> None: self.allowed_array = np.array([c.encode("ascii") for c in self.allowed_symbols], dtype="S1") +#: Backwards-compatible alias for older name. +AMINO_ACID_TO_SIMPLE_AMINO_ACID = AMINO_ACID_TO_REDUCED_AMINO_ACID + + #: Pre-built :class:`SequenceAlphabet` for the reduced amino-acid symbol set -#: derived from :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`. -SIMPLE_AMINO_ACID_ALPHABET = SequenceAlphabet(tuple(dict.fromkeys(AMINO_ACID_TO_SIMPLE_AMINO_ACID.values()))) +#: derived from :data:`AMINO_ACID_TO_REDUCED_AMINO_ACID`. +REDUCED_AMINO_ACID_ALPHABET = SequenceAlphabet(tuple(dict.fromkeys(AMINO_ACID_TO_REDUCED_AMINO_ACID.values()))) + + +#: Backwards-compatible alias for older name. +SIMPLE_AMINO_ACID_ALPHABET = REDUCED_AMINO_ACID_ALPHABET class AlphabetSequence: @@ -116,6 +124,11 @@ class AlphabetSequence: :class:`AminoAcidSequence`, :class:`SimpleAminoAcidSequence`) which provide a sensible default alphabet. + Class Attributes: + MASK_SYMBOL (str | None): Symbol used by :meth:`mask`. Subclasses + that support masking define this symbol (``"N"`` for nucleotides, + ``"X"`` for amino-acid alphabets). + Attributes: content (np.ndarray): One-dimensional ``S1``-dtype array storing the sequence as individual ASCII bytes. @@ -123,6 +136,8 @@ class AlphabetSequence: validated against. """ + MASK_SYMBOL: str | None = None + def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet) -> None: """Construct a validated sequence from an existing byte array. @@ -194,9 +209,73 @@ def substring(self, start: int, stop: int | None = None) -> "AlphabetSequence": sub_array = np.frombuffer(memoryview(part_bytes), dtype="S1").copy() return self.__class__(sub_array, self.alphabet) + def mask(self, position: int | slice | tuple[int, int]) -> Self: + """Return a copy with one position or a range replaced by mask symbol. + + Args: + position: Either a single integer index, a :class:`slice`, or a + ``(start, stop)`` tuple. ``stop`` is exclusive. + + Returns: + A new sequence with masked positions replaced by ``MASK_SYMBOL``. + + Raises: + ValueError: If this sequence type does not define ``MASK_SYMBOL``. + TypeError: If *position* has unsupported type. + IndexError: If integer position is out of bounds. + """ + if self.MASK_SYMBOL is None: + raise ValueError(f"Masking is not supported for {self.__class__.__name__}") + + masked = self.content.copy() + mask_byte = np.array(self.MASK_SYMBOL.encode("ascii"), dtype="S1") + + if isinstance(position, int): + if position < 0: + position += len(self) + if position < 0 or position >= len(self): + raise IndexError("Mask position out of range") + masked[position] = mask_byte + elif isinstance(position, slice): + masked[position] = mask_byte + elif isinstance(position, tuple) and len(position) == 2: + start, stop = position + masked[slice(start, stop)] = mask_byte + else: + raise TypeError("position must be int, slice, or (start, stop) tuple") + + return self.__class__(masked, self.alphabet) + + def _is_masked(self, symbol: np.bytes_) -> bool: + """Return whether a symbol is the sequence-specific mask marker.""" + if self.MASK_SYMBOL is None: + return False + return symbol == self.MASK_SYMBOL.encode("ascii") + + def matches(self, other: "AlphabetSequence") -> bool: + """Compare two sequences, treating mask characters as wildcards. + + Matching requires equal lengths. At each position, symbols match if + they are equal or if either symbol is masked. + """ + if len(self) != len(other): + return False + + for left, right in zip(self.content, other.content): + if left == right: + continue + if self._is_masked(left) or other._is_masked(right): + continue + return False + return True + def __len__(self) -> int: """Return the number of characters in the sequence.""" return int(self.content.shape[0]) + + def __str__(self) -> str: + """Return a human-readable string representation of the sequence.""" + return self.to_string() class NucleotideSequence(AlphabetSequence): @@ -207,10 +286,11 @@ class NucleotideSequence(AlphabetSequence): ambiguity codes. Class Attributes: - DEFAULT_ALPHABET (SequenceAlphabet): Standard DNA alphabet ``{A, T, G, C}``. + DEFAULT_ALPHABET (SequenceAlphabet): Standard DNA alphabet ``{A, T, G, C, N}``. """ - DEFAULT_ALPHABET = SequenceAlphabet(("A", "T", "G", "C")) + MASK_SYMBOL = "N" + DEFAULT_ALPHABET = SequenceAlphabet(("A", "T", "G", "C", "N")) def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet = DEFAULT_ALPHABET) -> None: """Construct a nucleotide sequence from a byte array. @@ -267,6 +347,7 @@ class AminoAcidSequence(AlphabetSequence): "*", "_", "X", ) ) + MASK_SYMBOL = "X" def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet = DEFAULT_ALPHABET) -> None: """Construct an amino-acid sequence from a byte array. @@ -303,60 +384,78 @@ def from_string( array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() return cls(array, alphabet) - def to_simple_amino_acid(self) -> "SimpleAminoAcidSequence": - """Convert to a :class:`SimpleAminoAcidSequence` using physico-chemical grouping. + def to_reduced_amino_acid(self) -> "ReducedAminoAcidSequence": + """Convert to :class:`ReducedAminoAcidSequence` using physico-chemical grouping. Each amino acid is mapped to a reduced symbol according to :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`. Returns: - A :class:`SimpleAminoAcidSequence` of the same length whose + A :class:`ReducedAminoAcidSequence` of the same length whose symbols represent physico-chemical classes. """ - converted = "".join(AMINO_ACID_TO_SIMPLE_AMINO_ACID[s] for s in self.to_string()) - return SimpleAminoAcidSequence.from_string(converted) + converted = "".join(AMINO_ACID_TO_REDUCED_AMINO_ACID[s] for s in self.to_string()) + return ReducedAminoAcidSequence.from_string(converted) + + def to_simple_amino_acid(self) -> "ReducedAminoAcidSequence": + """Backwards-compatible alias for :meth:`to_reduced_amino_acid`.""" + return self.to_reduced_amino_acid() - def matches_simple_amino_acid(self, simple_sequence: "SimpleAminoAcidSequence") -> bool: - """Check whether this sequence maps to a given simple amino-acid sequence. + def matches_reduced_amino_acid(self, reduced_sequence: "ReducedAminoAcidSequence") -> bool: + """Check whether this sequence matches a reduced amino-acid sequence. - Converts ``self`` to the reduced alphabet and compares byte-for-byte - with *simple_sequence*. + Matching is performed position-wise and ignores masked characters + (``X`` in either sequence). Args: - simple_sequence: A :class:`SimpleAminoAcidSequence` to compare + reduced_sequence: A :class:`ReducedAminoAcidSequence` to compare against. Returns: - ``True`` if the reduced representation of this sequence equals - *simple_sequence*, ``False`` otherwise. + ``True`` if all non-masked positions are compatible with the + reduced amino-acid mapping, ``False`` otherwise. """ - return self.to_simple_amino_acid().content.tobytes() == simple_sequence.content.tobytes() + if len(self) != len(reduced_sequence): + return False + for aa_symbol, reduced_symbol in zip(self.content, reduced_sequence.content): + if self._is_masked(aa_symbol) or reduced_sequence._is_masked(reduced_symbol): + continue + mapped = AMINO_ACID_TO_REDUCED_AMINO_ACID[aa_symbol.decode("ascii")].encode("ascii") + if mapped != reduced_symbol: + return False + return True -class SimpleAminoAcidSequence(AlphabetSequence): - """An amino-acid sequence encoded in the reduced physico-chemical alphabet. + def matches_simple_amino_acid(self, simple_sequence: "ReducedAminoAcidSequence") -> bool: + """Backwards-compatible alias for :meth:`matches_reduced_amino_acid`.""" + return self.matches_reduced_amino_acid(simple_sequence) + + +class ReducedAminoAcidSequence(AlphabetSequence): + """A sequence encoded in the reduced physico-chemical amino-acid alphabet. Symbols are those produced by :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`: ``l``, ``b``, ``m``, ``c``, ``s``, ``h``, ``G``, ``F``, ``P``, ``W``, ``Y``, ``X``, ``*``, ``_``. Instances are typically obtained via - :meth:`AminoAcidSequence.to_simple_amino_acid` rather than constructed + :meth:`AminoAcidSequence.to_reduced_amino_acid` rather than constructed directly. Class Attributes: - DEFAULT_ALPHABET (SequenceAlphabet): :data:`SIMPLE_AMINO_ACID_ALPHABET`. + DEFAULT_ALPHABET (SequenceAlphabet): :data:`REDUCED_AMINO_ACID_ALPHABET`. """ - DEFAULT_ALPHABET = SIMPLE_AMINO_ACID_ALPHABET + DEFAULT_ALPHABET = REDUCED_AMINO_ACID_ALPHABET + MASK_SYMBOL = "X" def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet = DEFAULT_ALPHABET) -> None: - """Construct a simple amino-acid sequence from a byte array. + """Construct a reduced amino-acid sequence from a byte array. Args: content: One-dimensional ``S1``-dtype NumPy array. alphabet: Alphabet to validate against (defaults to - :data:`SIMPLE_AMINO_ACID_ALPHABET`). + :data:`REDUCED_AMINO_ACID_ALPHABET`). """ super().__init__(content, alphabet) @@ -366,16 +465,16 @@ def from_string( sequence: str, alphabet: SequenceAlphabet = DEFAULT_ALPHABET, ) -> Self: - """Create a :class:`SimpleAminoAcidSequence` from a plain string. + """Create a :class:`ReducedAminoAcidSequence` from a plain string. Args: sequence: String using the reduced physico-chemical symbols (e.g. ``"slhhllGGlhmcbllW"``). All characters must belong to *alphabet*. - alphabet: Defaults to :data:`SIMPLE_AMINO_ACID_ALPHABET`. + alphabet: Defaults to :data:`REDUCED_AMINO_ACID_ALPHABET`. Returns: - A new :class:`SimpleAminoAcidSequence` instance. + A new :class:`ReducedAminoAcidSequence` instance. Raises: ValueError: If *sequence* contains characters outside *alphabet*. @@ -384,3 +483,8 @@ def from_string( sequence_bytes = bytes(sz_sequence) array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() return cls(array, alphabet) + + + +#: Backwards-compatible class alias. +SimpleAminoAcidSequence = ReducedAminoAcidSequence diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 2a86f2d..1ab6170 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -6,10 +6,10 @@ substring slicing, length, and alphabet rejection. NucleotideSequence -- DNA string parsing and slicing. - AminoAcidSequence -- protein string parsing, slicing, and - conversion to the reduced alphabet. - SimpleAminoAcidSequence -- reduced-alphabet string parsing and - slicing. + AminoAcidSequence -- protein string parsing, slicing, reduction, + and mask-aware matching. + ReducedAminoAcidSequence -- reduced-alphabet parsing, slicing, + masking, and matching. """ import unittest @@ -19,8 +19,8 @@ from mir.basic.sequence import ( AminoAcidSequence, NucleotideSequence, + ReducedAminoAcidSequence, SequenceAlphabet, - SimpleAminoAcidSequence, ) @@ -37,7 +37,7 @@ def test_create_convert_and_substring(self) -> None: * ``substring(start, stop)`` returns the expected subsequence for both :class:`NucleotideSequence` and :class:`AminoAcidSequence`. """ - self.assertIs(NucleotideSequence.DEFAULT_ALPHABET, SequenceAlphabet(("A", "T", "G", "C"))) + self.assertIs(NucleotideSequence.DEFAULT_ALPHABET, SequenceAlphabet(("A", "T", "G", "C", "N"))) nt = NucleotideSequence.from_string("ATTAGACA") self.assertEqual(nt.to_string(), "ATTAGACA") @@ -71,48 +71,97 @@ def test_empty_or_invalid_sequence(self) -> None: self.assertEqual(NucleotideSequence.from_string("ATTAGACA").substring(0, 0).to_string(), "") + self.assertEqual(NucleotideSequence.from_string("ATN").to_string(), "ATN") + with self.assertRaises(ValueError): - NucleotideSequence.from_string("ATU") + NucleotideSequence.from_string("ATU") with self.assertRaises(ValueError): AminoAcidSequence.from_string("B") -class TestSimpleAminoAcidSequence(unittest.TestCase): - """Tests for :class:`~mir.basic.sequence.SimpleAminoAcidSequence` and AA conversion.""" +class TestReducedAminoAcidSequence(unittest.TestCase): + """Tests for :class:`~mir.basic.sequence.ReducedAminoAcidSequence` and AA conversion.""" - def test_amino_acid_to_simple_conversion_and_match(self) -> None: - """``to_simple_amino_acid`` applies the physico-chemical grouping map correctly. + def test_amino_acid_to_reduced_conversion_and_match(self) -> None: + """Reduced conversion and matching respect mapping and masks. Verifies that: - * The reduced string produced by :meth:`AminoAcidSequence.to_simple_amino_acid` + * The reduced string produced by :meth:`AminoAcidSequence.to_reduced_amino_acid` matches the expected character-by-character mapping from - :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`. - * :meth:`AminoAcidSequence.matches_simple_amino_acid` returns ``True`` - for the sequence's own reduced form and ``False`` for an altered one. + :data:`AMINO_ACID_TO_REDUCED_AMINO_ACID`. + * :meth:`AminoAcidSequence.matches_reduced_amino_acid` returns ``True`` + for the sequence's own reduced form and ``False`` for an altered one + at an unmasked position. """ aa: AminoAcidSequence = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") - simple = aa.to_simple_amino_acid() + reduced = aa.to_reduced_amino_acid() + + self.assertEqual(reduced.to_string(), "slhhllGGlhmcbllW") + self.assertTrue(aa.matches_reduced_amino_acid(reduced)) + self.assertFalse(aa.matches_reduced_amino_acid(ReducedAminoAcidSequence.from_string("slhhllGGlhmcbllY"))) + + masked_aa = aa.mask(2) + self.assertTrue(masked_aa.matches_reduced_amino_acid(reduced)) + + masked_reduced = reduced.mask((2, 5)) + self.assertTrue(aa.matches_reduced_amino_acid(masked_reduced)) - self.assertEqual(simple.to_string(), "slhhllGGlhmcbllW") - self.assertTrue(aa.matches_simple_amino_acid(simple)) - self.assertFalse(aa.matches_simple_amino_acid(SimpleAminoAcidSequence.from_string("slhhllGGlhmcbllY"))) + def test_aa_to_reduced_backwards_compatible_aliases(self) -> None: + """Legacy simple-amino-acid aliases keep working.""" + aa = AminoAcidSequence.from_string("CAST") + reduced = aa.to_simple_amino_acid() + self.assertIsInstance(reduced, ReducedAminoAcidSequence) + self.assertTrue(aa.matches_simple_amino_acid(reduced)) - def test_simple_substrings(self) -> None: - """``substring`` on a :class:`SimpleAminoAcidSequence` slices correctly. + def test_reduced_substrings(self) -> None: + """``substring`` on a :class:`ReducedAminoAcidSequence` slices correctly. Verifies that: * A half-open slice returns the expected subsequence. * ``substring(start, None)`` slices through to the end of the sequence. * An out-of-alphabet character (``Z``) raises ``ValueError``. """ - simple = SimpleAminoAcidSequence.from_string("slhhllGGlhmcbllW") - self.assertEqual(simple.substring(0, 4).to_string(), "slhh") - self.assertEqual(simple.substring(6, 8).to_string(), "GG") - self.assertEqual(simple.substring(11, None).to_string(), "cbllW") + reduced = ReducedAminoAcidSequence.from_string("slhhllGGlhmcbllW") + self.assertEqual(reduced.substring(0, 4).to_string(), "slhh") + self.assertEqual(reduced.substring(6, 8).to_string(), "GG") + self.assertEqual(reduced.substring(11, None).to_string(), "cbllW") with self.assertRaises(ValueError): - SimpleAminoAcidSequence.from_string("Z") + ReducedAminoAcidSequence.from_string("Z") + + +class TestMaskAndMatch(unittest.TestCase): + """Tests for masking and wildcard-aware matching.""" + + def test_nucleotide_mask_single_and_range(self) -> None: + seq = NucleotideSequence.from_string("ATCGAT") + self.assertEqual(seq.mask(1).to_string(), "ANCGAT") + self.assertEqual(seq.mask((2, 5)).to_string(), "ATNNNT") + self.assertEqual(seq.mask(slice(0, 3)).to_string(), "NNNGAT") + + def test_amino_and_reduced_mask_single_and_range(self) -> None: + aa = AminoAcidSequence.from_string("CASTIV") + reduced = ReducedAminoAcidSequence.from_string("slhhll") + self.assertEqual(aa.mask(0).to_string(), "XASTIV") + self.assertEqual(aa.mask((1, 4)).to_string(), "CXXXIV") + self.assertEqual(reduced.mask(slice(2, 5)).to_string(), "slXXXl") + + def test_sequence_matching_ignores_mask_symbols(self) -> None: + nt1 = NucleotideSequence.from_string("ATCG") + nt2 = NucleotideSequence.from_string("ANNG") + self.assertTrue(nt1.matches(nt2)) + self.assertFalse(nt1.matches(NucleotideSequence.from_string("ANNA"))) + + aa1 = AminoAcidSequence.from_string("CAST") + aa2 = AminoAcidSequence.from_string("XASX") + self.assertTrue(aa1.matches(aa2)) + self.assertFalse(aa1.matches(AminoAcidSequence.from_string("XATX"))) + + red1 = ReducedAminoAcidSequence.from_string("slhh") + red2 = ReducedAminoAcidSequence.from_string("sXXh") + self.assertTrue(red1.matches(red2)) + self.assertFalse(red1.matches(ReducedAminoAcidSequence.from_string("sXXY"))) if __name__ == "__main__": From acd22b9cfcd08d6495ba674c629db584a8b3f17f Mon Sep 17 00:00:00 2001 From: mikessh Date: Fri, 10 Apr 2026 22:58:01 +0300 Subject: [PATCH 05/24] wip sequence --- mir/basic/sequence.py | 555 ++++++++++++++--------------------------- tests/test_sequence.py | 186 ++++++++------ 2 files changed, 300 insertions(+), 441 deletions(-) diff --git a/mir/basic/sequence.py b/mir/basic/sequence.py index 5efe592..3aa0ae9 100644 --- a/mir/basic/sequence.py +++ b/mir/basic/sequence.py @@ -1,490 +1,303 @@ -"""Biological sequence types backed by NumPy byte arrays. +"""Biological sequence types backed by immutable NumPy byte arrays. -This module defines alphabet-validated sequence classes for nucleotide and -amino-acid data. All sequences are stored as ``np.ndarray`` of dtype ``S1`` -(single-byte ASCII characters) so they can be operated on efficiently with -NumPy primitives and ``stringzilla``. +Sequence objects are lightweight and immutable: each instance stores only a +read-only ``np.ndarray`` of dtype ``S1``. The alphabet is validated on +construction and is defined at the class level (``DEFAULT_ALPHABET``). + +**Equality vs. matching:** ``__eq__`` and ``__hash__`` compare raw byte +content, so sequences work correctly as dictionary keys and set members. +The :meth:`~AlphabetSequence.matches` method is different: it treats mask +characters (``N`` for nucleotides, ``X`` for amino-acid alphabets) as +wildcards, so two sequences that *match* may not be *equal*. Classes: SequenceAlphabet -- Singleton alphabet definition. AlphabetSequence -- Base class for alphabet-constrained sequences. - NucleotideSequence -- DNA sequence (A/T/G/C/N by default). + NucleotideSequence -- DNA sequence (A/T/G/C/N). AminoAcidSequence -- Standard 20-AA + stop/unknown sequence. - ReducedAminoAcidSequence -- Reduced amino-acid alphabet used for fuzzy - matching (groups physico-chemically similar AAs). + ReducedAminoAcidSequence -- Reduced amino-acid alphabet for fuzzy matching. """ from __future__ import annotations import numpy as np -import stringzilla as sz from typing import Self +# --------------------------------------------------------------------------- +# Amino-acid → reduced-alphabet mapping +# --------------------------------------------------------------------------- + #: Maps each standard amino-acid one-letter code (plus ``X``, ``*``, ``_``) -#: to a reduced symbol representing its physico-chemical class:: -#: -#: l aliphatic/hydrophobic (A, I, L, V) -#: b basic (R, H, K) -#: m amide (N, Q) -#: c acidic/charged (D, E) -#: s sulphur-containing (C, M) -#: h hydroxyl (S, T) -#: G glycine -#: F phenylalanine -#: P proline -#: W tryptophan -#: Y tyrosine -#: X unknown -#: * stop codon -#: _ gap +#: to a reduced symbol representing its physico-chemical class. AMINO_ACID_TO_REDUCED_AMINO_ACID: dict[str, str] = { - "A": "l", - "R": "b", - "N": "m", - "D": "c", - "C": "s", - "Q": "m", - "E": "c", - "G": "G", - "H": "b", - "I": "l", - "L": "l", - "K": "b", - "M": "s", - "F": "F", - "P": "P", - "S": "h", - "T": "h", - "W": "W", - "Y": "Y", - "V": "l", - "X": "X", - "*": "*", - "_": "_", + "A": "l", "R": "b", "N": "m", "D": "c", "C": "s", "Q": "m", + "E": "c", "G": "G", "H": "b", "I": "l", "L": "l", "K": "b", + "M": "s", "F": "F", "P": "P", "S": "h", "T": "h", "W": "W", + "Y": "Y", "V": "l", "X": "X", "*": "*", "_": "_", } +#: Byte lookup table (128 entries, indexed by ASCII ordinal) for converting +#: amino-acid bytes to reduced-alphabet bytes without string intermediaries. +_AA_TO_REDUCED_LUT = np.zeros(128, dtype=np.uint8) +for _aa, _red in AMINO_ACID_TO_REDUCED_AMINO_ACID.items(): + _AA_TO_REDUCED_LUT[ord(_aa)] = ord(_red) + + +# --------------------------------------------------------------------------- +# Alphabet +# --------------------------------------------------------------------------- class SequenceAlphabet: - """Singleton-like immutable alphabet definition keyed by allowed symbols. + """Singleton immutable alphabet keyed by allowed symbols. - Instances are cached by their ``allowed_symbols`` tuple so that two - ``SequenceAlphabet`` objects constructed with identical symbol sets are - guaranteed to be the *same* object (``is`` comparison holds). + Instances are cached so that two objects with identical symbol sets are + the *same* object (``is`` holds). Attributes: - allowed_symbols (tuple[str, ...]): Immutable ordered collection of - single-character symbols that belong to this alphabet. - allowed_array (np.ndarray): ``S1``-dtype NumPy array of the same - symbols, pre-built for fast membership testing via ``np.isin``. + allowed_symbols: Ordered tuple of single-character symbols. + allowed_array: ``S1``-dtype NumPy array for fast ``np.isin`` tests. """ - _instances: dict[tuple[str, ...], "SequenceAlphabet"] = {} + _instances: dict[tuple[str, ...], SequenceAlphabet] = {} - def __new__(cls, allowed_symbols: tuple[str, ...]) -> "SequenceAlphabet": - """Return the cached instance for *allowed_symbols*, creating it if needed.""" + def __new__(cls, allowed_symbols: tuple[str, ...]) -> SequenceAlphabet: key = tuple(allowed_symbols) if key not in cls._instances: - instance = super().__new__(cls) - cls._instances[key] = instance + inst = super().__new__(cls) + cls._instances[key] = inst return cls._instances[key] def __init__(self, allowed_symbols: tuple[str, ...]) -> None: - """Initialise the alphabet (no-op when the cached instance already exists). - - Args: - allowed_symbols: Ordered tuple of single-character strings that - define the legal symbol set for this alphabet. - """ if hasattr(self, "allowed_symbols"): return self.allowed_symbols = tuple(allowed_symbols) - self.allowed_array = np.array([c.encode("ascii") for c in self.allowed_symbols], dtype="S1") - - -#: Backwards-compatible alias for older name. -AMINO_ACID_TO_SIMPLE_AMINO_ACID = AMINO_ACID_TO_REDUCED_AMINO_ACID - + self.allowed_array = np.array( + [c.encode("ascii") for c in self.allowed_symbols], dtype="S1", + ) -#: Pre-built :class:`SequenceAlphabet` for the reduced amino-acid symbol set -#: derived from :data:`AMINO_ACID_TO_REDUCED_AMINO_ACID`. -REDUCED_AMINO_ACID_ALPHABET = SequenceAlphabet(tuple(dict.fromkeys(AMINO_ACID_TO_REDUCED_AMINO_ACID.values()))) +REDUCED_AMINO_ACID_ALPHABET = SequenceAlphabet( + tuple(dict.fromkeys(AMINO_ACID_TO_REDUCED_AMINO_ACID.values())) +) -#: Backwards-compatible alias for older name. -SIMPLE_AMINO_ACID_ALPHABET = REDUCED_AMINO_ACID_ALPHABET +# --------------------------------------------------------------------------- +# Base sequence +# --------------------------------------------------------------------------- class AlphabetSequence: - """Compact alphabet-validated sequence backed by a NumPy array. + """Immutable alphabet-validated sequence backed by a read-only byte array. - This is the abstract base class for all concrete sequence types in this - module. Direct instantiation is allowed but callers should prefer one of - the concrete subclasses (:class:`NucleotideSequence`, - :class:`AminoAcidSequence`, :class:`SimpleAminoAcidSequence`) which - provide a sensible default alphabet. + Each instance stores **only** a read-only ``np.ndarray`` of dtype ``S1``. + The alphabet and mask symbol live on the class, not on instances. - Class Attributes: - MASK_SYMBOL (str | None): Symbol used by :meth:`mask`. Subclasses - that support masking define this symbol (``"N"`` for nucleotides, - ``"X"`` for amino-acid alphabets). + **Equality vs matching:** + ``__eq__`` / ``__hash__`` compare raw bytes (for ``dict`` / ``set`` use). + :meth:`matches` performs wildcard-aware comparison where mask characters + (``N`` for nucleotides, ``X`` for amino-acid types) count as matching + any symbol — so two sequences can *match* without being *equal*. - Attributes: - content (np.ndarray): One-dimensional ``S1``-dtype array storing the - sequence as individual ASCII bytes. - alphabet (SequenceAlphabet): The alphabet that ``content`` was - validated against. + Subclass protocol: + * ``DEFAULT_ALPHABET`` — :class:`SequenceAlphabet` with allowed symbols. + * ``_MASK_BYTE`` — ``b"N"``, ``b"X"``, or ``b""`` (no masking). """ - MASK_SYMBOL: str | None = None + __slots__ = ("_data",) - def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet) -> None: - """Construct a validated sequence from an existing byte array. + DEFAULT_ALPHABET: SequenceAlphabet # set by subclasses + _MASK_BYTE: bytes = b"" - Args: - content: One-dimensional NumPy array with ``dtype='S1'``. - alphabet: Alphabet to validate *content* against. + def __init__(self, data: np.ndarray) -> None: + """Validate *data* against the class alphabet and freeze it. Raises: - ValueError: If *content* does not have dtype ``S1``, is not - one-dimensional, or contains symbols absent from *alphabet*. + ValueError: If dtype is not ``S1``, array is not 1-D, or + symbols fall outside ``DEFAULT_ALPHABET``. """ - if content.dtype != np.dtype("S1"): + if data.dtype != np.dtype("S1"): raise ValueError("Sequence storage must have dtype S1") - if content.ndim != 1: + if data.ndim != 1: raise ValueError("Sequence storage must be one-dimensional") - - if not np.isin(content, alphabet.allowed_array).all(): + if not np.isin(data, self.DEFAULT_ALPHABET.allowed_array).all(): raise ValueError("Sequence contains symbols outside of alphabet") + data.flags.writeable = False + self._data = data - self.content = content - self.alphabet = alphabet + # -- constructors ------------------------------------------------------- @classmethod - def from_string(cls: type[Self], sequence: str, alphabet: SequenceAlphabet) -> Self: - """Create an instance by parsing a plain Python string. + def from_string(cls: type[Self], sequence: str) -> Self: + """Create a sequence from a plain Python string.""" + arr = np.frombuffer(sequence.encode("ascii"), dtype="S1").copy() + return cls(arr) - Args: - sequence: String whose characters must all belong to *alphabet*. - alphabet: Alphabet to validate the sequence against. + # -- accessors ---------------------------------------------------------- - Returns: - A new instance of the calling class backed by a freshly allocated - ``S1`` NumPy array. + @property + def data(self) -> np.ndarray: + """Read-only ``S1`` byte array backing this sequence.""" + return self._data - Raises: - ValueError: If any character in *sequence* is outside *alphabet*. - """ - sz_sequence = sz.Str(sequence) - sequence_bytes = bytes(sz_sequence) - array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() - return cls(array, alphabet) + @property + def content(self) -> np.ndarray: + """Alias for :attr:`data` (backward compatibility).""" + return self._data def to_string(self) -> str: - """Decode the byte array back to a plain Python string. + """Decode the byte array to a plain Python string.""" + return self._data.tobytes().decode("ascii") - Returns: - The sequence as a ``str``. - """ - return str(sz.Str(self.content.tobytes())) - - def substring(self, start: int, stop: int | None = None) -> "AlphabetSequence": - """Return a validated slice of this sequence. + def substring(self, start: int, stop: int | None = None) -> Self: + """Return a new sequence for the half-open range ``[start, stop)``.""" + return type(self)(self._data[start:stop].copy()) - Uses the same slicing semantics as Python built-in strings: *start* is - inclusive, *stop* is exclusive, and ``None`` means "to the end". - - Args: - start: Index of the first character to include (0-based). - stop: Index of the first character to *exclude*, or ``None`` to - slice through to the end of the sequence. - - Returns: - A new instance of the same concrete class containing the requested - subsequence with the same alphabet. - """ - view = sz.Str(self.content.tobytes()) - part = view[start:stop] - part_bytes = bytes(part) - sub_array = np.frombuffer(memoryview(part_bytes), dtype="S1").copy() - return self.__class__(sub_array, self.alphabet) + # -- masking ------------------------------------------------------------ def mask(self, position: int | slice | tuple[int, int]) -> Self: - """Return a copy with one position or a range replaced by mask symbol. + """Return a copy with the given position(s) replaced by the mask byte. Args: - position: Either a single integer index, a :class:`slice`, or a - ``(start, stop)`` tuple. ``stop`` is exclusive. - - Returns: - A new sequence with masked positions replaced by ``MASK_SYMBOL``. + position: Integer index, ``slice``, or ``(start, stop)`` tuple. Raises: - ValueError: If this sequence type does not define ``MASK_SYMBOL``. - TypeError: If *position* has unsupported type. - IndexError: If integer position is out of bounds. + ValueError: If this class does not support masking. + IndexError: If an integer position is out of bounds. """ - if self.MASK_SYMBOL is None: - raise ValueError(f"Masking is not supported for {self.__class__.__name__}") - - masked = self.content.copy() - mask_byte = np.array(self.MASK_SYMBOL.encode("ascii"), dtype="S1") - + if not self._MASK_BYTE: + raise ValueError(f"Masking not supported for {type(self).__name__}") + buf = self._data.copy() + buf.flags.writeable = True + mv = np.array(self._MASK_BYTE, dtype="S1") if isinstance(position, int): if position < 0: position += len(self) if position < 0 or position >= len(self): raise IndexError("Mask position out of range") - masked[position] = mask_byte + buf[position] = mv elif isinstance(position, slice): - masked[position] = mask_byte + buf[position] = mv elif isinstance(position, tuple) and len(position) == 2: - start, stop = position - masked[slice(start, stop)] = mask_byte + buf[position[0]:position[1]] = mv else: raise TypeError("position must be int, slice, or (start, stop) tuple") + return type(self)(buf) - return self.__class__(masked, self.alphabet) - - def _is_masked(self, symbol: np.bytes_) -> bool: - """Return whether a symbol is the sequence-specific mask marker.""" - if self.MASK_SYMBOL is None: - return False - return symbol == self.MASK_SYMBOL.encode("ascii") + # -- wildcard matching (NOT equality) ----------------------------------- - def matches(self, other: "AlphabetSequence") -> bool: - """Compare two sequences, treating mask characters as wildcards. + def matches(self, other: AlphabetSequence) -> bool: + """Wildcard-aware positional comparison. - Matching requires equal lengths. At each position, symbols match if - they are equal or if either symbol is masked. + Returns ``True`` when the sequences have the same length and at + every position the symbols are equal **or** at least one side + carries a mask character. This is intentionally **not** the same + as ``__eq__`` which compares bytes exactly. """ if len(self) != len(other): return False - - for left, right in zip(self.content, other.content): - if left == right: - continue - if self._is_masked(left) or other._is_masked(right): - continue - return False - return True + if len(self) == 0: + return True + eq = self._data == other._data + if eq.all(): + return True + ok = eq.copy() + if self._MASK_BYTE: + ok |= self._data == np.array(self._MASK_BYTE, dtype="S1") + if other._MASK_BYTE: + ok |= other._data == np.array(other._MASK_BYTE, dtype="S1") + return bool(ok.all()) + + # -- equality & hashing (byte-exact) ------------------------------------ + + def __eq__(self, other: object) -> bool: + if type(self) is not type(other): + return NotImplemented + return self._data.tobytes() == other._data.tobytes() + + def __hash__(self) -> int: + return hash(self._data.tobytes()) def __len__(self) -> int: - """Return the number of characters in the sequence.""" - return int(self.content.shape[0]) - + return int(self._data.shape[0]) + def __str__(self) -> str: - """Return a human-readable string representation of the sequence.""" return self.to_string() + def __repr__(self) -> str: + return f"{type(self).__name__}({self.to_string()!r})" -class NucleotideSequence(AlphabetSequence): - """A DNA nucleotide sequence restricted to the standard four-base alphabet. - The default alphabet is ``("A", "T", "G", "C")``. A custom - :class:`SequenceAlphabet` may be supplied to support, for example, - ambiguity codes. +# --------------------------------------------------------------------------- +# Concrete sequence types +# --------------------------------------------------------------------------- - Class Attributes: - DEFAULT_ALPHABET (SequenceAlphabet): Standard DNA alphabet ``{A, T, G, C, N}``. +class NucleotideSequence(AlphabetSequence): + """DNA nucleotide sequence (``A``, ``T``, ``G``, ``C``, ``N``). + + ``N`` serves as the mask / ambiguity symbol. """ - MASK_SYMBOL = "N" + __slots__ = () DEFAULT_ALPHABET = SequenceAlphabet(("A", "T", "G", "C", "N")) - - def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet = DEFAULT_ALPHABET) -> None: - """Construct a nucleotide sequence from a byte array. - - Args: - content: One-dimensional ``S1``-dtype NumPy array. - alphabet: Alphabet to validate against (defaults to ``{A, T, G, C}``). - """ - super().__init__(content, alphabet) - - @classmethod - def from_string( - cls: type[Self], - sequence: str, - alphabet: SequenceAlphabet = DEFAULT_ALPHABET, - ) -> Self: - """Create a :class:`NucleotideSequence` from a plain string. - - Args: - sequence: DNA string (e.g. ``"ATCG"``). All characters must - belong to *alphabet*. - alphabet: Defaults to the standard four-base DNA alphabet. - - Returns: - A new :class:`NucleotideSequence` instance. - - Raises: - ValueError: If *sequence* contains characters outside *alphabet*. - """ - sz_sequence = sz.Str(sequence) - sequence_bytes = bytes(sz_sequence) - array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() - return cls(array, alphabet) + _MASK_BYTE = b"N" class AminoAcidSequence(AlphabetSequence): - """A standard amino-acid sequence using the 20-letter IUPAC alphabet. + """Standard 20-letter amino-acid sequence. - In addition to the 20 canonical amino acids the alphabet includes: - - * ``*`` — stop codon - * ``_`` — gap - * ``X`` — unknown / any amino acid - - Class Attributes: - DEFAULT_ALPHABET (SequenceAlphabet): The 20 canonical AAs plus - ``*``, ``_``, and ``X``. + The alphabet includes ``*`` (stop), ``_`` (gap), and ``X`` (unknown). + ``X`` serves as the mask / wildcard symbol. """ - DEFAULT_ALPHABET = SequenceAlphabet( - ( - "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", - "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", - "*", "_", "X", - ) - ) - MASK_SYMBOL = "X" - - def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet = DEFAULT_ALPHABET) -> None: - """Construct an amino-acid sequence from a byte array. - - Args: - content: One-dimensional ``S1``-dtype NumPy array. - alphabet: Alphabet to validate against (defaults to the standard - 20-AA + stop/gap/unknown alphabet). - """ - super().__init__(content, alphabet) - - @classmethod - def from_string( - cls: type[Self], - sequence: str, - alphabet: SequenceAlphabet = DEFAULT_ALPHABET, - ) -> Self: - """Create an :class:`AminoAcidSequence` from a plain string. - - Args: - sequence: Amino-acid string in single-letter code - (e.g. ``"CASSLAPGATNEKLFF"``). All characters must belong to - *alphabet*. - alphabet: Defaults to the standard amino-acid alphabet. + __slots__ = () + DEFAULT_ALPHABET = SequenceAlphabet(( + "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", + "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", + "*", "_", "X", + )) + _MASK_BYTE = b"X" - Returns: - A new :class:`AminoAcidSequence` instance. - - Raises: - ValueError: If *sequence* contains characters outside *alphabet*. - """ - sz_sequence = sz.Str(sequence) - sequence_bytes = bytes(sz_sequence) - array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() - return cls(array, alphabet) + def to_reduced_amino_acid(self) -> ReducedAminoAcidSequence: + """Convert to the reduced physico-chemical alphabet via byte LUT.""" + converted = _AA_TO_REDUCED_LUT[self._data.view(np.uint8)].view("S1").copy() + return ReducedAminoAcidSequence(converted) - def to_reduced_amino_acid(self) -> "ReducedAminoAcidSequence": - """Convert to :class:`ReducedAminoAcidSequence` using physico-chemical grouping. - - Each amino acid is mapped to a reduced symbol according to - :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`. - - Returns: - A :class:`ReducedAminoAcidSequence` of the same length whose - symbols represent physico-chemical classes. - """ - converted = "".join(AMINO_ACID_TO_REDUCED_AMINO_ACID[s] for s in self.to_string()) - return ReducedAminoAcidSequence.from_string(converted) - - def to_simple_amino_acid(self) -> "ReducedAminoAcidSequence": + def to_simple_amino_acid(self) -> ReducedAminoAcidSequence: """Backwards-compatible alias for :meth:`to_reduced_amino_acid`.""" return self.to_reduced_amino_acid() - def matches_reduced_amino_acid(self, reduced_sequence: "ReducedAminoAcidSequence") -> bool: - """Check whether this sequence matches a reduced amino-acid sequence. - - Matching is performed position-wise and ignores masked characters - (``X`` in either sequence). + def matches_reduced_amino_acid(self, reduced: ReducedAminoAcidSequence) -> bool: + """Wildcard-aware match against a reduced amino-acid sequence. - Args: - reduced_sequence: A :class:`ReducedAminoAcidSequence` to compare - against. - - Returns: - ``True`` if all non-masked positions are compatible with the - reduced amino-acid mapping, ``False`` otherwise. + Each position of *self* is first mapped to the reduced alphabet via a + byte lookup table; then positions are compared treating ``X`` on + either side as a wildcard. Like :meth:`matches`, this is **not** an + equality test. """ - if len(self) != len(reduced_sequence): + if len(self) != len(reduced): return False - - for aa_symbol, reduced_symbol in zip(self.content, reduced_sequence.content): - if self._is_masked(aa_symbol) or reduced_sequence._is_masked(reduced_symbol): - continue - mapped = AMINO_ACID_TO_REDUCED_AMINO_ACID[aa_symbol.decode("ascii")].encode("ascii") - if mapped != reduced_symbol: - return False - return True - - def matches_simple_amino_acid(self, simple_sequence: "ReducedAminoAcidSequence") -> bool: + if len(self) == 0: + return True + converted = _AA_TO_REDUCED_LUT[self._data.view(np.uint8)].view("S1") + eq = converted == reduced._data + if eq.all(): + return True + mask_x = np.array(b"X", dtype="S1") + return bool((eq | (self._data == mask_x) | (reduced._data == mask_x)).all()) + + def matches_simple_amino_acid(self, simple: ReducedAminoAcidSequence) -> bool: """Backwards-compatible alias for :meth:`matches_reduced_amino_acid`.""" - return self.matches_reduced_amino_acid(simple_sequence) + return self.matches_reduced_amino_acid(simple) class ReducedAminoAcidSequence(AlphabetSequence): - """A sequence encoded in the reduced physico-chemical amino-acid alphabet. - - Symbols are those produced by :data:`AMINO_ACID_TO_SIMPLE_AMINO_ACID`: - ``l``, ``b``, ``m``, ``c``, ``s``, ``h``, ``G``, ``F``, ``P``, ``W``, - ``Y``, ``X``, ``*``, ``_``. + """Sequence in the reduced physico-chemical amino-acid alphabet. + Symbols: ``l b m c s h G F P W Y X * _``. ``X`` is the mask / wildcard. Instances are typically obtained via - :meth:`AminoAcidSequence.to_reduced_amino_acid` rather than constructed - directly. - - Class Attributes: - DEFAULT_ALPHABET (SequenceAlphabet): :data:`REDUCED_AMINO_ACID_ALPHABET`. + :meth:`AminoAcidSequence.to_reduced_amino_acid`. """ + __slots__ = () DEFAULT_ALPHABET = REDUCED_AMINO_ACID_ALPHABET - MASK_SYMBOL = "X" - - def __init__(self, content: np.ndarray, alphabet: SequenceAlphabet = DEFAULT_ALPHABET) -> None: - """Construct a reduced amino-acid sequence from a byte array. - - Args: - content: One-dimensional ``S1``-dtype NumPy array. - alphabet: Alphabet to validate against (defaults to - :data:`REDUCED_AMINO_ACID_ALPHABET`). - """ - super().__init__(content, alphabet) - - @classmethod - def from_string( - cls: type[Self], - sequence: str, - alphabet: SequenceAlphabet = DEFAULT_ALPHABET, - ) -> Self: - """Create a :class:`ReducedAminoAcidSequence` from a plain string. - - Args: - sequence: String using the reduced physico-chemical symbols - (e.g. ``"slhhllGGlhmcbllW"``). All characters must belong - to *alphabet*. - alphabet: Defaults to :data:`REDUCED_AMINO_ACID_ALPHABET`. - - Returns: - A new :class:`ReducedAminoAcidSequence` instance. - - Raises: - ValueError: If *sequence* contains characters outside *alphabet*. - """ - sz_sequence = sz.Str(sequence) - sequence_bytes = bytes(sz_sequence) - array = np.frombuffer(memoryview(sequence_bytes), dtype="S1").copy() - return cls(array, alphabet) - - - -#: Backwards-compatible class alias. -SimpleAminoAcidSequence = ReducedAminoAcidSequence + _MASK_BYTE = b"X" diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 1ab6170..3232938 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,15 +1,13 @@ """Unit tests for :mod:`mir.basic.sequence`. Coverage: - SequenceAlphabet -- singleton caching behaviour. - AlphabetSequence -- construction, round-trip string conversion, - substring slicing, length, and alphabet - rejection. - NucleotideSequence -- DNA string parsing and slicing. - AminoAcidSequence -- protein string parsing, slicing, reduction, - and mask-aware matching. - ReducedAminoAcidSequence -- reduced-alphabet parsing, slicing, - masking, and matching. + SequenceAlphabet -- singleton caching. + AlphabetSequence -- construction, string round-trip, substring, + immutability, ``__eq__``, ``__hash__``. + NucleotideSequence -- parsing, slicing, masking. + AminoAcidSequence -- parsing, slicing, reduced conversion, matching. + ReducedAminoAcidSequence -- parsing, slicing, masking, matching. + Equality vs matching -- ``matches()`` is wildcard-aware, ``==`` is not. """ import unittest @@ -19,30 +17,24 @@ from mir.basic.sequence import ( AminoAcidSequence, NucleotideSequence, - ReducedAminoAcidSequence, + ReducedAminoAcidSequence, SequenceAlphabet, ) class TestAlphabetSequence(unittest.TestCase): - """Tests for :class:`~mir.basic.sequence.AlphabetSequence` and its subclasses.""" + """Construction, round-trip, substring, immutability.""" def test_create_convert_and_substring(self) -> None: - """Sequences round-trip through ``from_string`` / ``to_string`` and slice correctly. - - Verifies that: - * ``DEFAULT_ALPHABET`` is the singleton cached by :class:`SequenceAlphabet`. - * The ``S1`` dtype is preserved after construction. - * ``to_string`` reconstructs the original string exactly. - * ``substring(start, stop)`` returns the expected subsequence for both - :class:`NucleotideSequence` and :class:`AminoAcidSequence`. - """ - self.assertIs(NucleotideSequence.DEFAULT_ALPHABET, SequenceAlphabet(("A", "T", "G", "C", "N"))) + self.assertIs( + NucleotideSequence.DEFAULT_ALPHABET, + SequenceAlphabet(("A", "T", "G", "C", "N")), + ) nt = NucleotideSequence.from_string("ATTAGACA") self.assertEqual(nt.to_string(), "ATTAGACA") - self.assertEqual(nt.content.dtype, np.dtype("S1")) - self.assertEqual(nt.content.tobytes(), b"ATTAGACA") + self.assertEqual(nt.data.dtype, np.dtype("S1")) + self.assertEqual(nt.data.tobytes(), b"ATTAGACA") self.assertEqual(nt.substring(2, 6).to_string(), "TAGA") aa = AminoAcidSequence.from_string("CASSLAPGATNEKLFF") @@ -50,17 +42,6 @@ def test_create_convert_and_substring(self) -> None: self.assertEqual(aa.substring(4, 9).to_string(), "LAPGA") def test_empty_or_invalid_sequence(self) -> None: - """Empty sequences are valid; out-of-alphabet characters raise ``ValueError``. - - Verifies that: - * An empty :class:`NucleotideSequence` and :class:`AminoAcidSequence` - have length 0 and round-trip to ``""``. - * ``substring(0, 0)`` on a non-empty sequence returns an empty sequence. - * Constructing a :class:`NucleotideSequence` from ``"ATU"`` raises - ``ValueError`` (``U`` is not in the DNA alphabet). - * Constructing an :class:`AminoAcidSequence` from ``"B"`` raises - ``ValueError`` (``B`` is not a standard amino acid). - """ empty_nt = NucleotideSequence.from_string("") self.assertEqual(len(empty_nt), 0) self.assertEqual(empty_nt.to_string(), "") @@ -69,59 +50,113 @@ def test_empty_or_invalid_sequence(self) -> None: self.assertEqual(len(empty_aa), 0) self.assertEqual(empty_aa.to_string(), "") - self.assertEqual(NucleotideSequence.from_string("ATTAGACA").substring(0, 0).to_string(), "") + self.assertEqual( + NucleotideSequence.from_string("ATTAGACA").substring(0, 0).to_string(), "" + ) self.assertEqual(NucleotideSequence.from_string("ATN").to_string(), "ATN") with self.assertRaises(ValueError): - NucleotideSequence.from_string("ATU") + NucleotideSequence.from_string("ATU") with self.assertRaises(ValueError): AminoAcidSequence.from_string("B") + def test_immutability(self) -> None: + """The underlying byte array is read-only.""" + nt = NucleotideSequence.from_string("ATCG") + with self.assertRaises(ValueError): + nt.data[0] = b"G" + + def test_no_extra_attributes(self) -> None: + """__slots__ prevents adding arbitrary instance attributes.""" + nt = NucleotideSequence.from_string("ATCG") + with self.assertRaises(AttributeError): + nt.foo = 42 # type: ignore[attr-defined] + + def test_content_backward_compat(self) -> None: + """The .content property still works.""" + nt = NucleotideSequence.from_string("ATCG") + self.assertIs(nt.content, nt.data) + + def test_repr(self) -> None: + nt = NucleotideSequence.from_string("ATCG") + self.assertEqual(repr(nt), "NucleotideSequence('ATCG')") + + +class TestEqualityAndHashing(unittest.TestCase): + """``__eq__`` and ``__hash__`` use raw bytes, not wildcard matching.""" + + def test_equal_sequences(self) -> None: + a = NucleotideSequence.from_string("ATCG") + b = NucleotideSequence.from_string("ATCG") + self.assertEqual(a, b) + self.assertEqual(hash(a), hash(b)) + + def test_unequal_sequences(self) -> None: + a = NucleotideSequence.from_string("ATCG") + b = NucleotideSequence.from_string("ATNG") + self.assertNotEqual(a, b) + + def test_masked_not_equal_but_matches(self) -> None: + """A masked sequence matches the original but is not equal.""" + orig = NucleotideSequence.from_string("ATCG") + masked = NucleotideSequence.from_string("ANNG") + self.assertNotEqual(orig, masked) + self.assertTrue(orig.matches(masked)) + + def test_set_and_dict_storage(self) -> None: + a = AminoAcidSequence.from_string("CAST") + b = AminoAcidSequence.from_string("CAST") + c = AminoAcidSequence.from_string("XAST") + s = {a, b, c} + self.assertEqual(len(s), 2) + d = {a: 1} + self.assertEqual(d[b], 1) + self.assertNotIn(c, d) + + def test_cross_type_not_equal(self) -> None: + """Different types with identical bytes are not equal.""" + aa = AminoAcidSequence.from_string("X") + red = ReducedAminoAcidSequence.from_string("X") + self.assertNotEqual(aa, red) + class TestReducedAminoAcidSequence(unittest.TestCase): - """Tests for :class:`~mir.basic.sequence.ReducedAminoAcidSequence` and AA conversion.""" - - def test_amino_acid_to_reduced_conversion_and_match(self) -> None: - """Reduced conversion and matching respect mapping and masks. - - Verifies that: - * The reduced string produced by :meth:`AminoAcidSequence.to_reduced_amino_acid` - matches the expected character-by-character mapping from - :data:`AMINO_ACID_TO_REDUCED_AMINO_ACID`. - * :meth:`AminoAcidSequence.matches_reduced_amino_acid` returns ``True`` - for the sequence's own reduced form and ``False`` for an altered one - at an unmasked position. - """ - aa: AminoAcidSequence = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") - reduced = aa.to_reduced_amino_acid() + """Reduced-alphabet conversion and matching.""" + def test_conversion_via_byte_lut(self) -> None: + aa = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") + reduced = aa.to_reduced_amino_acid() self.assertEqual(reduced.to_string(), "slhhllGGlhmcbllW") - self.assertTrue(aa.matches_reduced_amino_acid(reduced)) - self.assertFalse(aa.matches_reduced_amino_acid(ReducedAminoAcidSequence.from_string("slhhllGGlhmcbllY"))) - masked_aa = aa.mask(2) - self.assertTrue(masked_aa.matches_reduced_amino_acid(reduced)) + def test_match_and_mismatch(self) -> None: + aa = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") + reduced = aa.to_reduced_amino_acid() + self.assertTrue(aa.matches_reduced_amino_acid(reduced)) + self.assertFalse( + aa.matches_reduced_amino_acid( + ReducedAminoAcidSequence.from_string("slhhllGGlhmcbllY") + ) + ) + + def test_masked_aa_matches_reduced(self) -> None: + aa = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") + reduced = aa.to_reduced_amino_acid() + self.assertTrue(aa.mask(2).matches_reduced_amino_acid(reduced)) - masked_reduced = reduced.mask((2, 5)) - self.assertTrue(aa.matches_reduced_amino_acid(masked_reduced)) + def test_masked_reduced_matches_aa(self) -> None: + aa = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") + reduced = aa.to_reduced_amino_acid() + self.assertTrue(aa.matches_reduced_amino_acid(reduced.mask((2, 5)))) - def test_aa_to_reduced_backwards_compatible_aliases(self) -> None: - """Legacy simple-amino-acid aliases keep working.""" + def test_backwards_compatible_aliases(self) -> None: aa = AminoAcidSequence.from_string("CAST") reduced = aa.to_simple_amino_acid() self.assertIsInstance(reduced, ReducedAminoAcidSequence) self.assertTrue(aa.matches_simple_amino_acid(reduced)) def test_reduced_substrings(self) -> None: - """``substring`` on a :class:`ReducedAminoAcidSequence` slices correctly. - - Verifies that: - * A half-open slice returns the expected subsequence. - * ``substring(start, None)`` slices through to the end of the sequence. - * An out-of-alphabet character (``Z``) raises ``ValueError``. - """ reduced = ReducedAminoAcidSequence.from_string("slhhllGGlhmcbllW") self.assertEqual(reduced.substring(0, 4).to_string(), "slhh") self.assertEqual(reduced.substring(6, 8).to_string(), "GG") @@ -132,7 +167,7 @@ def test_reduced_substrings(self) -> None: class TestMaskAndMatch(unittest.TestCase): - """Tests for masking and wildcard-aware matching.""" + """Masking and wildcard-aware matching.""" def test_nucleotide_mask_single_and_range(self) -> None: seq = NucleotideSequence.from_string("ATCGAT") @@ -140,14 +175,14 @@ def test_nucleotide_mask_single_and_range(self) -> None: self.assertEqual(seq.mask((2, 5)).to_string(), "ATNNNT") self.assertEqual(seq.mask(slice(0, 3)).to_string(), "NNNGAT") - def test_amino_and_reduced_mask_single_and_range(self) -> None: + def test_amino_and_reduced_mask(self) -> None: aa = AminoAcidSequence.from_string("CASTIV") reduced = ReducedAminoAcidSequence.from_string("slhhll") self.assertEqual(aa.mask(0).to_string(), "XASTIV") self.assertEqual(aa.mask((1, 4)).to_string(), "CXXXIV") self.assertEqual(reduced.mask(slice(2, 5)).to_string(), "slXXXl") - def test_sequence_matching_ignores_mask_symbols(self) -> None: + def test_matching_ignores_mask_symbols(self) -> None: nt1 = NucleotideSequence.from_string("ATCG") nt2 = NucleotideSequence.from_string("ANNG") self.assertTrue(nt1.matches(nt2)) @@ -163,6 +198,17 @@ def test_sequence_matching_ignores_mask_symbols(self) -> None: self.assertTrue(red1.matches(red2)) self.assertFalse(red1.matches(ReducedAminoAcidSequence.from_string("sXXY"))) + def test_length_mismatch_does_not_match(self) -> None: + a = NucleotideSequence.from_string("ATC") + b = NucleotideSequence.from_string("ATCG") + self.assertFalse(a.matches(b)) + + def test_empty_sequences_match(self) -> None: + a = NucleotideSequence.from_string("") + b = NucleotideSequence.from_string("") + self.assertTrue(a.matches(b)) + self.assertEqual(a, b) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From b856dfef5baa3af1b1cd2cd262cdba49e191e685 Mon Sep 17 00:00:00 2001 From: mikessh Date: Fri, 10 Apr 2026 23:21:26 +0300 Subject: [PATCH 06/24] wip tokenizer --- mir/basic/sequence.py | 163 ++++++++++++++++++++++----------- mir/basic/tokens.py | 87 ++++++++++++++++++ tests/test_sequence.py | 2 +- tests/test_tokens.py | 129 ++++++++++++++++++++++++++ tests/test_tokens_benchmark.py | 78 ++++++++++++++++ 5 files changed, 402 insertions(+), 57 deletions(-) create mode 100644 mir/basic/tokens.py create mode 100644 tests/test_tokens.py create mode 100644 tests/test_tokens_benchmark.py diff --git a/mir/basic/sequence.py b/mir/basic/sequence.py index 3aa0ae9..b66f220 100644 --- a/mir/basic/sequence.py +++ b/mir/basic/sequence.py @@ -1,14 +1,14 @@ -"""Biological sequence types backed by immutable NumPy byte arrays. +"""Biological sequence types backed by immutable ``stringzilla.Str`` buffers. -Sequence objects are lightweight and immutable: each instance stores only a -read-only ``np.ndarray`` of dtype ``S1``. The alphabet is validated on -construction and is defined at the class level (``DEFAULT_ALPHABET``). +Sequence objects are lightweight and immutable. Each instance stores only a +``stringzilla.Str`` (a zero-copy, contiguous byte buffer). The alphabet is +validated on construction and defined at the class level. -**Equality vs. matching:** ``__eq__`` and ``__hash__`` compare raw byte -content, so sequences work correctly as dictionary keys and set members. -The :meth:`~AlphabetSequence.matches` method is different: it treats mask -characters (``N`` for nucleotides, ``X`` for amino-acid alphabets) as -wildcards, so two sequences that *match* may not be *equal*. +**Equality vs. matching:** ``__eq__`` and ``__hash__`` compare raw bytes so +sequences work correctly as ``dict`` keys and ``set`` members. +:meth:`~AlphabetSequence.matches` is different — it treats mask characters +(``N`` for nucleotides, ``X`` for amino-acid alphabets) as wildcards, so two +sequences that *match* may not be *equal*. Classes: SequenceAlphabet -- Singleton alphabet definition. @@ -21,6 +21,7 @@ from __future__ import annotations import numpy as np +import stringzilla as sz from typing import Self @@ -37,8 +38,8 @@ "Y": "Y", "V": "l", "X": "X", "*": "*", "_": "_", } -#: Byte lookup table (128 entries, indexed by ASCII ordinal) for converting -#: amino-acid bytes to reduced-alphabet bytes without string intermediaries. +#: NumPy uint8 LUT (128 entries) for fast vectorised conversion used by +#: :meth:`AminoAcidSequence.matches_reduced_amino_acid`. _AA_TO_REDUCED_LUT = np.zeros(128, dtype=np.uint8) for _aa, _red in AMINO_ACID_TO_REDUCED_AMINO_ACID.items(): _AA_TO_REDUCED_LUT[ord(_aa)] = ord(_red) @@ -56,7 +57,7 @@ class SequenceAlphabet: Attributes: allowed_symbols: Ordered tuple of single-character symbols. - allowed_array: ``S1``-dtype NumPy array for fast ``np.isin`` tests. + _allowed_set: ``frozenset`` of allowed byte values for O(1) lookup. """ _instances: dict[tuple[str, ...], SequenceAlphabet] = {} @@ -72,6 +73,10 @@ def __init__(self, allowed_symbols: tuple[str, ...]) -> None: if hasattr(self, "allowed_symbols"): return self.allowed_symbols = tuple(allowed_symbols) + self._allowed_set = frozenset( + c.encode("ascii") for c in self.allowed_symbols + ) + # Kept for any downstream code that uses np.isin against this. self.allowed_array = np.array( [c.encode("ascii") for c in self.allowed_symbols], dtype="S1", ) @@ -87,29 +92,32 @@ def __init__(self, allowed_symbols: tuple[str, ...]) -> None: # --------------------------------------------------------------------------- class AlphabetSequence: - """Immutable alphabet-validated sequence backed by a read-only byte array. + """Immutable alphabet-validated sequence backed by a ``stringzilla.Str``. - Each instance stores **only** a read-only ``np.ndarray`` of dtype ``S1``. - The alphabet and mask symbol live on the class, not on instances. + Each instance stores **only** a ``sz.Str`` buffer. The alphabet and mask + symbol live on the class, not on instances. **Equality vs matching:** ``__eq__`` / ``__hash__`` compare raw bytes (for ``dict`` / ``set`` use). :meth:`matches` performs wildcard-aware comparison where mask characters - (``N`` for nucleotides, ``X`` for amino-acid types) count as matching - any symbol — so two sequences can *match* without being *equal*. + (``N`` for nucleotides, ``X`` for amino-acid types) count as matching any + symbol — so two sequences can *match* without being *equal*. Subclass protocol: * ``DEFAULT_ALPHABET`` — :class:`SequenceAlphabet` with allowed symbols. * ``_MASK_BYTE`` — ``b"N"``, ``b"X"``, or ``b""`` (no masking). """ - __slots__ = ("_data",) + __slots__ = ("_sz",) DEFAULT_ALPHABET: SequenceAlphabet # set by subclasses _MASK_BYTE: bytes = b"" def __init__(self, data: np.ndarray) -> None: - """Validate *data* against the class alphabet and freeze it. + """Validate *data* against the class alphabet and store as ``sz.Str``. + + Args: + data: One-dimensional ``S1``-dtype NumPy array. Raises: ValueError: If dtype is not ``S1``, array is not 1-D, or @@ -119,10 +127,25 @@ def __init__(self, data: np.ndarray) -> None: raise ValueError("Sequence storage must have dtype S1") if data.ndim != 1: raise ValueError("Sequence storage must be one-dimensional") - if not np.isin(data, self.DEFAULT_ALPHABET.allowed_array).all(): - raise ValueError("Sequence contains symbols outside of alphabet") - data.flags.writeable = False - self._data = data + allowed = self.DEFAULT_ALPHABET._allowed_set + raw = data.tobytes() + for b in raw: + if b.to_bytes(1, "little") not in allowed: + raise ValueError("Sequence contains symbols outside of alphabet") + self._sz = sz.Str(raw) + + @classmethod + def _from_trusted_bytes(cls: type[Self], raw: bytes) -> Self: + """Fast-path constructor that skips alphabet validation. + + The caller **must** guarantee that every byte in *raw* belongs to + ``cls.DEFAULT_ALPHABET``. This is used internally by + :func:`~mir.basic.tokens.tokenize` and :meth:`substring` where the + source data has already been validated. + """ + inst = object.__new__(cls) + inst._sz = sz.Str(raw) + return inst # -- constructors ------------------------------------------------------- @@ -136,21 +159,31 @@ def from_string(cls: type[Self], sequence: str) -> Self: @property def data(self) -> np.ndarray: - """Read-only ``S1`` byte array backing this sequence.""" - return self._data + """Read-only ``S1`` NumPy view of the sequence bytes.""" + arr = np.frombuffer(bytes(self._sz), dtype="S1").copy() + arr.flags.writeable = False + return arr @property def content(self) -> np.ndarray: """Alias for :attr:`data` (backward compatibility).""" - return self._data + return self.data def to_string(self) -> str: - """Decode the byte array to a plain Python string.""" - return self._data.tobytes().decode("ascii") + """Decode the sequence to a plain Python string.""" + return str(self._sz) + + def to_bytes(self) -> bytes: + """Return the raw byte content.""" + return bytes(self._sz) def substring(self, start: int, stop: int | None = None) -> Self: - """Return a new sequence for the half-open range ``[start, stop)``.""" - return type(self)(self._data[start:stop].copy()) + """Return a new sequence for the half-open range ``[start, stop)``. + + Uses ``sz.Str`` slicing for a zero-copy view, then stores a copy. + """ + sliced = self._sz[start:stop] + return type(self)._from_trusted_bytes(bytes(sliced)) # -- masking ------------------------------------------------------------ @@ -166,22 +199,24 @@ def mask(self, position: int | slice | tuple[int, int]) -> Self: """ if not self._MASK_BYTE: raise ValueError(f"Masking not supported for {type(self).__name__}") - buf = self._data.copy() - buf.flags.writeable = True - mv = np.array(self._MASK_BYTE, dtype="S1") + buf = bytearray(bytes(self._sz)) + mask_val = self._MASK_BYTE[0] if isinstance(position, int): + n = len(buf) if position < 0: - position += len(self) - if position < 0 or position >= len(self): + position += n + if position < 0 or position >= n: raise IndexError("Mask position out of range") - buf[position] = mv + buf[position] = mask_val elif isinstance(position, slice): - buf[position] = mv + for i in range(*position.indices(len(buf))): + buf[i] = mask_val elif isinstance(position, tuple) and len(position) == 2: - buf[position[0]:position[1]] = mv + for i in range(position[0], position[1]): + buf[i] = mask_val else: raise TypeError("position must be int, slice, or (start, stop) tuple") - return type(self)(buf) + return type(self)._from_trusted_bytes(bytes(buf)) # -- wildcard matching (NOT equality) ----------------------------------- @@ -197,28 +232,30 @@ def matches(self, other: AlphabetSequence) -> bool: return False if len(self) == 0: return True - eq = self._data == other._data - if eq.all(): + sb = bytes(self._sz) + ob = bytes(other._sz) + if sb == ob: return True - ok = eq.copy() - if self._MASK_BYTE: - ok |= self._data == np.array(self._MASK_BYTE, dtype="S1") - if other._MASK_BYTE: - ok |= other._data == np.array(other._MASK_BYTE, dtype="S1") - return bool(ok.all()) + sm = self._MASK_BYTE[0] if self._MASK_BYTE else -1 + om = other._MASK_BYTE[0] if other._MASK_BYTE else -1 + for a, b in zip(sb, ob): + if a == b or a == sm or b == om: + continue + return False + return True # -- equality & hashing (byte-exact) ------------------------------------ def __eq__(self, other: object) -> bool: if type(self) is not type(other): return NotImplemented - return self._data.tobytes() == other._data.tobytes() + return bytes(self._sz) == bytes(other._sz) def __hash__(self) -> int: - return hash(self._data.tobytes()) + return hash(bytes(self._sz)) def __len__(self) -> int: - return int(self._data.shape[0]) + return len(self._sz) def __str__(self) -> str: return self.to_string() @@ -258,9 +295,14 @@ class AminoAcidSequence(AlphabetSequence): _MASK_BYTE = b"X" def to_reduced_amino_acid(self) -> ReducedAminoAcidSequence: - """Convert to the reduced physico-chemical alphabet via byte LUT.""" - converted = _AA_TO_REDUCED_LUT[self._data.view(np.uint8)].view("S1").copy() - return ReducedAminoAcidSequence(converted) + """Convert to the reduced physico-chemical alphabet via ``sz.translate``. + + Uses the :data:`AMINO_ACID_TO_REDUCED_AMINO_ACID` char→char mapping + applied through ``stringzilla.Str.translate`` for native-speed + byte-level translation. + """ + translated: bytes = self._sz.translate(AMINO_ACID_TO_REDUCED_AMINO_ACID) + return ReducedAminoAcidSequence._from_trusted_bytes(translated) def to_simple_amino_acid(self) -> ReducedAminoAcidSequence: """Backwards-compatible alias for :meth:`to_reduced_amino_acid`.""" @@ -278,12 +320,17 @@ def matches_reduced_amino_acid(self, reduced: ReducedAminoAcidSequence) -> bool: return False if len(self) == 0: return True - converted = _AA_TO_REDUCED_LUT[self._data.view(np.uint8)].view("S1") - eq = converted == reduced._data + # Use numpy LUT for the comparison (avoids creating an intermediate + # ReducedAminoAcidSequence object). + self_np = np.frombuffer(bytes(self._sz), dtype=np.uint8) + converted = _AA_TO_REDUCED_LUT[self_np].view("S1") + reduced_np = np.frombuffer(bytes(reduced._sz), dtype="S1") + eq = converted == reduced_np if eq.all(): return True mask_x = np.array(b"X", dtype="S1") - return bool((eq | (self._data == mask_x) | (reduced._data == mask_x)).all()) + self_s1 = self_np.view("S1") + return bool((eq | (self_s1 == mask_x) | (reduced_np == mask_x)).all()) def matches_simple_amino_acid(self, simple: ReducedAminoAcidSequence) -> bool: """Backwards-compatible alias for :meth:`matches_reduced_amino_acid`.""" @@ -301,3 +348,7 @@ class ReducedAminoAcidSequence(AlphabetSequence): __slots__ = () DEFAULT_ALPHABET = REDUCED_AMINO_ACID_ALPHABET _MASK_BYTE = b"X" + + +#: Backwards-compatible class alias. +SimpleAminoAcidSequence = ReducedAminoAcidSequence diff --git a/mir/basic/tokens.py b/mir/basic/tokens.py new file mode 100644 index 0000000..17cb546 --- /dev/null +++ b/mir/basic/tokens.py @@ -0,0 +1,87 @@ +"""K-mer tokenisation of :class:`~mir.basic.sequence.AlphabetSequence` objects. + +Uses ``stringzilla.Str`` slicing for zero-copy windowing and the fast +:meth:`~mir.basic.sequence.AlphabetSequence._from_trusted_bytes` constructor +to bypass per-k-mer alphabet validation (the source sequence was already +validated on construction). + +Functions: + tokenize -- Extract overlapping k-mers, optionally with gapped variants. +""" + +from __future__ import annotations + +import stringzilla as sz + +from mir.basic.sequence import AlphabetSequence + + +def tokenize( + sequence: AlphabetSequence, + k: int, + *, + gapped: bool = False, +) -> list[AlphabetSequence]: + """Split *sequence* into overlapping k-mers of length *k*. + + Uses ``sz.Str`` slicing (zero-copy view) for each window and + :meth:`AlphabetSequence._from_trusted_bytes` to construct k-mer objects + without re-validating the alphabet. + + When *gapped* is ``True``, instead of plain k-mers, each window position + produces *k* gapped variants where exactly one position within the k-mer + is replaced by the mask byte (``N`` for nucleotides, ``X`` for amino-acid + types). For example, with ``k=3`` and amino-acid sequence ``CASSL``:: + + position 0 → XAS CXS CAX + position 1 → XSS AXS ASX + position 2 → XSL SXL SSX + + Args: + sequence: Input sequence to tokenize. + k: K-mer length. Must satisfy ``1 <= k <= len(sequence)``. + gapped: If ``True``, emit gapped (single-position masked) k-mers + rather than plain k-mers. + + Returns: + A flat list of k-mer sequences. Plain mode yields + ``len(sequence) - k + 1`` items; gapped mode yields + ``(len(sequence) - k + 1) * k`` items. + + Raises: + ValueError: If *k* < 1 or *k* > ``len(sequence)``. + """ + n = len(sequence) + if k < 1 or k > n: + raise ValueError( + f"k must be between 1 and sequence length ({n}), got {k}" + ) + + cls = type(sequence) + raw_sz = sequence._sz # stringzilla.Str — slicing is zero-copy + + if not gapped: + result: list[AlphabetSequence] = [] + for i in range(n - k + 1): + result.append(cls._from_trusted_bytes(bytes(raw_sz[i : i + k]))) + return result + + # Gapped mode: for each window spawn k variants, each with one + # position replaced by the mask byte. + mask_byte = sequence._MASK_BYTE + if not mask_byte: + raise ValueError( + f"Gapped tokenisation requires a mask byte; " + f"{cls.__name__} does not define one" + ) + mask_val = mask_byte[0] + + result = [] + for i in range(n - k + 1): + window = bytes(raw_sz[i : i + k]) + for j in range(k): + buf = bytearray(window) + buf[j] = mask_val + result.append(cls._from_trusted_bytes(bytes(buf))) + return result + diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 3232938..33bbca0 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -77,7 +77,7 @@ def test_no_extra_attributes(self) -> None: def test_content_backward_compat(self) -> None: """The .content property still works.""" nt = NucleotideSequence.from_string("ATCG") - self.assertIs(nt.content, nt.data) + np.testing.assert_array_equal(nt.content, nt.data) def test_repr(self) -> None: nt = NucleotideSequence.from_string("ATCG") diff --git a/tests/test_tokens.py b/tests/test_tokens.py new file mode 100644 index 0000000..f10dd0c --- /dev/null +++ b/tests/test_tokens.py @@ -0,0 +1,129 @@ +"""Unit tests for :mod:`mir.basic.tokens`.""" + +import unittest + +import numpy as np + +from mir.basic.sequence import ( + AminoAcidSequence, + NucleotideSequence, + ReducedAminoAcidSequence, +) +from mir.basic.tokens import tokenize + + +def _strs(seqs): + """Helper: list of sequences → list of str.""" + return [s.to_string() for s in seqs] + + +class TestTokenizePlain(unittest.TestCase): + """Plain (non-gapped) k-mer extraction.""" + + def test_amino_acid_k3(self) -> None: + """CASSL → CAS ASS SSL.""" + aa = AminoAcidSequence.from_string("CASSL") + kmers = tokenize(aa, k=3) + self.assertEqual(_strs(kmers), ["CAS", "ASS", "SSL"]) + self.assertIsInstance(kmers[0], AminoAcidSequence) + + def test_nucleotide_k4(self) -> None: + nt = NucleotideSequence.from_string("ATCGAT") + kmers = tokenize(nt, k=4) + self.assertEqual(_strs(kmers), ["ATCG", "TCGA", "CGAT"]) + self.assertIsInstance(kmers[0], NucleotideSequence) + + def test_reduced_k2(self) -> None: + red = ReducedAminoAcidSequence.from_string("slhh") + kmers = tokenize(red, k=2) + self.assertEqual(_strs(kmers), ["sl", "lh", "hh"]) + + def test_k_equals_length(self) -> None: + """When k == len, a single k-mer equal to the sequence is returned.""" + aa = AminoAcidSequence.from_string("CAST") + kmers = tokenize(aa, k=4) + self.assertEqual(len(kmers), 1) + self.assertEqual(kmers[0], aa) + + def test_k_equals_one(self) -> None: + nt = NucleotideSequence.from_string("ATG") + kmers = tokenize(nt, k=1) + self.assertEqual(_strs(kmers), ["A", "T", "G"]) + + def test_invalid_k(self) -> None: + aa = AminoAcidSequence.from_string("CAST") + with self.assertRaises(ValueError): + tokenize(aa, k=0) + with self.assertRaises(ValueError): + tokenize(aa, k=5) + + def test_kmers_are_independent_copies(self) -> None: + """Returned k-mers own their data and don't share buffers.""" + aa = AminoAcidSequence.from_string("CASSL") + kmers = tokenize(aa, k=3) + self.assertFalse(np.shares_memory(kmers[0].data, kmers[1].data)) + + +class TestTokenizeGapped(unittest.TestCase): + """Gapped k-mer extraction (single-position mask variants).""" + + def test_amino_acid_gapped_k3(self) -> None: + """CASSL → 3 windows × 3 gap positions = 9 gapped k-mers.""" + aa = AminoAcidSequence.from_string("CASSL") + gapped = tokenize(aa, k=3, gapped=True) + self.assertEqual(len(gapped), 9) + expected = [ + # window CAS + "XAS", "CXS", "CAX", + # window ASS + "XSS", "AXS", "ASX", + # window SSL + "XSL", "SXL", "SSX", + ] + self.assertEqual(_strs(gapped), expected) + self.assertIsInstance(gapped[0], AminoAcidSequence) + + def test_nucleotide_gapped_k2(self) -> None: + nt = NucleotideSequence.from_string("ATG") + gapped = tokenize(nt, k=2, gapped=True) + expected = [ + "NT", "AN", # AT + "NG", "TN", # TG + ] + self.assertEqual(_strs(gapped), expected) + + def test_reduced_gapped_k2(self) -> None: + red = ReducedAminoAcidSequence.from_string("slh") + gapped = tokenize(red, k=2, gapped=True) + expected = ["Xl", "sX", "Xh", "lX"] + self.assertEqual(_strs(gapped), expected) + + def test_gapped_k1(self) -> None: + """With k=1, each gapped k-mer is just the mask character.""" + aa = AminoAcidSequence.from_string("CA") + gapped = tokenize(aa, k=1, gapped=True) + self.assertEqual(_strs(gapped), ["X", "X"]) + + def test_gapped_invalid_k(self) -> None: + aa = AminoAcidSequence.from_string("CAST") + with self.assertRaises(ValueError): + tokenize(aa, k=0, gapped=True) + with self.assertRaises(ValueError): + tokenize(aa, k=5, gapped=True) + + def test_gapped_kmers_match_plain_kmers(self) -> None: + """Each gapped k-mer should wildcard-match its corresponding plain k-mer.""" + aa = AminoAcidSequence.from_string("CASSL") + plain = tokenize(aa, k=3) + gapped = tokenize(aa, k=3, gapped=True) + for i, kmer in enumerate(plain): + variants = gapped[i * 3 : (i + 1) * 3] + for var in variants: + self.assertTrue( + kmer.matches(var), + f"{kmer} should match {var}", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_tokens_benchmark.py b/tests/test_tokens_benchmark.py new file mode 100644 index 0000000..2c4f58e --- /dev/null +++ b/tests/test_tokens_benchmark.py @@ -0,0 +1,78 @@ +"""Benchmark: tokenize() vs naive Python string slicing for 3-mer extraction. + +Generates N=10 000 random amino-acid sequences of length 15 and compares +wall-clock time for splitting each into overlapping 3-mers using: + +1. ``tokenize()`` from :mod:`mir.basic.tokens` (sequence + memoryview path). +2. Naive Python: plain string slicing producing ``list[str]``. + +Run with ``python -m unittest -v tests/test_tokens_benchmark.py``. +""" + +import random +import string +import time +import unittest + +from mir.basic.sequence import AminoAcidSequence +from mir.basic.tokens import tokenize + +N = 10_000 +SEQ_LEN = 15 +K = 3 + +# 20 canonical amino acids +_AA_LETTERS = "ACDEFGHIKLMNPQRSTVWY" + + +def _random_aa_strings(n: int, length: int) -> list[str]: + rng = random.Random(42) + return ["".join(rng.choices(_AA_LETTERS, k=length)) for _ in range(n)] + + +class TestTokenizeBenchmark(unittest.TestCase): + """Wall-clock comparison of tokenize() vs naive string slicing.""" + + def test_benchmark_3mer_tokenization(self) -> None: + strings = _random_aa_strings(N, SEQ_LEN) + + # -- naive Python string slicing ------------------------------------ + t0 = time.perf_counter() + naive_total = 0 + for s in strings: + kmers = [s[i : i + K] for i in range(len(s) - K + 1)] + naive_total += len(kmers) + t_naive = time.perf_counter() - t0 + + # -- tokenize (sequence objects) ------------------------------------ + sequences = [AminoAcidSequence.from_string(s) for s in strings] + t0 = time.perf_counter() + tok_total = 0 + for seq in sequences: + kmers = tokenize(seq, k=K) + tok_total += len(kmers) + t_tokenize = time.perf_counter() - t0 + + # Both must produce the same number of k-mers + self.assertEqual(naive_total, tok_total) + + expected_per_seq = SEQ_LEN - K + 1 + self.assertEqual(naive_total, N * expected_per_seq) + + print( + f"\n{'Method':<22} {'Time (s)':>10} {'k-mers/s':>14}\n" + f"{'-' * 48}" + ) + for label, elapsed in [ + ("naive str slicing", t_naive), + ("tokenize()", t_tokenize), + ]: + rate = tok_total / elapsed if elapsed > 0 else float("inf") + print(f"{label:<22} {elapsed:>10.4f} {rate:>14,.0f}") + + ratio = t_tokenize / t_naive if t_naive > 0 else float("inf") + print(f"\ntokenize / naive ratio: {ratio:.2f}x") + + +if __name__ == "__main__": + unittest.main() From df3e064247255daefd55802a9f3613b11fc617a0 Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 04:19:01 +0300 Subject: [PATCH 07/24] roll back to python-native methods as they turn out to be faster --- mir/basic/sequence.py | 508 +++++++++++++------------------ mir/basic/tokens.py | 148 +++++---- tests/test_memory_benchmark.py | 133 ++++++++ tests/test_sequence.py | 349 ++++++++++----------- tests/test_sequence_benchmark.py | 289 ++++++++++++++++++ tests/test_tokens.py | 148 ++++----- tests/test_tokens_benchmark.py | 169 +++++++--- 7 files changed, 1069 insertions(+), 675 deletions(-) create mode 100644 tests/test_memory_benchmark.py create mode 100644 tests/test_sequence_benchmark.py diff --git a/mir/basic/sequence.py b/mir/basic/sequence.py index b66f220..3024e50 100644 --- a/mir/basic/sequence.py +++ b/mir/basic/sequence.py @@ -1,354 +1,260 @@ -"""Biological sequence types backed by immutable ``stringzilla.Str`` buffers. - -Sequence objects are lightweight and immutable. Each instance stores only a -``stringzilla.Str`` (a zero-copy, contiguous byte buffer). The alphabet is -validated on construction and defined at the class level. - -**Equality vs. matching:** ``__eq__`` and ``__hash__`` compare raw bytes so -sequences work correctly as ``dict`` keys and ``set`` members. -:meth:`~AlphabetSequence.matches` is different — it treats mask characters -(``N`` for nucleotides, ``X`` for amino-acid alphabets) as wildcards, so two -sequences that *match* may not be *equal*. - -Classes: - SequenceAlphabet -- Singleton alphabet definition. - AlphabetSequence -- Base class for alphabet-constrained sequences. - NucleotideSequence -- DNA sequence (A/T/G/C/N). - AminoAcidSequence -- Standard 20-AA + stop/unknown sequence. - ReducedAminoAcidSequence -- Reduced amino-acid alphabet for fuzzy matching. +"""Biological sequence validation, translation, masking, and matching. + +All functions operate on plain ``str`` or ``bytes`` — no wrapper classes. +Alphabet membership is checked via 256-byte lookup tables (``bytes``) for +O(1) per-character validation. Translation uses ``bytes.translate`` with a +pre-built table for native-speed conversion. + +Alphabets +--------- +Three predefined alphabets are provided as module-level ``bytes`` lookup +tables (256 entries, 1 = allowed, 0 = disallowed): + +* ``NT_ALPHABET`` — DNA nucleotides ``ATGCN`` (``N`` = mask). +* ``AA_ALPHABET`` — 20 amino acids + ``*_X`` (``X`` = mask). +* ``REDUCED_AA_ALPHABET`` — Physico-chemical reduced alphabet (``X`` = mask). + +Functions +--------- +* ``make_alphabet`` — Build a 256-byte LUT from a string of allowed chars. +* ``validate`` — Check every byte belongs to an alphabet. +* ``translate`` — Byte-level translation via ``bytes.translate``. +* ``mask`` — Replace position(s) with a mask character. +* ``matches`` — Wildcard-aware positional comparison. +* ``aa_to_reduced`` — Convert amino-acid sequence to reduced alphabet. +* ``matches_aa_reduced``— Cross-alphabet wildcard match (AA vs reduced). """ from __future__ import annotations -import numpy as np -import stringzilla as sz -from typing import Self +Seq = str | bytes | bytearray + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _to_bytes(seq: Seq) -> bytes: + """Normalise *seq* to ``bytes``. Strings are ASCII-encoded.""" + return seq.encode("ascii") if isinstance(seq, str) else bytes(seq) + + +# --------------------------------------------------------------------------- +# Alphabet construction +# --------------------------------------------------------------------------- + +def make_alphabet(chars: str) -> bytes: + """Build a 256-byte lookup table where allowed positions are ``1``. + + Args: + chars: String of allowed ASCII characters. + + Returns: + A 256-byte ``bytes`` object usable as a fast membership LUT. + """ + lut = bytearray(256) + for ch in chars: + lut[ord(ch)] = 1 + return bytes(lut) + + +# --------------------------------------------------------------------------- +# Pre-built alphabets +# --------------------------------------------------------------------------- + +NT_CHARS = "ATGCN" +AA_CHARS = "ACDEFGHIKLMNPQRSTVWY*_X" +REDUCED_AA_CHARS = "lbmcshGFPWYX*_" + +NT_ALPHABET: bytes = make_alphabet(NT_CHARS) +AA_ALPHABET: bytes = make_alphabet(AA_CHARS) +REDUCED_AA_ALPHABET: bytes = make_alphabet(REDUCED_AA_CHARS) + +NT_MASK = ord("N") +AA_MASK = ord("X") +REDUCED_AA_MASK = ord("X") # --------------------------------------------------------------------------- # Amino-acid → reduced-alphabet mapping # --------------------------------------------------------------------------- -#: Maps each standard amino-acid one-letter code (plus ``X``, ``*``, ``_``) -#: to a reduced symbol representing its physico-chemical class. -AMINO_ACID_TO_REDUCED_AMINO_ACID: dict[str, str] = { +#: Per-character mapping from standard amino-acid codes to reduced symbols. +AA_TO_REDUCED: dict[str, str] = { "A": "l", "R": "b", "N": "m", "D": "c", "C": "s", "Q": "m", "E": "c", "G": "G", "H": "b", "I": "l", "L": "l", "K": "b", "M": "s", "F": "F", "P": "P", "S": "h", "T": "h", "W": "W", "Y": "Y", "V": "l", "X": "X", "*": "*", "_": "_", } -#: NumPy uint8 LUT (128 entries) for fast vectorised conversion used by -#: :meth:`AminoAcidSequence.matches_reduced_amino_acid`. -_AA_TO_REDUCED_LUT = np.zeros(128, dtype=np.uint8) -for _aa, _red in AMINO_ACID_TO_REDUCED_AMINO_ACID.items(): - _AA_TO_REDUCED_LUT[ord(_aa)] = ord(_red) +#: ``bytes.translate`` table for fast AA → reduced conversion. +AA_TO_REDUCED_TABLE: bytes = bytes.maketrans( + "".join(AA_TO_REDUCED.keys()).encode(), + "".join(AA_TO_REDUCED.values()).encode(), +) + +#: 256-byte LUT mapping each AA byte to its reduced byte (for matching). +_AA_TO_REDUCED_LUT: bytes +_lut = bytearray(256) +for _aa, _red in AA_TO_REDUCED.items(): + _lut[ord(_aa)] = ord(_red) +_AA_TO_REDUCED_LUT = bytes(_lut) +del _lut, _aa, _red # --------------------------------------------------------------------------- -# Alphabet +# Validation # --------------------------------------------------------------------------- -class SequenceAlphabet: - """Singleton immutable alphabet keyed by allowed symbols. +def validate(seq: Seq, alphabet: bytes) -> bytes: + """Validate that every byte of *seq* belongs to *alphabet*. - Instances are cached so that two objects with identical symbol sets are - the *same* object (``is`` holds). + Accepts ``str``, ``bytes``, or ``bytearray``. Strings are + ASCII-encoded first. - Attributes: - allowed_symbols: Ordered tuple of single-character symbols. - _allowed_set: ``frozenset`` of allowed byte values for O(1) lookup. - """ + Args: + seq: Input sequence. + alphabet: 256-byte LUT (1 = allowed). - _instances: dict[tuple[str, ...], SequenceAlphabet] = {} - - def __new__(cls, allowed_symbols: tuple[str, ...]) -> SequenceAlphabet: - key = tuple(allowed_symbols) - if key not in cls._instances: - inst = super().__new__(cls) - cls._instances[key] = inst - return cls._instances[key] - - def __init__(self, allowed_symbols: tuple[str, ...]) -> None: - if hasattr(self, "allowed_symbols"): - return - self.allowed_symbols = tuple(allowed_symbols) - self._allowed_set = frozenset( - c.encode("ascii") for c in self.allowed_symbols - ) - # Kept for any downstream code that uses np.isin against this. - self.allowed_array = np.array( - [c.encode("ascii") for c in self.allowed_symbols], dtype="S1", - ) - - -REDUCED_AMINO_ACID_ALPHABET = SequenceAlphabet( - tuple(dict.fromkeys(AMINO_ACID_TO_REDUCED_AMINO_ACID.values())) -) + Returns: + The validated sequence as ``bytes``. + + Raises: + ValueError: If any byte falls outside the alphabet. + """ + raw = _to_bytes(seq) + for b in raw: + if not alphabet[b]: + raise ValueError( + f"Sequence contains symbol {chr(b)!r} outside of alphabet" + ) + return raw # --------------------------------------------------------------------------- -# Base sequence +# Translation # --------------------------------------------------------------------------- -class AlphabetSequence: - """Immutable alphabet-validated sequence backed by a ``stringzilla.Str``. - - Each instance stores **only** a ``sz.Str`` buffer. The alphabet and mask - symbol live on the class, not on instances. +def translate(seq: Seq, table: bytes) -> bytes: + """Translate *seq* byte-by-byte using a ``bytes.maketrans`` *table*. - **Equality vs matching:** - ``__eq__`` / ``__hash__`` compare raw bytes (for ``dict`` / ``set`` use). - :meth:`matches` performs wildcard-aware comparison where mask characters - (``N`` for nucleotides, ``X`` for amino-acid types) count as matching any - symbol — so two sequences can *match* without being *equal*. + Args: + seq: Input sequence (``str``, ``bytes``, or ``bytearray``). + table: A 256-byte translation table (from ``bytes.maketrans``). - Subclass protocol: - * ``DEFAULT_ALPHABET`` — :class:`SequenceAlphabet` with allowed symbols. - * ``_MASK_BYTE`` — ``b"N"``, ``b"X"``, or ``b""`` (no masking). + Returns: + Translated ``bytes``. """ + return _to_bytes(seq).translate(table) - __slots__ = ("_sz",) - - DEFAULT_ALPHABET: SequenceAlphabet # set by subclasses - _MASK_BYTE: bytes = b"" - - def __init__(self, data: np.ndarray) -> None: - """Validate *data* against the class alphabet and store as ``sz.Str``. - - Args: - data: One-dimensional ``S1``-dtype NumPy array. - - Raises: - ValueError: If dtype is not ``S1``, array is not 1-D, or - symbols fall outside ``DEFAULT_ALPHABET``. - """ - if data.dtype != np.dtype("S1"): - raise ValueError("Sequence storage must have dtype S1") - if data.ndim != 1: - raise ValueError("Sequence storage must be one-dimensional") - allowed = self.DEFAULT_ALPHABET._allowed_set - raw = data.tobytes() - for b in raw: - if b.to_bytes(1, "little") not in allowed: - raise ValueError("Sequence contains symbols outside of alphabet") - self._sz = sz.Str(raw) - - @classmethod - def _from_trusted_bytes(cls: type[Self], raw: bytes) -> Self: - """Fast-path constructor that skips alphabet validation. - - The caller **must** guarantee that every byte in *raw* belongs to - ``cls.DEFAULT_ALPHABET``. This is used internally by - :func:`~mir.basic.tokens.tokenize` and :meth:`substring` where the - source data has already been validated. - """ - inst = object.__new__(cls) - inst._sz = sz.Str(raw) - return inst - - # -- constructors ------------------------------------------------------- - - @classmethod - def from_string(cls: type[Self], sequence: str) -> Self: - """Create a sequence from a plain Python string.""" - arr = np.frombuffer(sequence.encode("ascii"), dtype="S1").copy() - return cls(arr) - - # -- accessors ---------------------------------------------------------- - - @property - def data(self) -> np.ndarray: - """Read-only ``S1`` NumPy view of the sequence bytes.""" - arr = np.frombuffer(bytes(self._sz), dtype="S1").copy() - arr.flags.writeable = False - return arr - - @property - def content(self) -> np.ndarray: - """Alias for :attr:`data` (backward compatibility).""" - return self.data - - def to_string(self) -> str: - """Decode the sequence to a plain Python string.""" - return str(self._sz) - - def to_bytes(self) -> bytes: - """Return the raw byte content.""" - return bytes(self._sz) - - def substring(self, start: int, stop: int | None = None) -> Self: - """Return a new sequence for the half-open range ``[start, stop)``. - - Uses ``sz.Str`` slicing for a zero-copy view, then stores a copy. - """ - sliced = self._sz[start:stop] - return type(self)._from_trusted_bytes(bytes(sliced)) - - # -- masking ------------------------------------------------------------ - - def mask(self, position: int | slice | tuple[int, int]) -> Self: - """Return a copy with the given position(s) replaced by the mask byte. - - Args: - position: Integer index, ``slice``, or ``(start, stop)`` tuple. - - Raises: - ValueError: If this class does not support masking. - IndexError: If an integer position is out of bounds. - """ - if not self._MASK_BYTE: - raise ValueError(f"Masking not supported for {type(self).__name__}") - buf = bytearray(bytes(self._sz)) - mask_val = self._MASK_BYTE[0] - if isinstance(position, int): - n = len(buf) - if position < 0: - position += n - if position < 0 or position >= n: - raise IndexError("Mask position out of range") - buf[position] = mask_val - elif isinstance(position, slice): - for i in range(*position.indices(len(buf))): - buf[i] = mask_val - elif isinstance(position, tuple) and len(position) == 2: - for i in range(position[0], position[1]): - buf[i] = mask_val - else: - raise TypeError("position must be int, slice, or (start, stop) tuple") - return type(self)._from_trusted_bytes(bytes(buf)) - - # -- wildcard matching (NOT equality) ----------------------------------- - - def matches(self, other: AlphabetSequence) -> bool: - """Wildcard-aware positional comparison. - - Returns ``True`` when the sequences have the same length and at - every position the symbols are equal **or** at least one side - carries a mask character. This is intentionally **not** the same - as ``__eq__`` which compares bytes exactly. - """ - if len(self) != len(other): - return False - if len(self) == 0: - return True - sb = bytes(self._sz) - ob = bytes(other._sz) - if sb == ob: - return True - sm = self._MASK_BYTE[0] if self._MASK_BYTE else -1 - om = other._MASK_BYTE[0] if other._MASK_BYTE else -1 - for a, b in zip(sb, ob): - if a == b or a == sm or b == om: - continue - return False - return True - - # -- equality & hashing (byte-exact) ------------------------------------ - - def __eq__(self, other: object) -> bool: - if type(self) is not type(other): - return NotImplemented - return bytes(self._sz) == bytes(other._sz) - def __hash__(self) -> int: - return hash(bytes(self._sz)) +def aa_to_reduced(seq: Seq) -> bytes: + """Convert an amino-acid sequence to the reduced physico-chemical alphabet. - def __len__(self) -> int: - return len(self._sz) + Uses ``bytes.translate`` with a pre-built table for native speed. - def __str__(self) -> str: - return self.to_string() + Args: + seq: Amino-acid sequence (``str``, ``bytes``, or ``bytearray``). - def __repr__(self) -> str: - return f"{type(self).__name__}({self.to_string()!r})" + Returns: + Reduced-alphabet ``bytes``. + """ + return _to_bytes(seq).translate(AA_TO_REDUCED_TABLE) # --------------------------------------------------------------------------- -# Concrete sequence types +# Masking # --------------------------------------------------------------------------- -class NucleotideSequence(AlphabetSequence): - """DNA nucleotide sequence (``A``, ``T``, ``G``, ``C``, ``N``). +def mask(seq: Seq, position: int | slice | tuple[int, int], mask_byte: int) -> bytes: + """Return a copy of *seq* with the given position(s) replaced by *mask_byte*. + + Args: + seq: Input sequence. + position: Single index, ``slice``, or ``(start, stop)`` half-open range. + mask_byte: Replacement byte value (e.g. ``ord('N')`` or ``NT_MASK``). - ``N`` serves as the mask / ambiguity symbol. + Returns: + New ``bytes`` with the specified positions masked. + + Raises: + IndexError: If a single-index position is out of bounds. """ + buf = bytearray(_to_bytes(seq)) + if isinstance(position, int): + n = len(buf) + if position < 0: + position += n + if position < 0 or position >= n: + raise IndexError("Mask position out of range") + buf[position] = mask_byte + elif isinstance(position, slice): + for i in range(*position.indices(len(buf))): + buf[i] = mask_byte + elif isinstance(position, tuple) and len(position) == 2: + for i in range(position[0], position[1]): + buf[i] = mask_byte + else: + raise TypeError("position must be int, slice, or (start, stop) tuple") + return bytes(buf) - __slots__ = () - DEFAULT_ALPHABET = SequenceAlphabet(("A", "T", "G", "C", "N")) - _MASK_BYTE = b"N" +# --------------------------------------------------------------------------- +# Wildcard matching +# --------------------------------------------------------------------------- -class AminoAcidSequence(AlphabetSequence): - """Standard 20-letter amino-acid sequence. +def matches(a: Seq, b: Seq, mask_byte: int) -> bool: + """Wildcard-aware positional comparison. - The alphabet includes ``*`` (stop), ``_`` (gap), and ``X`` (unknown). - ``X`` serves as the mask / wildcard symbol. - """ + Returns ``True`` when *a* and *b* have the same length and at every + position the bytes are equal **or** at least one side carries + *mask_byte*. This is **not** the same as ``a == b``. + + Args: + a: First sequence. + b: Second sequence. + mask_byte: The wildcard byte value (e.g. ``NT_MASK``). - __slots__ = () - DEFAULT_ALPHABET = SequenceAlphabet(( - "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", - "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", - "*", "_", "X", - )) - _MASK_BYTE = b"X" - - def to_reduced_amino_acid(self) -> ReducedAminoAcidSequence: - """Convert to the reduced physico-chemical alphabet via ``sz.translate``. - - Uses the :data:`AMINO_ACID_TO_REDUCED_AMINO_ACID` char→char mapping - applied through ``stringzilla.Str.translate`` for native-speed - byte-level translation. - """ - translated: bytes = self._sz.translate(AMINO_ACID_TO_REDUCED_AMINO_ACID) - return ReducedAminoAcidSequence._from_trusted_bytes(translated) - - def to_simple_amino_acid(self) -> ReducedAminoAcidSequence: - """Backwards-compatible alias for :meth:`to_reduced_amino_acid`.""" - return self.to_reduced_amino_acid() - - def matches_reduced_amino_acid(self, reduced: ReducedAminoAcidSequence) -> bool: - """Wildcard-aware match against a reduced amino-acid sequence. - - Each position of *self* is first mapped to the reduced alphabet via a - byte lookup table; then positions are compared treating ``X`` on - either side as a wildcard. Like :meth:`matches`, this is **not** an - equality test. - """ - if len(self) != len(reduced): - return False - if len(self) == 0: - return True - # Use numpy LUT for the comparison (avoids creating an intermediate - # ReducedAminoAcidSequence object). - self_np = np.frombuffer(bytes(self._sz), dtype=np.uint8) - converted = _AA_TO_REDUCED_LUT[self_np].view("S1") - reduced_np = np.frombuffer(bytes(reduced._sz), dtype="S1") - eq = converted == reduced_np - if eq.all(): - return True - mask_x = np.array(b"X", dtype="S1") - self_s1 = self_np.view("S1") - return bool((eq | (self_s1 == mask_x) | (reduced_np == mask_x)).all()) - - def matches_simple_amino_acid(self, simple: ReducedAminoAcidSequence) -> bool: - """Backwards-compatible alias for :meth:`matches_reduced_amino_acid`.""" - return self.matches_reduced_amino_acid(simple) - - -class ReducedAminoAcidSequence(AlphabetSequence): - """Sequence in the reduced physico-chemical amino-acid alphabet. - - Symbols: ``l b m c s h G F P W Y X * _``. ``X`` is the mask / wildcard. - Instances are typically obtained via - :meth:`AminoAcidSequence.to_reduced_amino_acid`. + Returns: + ``True`` if the sequences match, ``False`` otherwise. """ + ba = _to_bytes(a) + bb = _to_bytes(b) + if len(ba) != len(bb): + return False + if ba == bb: + return True + for x, y in zip(ba, bb): + if x == y or x == mask_byte or y == mask_byte: + continue + return False + return True + + +def matches_aa_reduced(aa_seq: Seq, reduced_seq: Seq) -> bool: + """Wildcard-aware match between an amino-acid and a reduced-alphabet sequence. - __slots__ = () - DEFAULT_ALPHABET = REDUCED_AMINO_ACID_ALPHABET - _MASK_BYTE = b"X" + Each byte of *aa_seq* is first mapped to the reduced alphabet via a + byte LUT, then compared against *reduced_seq*. ``X`` (mask) on either + side counts as a wildcard. + Args: + aa_seq: Amino-acid sequence. + reduced_seq: Reduced-alphabet sequence. -#: Backwards-compatible class alias. -SimpleAminoAcidSequence = ReducedAminoAcidSequence + Returns: + ``True`` if every position matches (accounting for wildcards). + """ + ba = _to_bytes(aa_seq) + br = _to_bytes(reduced_seq) + if len(ba) != len(br): + return False + if len(ba) == 0: + return True + lut = _AA_TO_REDUCED_LUT + mask_x = AA_MASK + for a, r in zip(ba, br): + conv = lut[a] + if conv == r or a == mask_x or r == mask_x: + continue + return False + return True diff --git a/mir/basic/tokens.py b/mir/basic/tokens.py index 17cb546..45833c4 100644 --- a/mir/basic/tokens.py +++ b/mir/basic/tokens.py @@ -1,87 +1,113 @@ -"""K-mer tokenisation of :class:`~mir.basic.sequence.AlphabetSequence` objects. - -Uses ``stringzilla.Str`` slicing for zero-copy windowing and the fast -:meth:`~mir.basic.sequence.AlphabetSequence._from_trusted_bytes` constructor -to bypass per-k-mer alphabet validation (the source sequence was already -validated on construction). - -Functions: - tokenize -- Extract overlapping k-mers, optionally with gapped variants. +"""K-mer tokenisation for biological sequences. + +Provides plain and gapped k-mer extraction operating on ``str`` or ``bytes`` +inputs. Both approaches use bytes slicing internally (``str.encode`` is +virtually free for short ASCII sequences and ``bytes`` slicing is faster than +``str`` slicing in CPython). + +Functions +--------- +* ``tokenize`` — Overlapping k-mers as a ``list[bytes]``. +* ``tokenize_gapped`` — Gapped (single-position masked) k-mers as ``list[bytes]``. +* ``tokenize_str`` — Same as ``tokenize`` returning ``list[str]``. +* ``tokenize_gapped_str`` — Same as ``tokenize_gapped`` returning ``list[str]``. """ from __future__ import annotations -import stringzilla as sz - -from mir.basic.sequence import AlphabetSequence +from mir.basic.sequence import Seq, _to_bytes -def tokenize( - sequence: AlphabetSequence, - k: int, - *, - gapped: bool = False, -) -> list[AlphabetSequence]: - """Split *sequence* into overlapping k-mers of length *k*. +# --------------------------------------------------------------------------- +# Plain k-mers +# --------------------------------------------------------------------------- - Uses ``sz.Str`` slicing (zero-copy view) for each window and - :meth:`AlphabetSequence._from_trusted_bytes` to construct k-mer objects - without re-validating the alphabet. +def tokenize(seq: Seq, k: int) -> list[bytes]: + """Extract overlapping k-mers of length *k* from *seq*. - When *gapped* is ``True``, instead of plain k-mers, each window position - produces *k* gapped variants where exactly one position within the k-mer - is replaced by the mask byte (``N`` for nucleotides, ``X`` for amino-acid - types). For example, with ``k=3`` and amino-acid sequence ``CASSL``:: - - position 0 → XAS CXS CAX - position 1 → XSS AXS ASX - position 2 → XSL SXL SSX + Uses ``bytes`` slicing for speed; accepts ``str``, ``bytes``, + or ``bytearray``. Args: - sequence: Input sequence to tokenize. - k: K-mer length. Must satisfy ``1 <= k <= len(sequence)``. - gapped: If ``True``, emit gapped (single-position masked) k-mers - rather than plain k-mers. + seq: Input sequence. + k: K-mer length. Must satisfy ``1 <= k <= len(seq)``. Returns: - A flat list of k-mer sequences. Plain mode yields - ``len(sequence) - k + 1`` items; gapped mode yields - ``(len(sequence) - k + 1) * k`` items. + List of ``bytes`` k-mers (length ``len(seq) - k + 1``). Raises: - ValueError: If *k* < 1 or *k* > ``len(sequence)``. + ValueError: If *k* < 1 or *k* > ``len(seq)``. """ - n = len(sequence) + raw = _to_bytes(seq) + n = len(raw) if k < 1 or k > n: raise ValueError( f"k must be between 1 and sequence length ({n}), got {k}" ) + return [raw[i : i + k] for i in range(n - k + 1)] + + +def tokenize_str(seq: Seq, k: int) -> list[str]: + """Like :func:`tokenize` but returns ``list[str]``. + + Internally converts to bytes, tokenizes, then decodes each k-mer. + """ + return [km.decode("ascii") for km in tokenize(seq, k)] + + +# --------------------------------------------------------------------------- +# Gapped k-mers +# --------------------------------------------------------------------------- + +def tokenize_gapped(seq: Seq, k: int, mask_byte: int) -> list[bytes]: + """Extract gapped k-mers: for each window, *k* variants with one + position replaced by *mask_byte*. + + For window ``CAS`` with mask ``X`` (88):: - cls = type(sequence) - raw_sz = sequence._sz # stringzilla.Str — slicing is zero-copy + XAS CXS CAX - if not gapped: - result: list[AlphabetSequence] = [] - for i in range(n - k + 1): - result.append(cls._from_trusted_bytes(bytes(raw_sz[i : i + k]))) - return result + Args: + seq: Input sequence. + k: K-mer length. Must satisfy ``1 <= k <= len(seq)``. + mask_byte: Replacement byte value (e.g. ``ord('X')``). + + Returns: + List of ``bytes`` gapped k-mers. + Length is ``(len(seq) - k + 1) * k``. - # Gapped mode: for each window spawn k variants, each with one - # position replaced by the mask byte. - mask_byte = sequence._MASK_BYTE - if not mask_byte: + Raises: + ValueError: If *k* < 1 or *k* > ``len(seq)``. + """ + raw = _to_bytes(seq) + n = len(raw) + if k < 1 or k > n: raise ValueError( - f"Gapped tokenisation requires a mask byte; " - f"{cls.__name__} does not define one" + f"k must be between 1 and sequence length ({n}), got {k}" ) - mask_val = mask_byte[0] - - result = [] - for i in range(n - k + 1): - window = bytes(raw_sz[i : i + k]) + n_windows = n - k + 1 + n_gapped = n_windows * k + out = bytearray(n_gapped * k) + offset = 0 + for i in range(n_windows): + window = raw[i : i + k] for j in range(k): - buf = bytearray(window) - buf[j] = mask_val - result.append(cls._from_trusted_bytes(bytes(buf))) - return result + out[offset : offset + k] = window + out[offset + j] = mask_byte + offset += k + frozen = bytes(out) + return [frozen[i * k : (i + 1) * k] for i in range(n_gapped)] + +def tokenize_gapped_str(seq: Seq, k: int, mask_char: str) -> list[str]: + """Like :func:`tokenize_gapped` but returns ``list[str]``. + + Args: + seq: Input sequence. + k: K-mer length. + mask_char: Single-character mask string (e.g. ``"X"``). + """ + return [ + km.decode("ascii") + for km in tokenize_gapped(seq, k, ord(mask_char)) + ] diff --git a/tests/test_memory_benchmark.py b/tests/test_memory_benchmark.py new file mode 100644 index 0000000..0b5267b --- /dev/null +++ b/tests/test_memory_benchmark.py @@ -0,0 +1,133 @@ +"""Memory benchmark for k-mer tokenisation. + +Uses ``tracemalloc`` to measure memory for: + +1. Plain k-mers: tokenize() vs naive str slicing vs naive bytes slicing. +2. Gapped k-mers: tokenize_gapped() vs naive approaches. + +Run with ``python -m pytest tests/test_memory_benchmark.py -s``. +""" + +import random +import tracemalloc +import unittest + +from mir.basic.sequence import AA_MASK +from mir.basic.tokens import tokenize, tokenize_gapped + +N = 100_000 +SEQ_LEN = 15 +K = 3 +MASK_STR = "X" + +_AA_LETTERS = "ACDEFGHIKLMNPQRSTVWY" + + +def _random_strings(n: int, length: int) -> list[str]: + rng = random.Random(42) + return ["".join(rng.choices(_AA_LETTERS, k=length)) for _ in range(n)] + + +def _fmt(nbytes: int) -> str: + return f"{nbytes / 1024:.1f} KiB" + + +class TestMemoryBenchmark(unittest.TestCase): + + def test_plain_kmer_memory(self) -> None: + """Compare memory: tokenize() vs naive str/bytes slicing.""" + strings = _random_strings(N, SEQ_LEN) + byte_strings = [s.encode() for s in strings] + expected = N * (SEQ_LEN - K + 1) + + # naive str slices + tracemalloc.start() + str_kmers = [] + for s in strings: + str_kmers.extend(s[i : i + K] for i in range(len(s) - K + 1)) + cur_str, peak_str = tracemalloc.get_traced_memory() + tracemalloc.stop() + + # naive bytes slices + tracemalloc.start() + bytes_kmers = [] + for b in byte_strings: + bytes_kmers.extend(b[i : i + K] for i in range(len(b) - K + 1)) + cur_bytes, peak_bytes = tracemalloc.get_traced_memory() + tracemalloc.stop() + + # tokenize(bytes) + tracemalloc.start() + tok_kmers = [] + for b in byte_strings: + tok_kmers.extend(tokenize(b, K)) + cur_tok, peak_tok = tracemalloc.get_traced_memory() + tracemalloc.stop() + + self.assertEqual(len(str_kmers), expected) + self.assertEqual(len(bytes_kmers), expected) + self.assertEqual(len(tok_kmers), expected) + + print( + f"\n{'Approach':<32} {'Count':>8} {'Current':>12} {'Peak':>12} " + f"{'Per-item':>10}\n" + f"{'-' * 76}" + ) + for lbl, count, cur, peak in [ + ("naive str slices", len(str_kmers), cur_str, peak_str), + ("naive bytes slices", len(bytes_kmers), cur_bytes, peak_bytes), + ("tokenize(bytes)", len(tok_kmers), cur_tok, peak_tok), + ]: + per = cur / count if count else 0 + print( + f"{lbl:<32} {count:>8} {_fmt(cur):>12} {_fmt(peak):>12} " + f"{per:>8.0f} B" + ) + + def test_gapped_kmer_memory(self) -> None: + """Compare memory: tokenize_gapped() vs naive gapped str slicing.""" + strings = _random_strings(N, SEQ_LEN) + byte_strings = [s.encode() for s in strings] + n_windows = SEQ_LEN - K + 1 + expected = N * n_windows * K + + # naive str gapped + tracemalloc.start() + str_gapped = [] + for s in strings: + for i in range(len(s) - K + 1): + w = s[i : i + K] + for j in range(K): + str_gapped.append(w[:j] + MASK_STR + w[j + 1 :]) + cur_str, peak_str = tracemalloc.get_traced_memory() + tracemalloc.stop() + + # tokenize_gapped(bytes) + tracemalloc.start() + tok_gapped = [] + for b in byte_strings: + tok_gapped.extend(tokenize_gapped(b, K, AA_MASK)) + cur_tok, peak_tok = tracemalloc.get_traced_memory() + tracemalloc.stop() + + self.assertEqual(len(str_gapped), expected) + self.assertEqual(len(tok_gapped), expected) + + print( + f"\n{'Approach':<32} {'Count':>8} {'Current':>12} {'Peak':>12} " + f"{'Per-item':>10}\n" + f"{'-' * 76}" + ) + for lbl, count, cur, peak in [ + ("naive str gapped", len(str_gapped), cur_str, peak_str), + ("tokenize_gapped(bytes)", len(tok_gapped), cur_tok, peak_tok), + ]: + per = cur / count if count else 0 + print( + f"{lbl:<32} {count:>8} {_fmt(cur):>12} {_fmt(peak):>12} " + f"{per:>8.0f} B" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 33bbca0..04903a8 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,213 +1,190 @@ -"""Unit tests for :mod:`mir.basic.sequence`. +"""Unit tests for :mod:`mir.basic.sequence` functions. Coverage: - SequenceAlphabet -- singleton caching. - AlphabetSequence -- construction, string round-trip, substring, - immutability, ``__eq__``, ``__hash__``. - NucleotideSequence -- parsing, slicing, masking. - AminoAcidSequence -- parsing, slicing, reduced conversion, matching. - ReducedAminoAcidSequence -- parsing, slicing, masking, matching. - Equality vs matching -- ``matches()`` is wildcard-aware, ``==`` is not. + make_alphabet / validate — alphabet construction and validation. + aa_to_reduced / translate — byte-level translation. + mask — single-index, range, and slice masking. + matches — wildcard-aware comparison. + matches_aa_reduced — cross-alphabet wildcard match. + str / bytes duality — every function accepts both types. """ import unittest -import numpy as np - from mir.basic.sequence import ( - AminoAcidSequence, - NucleotideSequence, - ReducedAminoAcidSequence, - SequenceAlphabet, + AA_ALPHABET, + AA_MASK, + AA_TO_REDUCED_TABLE, + NT_ALPHABET, + NT_MASK, + REDUCED_AA_ALPHABET, + REDUCED_AA_MASK, + aa_to_reduced, + make_alphabet, + mask, + matches, + matches_aa_reduced, + translate, + validate, ) -class TestAlphabetSequence(unittest.TestCase): - """Construction, round-trip, substring, immutability.""" +class TestMakeAlphabet(unittest.TestCase): - def test_create_convert_and_substring(self) -> None: - self.assertIs( - NucleotideSequence.DEFAULT_ALPHABET, - SequenceAlphabet(("A", "T", "G", "C", "N")), - ) + def test_custom_alphabet(self) -> None: + lut = make_alphabet("AB") + self.assertEqual(len(lut), 256) + self.assertEqual(lut[ord("A")], 1) + self.assertEqual(lut[ord("B")], 1) + self.assertEqual(lut[ord("C")], 0) - nt = NucleotideSequence.from_string("ATTAGACA") - self.assertEqual(nt.to_string(), "ATTAGACA") - self.assertEqual(nt.data.dtype, np.dtype("S1")) - self.assertEqual(nt.data.tobytes(), b"ATTAGACA") - self.assertEqual(nt.substring(2, 6).to_string(), "TAGA") + def test_predefined_nt(self) -> None: + for ch in "ATGCN": + self.assertEqual(NT_ALPHABET[ord(ch)], 1) + self.assertEqual(NT_ALPHABET[ord("U")], 0) - aa = AminoAcidSequence.from_string("CASSLAPGATNEKLFF") - self.assertEqual(aa.to_string(), "CASSLAPGATNEKLFF") - self.assertEqual(aa.substring(4, 9).to_string(), "LAPGA") + def test_predefined_aa(self) -> None: + for ch in "ACDEFGHIKLMNPQRSTVWYX*_": + self.assertEqual(AA_ALPHABET[ord(ch)], 1) + self.assertEqual(AA_ALPHABET[ord("B")], 0) - def test_empty_or_invalid_sequence(self) -> None: - empty_nt = NucleotideSequence.from_string("") - self.assertEqual(len(empty_nt), 0) - self.assertEqual(empty_nt.to_string(), "") + def test_predefined_reduced(self) -> None: + for ch in "lbmcshGFPWYX*_": + self.assertEqual(REDUCED_AA_ALPHABET[ord(ch)], 1) + self.assertEqual(REDUCED_AA_ALPHABET[ord("Z")], 0) - empty_aa = AminoAcidSequence.from_string("") - self.assertEqual(len(empty_aa), 0) - self.assertEqual(empty_aa.to_string(), "") - self.assertEqual( - NucleotideSequence.from_string("ATTAGACA").substring(0, 0).to_string(), "" - ) +class TestValidate(unittest.TestCase): - self.assertEqual(NucleotideSequence.from_string("ATN").to_string(), "ATN") + def test_valid_nt_str(self) -> None: + self.assertEqual(validate("ATTAGACA", NT_ALPHABET), b"ATTAGACA") - with self.assertRaises(ValueError): - NucleotideSequence.from_string("ATU") + def test_valid_nt_bytes(self) -> None: + self.assertEqual(validate(b"ATN", NT_ALPHABET), b"ATN") + + def test_valid_aa_bytearray(self) -> None: + self.assertEqual(validate(bytearray(b"CAST"), AA_ALPHABET), b"CAST") + def test_empty(self) -> None: + self.assertEqual(validate("", NT_ALPHABET), b"") + self.assertEqual(validate(b"", AA_ALPHABET), b"") + + def test_invalid_nt(self) -> None: with self.assertRaises(ValueError): - AminoAcidSequence.from_string("B") + validate("ATU", NT_ALPHABET) - def test_immutability(self) -> None: - """The underlying byte array is read-only.""" - nt = NucleotideSequence.from_string("ATCG") + def test_invalid_aa(self) -> None: with self.assertRaises(ValueError): - nt.data[0] = b"G" - - def test_no_extra_attributes(self) -> None: - """__slots__ prevents adding arbitrary instance attributes.""" - nt = NucleotideSequence.from_string("ATCG") - with self.assertRaises(AttributeError): - nt.foo = 42 # type: ignore[attr-defined] - - def test_content_backward_compat(self) -> None: - """The .content property still works.""" - nt = NucleotideSequence.from_string("ATCG") - np.testing.assert_array_equal(nt.content, nt.data) - - def test_repr(self) -> None: - nt = NucleotideSequence.from_string("ATCG") - self.assertEqual(repr(nt), "NucleotideSequence('ATCG')") - - -class TestEqualityAndHashing(unittest.TestCase): - """``__eq__`` and ``__hash__`` use raw bytes, not wildcard matching.""" - - def test_equal_sequences(self) -> None: - a = NucleotideSequence.from_string("ATCG") - b = NucleotideSequence.from_string("ATCG") - self.assertEqual(a, b) - self.assertEqual(hash(a), hash(b)) - - def test_unequal_sequences(self) -> None: - a = NucleotideSequence.from_string("ATCG") - b = NucleotideSequence.from_string("ATNG") - self.assertNotEqual(a, b) - - def test_masked_not_equal_but_matches(self) -> None: - """A masked sequence matches the original but is not equal.""" - orig = NucleotideSequence.from_string("ATCG") - masked = NucleotideSequence.from_string("ANNG") - self.assertNotEqual(orig, masked) - self.assertTrue(orig.matches(masked)) - - def test_set_and_dict_storage(self) -> None: - a = AminoAcidSequence.from_string("CAST") - b = AminoAcidSequence.from_string("CAST") - c = AminoAcidSequence.from_string("XAST") - s = {a, b, c} - self.assertEqual(len(s), 2) - d = {a: 1} - self.assertEqual(d[b], 1) - self.assertNotIn(c, d) - - def test_cross_type_not_equal(self) -> None: - """Different types with identical bytes are not equal.""" - aa = AminoAcidSequence.from_string("X") - red = ReducedAminoAcidSequence.from_string("X") - self.assertNotEqual(aa, red) - - -class TestReducedAminoAcidSequence(unittest.TestCase): - """Reduced-alphabet conversion and matching.""" - - def test_conversion_via_byte_lut(self) -> None: - aa = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") - reduced = aa.to_reduced_amino_acid() - self.assertEqual(reduced.to_string(), "slhhllGGlhmcbllW") - - def test_match_and_mismatch(self) -> None: - aa = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") - reduced = aa.to_reduced_amino_acid() - self.assertTrue(aa.matches_reduced_amino_acid(reduced)) - self.assertFalse( - aa.matches_reduced_amino_acid( - ReducedAminoAcidSequence.from_string("slhhllGGlhmcbllY") - ) - ) - - def test_masked_aa_matches_reduced(self) -> None: - aa = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") - reduced = aa.to_reduced_amino_acid() - self.assertTrue(aa.mask(2).matches_reduced_amino_acid(reduced)) - - def test_masked_reduced_matches_aa(self) -> None: - aa = AminoAcidSequence.from_string("CASTIVGGLSQDKIVW") - reduced = aa.to_reduced_amino_acid() - self.assertTrue(aa.matches_reduced_amino_acid(reduced.mask((2, 5)))) - - def test_backwards_compatible_aliases(self) -> None: - aa = AminoAcidSequence.from_string("CAST") - reduced = aa.to_simple_amino_acid() - self.assertIsInstance(reduced, ReducedAminoAcidSequence) - self.assertTrue(aa.matches_simple_amino_acid(reduced)) - - def test_reduced_substrings(self) -> None: - reduced = ReducedAminoAcidSequence.from_string("slhhllGGlhmcbllW") - self.assertEqual(reduced.substring(0, 4).to_string(), "slhh") - self.assertEqual(reduced.substring(6, 8).to_string(), "GG") - self.assertEqual(reduced.substring(11, None).to_string(), "cbllW") + validate("B", AA_ALPHABET) + def test_invalid_reduced(self) -> None: with self.assertRaises(ValueError): - ReducedAminoAcidSequence.from_string("Z") - - -class TestMaskAndMatch(unittest.TestCase): - """Masking and wildcard-aware matching.""" - - def test_nucleotide_mask_single_and_range(self) -> None: - seq = NucleotideSequence.from_string("ATCGAT") - self.assertEqual(seq.mask(1).to_string(), "ANCGAT") - self.assertEqual(seq.mask((2, 5)).to_string(), "ATNNNT") - self.assertEqual(seq.mask(slice(0, 3)).to_string(), "NNNGAT") - - def test_amino_and_reduced_mask(self) -> None: - aa = AminoAcidSequence.from_string("CASTIV") - reduced = ReducedAminoAcidSequence.from_string("slhhll") - self.assertEqual(aa.mask(0).to_string(), "XASTIV") - self.assertEqual(aa.mask((1, 4)).to_string(), "CXXXIV") - self.assertEqual(reduced.mask(slice(2, 5)).to_string(), "slXXXl") - - def test_matching_ignores_mask_symbols(self) -> None: - nt1 = NucleotideSequence.from_string("ATCG") - nt2 = NucleotideSequence.from_string("ANNG") - self.assertTrue(nt1.matches(nt2)) - self.assertFalse(nt1.matches(NucleotideSequence.from_string("ANNA"))) - - aa1 = AminoAcidSequence.from_string("CAST") - aa2 = AminoAcidSequence.from_string("XASX") - self.assertTrue(aa1.matches(aa2)) - self.assertFalse(aa1.matches(AminoAcidSequence.from_string("XATX"))) - - red1 = ReducedAminoAcidSequence.from_string("slhh") - red2 = ReducedAminoAcidSequence.from_string("sXXh") - self.assertTrue(red1.matches(red2)) - self.assertFalse(red1.matches(ReducedAminoAcidSequence.from_string("sXXY"))) - - def test_length_mismatch_does_not_match(self) -> None: - a = NucleotideSequence.from_string("ATC") - b = NucleotideSequence.from_string("ATCG") - self.assertFalse(a.matches(b)) - - def test_empty_sequences_match(self) -> None: - a = NucleotideSequence.from_string("") - b = NucleotideSequence.from_string("") - self.assertTrue(a.matches(b)) - self.assertEqual(a, b) + validate("Z", REDUCED_AA_ALPHABET) + + +class TestTranslateAndReduce(unittest.TestCase): + + def test_aa_to_reduced_str(self) -> None: + self.assertEqual(aa_to_reduced("CASTIVGGLSQDKIVW"), b"slhhllGGlhmcbllW") + + def test_aa_to_reduced_bytes(self) -> None: + self.assertEqual(aa_to_reduced(b"CASTIVGGLSQDKIVW"), b"slhhllGGlhmcbllW") + + def test_generic_translate(self) -> None: + self.assertEqual(translate("CAST", AA_TO_REDUCED_TABLE), b"slhh") + + def test_empty_translate(self) -> None: + self.assertEqual(aa_to_reduced(""), b"") + + +class TestMask(unittest.TestCase): + + def test_single_nt(self) -> None: + self.assertEqual(mask("ATCGAT", 1, NT_MASK), b"ANCGAT") + + def test_range_nt(self) -> None: + self.assertEqual(mask("ATCGAT", (2, 5), NT_MASK), b"ATNNNT") + + def test_slice_nt(self) -> None: + self.assertEqual(mask("ATCGAT", slice(0, 3), NT_MASK), b"NNNGAT") + + def test_aa_single(self) -> None: + self.assertEqual(mask("CASTIV", 0, AA_MASK), b"XASTIV") + + def test_aa_range(self) -> None: + self.assertEqual(mask("CASTIV", (1, 4), AA_MASK), b"CXXXIV") + + def test_reduced_slice(self) -> None: + self.assertEqual(mask("slhhll", slice(2, 5), REDUCED_AA_MASK), b"slXXXl") + + def test_bytes_input(self) -> None: + self.assertEqual(mask(b"ATCG", 0, NT_MASK), b"NTCG") + + def test_out_of_range(self) -> None: + with self.assertRaises(IndexError): + mask("AT", 5, NT_MASK) + + +class TestMatches(unittest.TestCase): + + def test_identical(self) -> None: + self.assertTrue(matches("ATCG", "ATCG", NT_MASK)) + + def test_wildcard_match(self) -> None: + self.assertTrue(matches("ATCG", "ANNG", NT_MASK)) + + def test_no_match(self) -> None: + self.assertFalse(matches("ATCG", "ANNA", NT_MASK)) + + def test_length_mismatch(self) -> None: + self.assertFalse(matches("ATC", "ATCG", NT_MASK)) + + def test_empty(self) -> None: + self.assertTrue(matches("", "", NT_MASK)) + + def test_aa_wildcard(self) -> None: + self.assertTrue(matches("CAST", "XASX", AA_MASK)) + self.assertFalse(matches("CAST", "XATX", AA_MASK)) + + def test_reduced_wildcard(self) -> None: + self.assertTrue(matches("slhh", "sXXh", REDUCED_AA_MASK)) + self.assertFalse(matches("slhh", "sXXY", REDUCED_AA_MASK)) + + def test_bytes_input(self) -> None: + self.assertTrue(matches(b"ATCG", b"ANNG", NT_MASK)) + + def test_mixed_str_bytes(self) -> None: + self.assertTrue(matches("ATCG", b"ANNG", NT_MASK)) + + +class TestMatchesAaReduced(unittest.TestCase): + + def test_match(self) -> None: + reduced = aa_to_reduced("CASTIVGGLSQDKIVW") + self.assertTrue(matches_aa_reduced("CASTIVGGLSQDKIVW", reduced)) + + def test_mismatch(self) -> None: + self.assertFalse(matches_aa_reduced("CASTIVGGLSQDKIVW", b"slhhllGGlhmcbllY")) + + def test_masked_aa(self) -> None: + reduced = aa_to_reduced("CASTIVGGLSQDKIVW") + masked_aa = mask("CASTIVGGLSQDKIVW", 2, AA_MASK) + self.assertTrue(matches_aa_reduced(masked_aa, reduced)) + + def test_masked_reduced(self) -> None: + reduced = aa_to_reduced("CASTIVGGLSQDKIVW") + masked_red = mask(reduced, (2, 5), REDUCED_AA_MASK) + self.assertTrue(matches_aa_reduced("CASTIVGGLSQDKIVW", masked_red)) + + def test_empty(self) -> None: + self.assertTrue(matches_aa_reduced("", "")) + + def test_length_mismatch(self) -> None: + self.assertFalse(matches_aa_reduced("CAS", "sl")) + + def test_bytes_input(self) -> None: + reduced = aa_to_reduced(b"CAST") + self.assertTrue(matches_aa_reduced(b"CAST", reduced)) if __name__ == "__main__": diff --git a/tests/test_sequence_benchmark.py b/tests/test_sequence_benchmark.py new file mode 100644 index 0000000..230a3ff --- /dev/null +++ b/tests/test_sequence_benchmark.py @@ -0,0 +1,289 @@ +"""Speed benchmarks for sequence operations: validation, translation, +slicing, matching, and cross-alphabet matching. + +Each benchmark compares the ``mir.basic.sequence`` function against one +or more naive Python implementations. + +Run with ``python -m pytest tests/test_sequence_benchmark.py -s``. +""" + +import random +import time +import unittest + +from mir.basic.sequence import ( + AA_ALPHABET, + AA_MASK, + AA_TO_REDUCED, + AA_TO_REDUCED_TABLE, + NT_MASK, + _AA_TO_REDUCED_LUT, + _to_bytes, + aa_to_reduced, + matches, + matches_aa_reduced, + validate, +) + +N = 10_000 +SEQ_LEN = 15 +K = 3 + +_AA_LETTERS = "ACDEFGHIKLMNPQRSTVWY" +_AA_SET = frozenset(_AA_LETTERS + "*_X") + + +def _random_strings(n: int, length: int) -> list[str]: + rng = random.Random(42) + return ["".join(rng.choices(_AA_LETTERS, k=length)) for _ in range(n)] + + +def _print_table(title: str, rows: list[tuple[str, float, int]]) -> None: + print( + f"\n{title}\n" + f"{'Method':<40} {'Time (s)':>10} {'ops/s':>14}\n" + f"{'-' * 66}" + ) + for label, elapsed, count in rows: + rate = count / elapsed if elapsed > 0 else float("inf") + print(f"{label:<40} {elapsed:>10.4f} {rate:>14,.0f}") + + +class TestValidationBenchmark(unittest.TestCase): + + def test_validate_lut_vs_set(self) -> None: + """Alphabet validation: LUT (bytes[256]) vs frozenset membership.""" + strings = _random_strings(N, SEQ_LEN) + byte_strings = [s.encode() for s in strings] + + # LUT validation (validate function) + t0 = time.perf_counter() + for b in byte_strings: + validate(b, AA_ALPHABET) + t_lut = time.perf_counter() - t0 + + # frozenset[int] validation + aa_ords = frozenset(ord(c) for c in _AA_SET) + t0 = time.perf_counter() + for b in byte_strings: + for ch in b: + if ch not in aa_ords: + raise ValueError + t_fset = time.perf_counter() - t0 + + # naive str 'in' check + t0 = time.perf_counter() + for s in strings: + for ch in s: + if ch not in _AA_SET: + raise ValueError + t_str_in = time.perf_counter() - t0 + + _print_table( + f"Validation (N={N:,}, len={SEQ_LEN})", + [ + ("validate() [bytes LUT]", t_lut, N), + ("frozenset[int] loop", t_fset, N), + ("str 'in' frozenset[str]", t_str_in, N), + ], + ) + + +class TestTranslationBenchmark(unittest.TestCase): + + def test_translate_lut_vs_dict(self) -> None: + """Translation: bytes.translate vs dict lookup vs manual byte loop.""" + strings = _random_strings(N, SEQ_LEN) + byte_strings = [s.encode() for s in strings] + + # bytes.translate (aa_to_reduced) + t0 = time.perf_counter() + for b in byte_strings: + _ = b.translate(AA_TO_REDUCED_TABLE) + t_translate = time.perf_counter() - t0 + + # aa_to_reduced with str input (includes encode) + t0 = time.perf_counter() + for s in strings: + _ = aa_to_reduced(s) + t_aa_str = time.perf_counter() - t0 + + # naive dict[str,str] lookup + join + t0 = time.perf_counter() + for s in strings: + _ = "".join(AA_TO_REDUCED.get(ch, ch) for ch in s) + t_dict_join = time.perf_counter() - t0 + + # manual byte LUT loop + lut = _AA_TO_REDUCED_LUT + t0 = time.perf_counter() + for b in byte_strings: + _ = bytes(lut[ch] for ch in b) + t_manual = time.perf_counter() - t0 + + _print_table( + f"Translation AA→reduced (N={N:,}, len={SEQ_LEN})", + [ + ("bytes.translate (bytes in)", t_translate, N), + ("aa_to_reduced (str in)", t_aa_str, N), + ("dict[str,str] + join", t_dict_join, N), + ("manual byte LUT loop", t_manual, N), + ], + ) + ratio = t_dict_join / t_translate if t_translate > 0 else float("inf") + print(f"\ndict+join / bytes.translate: {ratio:.1f}x slower") + + +class TestSlicingBenchmark(unittest.TestCase): + + def test_bytes_vs_str_slicing(self) -> None: + """Substring slicing: bytes[i:j] vs str[i:j] at various k.""" + strings = _random_strings(N, SEQ_LEN) + byte_strings = [s.encode() for s in strings] + + for k in (3, 5, 10): + n_slices = SEQ_LEN - k + 1 + expected = N * n_slices + + # str slicing + t0 = time.perf_counter() + cnt = 0 + for s in strings: + for i in range(len(s) - k + 1): + _ = s[i : i + k] + cnt += 1 + t_str = time.perf_counter() - t0 + + # bytes slicing + t0 = time.perf_counter() + cnt2 = 0 + for b in byte_strings: + for i in range(len(b) - k + 1): + _ = b[i : i + k] + cnt2 += 1 + t_bytes = time.perf_counter() - t0 + + # str slicing via encode→slice→decode + t0 = time.perf_counter() + cnt3 = 0 + for s in strings: + b = s.encode() + for i in range(len(b) - k + 1): + _ = b[i : i + k] + cnt3 += 1 + t_enc_slice = time.perf_counter() - t0 + + self.assertEqual(cnt, expected) + self.assertEqual(cnt2, expected) + self.assertEqual(cnt3, expected) + + _print_table( + f"Slicing k={k} (N={N:,}, len={SEQ_LEN}, {n_slices} slices/seq)", + [ + ("str[i:i+k]", t_str, expected), + ("bytes[i:i+k]", t_bytes, expected), + ("str.encode + bytes[i:i+k]", t_enc_slice, expected), + ], + ) + ratio = t_str / t_bytes if t_bytes > 0 else float("inf") + print(f" str/bytes ratio: {ratio:.2f}x") + + +class TestMatchingBenchmark(unittest.TestCase): + + def test_matches_vs_naive(self) -> None: + """Wildcard matching: matches() vs naive Python loop.""" + rng = random.Random(42) + strings_a = _random_strings(N, SEQ_LEN) + # create pairs: 50% identical, 50% with 1 mask position + strings_b = [] + for s in strings_a: + if rng.random() < 0.5: + strings_b.append(s) + else: + pos = rng.randint(0, SEQ_LEN - 1) + strings_b.append(s[:pos] + "X" + s[pos + 1 :]) + + bytes_a = [s.encode() for s in strings_a] + bytes_b = [s.encode() for s in strings_b] + + # matches() function + t0 = time.perf_counter() + res1 = 0 + for a, b in zip(bytes_a, bytes_b): + if matches(a, b, AA_MASK): + res1 += 1 + t_func = time.perf_counter() - t0 + + # naive Python: zip + compare + mask_val = AA_MASK + t0 = time.perf_counter() + res2 = 0 + for a, b in zip(bytes_a, bytes_b): + if len(a) == len(b) and all( + x == y or x == mask_val or y == mask_val + for x, y in zip(a, b) + ): + res2 += 1 + t_naive = time.perf_counter() - t0 + + # naive str comparison + t0 = time.perf_counter() + res3 = 0 + for a, b in zip(strings_a, strings_b): + if len(a) == len(b) and all( + x == y or x == "X" or y == "X" + for x, y in zip(a, b) + ): + res3 += 1 + t_str = time.perf_counter() - t0 + + self.assertEqual(res1, res2) + self.assertEqual(res1, res3) + + _print_table( + f"Wildcard matching (N={N:,}, len={SEQ_LEN})", + [ + ("matches() [bytes]", t_func, N), + ("naive bytes zip+all", t_naive, N), + ("naive str zip+all", t_str, N), + ], + ) + + def test_matches_aa_reduced_vs_naive(self) -> None: + """Cross-alphabet matching: matches_aa_reduced() vs naive.""" + strings = _random_strings(N, SEQ_LEN) + reduced = [aa_to_reduced(s) for s in strings] + + bytes_aa = [s.encode() for s in strings] + + # matches_aa_reduced() + t0 = time.perf_counter() + cnt = 0 + for a, r in zip(bytes_aa, reduced): + if matches_aa_reduced(a, r): + cnt += 1 + t_func = time.perf_counter() - t0 + + # naive: translate then compare + t0 = time.perf_counter() + cnt2 = 0 + for a, r in zip(bytes_aa, reduced): + if a.translate(AA_TO_REDUCED_TABLE) == r: + cnt2 += 1 + t_naive = time.perf_counter() - t0 + + self.assertEqual(cnt, N) + self.assertEqual(cnt2, N) + + _print_table( + f"Cross-alphabet matching (N={N:,}, len={SEQ_LEN})", + [ + ("matches_aa_reduced()", t_func, N), + ("translate + bytes ==", t_naive, N), + ], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_tokens.py b/tests/test_tokens.py index f10dd0c..9d7fe92 100644 --- a/tests/test_tokens.py +++ b/tests/test_tokens.py @@ -1,129 +1,109 @@ -"""Unit tests for :mod:`mir.basic.tokens`.""" +"""Unit tests for :mod:`mir.basic.tokens` functions.""" import unittest -import numpy as np +from mir.basic.sequence import AA_MASK, NT_MASK, REDUCED_AA_MASK, matches +from mir.basic.tokens import tokenize, tokenize_gapped, tokenize_gapped_str, tokenize_str -from mir.basic.sequence import ( - AminoAcidSequence, - NucleotideSequence, - ReducedAminoAcidSequence, -) -from mir.basic.tokens import tokenize +class TestTokenize(unittest.TestCase): + """Plain k-mer extraction (bytes output).""" -def _strs(seqs): - """Helper: list of sequences → list of str.""" - return [s.to_string() for s in seqs] + def test_aa_k3(self) -> None: + self.assertEqual(tokenize("CASSL", 3), [b"CAS", b"ASS", b"SSL"]) - -class TestTokenizePlain(unittest.TestCase): - """Plain (non-gapped) k-mer extraction.""" - - def test_amino_acid_k3(self) -> None: - """CASSL → CAS ASS SSL.""" - aa = AminoAcidSequence.from_string("CASSL") - kmers = tokenize(aa, k=3) - self.assertEqual(_strs(kmers), ["CAS", "ASS", "SSL"]) - self.assertIsInstance(kmers[0], AminoAcidSequence) - - def test_nucleotide_k4(self) -> None: - nt = NucleotideSequence.from_string("ATCGAT") - kmers = tokenize(nt, k=4) - self.assertEqual(_strs(kmers), ["ATCG", "TCGA", "CGAT"]) - self.assertIsInstance(kmers[0], NucleotideSequence) + def test_nt_k4(self) -> None: + self.assertEqual(tokenize("ATCGAT", 4), [b"ATCG", b"TCGA", b"CGAT"]) def test_reduced_k2(self) -> None: - red = ReducedAminoAcidSequence.from_string("slhh") - kmers = tokenize(red, k=2) - self.assertEqual(_strs(kmers), ["sl", "lh", "hh"]) + self.assertEqual(tokenize("slhh", 2), [b"sl", b"lh", b"hh"]) def test_k_equals_length(self) -> None: - """When k == len, a single k-mer equal to the sequence is returned.""" - aa = AminoAcidSequence.from_string("CAST") - kmers = tokenize(aa, k=4) - self.assertEqual(len(kmers), 1) - self.assertEqual(kmers[0], aa) + self.assertEqual(tokenize("CAST", 4), [b"CAST"]) def test_k_equals_one(self) -> None: - nt = NucleotideSequence.from_string("ATG") - kmers = tokenize(nt, k=1) - self.assertEqual(_strs(kmers), ["A", "T", "G"]) + self.assertEqual(tokenize("ATG", 1), [b"A", b"T", b"G"]) + + def test_bytes_input(self) -> None: + self.assertEqual(tokenize(b"CASSL", 3), [b"CAS", b"ASS", b"SSL"]) + + def test_bytearray_input(self) -> None: + self.assertEqual(tokenize(bytearray(b"ATG"), 1), [b"A", b"T", b"G"]) def test_invalid_k(self) -> None: - aa = AminoAcidSequence.from_string("CAST") with self.assertRaises(ValueError): - tokenize(aa, k=0) + tokenize("CAST", 0) with self.assertRaises(ValueError): - tokenize(aa, k=5) + tokenize("CAST", 5) + + +class TestTokenizeStr(unittest.TestCase): + """Plain k-mer extraction (str output).""" + + def test_basic(self) -> None: + self.assertEqual(tokenize_str("CASSL", 3), ["CAS", "ASS", "SSL"]) - def test_kmers_are_independent_copies(self) -> None: - """Returned k-mers own their data and don't share buffers.""" - aa = AminoAcidSequence.from_string("CASSL") - kmers = tokenize(aa, k=3) - self.assertFalse(np.shares_memory(kmers[0].data, kmers[1].data)) + def test_bytes_input(self) -> None: + self.assertEqual(tokenize_str(b"ATG", 1), ["A", "T", "G"]) class TestTokenizeGapped(unittest.TestCase): - """Gapped k-mer extraction (single-position mask variants).""" + """Gapped k-mer extraction (bytes output).""" - def test_amino_acid_gapped_k3(self) -> None: - """CASSL → 3 windows × 3 gap positions = 9 gapped k-mers.""" - aa = AminoAcidSequence.from_string("CASSL") - gapped = tokenize(aa, k=3, gapped=True) + def test_aa_gapped_k3(self) -> None: + gapped = tokenize_gapped("CASSL", 3, AA_MASK) self.assertEqual(len(gapped), 9) expected = [ - # window CAS - "XAS", "CXS", "CAX", - # window ASS - "XSS", "AXS", "ASX", - # window SSL - "XSL", "SXL", "SSX", + b"XAS", b"CXS", b"CAX", + b"XSS", b"AXS", b"ASX", + b"XSL", b"SXL", b"SSX", ] - self.assertEqual(_strs(gapped), expected) - self.assertIsInstance(gapped[0], AminoAcidSequence) + self.assertEqual(gapped, expected) - def test_nucleotide_gapped_k2(self) -> None: - nt = NucleotideSequence.from_string("ATG") - gapped = tokenize(nt, k=2, gapped=True) - expected = [ - "NT", "AN", # AT - "NG", "TN", # TG - ] - self.assertEqual(_strs(gapped), expected) + def test_nt_gapped_k2(self) -> None: + gapped = tokenize_gapped("ATG", 2, NT_MASK) + self.assertEqual(gapped, [b"NT", b"AN", b"NG", b"TN"]) def test_reduced_gapped_k2(self) -> None: - red = ReducedAminoAcidSequence.from_string("slh") - gapped = tokenize(red, k=2, gapped=True) - expected = ["Xl", "sX", "Xh", "lX"] - self.assertEqual(_strs(gapped), expected) + gapped = tokenize_gapped("slh", 2, REDUCED_AA_MASK) + self.assertEqual(gapped, [b"Xl", b"sX", b"Xh", b"lX"]) def test_gapped_k1(self) -> None: - """With k=1, each gapped k-mer is just the mask character.""" - aa = AminoAcidSequence.from_string("CA") - gapped = tokenize(aa, k=1, gapped=True) - self.assertEqual(_strs(gapped), ["X", "X"]) + gapped = tokenize_gapped("CA", 1, AA_MASK) + self.assertEqual(gapped, [b"X", b"X"]) - def test_gapped_invalid_k(self) -> None: - aa = AminoAcidSequence.from_string("CAST") + def test_invalid_k(self) -> None: with self.assertRaises(ValueError): - tokenize(aa, k=0, gapped=True) + tokenize_gapped("CAST", 0, AA_MASK) with self.assertRaises(ValueError): - tokenize(aa, k=5, gapped=True) + tokenize_gapped("CAST", 5, AA_MASK) + + def test_bytes_input(self) -> None: + gapped = tokenize_gapped(b"ATG", 2, NT_MASK) + self.assertEqual(gapped, [b"NT", b"AN", b"NG", b"TN"]) - def test_gapped_kmers_match_plain_kmers(self) -> None: + def test_gapped_match_plain(self) -> None: """Each gapped k-mer should wildcard-match its corresponding plain k-mer.""" - aa = AminoAcidSequence.from_string("CASSL") - plain = tokenize(aa, k=3) - gapped = tokenize(aa, k=3, gapped=True) + plain = tokenize("CASSL", 3) + gapped = tokenize_gapped("CASSL", 3, AA_MASK) for i, kmer in enumerate(plain): variants = gapped[i * 3 : (i + 1) * 3] for var in variants: self.assertTrue( - kmer.matches(var), + matches(kmer, var, AA_MASK), f"{kmer} should match {var}", ) +class TestTokenizeGappedStr(unittest.TestCase): + """Gapped k-mer extraction (str output).""" + + def test_basic(self) -> None: + gapped = tokenize_gapped_str("CASSL", 3, "X") + self.assertEqual(len(gapped), 9) + self.assertEqual(gapped[0], "XAS") + self.assertIsInstance(gapped[0], str) + + if __name__ == "__main__": unittest.main() diff --git a/tests/test_tokens_benchmark.py b/tests/test_tokens_benchmark.py index 2c4f58e..57f2e20 100644 --- a/tests/test_tokens_benchmark.py +++ b/tests/test_tokens_benchmark.py @@ -1,77 +1,160 @@ -"""Benchmark: tokenize() vs naive Python string slicing for 3-mer extraction. +"""Speed benchmark: tokenize / tokenize_gapped vs naive Python. -Generates N=10 000 random amino-acid sequences of length 15 and compares -wall-clock time for splitting each into overlapping 3-mers using: +Compares bytes-based tokenisation functions against naive ``str`` slicing +for both plain and gapped k-mers. Also benchmarks ``str`` vs ``bytes`` +input to verify conversion overhead is negligible. -1. ``tokenize()`` from :mod:`mir.basic.tokens` (sequence + memoryview path). -2. Naive Python: plain string slicing producing ``list[str]``. - -Run with ``python -m unittest -v tests/test_tokens_benchmark.py``. +Run with ``python -m pytest tests/test_tokens_benchmark.py -s``. """ import random -import string import time import unittest -from mir.basic.sequence import AminoAcidSequence -from mir.basic.tokens import tokenize +from mir.basic.sequence import AA_MASK +from mir.basic.tokens import tokenize, tokenize_gapped N = 10_000 SEQ_LEN = 15 K = 3 +MASK_STR = "X" -# 20 canonical amino acids _AA_LETTERS = "ACDEFGHIKLMNPQRSTVWY" -def _random_aa_strings(n: int, length: int) -> list[str]: +def _random_strings(n: int, length: int) -> list[str]: rng = random.Random(42) return ["".join(rng.choices(_AA_LETTERS, k=length)) for _ in range(n)] +def _print_table(title: str, rows: list[tuple[str, float, int]]) -> None: + print( + f"\n{title}\n" + f"{'Method':<36} {'Time (s)':>10} {'items/s':>14}\n" + f"{'-' * 62}" + ) + for label, elapsed, count in rows: + rate = count / elapsed if elapsed > 0 else float("inf") + print(f"{label:<36} {elapsed:>10.4f} {rate:>14,.0f}") + + class TestTokenizeBenchmark(unittest.TestCase): - """Wall-clock comparison of tokenize() vs naive string slicing.""" - def test_benchmark_3mer_tokenization(self) -> None: - strings = _random_aa_strings(N, SEQ_LEN) + def test_plain_kmers(self) -> None: + """Plain k-mers: tokenize(bytes) vs naive str slicing.""" + strings = _random_strings(N, SEQ_LEN) + byte_strings = [s.encode() for s in strings] + expected = N * (SEQ_LEN - K + 1) - # -- naive Python string slicing ------------------------------------ + # naive str slicing + t0 = time.perf_counter() + cnt = 0 + for s in strings: + for i in range(len(s) - K + 1): + _ = s[i : i + K] + cnt += 1 + t_naive_str = time.perf_counter() - t0 + + # naive bytes slicing + t0 = time.perf_counter() + cnt2 = 0 + for b in byte_strings: + for i in range(len(b) - K + 1): + _ = b[i : i + K] + cnt2 += 1 + t_naive_bytes = time.perf_counter() - t0 + + # tokenize(str input) t0 = time.perf_counter() - naive_total = 0 + cnt3 = 0 for s in strings: - kmers = [s[i : i + K] for i in range(len(s) - K + 1)] - naive_total += len(kmers) - t_naive = time.perf_counter() - t0 + cnt3 += len(tokenize(s, K)) + t_tok_str = time.perf_counter() - t0 - # -- tokenize (sequence objects) ------------------------------------ - sequences = [AminoAcidSequence.from_string(s) for s in strings] + # tokenize(bytes input) t0 = time.perf_counter() - tok_total = 0 - for seq in sequences: - kmers = tokenize(seq, k=K) - tok_total += len(kmers) - t_tokenize = time.perf_counter() - t0 + cnt4 = 0 + for b in byte_strings: + cnt4 += len(tokenize(b, K)) + t_tok_bytes = time.perf_counter() - t0 + + self.assertEqual(cnt, expected) + self.assertEqual(cnt2, expected) + self.assertEqual(cnt3, expected) + self.assertEqual(cnt4, expected) + + _print_table( + f"Plain {K}-mers (N={N:,}, len={SEQ_LEN})", + [ + ("naive str slicing", t_naive_str, expected), + ("naive bytes slicing", t_naive_bytes, expected), + ("tokenize(str input)", t_tok_str, expected), + ("tokenize(bytes input)", t_tok_bytes, expected), + ], + ) + ratio = t_tok_bytes / t_naive_str if t_naive_str > 0 else float("inf") + print(f"\ntokenize(bytes) / naive str: {ratio:.2f}x") - # Both must produce the same number of k-mers - self.assertEqual(naive_total, tok_total) + def test_gapped_kmers(self) -> None: + """Gapped k-mers: tokenize_gapped vs naive str concatenation.""" + strings = _random_strings(N, SEQ_LEN) + byte_strings = [s.encode() for s in strings] + n_windows = SEQ_LEN - K + 1 + expected = N * n_windows * K - expected_per_seq = SEQ_LEN - K + 1 - self.assertEqual(naive_total, N * expected_per_seq) + # naive str: slice + replace + t0 = time.perf_counter() + cnt = 0 + for s in strings: + for i in range(len(s) - K + 1): + w = s[i : i + K] + for j in range(K): + _ = w[:j] + MASK_STR + w[j + 1 :] + cnt += 1 + t_naive_str = time.perf_counter() - t0 + + # naive bytes: slice + replace + mask_b = bytes([AA_MASK]) + t0 = time.perf_counter() + cnt2 = 0 + for b in byte_strings: + for i in range(len(b) - K + 1): + w = b[i : i + K] + for j in range(K): + _ = w[:j] + mask_b + w[j + 1 :] + cnt2 += 1 + t_naive_bytes = time.perf_counter() - t0 + + # tokenize_gapped(str input) + t0 = time.perf_counter() + cnt3 = 0 + for s in strings: + cnt3 += len(tokenize_gapped(s, K, AA_MASK)) + t_tok_str = time.perf_counter() - t0 - print( - f"\n{'Method':<22} {'Time (s)':>10} {'k-mers/s':>14}\n" - f"{'-' * 48}" + # tokenize_gapped(bytes input) + t0 = time.perf_counter() + cnt4 = 0 + for b in byte_strings: + cnt4 += len(tokenize_gapped(b, K, AA_MASK)) + t_tok_bytes = time.perf_counter() - t0 + + self.assertEqual(cnt, expected) + self.assertEqual(cnt2, expected) + self.assertEqual(cnt3, expected) + self.assertEqual(cnt4, expected) + + _print_table( + f"Gapped {K}-mers (N={N:,}, len={SEQ_LEN})", + [ + ("naive str slice+replace", t_naive_str, expected), + ("naive bytes slice+replace", t_naive_bytes, expected), + ("tokenize_gapped(str input)", t_tok_str, expected), + ("tokenize_gapped(bytes input)", t_tok_bytes, expected), + ], ) - for label, elapsed in [ - ("naive str slicing", t_naive), - ("tokenize()", t_tokenize), - ]: - rate = tok_total / elapsed if elapsed > 0 else float("inf") - print(f"{label:<22} {elapsed:>10.4f} {rate:>14,.0f}") - - ratio = t_tokenize / t_naive if t_naive > 0 else float("inf") - print(f"\ntokenize / naive ratio: {ratio:.2f}x") + ratio = t_tok_bytes / t_naive_str if t_naive_str > 0 else float("inf") + print(f"\ntokenize_gapped(bytes) / naive str: {ratio:.2f}x") if __name__ == "__main__": From 7b8fcfd1da66f3cea29d911172d7fc5a4d53d341 Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 17:26:55 +0300 Subject: [PATCH 08/24] wip token summary --- mir/basic/token_tables.py | 260 ++++++++++++++ tests/test_token_tables.py | 670 +++++++++++++++++++++++++++++++++++++ 2 files changed, 930 insertions(+) create mode 100644 mir/basic/token_tables.py create mode 100644 tests/test_token_tables.py diff --git a/mir/basic/token_tables.py b/mir/basic/token_tables.py new file mode 100644 index 0000000..00ac8c7 --- /dev/null +++ b/mir/basic/token_tables.py @@ -0,0 +1,260 @@ +"""Rearrangement-level k-mer indexing. + +Provides a lightweight ``Rearrangement`` type and hashable k-mer +named-tuples, together with functions that build inverted indices and +summary statistics from rearrangement lists. + +Functions +--------- +* ``tokenize_rearrangements`` — ``dict[Kmer, list[KmerMatch]]`` with + position tracking. +* ``summarize_rearrangements`` — ``dict[Kmer, KmerStats]`` (full key). +* ``summarize_annotations`` — ``dict[KmerSeq, dict[KmerAnnotation, KmerStats]]`` + keyed by (locus, seq) only, mapping to per-(v_gene, c_gene, position) + counts. + +All functions accept an optional *mask_byte* for gapped k-mers. +No runtime type checks — relies on static typing. +""" + +from __future__ import annotations + +from typing import NamedTuple + +from mir.basic.sequence import Seq, _to_bytes + + +# --------------------------------------------------------------------------- +# Types +# --------------------------------------------------------------------------- + +class Rearrangement: + """Immune receptor rearrangement with minimal annotation. + + Uses ``__slots__`` for memory efficiency. + """ + + __slots__ = ("locus", "id", "v_gene", "c_gene", "junction_aa", + "duplicate_count") + + def __init__( + self, + locus: str, + id: str, + v_gene: str, + c_gene: str, + junction_aa: str, + duplicate_count: int, + ) -> None: + self.locus = locus + self.id = id + self.v_gene = v_gene + self.c_gene = c_gene + self.junction_aa = junction_aa + self.duplicate_count = duplicate_count + + +class Kmer(NamedTuple): + """Annotated k-mer: sequence plus the gene context it was drawn from. + + Hashable by default (NamedTuple), so it can serve as a ``dict`` key. + """ + + locus: str + v_gene: str + c_gene: str + seq: bytes + + +class KmerMatch(NamedTuple): + """A single k-mer occurrence linking back to its source.""" + + rearrangement: Rearrangement + position: int + + +class KmerSeq(NamedTuple): + """Reduced k-mer key: locus + sequence only (ignores gene annotation).""" + + locus: str + seq: bytes + + +class KmerAnnotation(NamedTuple): + """Parent annotation for a k-mer occurrence.""" + + v_gene: str + c_gene: str + position: int + + +class KmerStats(NamedTuple): + """Aggregate statistics for a single k-mer (or annotation bucket).""" + + rearrangement_count: int + duplicate_count: int + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +def _plain_kmers(raw: bytes, k: int) -> list[tuple[bytes, int]]: + """Overlapping k-mers from *raw* with their start positions.""" + return [(raw[i : i + k], i) for i in range(len(raw) - k + 1)] + + +def _gapped_kmers(raw: bytes, k: int, mask_byte: int) -> list[tuple[bytes, int]]: + """Gapped k-mers (each position masked once) with window start positions.""" + n = len(raw) + n_windows = n - k + 1 + buf = bytearray(k) + result: list[tuple[bytes, int]] = [] + for i in range(n_windows): + window = raw[i : i + k] + for j in range(k): + buf[:] = window + buf[j] = mask_byte + result.append((bytes(buf), i)) + return result + + +# --------------------------------------------------------------------------- +# Indexing +# --------------------------------------------------------------------------- + +def tokenize_rearrangements( + rearrangements: list[Rearrangement], + k: int, + mask_byte: int | None = None, +) -> dict[Kmer, list[KmerMatch]]: + """Build an inverted index from annotated k-mers to their source + rearrangements, tracking the position each k-mer was extracted from. + + When *mask_byte* is given, gapped k-mers are produced instead (each + position masked once per window, as in :func:`tokens.tokenize_gapped`). + + Args: + rearrangements: Input rearrangements. + k: K-mer length. + mask_byte: If not ``None``, replacement byte for gapped k-mers + (e.g. ``ord('X')``). ``None`` (default) produces plain k-mers. + + Returns: + Dict mapping each :class:`Kmer` to a list of :class:`KmerMatch` + (rearrangement, position) pairs. + """ + _kmers = _gapped_kmers if mask_byte is not None else _plain_kmers + _mb = mask_byte + index: dict[Kmer, list[KmerMatch]] = {} + for r in rearrangements: + raw = _to_bytes(r.junction_aa) + if k > len(raw): + continue + locus = r.locus + v_gene = r.v_gene + c_gene = r.c_gene + pairs = _kmers(raw, k, _mb) if _mb is not None else _kmers(raw, k) + for s, pos in pairs: + key = Kmer(locus, v_gene, c_gene, s) + match = KmerMatch(r, pos) + lst = index.get(key) + if lst is None: + index[key] = [match] + else: + lst.append(match) + return index + + +def summarize_rearrangements( + rearrangements: list[Rearrangement], + k: int, + mask_byte: int | None = None, +) -> dict[Kmer, KmerStats]: + """Compute per-kmer summary statistics (full :class:`Kmer` key). + + For each :class:`Kmer` the result contains: + + * ``rearrangement_count`` — number of rearrangements contributing + that k-mer. + * ``duplicate_count`` — sum of :attr:`Rearrangement.duplicate_count` + across those rearrangements. + + Args: + rearrangements: Input rearrangements. + k: K-mer length. + mask_byte: If not ``None``, produce gapped k-mers. + + Returns: + Dict mapping each :class:`Kmer` to its :class:`KmerStats`. + """ + _kmers = _gapped_kmers if mask_byte is not None else _plain_kmers + _mb = mask_byte + counts: dict[Kmer, int] = {} + dups: dict[Kmer, int] = {} + for r in rearrangements: + raw = _to_bytes(r.junction_aa) + if k > len(raw): + continue + locus = r.locus + v_gene = r.v_gene + c_gene = r.c_gene + dc = r.duplicate_count + pairs = _kmers(raw, k, _mb) if _mb is not None else _kmers(raw, k) + for s, _pos in pairs: + key = Kmer(locus, v_gene, c_gene, s) + counts[key] = counts.get(key, 0) + 1 + dups[key] = dups.get(key, 0) + dc + return {k: KmerStats(counts[k], dups[k]) for k in counts} + + +def summarize_annotations( + rearrangements: list[Rearrangement], + k: int, + mask_byte: int | None = None, +) -> dict[KmerSeq, dict[KmerAnnotation, KmerStats]]: + """Compute per-kmer summary keyed by (locus, seq) only, with + per-(v_gene, c_gene, position) breakdowns. + + The outer key is a :class:`KmerSeq` — just locus and k-mer bytes, + ignoring gene annotation. The inner dict maps each unique + :class:`KmerAnnotation` (v_gene, c_gene, position) to a + :class:`KmerStats` holding rearrangement_count and duplicate_count. + + Args: + rearrangements: Input rearrangements. + k: K-mer length. + mask_byte: If not ``None``, produce gapped k-mers. + + Returns: + Nested dict ``KmerSeq → KmerAnnotation → KmerStats``. + """ + _kmers = _gapped_kmers if mask_byte is not None else _plain_kmers + _mb = mask_byte + # Accumulate into flat (KmerSeq, KmerAnnotation) → (count, dup_sum) + counts: dict[tuple[KmerSeq, KmerAnnotation], int] = {} + dups: dict[tuple[KmerSeq, KmerAnnotation], int] = {} + for r in rearrangements: + raw = _to_bytes(r.junction_aa) + if k > len(raw): + continue + locus = r.locus + v_gene = r.v_gene + c_gene = r.c_gene + dc = r.duplicate_count + pairs = _kmers(raw, k, _mb) if _mb is not None else _kmers(raw, k) + for s, pos in pairs: + ks = KmerSeq(locus, s) + ka = KmerAnnotation(v_gene, c_gene, pos) + flat_key = (ks, ka) + counts[flat_key] = counts.get(flat_key, 0) + 1 + dups[flat_key] = dups.get(flat_key, 0) + dc + # Pivot into nested dict + result: dict[KmerSeq, dict[KmerAnnotation, KmerStats]] = {} + for (ks, ka), cnt in counts.items(): + inner = result.get(ks) + if inner is None: + inner = {} + result[ks] = inner + inner[ka] = KmerStats(cnt, dups[(ks, ka)]) + return result diff --git a/tests/test_token_tables.py b/tests/test_token_tables.py new file mode 100644 index 0000000..b6e0ae0 --- /dev/null +++ b/tests/test_token_tables.py @@ -0,0 +1,670 @@ +"""Tests and benchmarks for token_tables: Rearrangement / Kmer indexing.""" + +from __future__ import annotations + +import time + +import pytest + +from mir.basic.token_tables import ( + Kmer, + KmerAnnotation, + KmerMatch, + KmerSeq, + KmerStats, + Rearrangement, + summarize_annotations, + summarize_rearrangements, + tokenize_rearrangements, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_rearrangement( + junction_aa: str = "CASSLAPGATNEKLFF", + *, + locus: str = "TRB", + id: str = "r1", + v_gene: str = "TRBV5-1", + c_gene: str = "TRBC1", + duplicate_count: int = 10, +) -> Rearrangement: + return Rearrangement(locus, id, v_gene, c_gene, junction_aa, duplicate_count) + + +# --------------------------------------------------------------------------- +# Unit tests — types +# --------------------------------------------------------------------------- + +class TestRearrangement: + def test_slots(self): + r = _make_rearrangement() + assert r.locus == "TRB" + assert r.junction_aa == "CASSLAPGATNEKLFF" + assert r.duplicate_count == 10 + assert not hasattr(r, "__dict__") + + def test_fields(self): + r = _make_rearrangement() + assert r.id == "r1" + assert r.v_gene == "TRBV5-1" + assert r.c_gene == "TRBC1" + + +class TestKmer: + def test_hashable(self): + k = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") + d = {k: 1} + assert d[k] == 1 + + def test_equal_by_value(self): + a = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") + b = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") + assert a == b + assert hash(a) == hash(b) + + def test_not_equal_different_seq(self): + a = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") + b = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASX") + assert a != b + + def test_not_equal_different_gene(self): + a = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") + b = Kmer("TRB", "TRBV5-2", "TRBC1", b"CASS") + assert a != b + + +# --------------------------------------------------------------------------- +# Unit tests — tokenize_rearrangements +# --------------------------------------------------------------------------- + +class TestTokenizeRearrangements: + def test_basic_indexing(self): + r = _make_rearrangement("CASSLAP") + idx = tokenize_rearrangements([r], k=4) + # 7 - 4 + 1 = 4 k-mers: CASS, ASSL, SSLA, SLAP + assert len(idx) == 4 + assert all(len(v) == 1 and v[0].rearrangement is r for v in idx.values()) + + def test_true_lookup(self): + """K-mer known to exist can be found.""" + r = _make_rearrangement("CASSLAPGATNEKLFF") + idx = tokenize_rearrangements([r], k=5) + key = Kmer("TRB", "TRBV5-1", "TRBC1", b"LAPGA") + assert key in idx + assert idx[key][0].rearrangement is r + + def test_false_lookup(self): + """K-mer not present is absent from the index.""" + r = _make_rearrangement("CASSLAPGATNEKLFF") + idx = tokenize_rearrangements([r], k=5) + missing = Kmer("TRB", "TRBV5-1", "TRBC1", b"ZZZZZ") + assert missing not in idx + + def test_false_lookup_wrong_gene(self): + """Same sequence but different gene annotation → not found.""" + r = _make_rearrangement("CASSLAPGATNEKLFF") + idx = tokenize_rearrangements([r], k=5) + wrong_gene = Kmer("TRB", "TRBV99", "TRBC1", b"LAPGA") + assert wrong_gene not in idx + + def test_multiple_rearrangements_shared_kmer(self): + """Two rearrangements sharing a k-mer both appear in the list.""" + r1 = _make_rearrangement("CASSLA", id="r1") + r2 = _make_rearrangement("CASSXY", id="r2") + idx = tokenize_rearrangements([r1, r2], k=4) + shared = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") + assert shared in idx + rearrangements = [m.rearrangement for m in idx[shared]] + assert r1 in rearrangements + assert r2 in rearrangements + + def test_skip_short_junction(self): + """Rearrangement with junction shorter than k is silently skipped.""" + r = _make_rearrangement("CA") + idx = tokenize_rearrangements([r], k=5) + assert len(idx) == 0 + + def test_empty_input(self): + idx = tokenize_rearrangements([], k=3) + assert idx == {} + + def test_different_loci(self): + r_trb = Rearrangement("TRB", "r1", "TRBV5-1", "TRBC1", "CASSLA", 1) + r_tra = Rearrangement("TRA", "r2", "TRAV12", "TRAC", "CASSLA", 1) + idx = tokenize_rearrangements([r_trb, r_tra], k=4) + key_trb = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") + key_tra = Kmer("TRA", "TRAV12", "TRAC", b"CASS") + assert key_trb in idx and idx[key_trb][0].rearrangement is r_trb + assert key_tra in idx and idx[key_tra][0].rearrangement is r_tra + + def test_positions_plain(self): + """Plain k-mers record correct extraction positions.""" + r = _make_rearrangement("CASSLAP") + idx = tokenize_rearrangements([r], k=4) + # CASS@0, ASSL@1, SSLA@2, SLAP@3 + assert idx[Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS")][0].position == 0 + assert idx[Kmer("TRB", "TRBV5-1", "TRBC1", b"ASSL")][0].position == 1 + assert idx[Kmer("TRB", "TRBV5-1", "TRBC1", b"SSLA")][0].position == 2 + assert idx[Kmer("TRB", "TRBV5-1", "TRBC1", b"SLAP")][0].position == 3 + + def test_positions_gapped(self): + """Gapped k-mers from the same window share the window position.""" + r = _make_rearrangement("CASSLA") + idx = tokenize_rearrangements([r], k=4, mask_byte=ord("X")) + # Window 0 (CASS) → XASS, CXSS, CAXS, CASX all at position 0 + for seq in [b"XASS", b"CXSS", b"CAXS", b"CASX"]: + key = Kmer("TRB", "TRBV5-1", "TRBC1", seq) + if key in idx: + assert idx[key][0].position == 0 + # Window 1 (ASSL) → position 1 + key1 = Kmer("TRB", "TRBV5-1", "TRBC1", b"XSSL") + assert key1 in idx and idx[key1][0].position == 1 + + +# --------------------------------------------------------------------------- +# Benchmark +# --------------------------------------------------------------------------- + +class TestTokenizeRearrangementsBenchmark: + N = 100_000 + K = 5 + JUNCTION = "CASSLAPGATNEKLFF" # 16 aa → 12 k-mers per rearrangement + + @pytest.fixture(scope="class") + def rearrangements(self): + return [ + Rearrangement("TRB", f"r{i}", "TRBV5-1", "TRBC1", + self.JUNCTION, 10) + for i in range(self.N) + ] + + def test_benchmark_tokenize(self, rearrangements): + # Warm-up + tokenize_rearrangements(rearrangements[:1000], self.K) + + t0 = time.perf_counter() + idx = tokenize_rearrangements(rearrangements, self.K) + elapsed = time.perf_counter() - t0 + + n_kmers = len(self.JUNCTION) - self.K + 1 + print( + f"\ntokenize_rearrangements: {self.N:,} rearrangements, " + f"k={self.K}, {n_kmers} kmers/seq → " + f"{len(idx):,} unique Kmer keys, " + f"{elapsed:.3f}s " + f"({self.N / elapsed:,.0f} rearrangements/s)" + ) + + def test_benchmark_lookup(self, rearrangements): + idx = tokenize_rearrangements(rearrangements, self.K) + key_hit = Kmer("TRB", "TRBV5-1", "TRBC1", b"LAPGA") + key_miss = Kmer("TRB", "TRBV5-1", "TRBC1", b"ZZZZZ") + + n_lookups = 1_000_000 + t0 = time.perf_counter() + for _ in range(n_lookups): + _ = key_hit in idx + hit_elapsed = time.perf_counter() - t0 + + t0 = time.perf_counter() + for _ in range(n_lookups): + _ = key_miss in idx + miss_elapsed = time.perf_counter() - t0 + + print( + f"\nlookup: {n_lookups:,} hits in {hit_elapsed:.3f}s " + f"({n_lookups / hit_elapsed:,.0f} ops/s), " + f"{n_lookups:,} misses in {miss_elapsed:.3f}s " + f"({n_lookups / miss_elapsed:,.0f} ops/s)" + ) + + +# --------------------------------------------------------------------------- +# Unit tests — gapped k-mers +# --------------------------------------------------------------------------- + +MASK = ord("X") + + +class TestTokenizeRearrangementsGapped: + def test_gapped_kmer_count(self): + """Each window produces k gapped variants.""" + r = _make_rearrangement("CASSLAP") # 7 aa, k=4 → 4 windows × 4 = 16 + idx = tokenize_rearrangements([r], k=4, mask_byte=MASK) + # All keys must contain exactly one X + for key in idx: + assert key.seq.count(MASK) == 1 + # Total unique gapped k-mers ≤ 16 (some may collide) + total_refs = sum(len(v) for v in idx.values()) + assert total_refs == 16 + + def test_gapped_true_lookup(self): + r = _make_rearrangement("CASSLA") + idx = tokenize_rearrangements([r], k=4, mask_byte=MASK) + # Window CASS → gapped: XASS, CXSS, CAXS, CASX + assert Kmer("TRB", "TRBV5-1", "TRBC1", b"XASS") in idx + assert Kmer("TRB", "TRBV5-1", "TRBC1", b"CXSS") in idx + assert Kmer("TRB", "TRBV5-1", "TRBC1", b"CASX") in idx + + def test_gapped_false_lookup(self): + r = _make_rearrangement("CASSLA") + idx = tokenize_rearrangements([r], k=4, mask_byte=MASK) + assert Kmer("TRB", "TRBV5-1", "TRBC1", b"XXSS") not in idx + + def test_gapped_no_mask_is_plain(self): + """mask_byte=None gives identical result to plain call.""" + r = _make_rearrangement("CASSLA") + plain = tokenize_rearrangements([r], k=4) + explicit_none = tokenize_rearrangements([r], k=4, mask_byte=None) + assert plain.keys() == explicit_none.keys() + + +# --------------------------------------------------------------------------- +# Unit tests — summarize_rearrangements +# --------------------------------------------------------------------------- + +class TestSummarizeRearrangements: + def test_single_rearrangement(self): + r = _make_rearrangement("CASSLA", duplicate_count=5) + stats = summarize_rearrangements([r], k=4) + # 3 k-mers: CASS, ASSL, SSLA + assert len(stats) == 3 + for v in stats.values(): + assert v.rearrangement_count == 1 + assert v.duplicate_count == 5 + + def test_two_rearrangements_shared_kmer(self): + r1 = _make_rearrangement("CASSLA", id="r1", duplicate_count=3) + r2 = _make_rearrangement("CASSXY", id="r2", duplicate_count=7) + stats = summarize_rearrangements([r1, r2], k=4) + shared = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") + assert shared in stats + assert stats[shared].rearrangement_count == 2 + assert stats[shared].duplicate_count == 10 # 3 + 7 + + def test_unique_kmers(self): + r1 = _make_rearrangement("CASSLA", id="r1", duplicate_count=2) + r2 = _make_rearrangement("CASSXY", id="r2", duplicate_count=8) + stats = summarize_rearrangements([r1, r2], k=4) + unique_r1 = Kmer("TRB", "TRBV5-1", "TRBC1", b"SSLA") + unique_r2 = Kmer("TRB", "TRBV5-1", "TRBC1", b"SSXY") + assert stats[unique_r1] == KmerStats(1, 2) + assert stats[unique_r2] == KmerStats(1, 8) + + def test_empty(self): + assert summarize_rearrangements([], k=3) == {} + + def test_skip_short(self): + r = _make_rearrangement("CA", duplicate_count=99) + assert summarize_rearrangements([r], k=5) == {} + + def test_different_loci_separate(self): + r1 = Rearrangement("TRB", "r1", "V1", "C1", "CASSLA", 1) + r2 = Rearrangement("TRA", "r2", "V2", "C2", "CASSLA", 4) + stats = summarize_rearrangements([r1, r2], k=4) + k_trb = Kmer("TRB", "V1", "C1", b"CASS") + k_tra = Kmer("TRA", "V2", "C2", b"CASS") + assert stats[k_trb] == KmerStats(1, 1) + assert stats[k_tra] == KmerStats(1, 4) + + def test_gapped_summary(self): + r = _make_rearrangement("CASSLA", duplicate_count=6) + stats = summarize_rearrangements([r], k=4, mask_byte=MASK) + # Gapped: 3 windows × 4 positions = 12 total k-mer emissions + # Each maps to 1 rearrangement with dup_count 6 + for v in stats.values(): + assert v.duplicate_count % 6 == 0 + assert v.rearrangement_count >= 1 + # All keys contain exactly one mask + for key in stats: + assert key.seq.count(MASK) == 1 + + def test_gapped_shared_summary(self): + r1 = _make_rearrangement("CASSLA", id="r1", duplicate_count=2) + r2 = _make_rearrangement("CASSXY", id="r2", duplicate_count=3) + stats = summarize_rearrangements([r1, r2], k=4, mask_byte=MASK) + # Both produce gapped XASS from window CASS + shared = Kmer("TRB", "TRBV5-1", "TRBC1", b"XASS") + assert shared in stats + assert stats[shared].rearrangement_count == 2 + assert stats[shared].duplicate_count == 5 + + +# --------------------------------------------------------------------------- +# Benchmark — summarize +# --------------------------------------------------------------------------- + +class TestSummarizeRearrangementsBenchmark: + N = 100_000 + K = 5 + JUNCTION = "CASSLAPGATNEKLFF" + + @pytest.fixture(scope="class") + def rearrangements(self): + return [ + Rearrangement("TRB", f"r{i}", "TRBV5-1", "TRBC1", + self.JUNCTION, 10) + for i in range(self.N) + ] + + def test_benchmark_summarize_plain(self, rearrangements): + summarize_rearrangements(rearrangements[:1000], self.K) + t0 = time.perf_counter() + stats = summarize_rearrangements(rearrangements, self.K) + elapsed = time.perf_counter() - t0 + print( + f"\nsummarize (plain): {self.N:,} rearrangements, k={self.K} → " + f"{len(stats):,} unique keys, {elapsed:.3f}s " + f"({self.N / elapsed:,.0f} rearrangements/s)" + ) + + def test_benchmark_summarize_gapped(self, rearrangements): + summarize_rearrangements(rearrangements[:1000], self.K, mask_byte=MASK) + t0 = time.perf_counter() + stats = summarize_rearrangements(rearrangements, self.K, mask_byte=MASK) + elapsed = time.perf_counter() - t0 + print( + f"\nsummarize (gapped): {self.N:,} rearrangements, k={self.K} → " + f"{len(stats):,} unique keys, {elapsed:.3f}s " + f"({self.N / elapsed:,.0f} rearrangements/s)" + ) + + +# --------------------------------------------------------------------------- +# Unit tests — summarize_annotations +# --------------------------------------------------------------------------- + +class TestSummarizeAnnotations: + def test_single_rearrangement_positions(self): + """Each k-mer gets a separate position annotation.""" + r = _make_rearrangement("CASSLA", duplicate_count=5) + ann = summarize_annotations([r], k=4) + # 3 plain k-mers at positions 0, 1, 2 + assert len(ann) == 3 + ks_cass = KmerSeq("TRB", b"CASS") + assert ks_cass in ann + inner = ann[ks_cass] + assert KmerAnnotation("TRBV5-1", "TRBC1", 0) in inner + assert inner[KmerAnnotation("TRBV5-1", "TRBC1", 0)] == KmerStats(1, 5) + + def test_different_genes_merge_under_same_kmer_seq(self): + """Same locus+seq but different v_gene → single KmerSeq key, + two KmerAnnotation entries.""" + r1 = Rearrangement("TRB", "r1", "TRBV5-1", "TRBC1", "CASSLA", 3) + r2 = Rearrangement("TRB", "r2", "TRBV6-2", "TRBC2", "CASSLA", 7) + ann = summarize_annotations([r1, r2], k=4) + ks = KmerSeq("TRB", b"CASS") + assert ks in ann + inner = ann[ks] + a1 = KmerAnnotation("TRBV5-1", "TRBC1", 0) + a2 = KmerAnnotation("TRBV6-2", "TRBC2", 0) + assert a1 in inner and inner[a1] == KmerStats(1, 3) + assert a2 in inner and inner[a2] == KmerStats(1, 7) + + def test_different_loci_separate(self): + r_trb = Rearrangement("TRB", "r1", "V1", "C1", "CASSLA", 1) + r_tra = Rearrangement("TRA", "r2", "V2", "C2", "CASSLA", 4) + ann = summarize_annotations([r_trb, r_tra], k=4) + ks_trb = KmerSeq("TRB", b"CASS") + ks_tra = KmerSeq("TRA", b"CASS") + assert ks_trb in ann and ks_tra in ann + assert len(ann[ks_trb]) == 1 + assert len(ann[ks_tra]) == 1 + + def test_shared_kmer_same_gene_accumulates(self): + """Two rearrangements with identical gene annotations at same position + accumulate counts.""" + r1 = _make_rearrangement("CASSLA", id="r1", duplicate_count=2) + r2 = _make_rearrangement("CASSXY", id="r2", duplicate_count=8) + ann = summarize_annotations([r1, r2], k=4) + ks = KmerSeq("TRB", b"CASS") + a = KmerAnnotation("TRBV5-1", "TRBC1", 0) + assert ann[ks][a] == KmerStats(2, 10) + + def test_position_distinguishes_annotations(self): + """Same k-mer at different positions → separate KmerAnnotation entries.""" + # ACASS has "AS" at position 2 (from ACAS→no, let's be precise) + # Use junction where same 3-mer appears twice: CASCA → k=3: CAS@0, ASC@1, SCA@2 + # No repeated k-mer there. Use CASSCAS → CAS@0, ASS@1, SSC@2, SCA@3, CAS@4 + r = _make_rearrangement("CASSCAS", duplicate_count=1) + ann = summarize_annotations([r], k=3) + ks_cas = KmerSeq("TRB", b"CAS") + assert ks_cas in ann + inner = ann[ks_cas] + # CAS appears at position 0 and position 4 + a0 = KmerAnnotation("TRBV5-1", "TRBC1", 0) + a4 = KmerAnnotation("TRBV5-1", "TRBC1", 4) + assert a0 in inner and inner[a0] == KmerStats(1, 1) + assert a4 in inner and inner[a4] == KmerStats(1, 1) + + def test_gapped_annotations(self): + r = _make_rearrangement("CASSLA", duplicate_count=6) + ann = summarize_annotations([r], k=4, mask_byte=MASK) + # All outer keys should have locus only + for ks in ann: + assert isinstance(ks, KmerSeq) + assert ks.locus == "TRB" + # Gapped k-mers from window 0 (CASS) should have position 0 + ks_xass = KmerSeq("TRB", b"XASS") + assert ks_xass in ann + inner = ann[ks_xass] + assert KmerAnnotation("TRBV5-1", "TRBC1", 0) in inner + + def test_empty(self): + assert summarize_annotations([], k=3) == {} + + def test_skip_short(self): + r = _make_rearrangement("CA", duplicate_count=99) + assert summarize_annotations([r], k=5) == {} + + def test_gapped_different_genes_merge(self): + """Gapped: different v_gene rearrangements with same locus+seq + merge under one KmerSeq.""" + r1 = Rearrangement("TRB", "r1", "TRBV5-1", "TRBC1", "CASSLA", 2) + r2 = Rearrangement("TRB", "r2", "TRBV6-2", "TRBC2", "CASSLA", 3) + ann = summarize_annotations([r1, r2], k=4, mask_byte=MASK) + ks = KmerSeq("TRB", b"XASS") + assert ks in ann + inner = ann[ks] + a1 = KmerAnnotation("TRBV5-1", "TRBC1", 0) + a2 = KmerAnnotation("TRBV6-2", "TRBC2", 0) + assert a1 in inner and inner[a1] == KmerStats(1, 2) + assert a2 in inner and inner[a2] == KmerStats(1, 3) + + +# --------------------------------------------------------------------------- +# Benchmark — summarize_annotations +# --------------------------------------------------------------------------- + +class TestSummarizeAnnotationsBenchmark: + N = 100_000 + K = 5 + JUNCTION = "CASSLAPGATNEKLFF" + + @pytest.fixture(scope="class") + def rearrangements(self): + return [ + Rearrangement("TRB", f"r{i}", "TRBV5-1", "TRBC1", + self.JUNCTION, 10) + for i in range(self.N) + ] + + def test_benchmark_annotations_plain(self, rearrangements): + summarize_annotations(rearrangements[:1000], self.K) + t0 = time.perf_counter() + ann = summarize_annotations(rearrangements, self.K) + elapsed = time.perf_counter() - t0 + total_annotations = sum(len(v) for v in ann.values()) + print( + f"\nsummarize_annotations (plain): {self.N:,} rearrangements, " + f"k={self.K} → {len(ann):,} KmerSeq keys, " + f"{total_annotations:,} annotations, " + f"{elapsed:.3f}s ({self.N / elapsed:,.0f} rearrangements/s)" + ) + + +# --------------------------------------------------------------------------- +# OLGA-based realistic benchmark +# --------------------------------------------------------------------------- + +class TestOlgaKmerSummary: + """Generate 10,000 human TCR-beta rearrangements via OLGA and validate + biological expectations on k-mer incidence.""" + + N = 10_000 + K = 3 + + @pytest.fixture(scope="class") + def olga_rearrangements(self): + from mir.basic.pgen import OlgaModel + + model = OlgaModel(chain="TRB") + seqs = model.generate_sequences_with_meta(self.N, pgens=False) + return [ + Rearrangement( + locus="TRB", + id=f"olga_{i}", + v_gene=rec["v_gene"].split("*")[0], # strip allele + c_gene="", + junction_aa=rec["cdr3"], + duplicate_count=1, + ) + for i, rec in enumerate(seqs) + ] + + @pytest.fixture(scope="class") + def annotations(self, olga_rearrangements): + return summarize_annotations(olga_rearrangements, self.K) + + # -- CSA: V-gene–specific, beginning of junction ----------------------- + + def test_csa_present(self, annotations): + """CSA should be a common k-mer (most CDR3s start with C).""" + ks = KmerSeq("TRB", b"CSA") + assert ks in annotations + total = sum(st.rearrangement_count for st in annotations[ks].values()) + # ~6% of 10k rearrangements start with CSA → expect 400-900 + assert 300 <= total <= 1200, f"CSA total count {total} outside [300, 1200]" + + def test_csa_linked_to_trbv20_1(self, annotations): + """CSA at position 0 should be predominantly from TRBV20-1.""" + ks = KmerSeq("TRB", b"CSA") + inner = annotations[ks] + # Collect annotations at position 0 + pos0 = {ka: st for ka, st in inner.items() if ka.position == 0} + assert len(pos0) > 0, "CSA should appear at position 0" + # TRBV20-1 should dominate among pos-0 annotations + total = sum(st.rearrangement_count for st in pos0.values()) + trbv20_count = sum( + st.rearrangement_count + for ka, st in pos0.items() + if ka.v_gene == "TRBV20-1" + ) + fraction = trbv20_count / total + print( + f"\nCSA@pos0: {trbv20_count}/{total} from TRBV20-1 " + f"({fraction:.1%})" + ) + # Observed ~96.6%; TRBV20-1 encodes the CSA motif + assert 0.85 <= fraction <= 1.0, ( + f"Expected TRBV20-1 fraction 0.85–1.0 for CSA@pos0, got {fraction:.3f}" + ) + + def test_csa_at_beginning(self, annotations): + """CSA occurrences should overwhelmingly be at position 0.""" + ks = KmerSeq("TRB", b"CSA") + inner = annotations[ks] + total = sum(st.rearrangement_count for st in inner.values()) + at_pos0 = sum( + st.rearrangement_count + for ka, st in inner.items() + if ka.position == 0 + ) + fraction = at_pos0 / total + print(f"\nCSA: {at_pos0}/{total} at position 0 ({fraction:.1%})") + # Observed ~100%; CSA is a V-gene–encoded motif at CDR3 start + assert 0.95 <= fraction <= 1.0, ( + f"Expected ≥95% CSA at position 0, got {fraction:.3f}" + ) + + # -- GGG: V-gene–agnostic, middle of junction ------------------------- + + def test_ggg_present(self, annotations): + """GGG should appear in the repertoire.""" + ks = KmerSeq("TRB", b"GGG") + assert ks in annotations, ( + "GGG not found — unlikely for 10k TRB sequences" + ) + total = sum(st.rearrangement_count for st in annotations[ks].values()) + # ~307 observed; GGG arises from random N/D insertions + assert 100 <= total <= 800, f"GGG total count {total} outside [100, 800]" + + def test_ggg_v_gene_agnostic(self, annotations): + """GGG should come from multiple V genes, not just one.""" + ks = KmerSeq("TRB", b"GGG") + inner = annotations[ks] + v_genes = {ka.v_gene for ka in inner} + print(f"\nGGG: {len(v_genes)} distinct V genes — {sorted(v_genes)}") + # Observed ~45; GGG arises from N/D insertions, not V-gene–encoded + assert 20 <= len(v_genes) <= 60, ( + f"Expected 20–60 V genes for GGG, got {len(v_genes)}" + ) + + def test_ggg_middle_position(self, annotations, olga_rearrangements): + """GGG should predominantly come from the middle portion of + junction_aa, not the very start or end.""" + ks = KmerSeq("TRB", b"GGG") + inner = annotations[ks] + # Compute median junction length for context + lengths = [len(r.junction_aa) for r in olga_rearrangements] + median_len = sorted(lengths)[len(lengths) // 2] + # Count how many GGG hits are at interior positions (not 0, not last) + total = sum(st.rearrangement_count for st in inner.values()) + interior = sum( + st.rearrangement_count + for ka, st in inner.items() + if 1 <= ka.position <= median_len - self.K - 1 + ) + fraction = interior / total + print( + f"\nGGG: {interior}/{total} at interior positions ({fraction:.1%}), " + f"median junction length={median_len}" + ) + # Observed ~97.7%; GGG comes from N/D insertions in the junction core + assert 0.90 <= fraction <= 1.0, ( + f"Expected ≥90% GGG in middle, got {fraction:.3f}" + ) + # Median junction length for human TRB is typically 14-16 aa + assert 12 <= median_len <= 18, ( + f"Median junction length {median_len} outside expected [12, 18]" + ) + + # -- Timing ------------------------------------------------------------ + + def test_benchmark_olga_summarize(self, olga_rearrangements): + """Time the full summarize_annotations pipeline on OLGA data.""" + # Warm-up + summarize_annotations(olga_rearrangements[:500], self.K) + + t0 = time.perf_counter() + ann = summarize_annotations(olga_rearrangements, self.K) + elapsed = time.perf_counter() - t0 + total_kmer_seqs = len(ann) + total_annotations = sum(len(v) for v in ann.values()) + print( + f"\nOLGA summarize_annotations: {self.N:,} rearrangements, " + f"k={self.K} → {total_kmer_seqs:,} KmerSeq keys, " + f"{total_annotations:,} annotations, " + f"{elapsed:.3f}s ({self.N / elapsed:,.0f} rearrangements/s)" + ) + # Observed ~6,191 unique 3-mers, ~76,215 annotations for 10k seqs + assert 4_000 <= total_kmer_seqs <= 9_000, ( + f"KmerSeq count {total_kmer_seqs} outside [4000, 9000]" + ) + assert 50_000 <= total_annotations <= 120_000, ( + f"Annotation count {total_annotations} outside [50000, 120000]" + ) From 165e3113ffa7cd7c1f997a8ce9f431b65497f40f Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 17:53:34 +0300 Subject: [PATCH 09/24] add polars implementation for kmers --- mir/basic/token_tables.py | 55 +++-- mir/basic/token_tables_pl.py | 189 +++++++++++++++++ tests/test_token_tables.py | 52 ++--- tests/test_token_tables_impl.py | 364 ++++++++++++++++++++++++++++++++ 4 files changed, 619 insertions(+), 41 deletions(-) create mode 100644 mir/basic/token_tables_pl.py create mode 100644 tests/test_token_tables_impl.py diff --git a/mir/basic/token_tables.py b/mir/basic/token_tables.py index 00ac8c7..539d4e7 100644 --- a/mir/basic/token_tables.py +++ b/mir/basic/token_tables.py @@ -32,6 +32,14 @@ class Rearrangement: """Immune receptor rearrangement with minimal annotation. Uses ``__slots__`` for memory efficiency. + + Attributes: + locus: Chain locus (e.g. ``"TRB"``, ``"TRA"``). + id: Unique integer identifier. + v_gene: Variable gene name. + c_gene: Constant gene name. + junction_aa: Amino-acid junction (CDR3) sequence. + duplicate_count: Number of duplicate reads. """ __slots__ = ("locus", "id", "v_gene", "c_gene", "junction_aa", @@ -40,7 +48,7 @@ class Rearrangement: def __init__( self, locus: str, - id: str, + id: int, v_gene: str, c_gene: str, junction_aa: str, @@ -89,7 +97,12 @@ class KmerAnnotation(NamedTuple): class KmerStats(NamedTuple): - """Aggregate statistics for a single k-mer (or annotation bucket).""" + """Aggregate statistics for a single k-mer (or annotation bucket). + + Attributes: + rearrangement_count: Number of *unique* rearrangement IDs. + duplicate_count: Sum of ``Rearrangement.duplicate_count``. + """ rearrangement_count: int duplicate_count: int @@ -175,8 +188,8 @@ def summarize_rearrangements( For each :class:`Kmer` the result contains: - * ``rearrangement_count`` — number of rearrangements contributing - that k-mer. + * ``rearrangement_count`` — number of *unique* rearrangement IDs + contributing that k-mer. * ``duplicate_count`` — sum of :attr:`Rearrangement.duplicate_count` across those rearrangements. @@ -190,7 +203,7 @@ def summarize_rearrangements( """ _kmers = _gapped_kmers if mask_byte is not None else _plain_kmers _mb = mask_byte - counts: dict[Kmer, int] = {} + ids: dict[Kmer, set[int]] = {} dups: dict[Kmer, int] = {} for r in rearrangements: raw = _to_bytes(r.junction_aa) @@ -199,13 +212,19 @@ def summarize_rearrangements( locus = r.locus v_gene = r.v_gene c_gene = r.c_gene + rid = r.id dc = r.duplicate_count pairs = _kmers(raw, k, _mb) if _mb is not None else _kmers(raw, k) for s, _pos in pairs: key = Kmer(locus, v_gene, c_gene, s) - counts[key] = counts.get(key, 0) + 1 - dups[key] = dups.get(key, 0) + dc - return {k: KmerStats(counts[k], dups[k]) for k in counts} + id_set = ids.get(key) + if id_set is None: + ids[key] = {rid} + dups[key] = dc + else: + id_set.add(rid) + dups[key] += dc + return {k: KmerStats(len(ids[k]), dups[k]) for k in ids} def summarize_annotations( @@ -219,7 +238,8 @@ def summarize_annotations( The outer key is a :class:`KmerSeq` — just locus and k-mer bytes, ignoring gene annotation. The inner dict maps each unique :class:`KmerAnnotation` (v_gene, c_gene, position) to a - :class:`KmerStats` holding rearrangement_count and duplicate_count. + :class:`KmerStats` holding rearrangement_count (unique IDs) and + duplicate_count. Args: rearrangements: Input rearrangements. @@ -231,8 +251,7 @@ def summarize_annotations( """ _kmers = _gapped_kmers if mask_byte is not None else _plain_kmers _mb = mask_byte - # Accumulate into flat (KmerSeq, KmerAnnotation) → (count, dup_sum) - counts: dict[tuple[KmerSeq, KmerAnnotation], int] = {} + ids: dict[tuple[KmerSeq, KmerAnnotation], set[int]] = {} dups: dict[tuple[KmerSeq, KmerAnnotation], int] = {} for r in rearrangements: raw = _to_bytes(r.junction_aa) @@ -241,20 +260,26 @@ def summarize_annotations( locus = r.locus v_gene = r.v_gene c_gene = r.c_gene + rid = r.id dc = r.duplicate_count pairs = _kmers(raw, k, _mb) if _mb is not None else _kmers(raw, k) for s, pos in pairs: ks = KmerSeq(locus, s) ka = KmerAnnotation(v_gene, c_gene, pos) flat_key = (ks, ka) - counts[flat_key] = counts.get(flat_key, 0) + 1 - dups[flat_key] = dups.get(flat_key, 0) + dc + id_set = ids.get(flat_key) + if id_set is None: + ids[flat_key] = {rid} + dups[flat_key] = dc + else: + id_set.add(rid) + dups[flat_key] += dc # Pivot into nested dict result: dict[KmerSeq, dict[KmerAnnotation, KmerStats]] = {} - for (ks, ka), cnt in counts.items(): + for (ks, ka), id_set in ids.items(): inner = result.get(ks) if inner is None: inner = {} result[ks] = inner - inner[ka] = KmerStats(cnt, dups[(ks, ka)]) + inner[ka] = KmerStats(len(id_set), dups[(ks, ka)]) return result diff --git a/mir/basic/token_tables_pl.py b/mir/basic/token_tables_pl.py new file mode 100644 index 0000000..96e79e5 --- /dev/null +++ b/mir/basic/token_tables_pl.py @@ -0,0 +1,189 @@ +"""Polars-based rearrangement k-mer indexing and summarisation. + +Mirrors the object-based API in :mod:`token_tables` using Polars +DataFrames. The rearrangement table has columns: + + ``id`` (Int64), ``locus`` (Utf8), ``v_gene`` (Utf8), + ``c_gene`` (Utf8), ``junction_aa`` (Utf8), ``duplicate_count`` (Int64). + +Functions +--------- +* ``expand_kmers`` — Expand each rearrangement row into one row + per k-mer, adding ``kmer_pos`` and ``kmer_seq`` columns. +* ``summarize_by_gene`` — Group by (locus, v_gene, c_gene, kmer_seq) + → rearrangement_count, duplicate_count. +* ``summarize_by_pos`` — Group by (locus, kmer_seq, kmer_pos). +* ``summarize_by_v`` — Group by (locus, kmer_seq, v_gene). +* ``summarize_by_c`` — Group by (locus, kmer_seq, c_gene). +* ``fetch_by_kmer`` — Rows from the original table matching + (locus, kmer_seq). +* ``fetch_by_annotated_kmer``— Rows matching (locus, v_gene, c_gene, kmer_seq). +""" + +from __future__ import annotations + +import polars as pl + + +# --------------------------------------------------------------------------- +# K-mer expansion +# --------------------------------------------------------------------------- + +def expand_kmers(df: pl.DataFrame, k: int) -> pl.DataFrame: + """Expand rearrangement table: one row per overlapping k-mer. + + For each rearrangement with ``junction_aa`` of length *n ≥ k*, produces + *n − k + 1* rows with new columns ``kmer_pos`` (``Int64``) and + ``kmer_seq`` (``Utf8``). Rearrangements shorter than *k* are dropped. + + Args: + df: Rearrangement table with at least ``id``, ``locus``, + ``v_gene``, ``c_gene``, ``junction_aa``, ``duplicate_count``. + k: K-mer length. + + Returns: + Expanded :class:`polars.DataFrame`. + """ + jlen = df["junction_aa"].str.len_chars() + df_valid = df.filter(jlen >= k) + if df_valid.height == 0: + return df_valid.with_columns( + pl.lit(None, dtype=pl.Int64).alias("kmer_pos"), + pl.lit(None, dtype=pl.Utf8).alias("kmer_seq"), + ) + n_kmers = df_valid["junction_aa"].str.len_chars() - k + 1 + df_with_n = df_valid.with_columns(n_kmers.alias("_n_kmers")) + # Repeat each row n_kmers times, then assign positions + rows = df_with_n.with_columns( + pl.col("_n_kmers").map_elements( + lambda n: list(range(n)), return_dtype=pl.List(pl.Int64) + ).alias("kmer_pos") + ).explode("kmer_pos").drop("_n_kmers") + # Extract k-mer at each position + rows = rows.with_columns( + pl.col("junction_aa").str.slice( + pl.col("kmer_pos").cast(pl.UInt32), k + ).alias("kmer_seq") + ) + return rows + + +# --------------------------------------------------------------------------- +# Summary tables +# --------------------------------------------------------------------------- + +def _summarize(expanded: pl.DataFrame, group_cols: list[str]) -> pl.DataFrame: + """Group *expanded* by *group_cols* and compute summary stats.""" + return ( + expanded + .group_by(group_cols) + .agg( + pl.col("id").n_unique().alias("rearrangement_count"), + pl.col("duplicate_count").sum().alias("duplicate_count"), + ) + ) + + +def summarize_by_gene(expanded: pl.DataFrame) -> pl.DataFrame: + """Group by (locus, v_gene, c_gene, kmer_seq). + + Returns columns: locus, v_gene, c_gene, kmer_seq, + rearrangement_count, duplicate_count. + """ + return _summarize(expanded, ["locus", "v_gene", "c_gene", "kmer_seq"]) + + +def summarize_by_pos(expanded: pl.DataFrame) -> pl.DataFrame: + """Group by (locus, kmer_seq, kmer_pos). + + Returns columns: locus, kmer_seq, kmer_pos, + rearrangement_count, duplicate_count. + """ + return _summarize(expanded, ["locus", "kmer_seq", "kmer_pos"]) + + +def summarize_by_v(expanded: pl.DataFrame) -> pl.DataFrame: + """Group by (locus, kmer_seq, v_gene). + + Returns columns: locus, kmer_seq, v_gene, + rearrangement_count, duplicate_count. + """ + return _summarize(expanded, ["locus", "kmer_seq", "v_gene"]) + + +def summarize_by_c(expanded: pl.DataFrame) -> pl.DataFrame: + """Group by (locus, kmer_seq, c_gene). + + Returns columns: locus, kmer_seq, c_gene, + rearrangement_count, duplicate_count. + """ + return _summarize(expanded, ["locus", "kmer_seq", "c_gene"]) + + +# --------------------------------------------------------------------------- +# Fetch +# --------------------------------------------------------------------------- + +def fetch_by_kmer( + df: pl.DataFrame, + expanded: pl.DataFrame, + locus: str, + kmer_seq: str, +) -> pl.DataFrame: + """Return rows from the original rearrangement table whose + ``junction_aa`` contains the given k-mer at the specified locus. + + Args: + df: Original rearrangement table. + expanded: Expanded k-mer table (from :func:`expand_kmers`). + locus: Locus string to match. + kmer_seq: K-mer sequence string to match. + + Returns: + Subset of *df* (original columns only, deduplicated by ``id``). + """ + ids = ( + expanded + .filter( + (pl.col("locus") == locus) & (pl.col("kmer_seq") == kmer_seq) + ) + .select("id") + .unique() + ) + return df.join(ids, on="id", how="inner") + + +def fetch_by_annotated_kmer( + df: pl.DataFrame, + expanded: pl.DataFrame, + locus: str, + v_gene: str, + c_gene: str, + kmer_seq: str, +) -> pl.DataFrame: + """Return rows from the original rearrangement table matching a fully + annotated k-mer query (locus, v_gene, c_gene, kmer_seq). + + Args: + df: Original rearrangement table. + expanded: Expanded k-mer table (from :func:`expand_kmers`). + locus: Locus string to match. + v_gene: V-gene name to match. + c_gene: C-gene name to match. + kmer_seq: K-mer sequence string to match. + + Returns: + Subset of *df* (original columns only, deduplicated by ``id``). + """ + ids = ( + expanded + .filter( + (pl.col("locus") == locus) + & (pl.col("v_gene") == v_gene) + & (pl.col("c_gene") == c_gene) + & (pl.col("kmer_seq") == kmer_seq) + ) + .select("id") + .unique() + ) + return df.join(ids, on="id", how="inner") diff --git a/tests/test_token_tables.py b/tests/test_token_tables.py index b6e0ae0..aad6bc3 100644 --- a/tests/test_token_tables.py +++ b/tests/test_token_tables.py @@ -27,7 +27,7 @@ def _make_rearrangement( junction_aa: str = "CASSLAPGATNEKLFF", *, locus: str = "TRB", - id: str = "r1", + id: int = 1, v_gene: str = "TRBV5-1", c_gene: str = "TRBC1", duplicate_count: int = 10, @@ -49,7 +49,7 @@ def test_slots(self): def test_fields(self): r = _make_rearrangement() - assert r.id == "r1" + assert r.id == 1 assert r.v_gene == "TRBV5-1" assert r.c_gene == "TRBC1" @@ -113,8 +113,8 @@ def test_false_lookup_wrong_gene(self): def test_multiple_rearrangements_shared_kmer(self): """Two rearrangements sharing a k-mer both appear in the list.""" - r1 = _make_rearrangement("CASSLA", id="r1") - r2 = _make_rearrangement("CASSXY", id="r2") + r1 = _make_rearrangement("CASSLA", id=1) + r2 = _make_rearrangement("CASSXY", id=2) idx = tokenize_rearrangements([r1, r2], k=4) shared = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") assert shared in idx @@ -133,8 +133,8 @@ def test_empty_input(self): assert idx == {} def test_different_loci(self): - r_trb = Rearrangement("TRB", "r1", "TRBV5-1", "TRBC1", "CASSLA", 1) - r_tra = Rearrangement("TRA", "r2", "TRAV12", "TRAC", "CASSLA", 1) + r_trb = Rearrangement("TRB", 1, "TRBV5-1", "TRBC1", "CASSLA", 1) + r_tra = Rearrangement("TRA", 2, "TRAV12", "TRAC", "CASSLA", 1) idx = tokenize_rearrangements([r_trb, r_tra], k=4) key_trb = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") key_tra = Kmer("TRA", "TRAV12", "TRAC", b"CASS") @@ -177,7 +177,7 @@ class TestTokenizeRearrangementsBenchmark: @pytest.fixture(scope="class") def rearrangements(self): return [ - Rearrangement("TRB", f"r{i}", "TRBV5-1", "TRBC1", + Rearrangement("TRB", i, "TRBV5-1", "TRBC1", self.JUNCTION, 10) for i in range(self.N) ] @@ -278,8 +278,8 @@ def test_single_rearrangement(self): assert v.duplicate_count == 5 def test_two_rearrangements_shared_kmer(self): - r1 = _make_rearrangement("CASSLA", id="r1", duplicate_count=3) - r2 = _make_rearrangement("CASSXY", id="r2", duplicate_count=7) + r1 = _make_rearrangement("CASSLA", id=1, duplicate_count=3) + r2 = _make_rearrangement("CASSXY", id=2, duplicate_count=7) stats = summarize_rearrangements([r1, r2], k=4) shared = Kmer("TRB", "TRBV5-1", "TRBC1", b"CASS") assert shared in stats @@ -287,8 +287,8 @@ def test_two_rearrangements_shared_kmer(self): assert stats[shared].duplicate_count == 10 # 3 + 7 def test_unique_kmers(self): - r1 = _make_rearrangement("CASSLA", id="r1", duplicate_count=2) - r2 = _make_rearrangement("CASSXY", id="r2", duplicate_count=8) + r1 = _make_rearrangement("CASSLA", id=1, duplicate_count=2) + r2 = _make_rearrangement("CASSXY", id=2, duplicate_count=8) stats = summarize_rearrangements([r1, r2], k=4) unique_r1 = Kmer("TRB", "TRBV5-1", "TRBC1", b"SSLA") unique_r2 = Kmer("TRB", "TRBV5-1", "TRBC1", b"SSXY") @@ -303,8 +303,8 @@ def test_skip_short(self): assert summarize_rearrangements([r], k=5) == {} def test_different_loci_separate(self): - r1 = Rearrangement("TRB", "r1", "V1", "C1", "CASSLA", 1) - r2 = Rearrangement("TRA", "r2", "V2", "C2", "CASSLA", 4) + r1 = Rearrangement("TRB", 1, "V1", "C1", "CASSLA", 1) + r2 = Rearrangement("TRA", 2, "V2", "C2", "CASSLA", 4) stats = summarize_rearrangements([r1, r2], k=4) k_trb = Kmer("TRB", "V1", "C1", b"CASS") k_tra = Kmer("TRA", "V2", "C2", b"CASS") @@ -324,8 +324,8 @@ def test_gapped_summary(self): assert key.seq.count(MASK) == 1 def test_gapped_shared_summary(self): - r1 = _make_rearrangement("CASSLA", id="r1", duplicate_count=2) - r2 = _make_rearrangement("CASSXY", id="r2", duplicate_count=3) + r1 = _make_rearrangement("CASSLA", id=1, duplicate_count=2) + r2 = _make_rearrangement("CASSXY", id=2, duplicate_count=3) stats = summarize_rearrangements([r1, r2], k=4, mask_byte=MASK) # Both produce gapped XASS from window CASS shared = Kmer("TRB", "TRBV5-1", "TRBC1", b"XASS") @@ -346,7 +346,7 @@ class TestSummarizeRearrangementsBenchmark: @pytest.fixture(scope="class") def rearrangements(self): return [ - Rearrangement("TRB", f"r{i}", "TRBV5-1", "TRBC1", + Rearrangement("TRB", i, "TRBV5-1", "TRBC1", self.JUNCTION, 10) for i in range(self.N) ] @@ -394,8 +394,8 @@ def test_single_rearrangement_positions(self): def test_different_genes_merge_under_same_kmer_seq(self): """Same locus+seq but different v_gene → single KmerSeq key, two KmerAnnotation entries.""" - r1 = Rearrangement("TRB", "r1", "TRBV5-1", "TRBC1", "CASSLA", 3) - r2 = Rearrangement("TRB", "r2", "TRBV6-2", "TRBC2", "CASSLA", 7) + r1 = Rearrangement("TRB", 1, "TRBV5-1", "TRBC1", "CASSLA", 3) + r2 = Rearrangement("TRB", 2, "TRBV6-2", "TRBC2", "CASSLA", 7) ann = summarize_annotations([r1, r2], k=4) ks = KmerSeq("TRB", b"CASS") assert ks in ann @@ -406,8 +406,8 @@ def test_different_genes_merge_under_same_kmer_seq(self): assert a2 in inner and inner[a2] == KmerStats(1, 7) def test_different_loci_separate(self): - r_trb = Rearrangement("TRB", "r1", "V1", "C1", "CASSLA", 1) - r_tra = Rearrangement("TRA", "r2", "V2", "C2", "CASSLA", 4) + r_trb = Rearrangement("TRB", 1, "V1", "C1", "CASSLA", 1) + r_tra = Rearrangement("TRA", 2, "V2", "C2", "CASSLA", 4) ann = summarize_annotations([r_trb, r_tra], k=4) ks_trb = KmerSeq("TRB", b"CASS") ks_tra = KmerSeq("TRA", b"CASS") @@ -418,8 +418,8 @@ def test_different_loci_separate(self): def test_shared_kmer_same_gene_accumulates(self): """Two rearrangements with identical gene annotations at same position accumulate counts.""" - r1 = _make_rearrangement("CASSLA", id="r1", duplicate_count=2) - r2 = _make_rearrangement("CASSXY", id="r2", duplicate_count=8) + r1 = _make_rearrangement("CASSLA", id=1, duplicate_count=2) + r2 = _make_rearrangement("CASSXY", id=2, duplicate_count=8) ann = summarize_annotations([r1, r2], k=4) ks = KmerSeq("TRB", b"CASS") a = KmerAnnotation("TRBV5-1", "TRBC1", 0) @@ -464,8 +464,8 @@ def test_skip_short(self): def test_gapped_different_genes_merge(self): """Gapped: different v_gene rearrangements with same locus+seq merge under one KmerSeq.""" - r1 = Rearrangement("TRB", "r1", "TRBV5-1", "TRBC1", "CASSLA", 2) - r2 = Rearrangement("TRB", "r2", "TRBV6-2", "TRBC2", "CASSLA", 3) + r1 = Rearrangement("TRB", 1, "TRBV5-1", "TRBC1", "CASSLA", 2) + r2 = Rearrangement("TRB", 2, "TRBV6-2", "TRBC2", "CASSLA", 3) ann = summarize_annotations([r1, r2], k=4, mask_byte=MASK) ks = KmerSeq("TRB", b"XASS") assert ks in ann @@ -488,7 +488,7 @@ class TestSummarizeAnnotationsBenchmark: @pytest.fixture(scope="class") def rearrangements(self): return [ - Rearrangement("TRB", f"r{i}", "TRBV5-1", "TRBC1", + Rearrangement("TRB", i, "TRBV5-1", "TRBC1", self.JUNCTION, 10) for i in range(self.N) ] @@ -527,7 +527,7 @@ def olga_rearrangements(self): return [ Rearrangement( locus="TRB", - id=f"olga_{i}", + id=i, v_gene=rec["v_gene"].split("*")[0], # strip allele c_gene="", junction_aa=rec["cdr3"], diff --git a/tests/test_token_tables_impl.py b/tests/test_token_tables_impl.py new file mode 100644 index 0000000..c8d5d4f --- /dev/null +++ b/tests/test_token_tables_impl.py @@ -0,0 +1,364 @@ +"""Tests for Polars k-mer implementation and benchmarks comparing +Polars vs the naive object-based implementation in token_tables.py. + +Provides memory and time measurements for both approaches. +""" + +from __future__ import annotations + +import gc +import time +import tracemalloc + +import polars as pl +import pytest + +from mir.basic import token_tables_pl as plmod +from mir.basic.token_tables import ( + Kmer, + Rearrangement, + summarize_annotations, + summarize_rearrangements, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_pl_df(rows: list[dict]) -> pl.DataFrame: + """Build a Polars rearrangement DataFrame from a list of dicts.""" + return pl.DataFrame(rows).cast({ + "id": pl.Int64, + "duplicate_count": pl.Int64, + }) + + +def _row( + junction_aa: str = "CASSLAPGATNEKLFF", + *, + locus: str = "TRB", + id: int = 1, + v_gene: str = "TRBV5-1", + c_gene: str = "TRBC1", + duplicate_count: int = 10, +) -> dict: + return dict( + id=id, locus=locus, v_gene=v_gene, c_gene=c_gene, + junction_aa=junction_aa, duplicate_count=duplicate_count, + ) + + +def _rows_to_rearrangements(rows: list[dict]) -> list[Rearrangement]: + return [ + Rearrangement(d["locus"], d["id"], d["v_gene"], d["c_gene"], + d["junction_aa"], d["duplicate_count"]) + for d in rows + ] + + +# =================================================================== +# Polars unit tests +# =================================================================== + + +class TestExpandKmersPl: + def test_basic(self): + df = _make_pl_df([_row("CASSLAP")]) + ex = plmod.expand_kmers(df, k=4) + assert ex.height == 4 + assert set(ex["kmer_seq"].to_list()) == {"CASS", "ASSL", "SSLA", "SLAP"} + assert set(ex["kmer_pos"].to_list()) == {0, 1, 2, 3} + + def test_skip_short(self): + df = _make_pl_df([_row("CA")]) + assert plmod.expand_kmers(df, k=5).height == 0 + + def test_empty(self): + df = _make_pl_df([_row("CASSLA")]) + assert plmod.expand_kmers(df.head(0), k=3).height == 0 + + def test_multiple_rows(self): + df = _make_pl_df([ + _row("CASSLA", id=1, duplicate_count=3), + _row("CASSXY", id=2, duplicate_count=7), + ]) + assert plmod.expand_kmers(df, k=4).height == 6 + + +class TestSummarizeByGenePl: + def test_single(self): + df = _make_pl_df([_row("CASSLA", duplicate_count=5)]) + ex = plmod.expand_kmers(df, k=4) + s = plmod.summarize_by_gene(ex) + assert s.height == 3 + for row in s.iter_rows(named=True): + assert row["rearrangement_count"] == 1 + assert row["duplicate_count"] == 5 + + def test_shared_kmer(self): + df = _make_pl_df([ + _row("CASSLA", id=1, duplicate_count=3), + _row("CASSXY", id=2, duplicate_count=7), + ]) + s = plmod.summarize_by_gene(plmod.expand_kmers(df, k=4)) + cass = s.filter(pl.col("kmer_seq") == "CASS") + assert cass["rearrangement_count"][0] == 2 + assert cass["duplicate_count"][0] == 10 + + def test_different_loci(self): + df = _make_pl_df([ + _row("CASSLA", id=1, locus="TRB", v_gene="V1", c_gene="C1", duplicate_count=1), + _row("CASSLA", id=2, locus="TRA", v_gene="V2", c_gene="C2", duplicate_count=4), + ]) + s = plmod.summarize_by_gene(plmod.expand_kmers(df, k=4)) + trb = s.filter((pl.col("locus") == "TRB") & (pl.col("kmer_seq") == "CASS")) + tra = s.filter((pl.col("locus") == "TRA") & (pl.col("kmer_seq") == "CASS")) + assert trb["rearrangement_count"][0] == 1 and trb["duplicate_count"][0] == 1 + assert tra["rearrangement_count"][0] == 1 and tra["duplicate_count"][0] == 4 + + +class TestSummarizeByPosPl: + def test_positions(self): + df = _make_pl_df([_row("CASSLA", duplicate_count=5)]) + s = plmod.summarize_by_pos(plmod.expand_kmers(df, k=4)) + assert s.height == 3 + for row in s.iter_rows(named=True): + assert row["rearrangement_count"] == 1 + assert row["duplicate_count"] == 5 + + +class TestSummarizeByVPl: + def test_different_v_genes(self): + df = _make_pl_df([ + _row("CASSLA", id=1, v_gene="TRBV5-1", duplicate_count=3), + _row("CASSLA", id=2, v_gene="TRBV6-2", duplicate_count=7), + ]) + s = plmod.summarize_by_v(plmod.expand_kmers(df, k=4)) + cass_v5 = s.filter((pl.col("kmer_seq") == "CASS") & (pl.col("v_gene") == "TRBV5-1")) + cass_v6 = s.filter((pl.col("kmer_seq") == "CASS") & (pl.col("v_gene") == "TRBV6-2")) + assert cass_v5["rearrangement_count"][0] == 1 and cass_v5["duplicate_count"][0] == 3 + assert cass_v6["rearrangement_count"][0] == 1 and cass_v6["duplicate_count"][0] == 7 + + +class TestSummarizeByCPl: + def test_different_c_genes(self): + df = _make_pl_df([ + _row("CASSLA", id=1, c_gene="TRBC1", duplicate_count=2), + _row("CASSLA", id=2, c_gene="TRBC2", duplicate_count=8), + ]) + s = plmod.summarize_by_c(plmod.expand_kmers(df, k=4)) + cass_c1 = s.filter((pl.col("kmer_seq") == "CASS") & (pl.col("c_gene") == "TRBC1")) + cass_c2 = s.filter((pl.col("kmer_seq") == "CASS") & (pl.col("c_gene") == "TRBC2")) + assert cass_c1["rearrangement_count"][0] == 1 and cass_c1["duplicate_count"][0] == 2 + assert cass_c2["rearrangement_count"][0] == 1 and cass_c2["duplicate_count"][0] == 8 + + +class TestFetchPl: + @pytest.fixture() + def data(self): + df = _make_pl_df([ + _row("CASSLA", id=1, v_gene="TRBV5-1", c_gene="TRBC1", duplicate_count=3), + _row("CASSXY", id=2, v_gene="TRBV5-1", c_gene="TRBC1", duplicate_count=7), + _row("TTTXYZ", id=3, locus="TRA", v_gene="TRAV12", c_gene="TRAC", duplicate_count=1), + ]) + ex = plmod.expand_kmers(df, k=4) + return df, ex + + def test_fetch_by_kmer(self, data): + df, ex = data + assert set(plmod.fetch_by_kmer(df, ex, "TRB", "CASS")["id"].to_list()) == {1, 2} + + def test_fetch_by_kmer_miss(self, data): + df, ex = data + assert plmod.fetch_by_kmer(df, ex, "TRB", "ZZZZ").height == 0 + + def test_fetch_by_annotated_kmer(self, data): + df, ex = data + result = plmod.fetch_by_annotated_kmer(df, ex, "TRB", "TRBV5-1", "TRBC1", "CASS") + assert set(result["id"].to_list()) == {1, 2} + + def test_fetch_by_annotated_kmer_wrong_gene(self, data): + df, ex = data + assert plmod.fetch_by_annotated_kmer(df, ex, "TRB", "TRBV99", "TRBC1", "CASS").height == 0 + + def test_fetch_different_locus(self, data): + df, ex = data + assert set(plmod.fetch_by_kmer(df, ex, "TRA", "TTXY")["id"].to_list()) == {3} + + def test_fetch_original_columns(self, data): + df, ex = data + result = plmod.fetch_by_kmer(df, ex, "TRB", "CASS") + assert set(result.columns) == set(df.columns) + + +# =================================================================== +# Cross-implementation: Polars vs naive (object-based) +# =================================================================== + + +class TestCrossImplementation: + """Verify Polars and object-based (naive) produce consistent results.""" + + @pytest.fixture() + def shared_input(self): + dicts = [ + _row("CASSLA", id=1, duplicate_count=3, v_gene="TRBV5-1", c_gene="TRBC1"), + _row("CASSXY", id=2, duplicate_count=7, v_gene="TRBV5-1", c_gene="TRBC1"), + _row("CASSLA", id=3, duplicate_count=2, v_gene="TRBV6-2", c_gene="TRBC2"), + ] + objs = _rows_to_rearrangements(dicts) + pl_df = _make_pl_df(dicts) + return objs, pl_df + + def test_expand_row_count(self, shared_input): + objs, pl_df = shared_input + k = 4 + ex_pl = plmod.expand_kmers(pl_df, k) + # naive: each rearrangement with len >= k produces len-k+1 k-mers + naive_count = sum(max(0, len(r.junction_aa) - k + 1) for r in objs) + assert ex_pl.height == naive_count == 9 + + def test_summarize_by_gene_matches_naive(self, shared_input): + objs, pl_df = shared_input + k = 4 + # Polars + s_pl = plmod.summarize_by_gene(plmod.expand_kmers(pl_df, k)).sort( + ["locus", "v_gene", "c_gene", "kmer_seq"] + ) + # Naive (object-based) + s_obj = summarize_rearrangements(objs, k) + + # For each Polars summary row, verify it matches the naive result + for row in s_pl.iter_rows(named=True): + key = Kmer(row["locus"], row["v_gene"], row["c_gene"], + row["kmer_seq"].encode("ascii")) + assert key in s_obj + assert row["rearrangement_count"] == s_obj[key].rearrangement_count + assert row["duplicate_count"] == s_obj[key].duplicate_count + + # Same total number of groups + assert s_pl.height == len(s_obj) + + +# =================================================================== +# Benchmark: naive (object-based) vs Polars — time and memory +# =================================================================== + + +def _measure(func, label: str) -> dict: + """Run *func*, returning wall time (s) and peak memory (bytes).""" + gc.collect() + tracemalloc.start() + t0 = time.perf_counter() + result = func() + elapsed = time.perf_counter() - t0 + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + return {"label": label, "elapsed": elapsed, "peak_mem": peak, "result": result} + + +class TestBenchmarkImplementations: + """Compare time and memory: naive (token_tables.py) vs Polars on + 10,000 OLGA-generated TCR-beta rearrangements.""" + + N = 10_000 + K = 3 + + @pytest.fixture(scope="class") + def olga_data(self): + from mir.basic.pgen import OlgaModel + + model = OlgaModel(chain="TRB") + seqs = model.generate_sequences_with_meta(self.N, pgens=False) + dicts = [ + _row( + rec["cdr3"], + id=i, + locus="TRB", + v_gene=rec["v_gene"].split("*")[0], + c_gene="", + duplicate_count=1, + ) + for i, rec in enumerate(seqs) + ] + objs = _rows_to_rearrangements(dicts) + pl_df = _make_pl_df(dicts) + return objs, pl_df + + def test_benchmark_naive_summarize(self, olga_data): + objs, _ = olga_data + # warm-up + summarize_rearrangements(objs[:500], self.K) + m = _measure(lambda: summarize_rearrangements(objs, self.K), "naive") + print( + f"\n[naive] summarize_rearrangements: {self.N:,} seqs, k={self.K} → " + f"{len(m['result']):,} keys, " + f"time={m['elapsed']:.3f}s, peak_mem={m['peak_mem'] / 1024:.0f} KiB" + ) + + def test_benchmark_polars_summarize(self, olga_data): + _, pl_df = olga_data + # warm-up + plmod.summarize_by_gene(plmod.expand_kmers(pl_df.head(500), self.K)) + + def run(): + ex = plmod.expand_kmers(pl_df, self.K) + return plmod.summarize_by_gene(ex) + + m = _measure(run, "polars") + print( + f"\n[polars] expand + summarize_by_gene: {self.N:,} seqs, k={self.K} → " + f"{m['result'].height:,} summary rows, " + f"time={m['elapsed']:.3f}s, peak_mem={m['peak_mem'] / 1024:.0f} KiB" + ) + + def test_benchmark_naive_annotations(self, olga_data): + objs, _ = olga_data + summarize_annotations(objs[:500], self.K) + m = _measure(lambda: summarize_annotations(objs, self.K), "naive_ann") + total = sum(len(v) for v in m["result"].values()) + print( + f"\n[naive] summarize_annotations: {self.N:,} seqs, k={self.K} → " + f"{len(m['result']):,} KmerSeq, {total:,} annotations, " + f"time={m['elapsed']:.3f}s, peak_mem={m['peak_mem'] / 1024:.0f} KiB" + ) + + def test_benchmark_polars_all_summaries(self, olga_data): + _, pl_df = olga_data + plmod.expand_kmers(pl_df.head(500), self.K) + + def run(): + ex = plmod.expand_kmers(pl_df, self.K) + return { + "by_gene": plmod.summarize_by_gene(ex), + "by_pos": plmod.summarize_by_pos(ex), + "by_v": plmod.summarize_by_v(ex), + "by_c": plmod.summarize_by_c(ex), + } + + m = _measure(run, "polars_all") + r = m["result"] + print( + f"\n[polars] expand + 4 summaries: {self.N:,} seqs, k={self.K} → " + f"by_gene={r['by_gene'].height:,}, by_pos={r['by_pos'].height:,}, " + f"by_v={r['by_v'].height:,}, by_c={r['by_c'].height:,}, " + f"time={m['elapsed']:.3f}s, peak_mem={m['peak_mem'] / 1024:.0f} KiB" + ) + + def test_benchmark_fetch(self, olga_data): + _, pl_df = olga_data + ex = plmod.expand_kmers(pl_df, self.K) + top = ex.group_by("kmer_seq").len().sort("len", descending=True).head(1) + kmer_seq = top["kmer_seq"][0] + + n_lookups = 1000 + t0 = time.perf_counter() + for _ in range(n_lookups): + plmod.fetch_by_kmer(pl_df, ex, "TRB", kmer_seq) + elapsed = time.perf_counter() - t0 + print( + f"\n[polars] fetch_by_kmer '{kmer_seq}': " + f"{n_lookups:,} lookups in {elapsed:.3f}s " + f"({n_lookups / elapsed:,.0f} ops/s)" + ) From 6a3a8898e52e8a1e03a52f3775cd131d41e20485 Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 18:55:26 +0300 Subject: [PATCH 10/24] C implementation wip --- CMakeLists.txt | 17 +- mir/basic/{sequence.py => alphabets.py} | 144 ++----- mir/basic/mirseq.cpp | 367 ++++++++++++++++++ mir/basic/token_tables.py | 2 +- mir/basic/tokens.py | 68 +--- mir/distances/seqdist.py | 31 ++ requirements.txt | 1 + tests/test_alphabets.py | 214 +++++++++++ tests/test_memory_benchmark.py | 133 ------- tests/test_mirseq.py | 487 ++++++++++++++++++++++++ tests/test_mirseq_benchmark.py | 148 +++++++ tests/test_sequence.py | 191 ---------- tests/test_sequence_benchmark.py | 289 -------------- tests/test_tokens.py | 125 +++--- tests/test_tokens_benchmark.py | 161 -------- 15 files changed, 1380 insertions(+), 998 deletions(-) rename mir/basic/{sequence.py => alphabets.py} (61%) create mode 100644 mir/basic/mirseq.cpp create mode 100644 mir/distances/seqdist.py create mode 100644 tests/test_alphabets.py delete mode 100644 tests/test_memory_benchmark.py create mode 100644 tests/test_mirseq.py create mode 100644 tests/test_mirseq_benchmark.py delete mode 100644 tests/test_sequence.py delete mode 100644 tests/test_sequence_benchmark.py delete mode 100644 tests/test_tokens_benchmark.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 6060df7..e2377a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,10 @@ cmake_minimum_required(VERSION 3.18) -project(mir_cdrscore LANGUAGES CXX) +project(mir_native LANGUAGES CXX) find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) find_package(pybind11 CONFIG REQUIRED) +# --- cdrscore (mir.distances) --- pybind11_add_module(cdrscore MODULE mir/distances/cdrscore.cpp) target_compile_features(cdrscore PRIVATE cxx_std_17) if (MSVC) @@ -11,8 +12,20 @@ if (MSVC) else() target_compile_options(cdrscore PRIVATE -O3) endif() - set_target_properties(cdrscore PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/mir/distances" ) install(TARGETS cdrscore LIBRARY DESTINATION mir/distances) + +# --- mirseq (mir.basic) --- +pybind11_add_module(mirseq MODULE mir/basic/mirseq.cpp) +target_compile_features(mirseq PRIVATE cxx_std_17) +if (MSVC) + target_compile_options(mirseq PRIVATE /O2 /DNOMINMAX) +else() + target_compile_options(mirseq PRIVATE -O3) +endif() +set_target_properties(mirseq PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/mir/basic" +) +install(TARGETS mirseq LIBRARY DESTINATION mir/basic) diff --git a/mir/basic/sequence.py b/mir/basic/alphabets.py similarity index 61% rename from mir/basic/sequence.py rename to mir/basic/alphabets.py index 3024e50..9bfdcd9 100644 --- a/mir/basic/sequence.py +++ b/mir/basic/alphabets.py @@ -1,32 +1,38 @@ -"""Biological sequence validation, translation, masking, and matching. +"""Alphabets, constants, and amino-acid → reduced-alphabet translation. -All functions operate on plain ``str`` or ``bytes`` — no wrapper classes. -Alphabet membership is checked via 256-byte lookup tables (``bytes``) for -O(1) per-character validation. Translation uses ``bytes.translate`` with a -pre-built table for native-speed conversion. +This module holds the lightweight, GC-friendly parts that are faster in +pure Python (``bytes.translate``) than in C. Heavy-lifting functions +(codon translation, tokenisation, distances) live in the ``mirseq`` +C extension. -Alphabets ---------- -Three predefined alphabets are provided as module-level ``bytes`` lookup -tables (256 entries, 1 = allowed, 0 = disallowed): +Types +----- +* ``Seq`` — Union type ``str | bytes | bytearray``. -* ``NT_ALPHABET`` — DNA nucleotides ``ATGCN`` (``N`` = mask). -* ``AA_ALPHABET`` — 20 amino acids + ``*_X`` (``X`` = mask). -* ``REDUCED_AA_ALPHABET`` — Physico-chemical reduced alphabet (``X`` = mask). +Helpers +------- +* ``_to_bytes`` — Normalise *Seq* to ``bytes``. -Functions +Alphabets --------- -* ``make_alphabet`` — Build a 256-byte LUT from a string of allowed chars. +* ``NT_ALPHABET`` / ``AA_ALPHABET`` / ``REDUCED_AA_ALPHABET`` — 256-byte LUTs. +* ``NT_MASK`` / ``AA_MASK`` / ``REDUCED_AA_MASK`` — Mask byte values. + +Translation +----------- +* ``aa_to_reduced`` — AA → reduced via ``bytes.translate`` (fastest path). * ``validate`` — Check every byte belongs to an alphabet. -* ``translate`` — Byte-level translation via ``bytes.translate``. * ``mask`` — Replace position(s) with a mask character. * ``matches`` — Wildcard-aware positional comparison. -* ``aa_to_reduced`` — Convert amino-acid sequence to reduced alphabet. * ``matches_aa_reduced``— Cross-alphabet wildcard match (AA vs reduced). """ from __future__ import annotations +# --------------------------------------------------------------------------- +# Type alias +# --------------------------------------------------------------------------- + Seq = str | bytes | bytearray # --------------------------------------------------------------------------- @@ -43,14 +49,7 @@ def _to_bytes(seq: Seq) -> bytes: # --------------------------------------------------------------------------- def make_alphabet(chars: str) -> bytes: - """Build a 256-byte lookup table where allowed positions are ``1``. - - Args: - chars: String of allowed ASCII characters. - - Returns: - A 256-byte ``bytes`` object usable as a fast membership LUT. - """ + """Build a 256-byte lookup table where allowed positions are ``1``.""" lut = bytearray(256) for ch in chars: lut[ord(ch)] = 1 @@ -78,7 +77,6 @@ def make_alphabet(chars: str) -> bytes: # Amino-acid → reduced-alphabet mapping # --------------------------------------------------------------------------- -#: Per-character mapping from standard amino-acid codes to reduced symbols. AA_TO_REDUCED: dict[str, str] = { "A": "l", "R": "b", "N": "m", "D": "c", "C": "s", "Q": "m", "E": "c", "G": "G", "H": "b", "I": "l", "L": "l", "K": "b", @@ -86,13 +84,11 @@ def make_alphabet(chars: str) -> bytes: "Y": "Y", "V": "l", "X": "X", "*": "*", "_": "_", } -#: ``bytes.translate`` table for fast AA → reduced conversion. AA_TO_REDUCED_TABLE: bytes = bytes.maketrans( "".join(AA_TO_REDUCED.keys()).encode(), "".join(AA_TO_REDUCED.values()).encode(), ) -#: 256-byte LUT mapping each AA byte to its reduced byte (for matching). _AA_TO_REDUCED_LUT: bytes _lut = bytearray(256) for _aa, _red in AA_TO_REDUCED.items(): @@ -102,25 +98,24 @@ def make_alphabet(chars: str) -> bytes: # --------------------------------------------------------------------------- -# Validation +# Translation (aa_to_reduced — fastest in Python via bytes.translate) # --------------------------------------------------------------------------- -def validate(seq: Seq, alphabet: bytes) -> bytes: - """Validate that every byte of *seq* belongs to *alphabet*. +def aa_to_reduced(seq: Seq) -> bytes: + """Convert an amino-acid sequence to the reduced physico-chemical alphabet. - Accepts ``str``, ``bytes``, or ``bytearray``. Strings are - ASCII-encoded first. + Uses ``bytes.translate`` with a pre-built table — faster than C for + this particular operation. + """ + return _to_bytes(seq).translate(AA_TO_REDUCED_TABLE) - Args: - seq: Input sequence. - alphabet: 256-byte LUT (1 = allowed). - Returns: - The validated sequence as ``bytes``. +# --------------------------------------------------------------------------- +# Validation +# --------------------------------------------------------------------------- - Raises: - ValueError: If any byte falls outside the alphabet. - """ +def validate(seq: Seq, alphabet: bytes) -> bytes: + """Validate every byte of *seq* belongs to *alphabet* (256-byte LUT).""" raw = _to_bytes(seq) for b in raw: if not alphabet[b]: @@ -130,55 +125,12 @@ def validate(seq: Seq, alphabet: bytes) -> bytes: return raw -# --------------------------------------------------------------------------- -# Translation -# --------------------------------------------------------------------------- - -def translate(seq: Seq, table: bytes) -> bytes: - """Translate *seq* byte-by-byte using a ``bytes.maketrans`` *table*. - - Args: - seq: Input sequence (``str``, ``bytes``, or ``bytearray``). - table: A 256-byte translation table (from ``bytes.maketrans``). - - Returns: - Translated ``bytes``. - """ - return _to_bytes(seq).translate(table) - - -def aa_to_reduced(seq: Seq) -> bytes: - """Convert an amino-acid sequence to the reduced physico-chemical alphabet. - - Uses ``bytes.translate`` with a pre-built table for native speed. - - Args: - seq: Amino-acid sequence (``str``, ``bytes``, or ``bytearray``). - - Returns: - Reduced-alphabet ``bytes``. - """ - return _to_bytes(seq).translate(AA_TO_REDUCED_TABLE) - - # --------------------------------------------------------------------------- # Masking # --------------------------------------------------------------------------- def mask(seq: Seq, position: int | slice | tuple[int, int], mask_byte: int) -> bytes: - """Return a copy of *seq* with the given position(s) replaced by *mask_byte*. - - Args: - seq: Input sequence. - position: Single index, ``slice``, or ``(start, stop)`` half-open range. - mask_byte: Replacement byte value (e.g. ``ord('N')`` or ``NT_MASK``). - - Returns: - New ``bytes`` with the specified positions masked. - - Raises: - IndexError: If a single-index position is out of bounds. - """ + """Return a copy of *seq* with the given position(s) replaced by *mask_byte*.""" buf = bytearray(_to_bytes(seq)) if isinstance(position, int): n = len(buf) @@ -207,15 +159,7 @@ def matches(a: Seq, b: Seq, mask_byte: int) -> bool: Returns ``True`` when *a* and *b* have the same length and at every position the bytes are equal **or** at least one side carries - *mask_byte*. This is **not** the same as ``a == b``. - - Args: - a: First sequence. - b: Second sequence. - mask_byte: The wildcard byte value (e.g. ``NT_MASK``). - - Returns: - ``True`` if the sequences match, ``False`` otherwise. + *mask_byte*. """ ba = _to_bytes(a) bb = _to_bytes(b) @@ -231,19 +175,7 @@ def matches(a: Seq, b: Seq, mask_byte: int) -> bool: def matches_aa_reduced(aa_seq: Seq, reduced_seq: Seq) -> bool: - """Wildcard-aware match between an amino-acid and a reduced-alphabet sequence. - - Each byte of *aa_seq* is first mapped to the reduced alphabet via a - byte LUT, then compared against *reduced_seq*. ``X`` (mask) on either - side counts as a wildcard. - - Args: - aa_seq: Amino-acid sequence. - reduced_seq: Reduced-alphabet sequence. - - Returns: - ``True`` if every position matches (accounting for wildcards). - """ + """Wildcard-aware match between an amino-acid and a reduced-alphabet sequence.""" ba = _to_bytes(aa_seq) br = _to_bytes(reduced_seq) if len(ba) != len(br): diff --git a/mir/basic/mirseq.cpp b/mir/basic/mirseq.cpp new file mode 100644 index 0000000..682d4ed --- /dev/null +++ b/mir/basic/mirseq.cpp @@ -0,0 +1,367 @@ +/* + * mirseq — C-native sequence translation, tokenization, and distances. + * + * Compiled as a pybind11 module. All functions accept Python str or bytes + * via py::bytes / std::string_view and return Python list[str] or list[bytes]. + * + * Lookup tables are compile-time constants (constexpr arrays) — no heap + * allocation, no GC interaction. + * + * Functions: + * translate_linear(nt_seq) → str amino-acid translation (linear) + * translate_bidi(nt_seq) → str amino-acid translation (bidirectional) + * aa_to_reduced(aa_seq) → str reduced amino-acid alphabet + * tokenize_bytes(seq, k) → list[bytes] sliding window k-mers + * tokenize_str(seq, k) → list[str] sliding window k-mers + * tokenize_gapped_bytes(seq,k,m) → list[bytes] gapped k-mers + * tokenize_gapped_str(seq,k,m) → list[str] gapped k-mers + * hamming(a, b) → int hamming distance + * levenshtein(a, b) → int levenshtein distance + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; + +/* ================================================================ + * Codon table (64 entries, indexed by 2-bit packed nucleotides) + * A=0, T=1, G=2, C=3. Any codon containing N → 'X'. + * ================================================================ */ + +// Nucleotide to 2-bit index: A=0, T=1, G=2, C=3, else -1 +struct NtIdx { + int v[256]; + constexpr NtIdx() : v{} { + for (int i = 0; i < 256; ++i) v[i] = -1; + v['A'] = 0; v['T'] = 1; v['G'] = 2; v['C'] = 3; + } +}; +static constexpr NtIdx nt_idx{}; + +// Standard genetic code: index = n1*16 + n2*4 + n3 +// Order: A=0 T=1 G=2 C=3 +static constexpr char CODON_TABLE[64] = { + // AAA AAT AAG AAC ATA ATT ATG ATC AGA AGT AGG AGC ACA ACT ACG ACC + 'K','N','K','N', 'I','I','M','I', 'R','S','R','S', 'T','T','T','T', + // TAA TAT TAG TAC TTA TTT TTG TTC TGA TGT TGG TGC TCA TCT TCG TCC + '*','Y','*','Y', 'L','F','L','F', '*','C','W','C', 'S','S','S','S', + // GAA GAT GAG GAC GTA GTT GTG GTC GGA GGT GGG GGC GCA GCT GCG GCC + 'E','D','E','D', 'V','V','V','V', 'G','G','G','G', 'A','A','A','A', + // CAA CAT CAG CAC CTA CTT CTG CTC CGA CGT CGG CGC CCA CCT CCG CCC + 'Q','H','Q','H', 'L','L','L','L', 'R','R','R','R', 'P','P','P','P', +}; + +static inline char translate_codon(unsigned char n1, unsigned char n2, unsigned char n3) { + int i1 = nt_idx.v[n1], i2 = nt_idx.v[n2], i3 = nt_idx.v[n3]; + if (i1 < 0 || i2 < 0 || i3 < 0) return 'X'; + return CODON_TABLE[i1 * 16 + i2 * 4 + i3]; +} + +/* ================================================================ + * Amino acid → reduced alphabet lookup table (256 entries) + * ================================================================ */ + +struct ReducedLut { + char v[256]; + constexpr ReducedLut() : v{} { + for (int i = 0; i < 256; ++i) v[i] = 0; + v['A'] = 'l'; v['R'] = 'b'; v['N'] = 'm'; v['D'] = 'c'; + v['C'] = 's'; v['Q'] = 'm'; v['E'] = 'c'; v['G'] = 'G'; + v['H'] = 'b'; v['I'] = 'l'; v['L'] = 'l'; v['K'] = 'b'; + v['M'] = 's'; v['F'] = 'F'; v['P'] = 'P'; v['S'] = 'h'; + v['T'] = 'h'; v['W'] = 'W'; v['Y'] = 'Y'; v['V'] = 'l'; + v['X'] = 'X'; v['*'] = '*'; v['_'] = '_'; + } +}; +static constexpr ReducedLut reduced_lut{}; + +/* ================================================================ + * Helper: extract raw pointer + length from str or bytes + * ================================================================ */ + +struct SeqView { + const char* data; + size_t len; +}; + +static SeqView to_view(const py::object& obj) { + if (py::isinstance(obj)) { + // str → UTF-8 (ASCII subset) + Py_ssize_t sz = 0; + const char* p = PyUnicode_AsUTF8AndSize(obj.ptr(), &sz); + if (!p) throw py::error_already_set(); + return {p, (size_t)sz}; + } + if (py::isinstance(obj)) { + char* buf = nullptr; + Py_ssize_t sz = 0; + PyBytes_AsStringAndSize(obj.ptr(), &buf, &sz); + return {buf, (size_t)sz}; + } + if (py::isinstance(obj)) { + const char* buf = PyByteArray_AS_STRING(obj.ptr()); + size_t sz = (size_t)PyByteArray_GET_SIZE(obj.ptr()); + return {buf, sz}; + } + throw py::type_error("expected str, bytes, or bytearray"); +} + +/* ================================================================ + * Translation: linear + * ================================================================ */ + +static py::str translate_linear(const py::object& obj) { + auto sv = to_view(obj); + size_t n = sv.len; + size_t full_codons = n / 3; + bool incomplete = (n % 3) != 0; + size_t out_len = full_codons + (incomplete ? 1 : 0); + std::string result(out_len, '\0'); + const char* s = sv.data; + for (size_t i = 0; i < full_codons; ++i) { + result[i] = translate_codon((unsigned char)s[i*3], + (unsigned char)s[i*3+1], + (unsigned char)s[i*3+2]); + } + if (incomplete) result[full_codons] = '_'; + return py::str(result); +} + +/* ================================================================ + * Translation: bidirectional + * ================================================================ */ + +static py::str translate_bidi(const py::object& obj) { + auto sv = to_view(obj); + size_t n = sv.len; + if (n == 0) return py::str(""); + + size_t remainder = n % 3; + if (remainder == 0) { + // Exact multiple of 3: just translate linearly + size_t n_codons = n / 3; + std::string result(n_codons, '\0'); + const char* s = sv.data; + for (size_t i = 0; i < n_codons; ++i) + result[i] = translate_codon((unsigned char)s[i*3], + (unsigned char)s[i*3+1], + (unsigned char)s[i*3+2]); + return py::str(result); + } + + // Not multiple of 3: bidirectional with gap + size_t n_codons = n / 3; // full codons available + // Determine forward and reverse codon counts + // For long sequences (>= 9 codons worth = 27 nt): gap after 4th codon from start + // For shorter sequences: gap in the middle + size_t fwd_codons, rev_codons; + if (n >= 9 * 3) { + fwd_codons = 4; + rev_codons = n_codons - 4; + } else { + fwd_codons = n_codons / 2; + rev_codons = n_codons - fwd_codons; + } + + // out_len = fwd + 1 (gap) + rev + size_t out_len = fwd_codons + 1 + rev_codons; + std::string result(out_len, '\0'); + const char* s = sv.data; + + // Forward codons from start + for (size_t i = 0; i < fwd_codons; ++i) + result[i] = translate_codon((unsigned char)s[i*3], + (unsigned char)s[i*3+1], + (unsigned char)s[i*3+2]); + + // Gap + result[fwd_codons] = '_'; + + // Reverse codons from end + for (size_t i = 0; i < rev_codons; ++i) { + size_t nt_pos = n - (rev_codons - i) * 3; + result[fwd_codons + 1 + i] = translate_codon( + (unsigned char)s[nt_pos], + (unsigned char)s[nt_pos+1], + (unsigned char)s[nt_pos+2]); + } + + return py::str(result); +} + +/* ================================================================ + * AA → reduced alphabet + * ================================================================ */ + +static py::str c_aa_to_reduced(const py::object& obj) { + auto sv = to_view(obj); + std::string result(sv.len, '\0'); + for (size_t i = 0; i < sv.len; ++i) { + char c = reduced_lut.v[(unsigned char)sv.data[i]]; + result[i] = c ? c : sv.data[i]; // pass-through unmapped + } + return py::str(result); +} + +/* ================================================================ + * Tokenization: sliding window → list[bytes] / list[str] + * ================================================================ */ + +static py::list c_tokenize_bytes(const py::object& obj, int k) { + auto sv = to_view(obj); + int n = (int)sv.len; + if (k < 1 || k > n) + throw std::invalid_argument("k must be between 1 and sequence length"); + int count = n - k + 1; + py::list result(count); + for (int i = 0; i < count; ++i) + result[i] = py::bytes(sv.data + i, k); + return result; +} + +static py::list c_tokenize_str(const py::object& obj, int k) { + auto sv = to_view(obj); + int n = (int)sv.len; + if (k < 1 || k > n) + throw std::invalid_argument("k must be between 1 and sequence length"); + int count = n - k + 1; + py::list result(count); + for (int i = 0; i < count; ++i) + result[i] = py::str(std::string(sv.data + i, k)); + return result; +} + +/* ================================================================ + * Tokenization: sliding window + mask → list[bytes] / list[str] + * ================================================================ */ + +static py::list c_tokenize_gapped_bytes(const py::object& obj, int k, int mask_byte) { + auto sv = to_view(obj); + int n = (int)sv.len; + if (k < 1 || k > n) + throw std::invalid_argument("k must be between 1 and sequence length"); + int n_windows = n - k + 1; + int total = n_windows * k; + py::list result(total); + // Temporary buffer for each gapped k-mer + char* buf = (char*)alloca(k); + int idx = 0; + for (int i = 0; i < n_windows; ++i) { + for (int j = 0; j < k; ++j) { + std::memcpy(buf, sv.data + i, k); + buf[j] = (char)mask_byte; + result[idx++] = py::bytes(buf, k); + } + } + return result; +} + +static py::list c_tokenize_gapped_str(const py::object& obj, int k, int mask_byte) { + auto sv = to_view(obj); + int n = (int)sv.len; + if (k < 1 || k > n) + throw std::invalid_argument("k must be between 1 and sequence length"); + int n_windows = n - k + 1; + int total = n_windows * k; + py::list result(total); + char* buf = (char*)alloca(k); + int idx = 0; + for (int i = 0; i < n_windows; ++i) { + for (int j = 0; j < k; ++j) { + std::memcpy(buf, sv.data + i, k); + buf[j] = (char)mask_byte; + result[idx++] = py::str(std::string(buf, k)); + } + } + return result; +} + +/* ================================================================ + * Hamming distance + * ================================================================ */ + +static int c_hamming(const py::object& a, const py::object& b) { + auto sa = to_view(a); + auto sb = to_view(b); + if (sa.len != sb.len) + throw std::invalid_argument("sequences must have equal length for hamming distance"); + int d = 0; + for (size_t i = 0; i < sa.len; ++i) + d += (sa.data[i] != sb.data[i]); + return d; +} + +/* ================================================================ + * Levenshtein distance (classic DP, two-row, O(min(m,n)) space) + * ================================================================ */ + +static int c_levenshtein(const py::object& a, const py::object& b) { + auto sa = to_view(a); + auto sb = to_view(b); + size_t m = sa.len, n = sb.len; + // Ensure m <= n for space optimisation + const char* s = sa.data; + const char* t = sb.data; + if (m > n) { std::swap(s, t); std::swap(m, n); } + std::vector prev(m + 1), curr(m + 1); + for (size_t i = 0; i <= m; ++i) prev[i] = (int)i; + for (size_t j = 1; j <= n; ++j) { + curr[0] = (int)j; + for (size_t i = 1; i <= m; ++i) { + int cost = (s[i-1] != t[j-1]) ? 1 : 0; + int del_ = prev[i] + 1; + int ins = curr[i-1] + 1; + int sub = prev[i-1] + cost; + curr[i] = std::min({del_, ins, sub}); + } + std::swap(prev, curr); + } + return prev[m]; +} + +/* ================================================================ + * Module definition + * ================================================================ */ + +PYBIND11_MODULE(mirseq, m) { + m.doc() = "C-native sequence translation, tokenization, and distances"; + + // Translation + m.def("translate_linear", &translate_linear, + py::arg("seq"), + "Translate nucleotide sequence to amino acids (linear, incomplete codon → '_')"); + m.def("translate_bidi", &translate_bidi, + py::arg("seq"), + "Translate nucleotide sequence to amino acids (bidirectional, gap '_' inserted)"); + m.def("aa_to_reduced", &c_aa_to_reduced, + py::arg("seq"), + "Convert amino acid sequence to reduced alphabet"); + + // Tokenization + m.def("tokenize_bytes", &c_tokenize_bytes, + py::arg("seq"), py::arg("k"), + "Sliding window k-mers as list[bytes]"); + m.def("tokenize_str", &c_tokenize_str, + py::arg("seq"), py::arg("k"), + "Sliding window k-mers as list[str]"); + m.def("tokenize_gapped_bytes", &c_tokenize_gapped_bytes, + py::arg("seq"), py::arg("k"), py::arg("mask_byte"), + "Gapped k-mers (each position masked) as list[bytes]"); + m.def("tokenize_gapped_str", &c_tokenize_gapped_str, + py::arg("seq"), py::arg("k"), py::arg("mask_byte"), + "Gapped k-mers (each position masked) as list[str]"); + + // Distances + m.def("hamming", &c_hamming, + py::arg("a"), py::arg("b"), + "Hamming distance between two equal-length sequences"); + m.def("levenshtein", &c_levenshtein, + py::arg("a"), py::arg("b"), + "Levenshtein (edit) distance between two sequences"); +} diff --git a/mir/basic/token_tables.py b/mir/basic/token_tables.py index 539d4e7..269897e 100644 --- a/mir/basic/token_tables.py +++ b/mir/basic/token_tables.py @@ -21,7 +21,7 @@ from typing import NamedTuple -from mir.basic.sequence import Seq, _to_bytes +from mir.basic.alphabets import Seq, _to_bytes # --------------------------------------------------------------------------- diff --git a/mir/basic/tokens.py b/mir/basic/tokens.py index 45833c4..4d3dd6f 100644 --- a/mir/basic/tokens.py +++ b/mir/basic/tokens.py @@ -1,13 +1,11 @@ """K-mer tokenisation for biological sequences. -Provides plain and gapped k-mer extraction operating on ``str`` or ``bytes`` -inputs. Both approaches use bytes slicing internally (``str.encode`` is -virtually free for short ASCII sequences and ``bytes`` slicing is faster than -``str`` slicing in CPython). +Thin wrappers around the ``mirseq`` C extension. Accepts ``str``, +``bytes``, or ``bytearray`` inputs. Functions --------- -* ``tokenize`` — Overlapping k-mers as a ``list[bytes]``. +* ``tokenize`` — Overlapping k-mers as ``list[bytes]``. * ``tokenize_gapped`` — Gapped (single-position masked) k-mers as ``list[bytes]``. * ``tokenize_str`` — Same as ``tokenize`` returning ``list[str]``. * ``tokenize_gapped_str`` — Same as ``tokenize_gapped`` returning ``list[str]``. @@ -15,7 +13,13 @@ from __future__ import annotations -from mir.basic.sequence import Seq, _to_bytes +from mir.basic.mirseq import ( + tokenize_bytes as _c_tokenize_bytes, + tokenize_str as _c_tokenize_str, + tokenize_gapped_bytes as _c_tokenize_gapped_bytes, + tokenize_gapped_str as _c_tokenize_gapped_str, +) +from mir.basic.alphabets import Seq # --------------------------------------------------------------------------- @@ -25,8 +29,7 @@ def tokenize(seq: Seq, k: int) -> list[bytes]: """Extract overlapping k-mers of length *k* from *seq*. - Uses ``bytes`` slicing for speed; accepts ``str``, ``bytes``, - or ``bytearray``. + Delegates to the ``mirseq`` C extension for speed. Args: seq: Input sequence. @@ -34,25 +37,13 @@ def tokenize(seq: Seq, k: int) -> list[bytes]: Returns: List of ``bytes`` k-mers (length ``len(seq) - k + 1``). - - Raises: - ValueError: If *k* < 1 or *k* > ``len(seq)``. """ - raw = _to_bytes(seq) - n = len(raw) - if k < 1 or k > n: - raise ValueError( - f"k must be between 1 and sequence length ({n}), got {k}" - ) - return [raw[i : i + k] for i in range(n - k + 1)] + return _c_tokenize_bytes(seq, k) def tokenize_str(seq: Seq, k: int) -> list[str]: - """Like :func:`tokenize` but returns ``list[str]``. - - Internally converts to bytes, tokenizes, then decodes each k-mer. - """ - return [km.decode("ascii") for km in tokenize(seq, k)] + """Like :func:`tokenize` but returns ``list[str]``.""" + return _c_tokenize_str(seq, k) # --------------------------------------------------------------------------- @@ -63,9 +54,7 @@ def tokenize_gapped(seq: Seq, k: int, mask_byte: int) -> list[bytes]: """Extract gapped k-mers: for each window, *k* variants with one position replaced by *mask_byte*. - For window ``CAS`` with mask ``X`` (88):: - - XAS CXS CAX + Delegates to the ``mirseq`` C extension for speed. Args: seq: Input sequence. @@ -75,28 +64,8 @@ def tokenize_gapped(seq: Seq, k: int, mask_byte: int) -> list[bytes]: Returns: List of ``bytes`` gapped k-mers. Length is ``(len(seq) - k + 1) * k``. - - Raises: - ValueError: If *k* < 1 or *k* > ``len(seq)``. """ - raw = _to_bytes(seq) - n = len(raw) - if k < 1 or k > n: - raise ValueError( - f"k must be between 1 and sequence length ({n}), got {k}" - ) - n_windows = n - k + 1 - n_gapped = n_windows * k - out = bytearray(n_gapped * k) - offset = 0 - for i in range(n_windows): - window = raw[i : i + k] - for j in range(k): - out[offset : offset + k] = window - out[offset + j] = mask_byte - offset += k - frozen = bytes(out) - return [frozen[i * k : (i + 1) * k] for i in range(n_gapped)] + return _c_tokenize_gapped_bytes(seq, k, mask_byte) def tokenize_gapped_str(seq: Seq, k: int, mask_char: str) -> list[str]: @@ -107,7 +76,4 @@ def tokenize_gapped_str(seq: Seq, k: int, mask_char: str) -> list[str]: k: K-mer length. mask_char: Single-character mask string (e.g. ``"X"``). """ - return [ - km.decode("ascii") - for km in tokenize_gapped(seq, k, ord(mask_char)) - ] + return _c_tokenize_gapped_str(seq, k, ord(mask_char)) diff --git a/mir/distances/seqdist.py b/mir/distances/seqdist.py new file mode 100644 index 0000000..c7dd40b --- /dev/null +++ b/mir/distances/seqdist.py @@ -0,0 +1,31 @@ +"""Thin Python wrappers around the C-native distance functions in ``mirseq``. + +Functions +--------- +* ``hamming(a, b)`` — Hamming distance (equal-length sequences). +* ``levenshtein(a, b)`` — Levenshtein (edit) distance. +""" + +from __future__ import annotations + +from mir.basic.alphabets import Seq +from mir.basic.mirseq import hamming as _c_hamming, levenshtein as _c_levenshtein + + +def hamming(a: Seq, b: Seq) -> int: + """Hamming distance between two equal-length sequences. + + Accepts ``str``, ``bytes``, or ``bytearray``. + + Raises: + ValueError: If the sequences differ in length. + """ + return _c_hamming(a, b) + + +def levenshtein(a: Seq, b: Seq) -> int: + """Levenshtein (edit) distance between two sequences. + + Accepts ``str``, ``bytes``, or ``bytearray``. + """ + return _c_levenshtein(a, b) diff --git a/requirements.txt b/requirements.txt index 7845217..f332ade 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ stringutils==1.0.6 umap-learn==0.5.3 pybind11==2.11.0 multipy==0.16 +polars==1.39.3 \ No newline at end of file diff --git a/tests/test_alphabets.py b/tests/test_alphabets.py new file mode 100644 index 0000000..3a17f6b --- /dev/null +++ b/tests/test_alphabets.py @@ -0,0 +1,214 @@ +"""Unit tests for ``mir.basic.alphabets``. + +Covers: Seq helpers, alphabet LUTs, aa_to_reduced (Python path), +validate, mask, matches, matches_aa_reduced. + +Run with ``python -m pytest tests/test_alphabets.py -v``. +""" + +import unittest + +from mir.basic.alphabets import ( + Seq, + _to_bytes, + make_alphabet, + NT_ALPHABET, + AA_ALPHABET, + REDUCED_AA_ALPHABET, + NT_CHARS, + AA_CHARS, + REDUCED_AA_CHARS, + NT_MASK, + AA_MASK, + REDUCED_AA_MASK, + AA_TO_REDUCED, + AA_TO_REDUCED_TABLE, + _AA_TO_REDUCED_LUT, + aa_to_reduced, + validate, + mask, + matches, + matches_aa_reduced, +) + + +# ── _to_bytes ───────────────────────────────────────────────────── + +class TestToBytes(unittest.TestCase): + + def test_str(self) -> None: + self.assertEqual(_to_bytes("CAST"), b"CAST") + + def test_bytes(self) -> None: + self.assertEqual(_to_bytes(b"CAST"), b"CAST") + + def test_bytearray(self) -> None: + self.assertEqual(_to_bytes(bytearray(b"CAST")), b"CAST") + + def test_empty(self) -> None: + self.assertEqual(_to_bytes(""), b"") + + +# ── Alphabet LUTs ───────────────────────────────────────────────── + +class TestAlphabets(unittest.TestCase): + + def test_nt_lut_size(self) -> None: + self.assertEqual(len(NT_ALPHABET), 256) + + def test_nt_chars_allowed(self) -> None: + for ch in NT_CHARS: + self.assertEqual(NT_ALPHABET[ord(ch)], 1, ch) + + def test_nt_lowercase_disallowed(self) -> None: + for ch in "atgcn": + self.assertEqual(NT_ALPHABET[ord(ch)], 0, ch) + + def test_aa_chars_allowed(self) -> None: + for ch in AA_CHARS: + self.assertEqual(AA_ALPHABET[ord(ch)], 1, ch) + + def test_reduced_chars_allowed(self) -> None: + for ch in REDUCED_AA_CHARS: + self.assertEqual(REDUCED_AA_ALPHABET[ord(ch)], 1, ch) + + def test_make_alphabet_custom(self) -> None: + lut = make_alphabet("AB") + self.assertEqual(lut[ord("A")], 1) + self.assertEqual(lut[ord("B")], 1) + self.assertEqual(lut[ord("C")], 0) + + +# ── AA → reduced ───────────────────────────────────────────────── + +class TestAaToReduced(unittest.TestCase): + + def test_str_input(self) -> None: + self.assertEqual(aa_to_reduced("CASTIVGGLSQDKIVW"), b"slhhllGGlhmcbllW") + + def test_bytes_input(self) -> None: + self.assertEqual(aa_to_reduced(b"CASTIVGGLSQDKIVW"), b"slhhllGGlhmcbllW") + + def test_empty(self) -> None: + self.assertEqual(aa_to_reduced(""), b"") + + def test_specials(self) -> None: + self.assertEqual(aa_to_reduced("*_X"), b"*_X") + + def test_each_aa(self) -> None: + for aa, exp in AA_TO_REDUCED.items(): + with self.subTest(aa=aa): + self.assertEqual(aa_to_reduced(aa), exp.encode()) + + def test_table_consistency(self) -> None: + for aa, exp in AA_TO_REDUCED.items(): + with self.subTest(aa=aa): + self.assertEqual(AA_TO_REDUCED_TABLE[ord(aa)], ord(exp)) + + def test_lut_consistency(self) -> None: + for aa, exp in AA_TO_REDUCED.items(): + with self.subTest(aa=aa): + self.assertEqual(_AA_TO_REDUCED_LUT[ord(aa)], ord(exp)) + + +# ── validate ────────────────────────────────────────────────────── + +class TestValidate(unittest.TestCase): + + def test_valid_nt(self) -> None: + self.assertEqual(validate("ATGCN", NT_ALPHABET), b"ATGCN") + + def test_valid_aa(self) -> None: + self.assertEqual(validate("CASTIVW*_X", AA_ALPHABET), b"CASTIVW*_X") + + def test_invalid_nt_lowercase(self) -> None: + with self.assertRaises(ValueError): + validate("atgc", NT_ALPHABET) + + def test_invalid_aa_number(self) -> None: + with self.assertRaises(ValueError): + validate("CAST1", AA_ALPHABET) + + def test_empty(self) -> None: + self.assertEqual(validate("", NT_ALPHABET), b"") + + +# ── mask ────────────────────────────────────────────────────────── + +class TestMask(unittest.TestCase): + + def test_single_position(self) -> None: + self.assertEqual(mask("CAST", 0, AA_MASK), b"XAST") + self.assertEqual(mask("CAST", 3, AA_MASK), b"CASX") + + def test_negative_position(self) -> None: + self.assertEqual(mask("CAST", -1, AA_MASK), b"CASX") + + def test_out_of_range(self) -> None: + with self.assertRaises(IndexError): + mask("CA", 5, AA_MASK) + + def test_slice_position(self) -> None: + self.assertEqual(mask("CASTIV", slice(1, 3), AA_MASK), b"CXXTIV") + + def test_tuple_position(self) -> None: + self.assertEqual(mask("CASTIV", (1, 3), AA_MASK), b"CXXTIV") + + def test_bad_position_type(self) -> None: + with self.assertRaises(TypeError): + mask("CAST", [0], AA_MASK) # type: ignore[arg-type] + + +# ── matches ─────────────────────────────────────────────────────── + +class TestMatches(unittest.TestCase): + + def test_identical(self) -> None: + self.assertTrue(matches("CAST", "CAST", AA_MASK)) + + def test_wildcard_on_a(self) -> None: + self.assertTrue(matches("XAST", "CAST", AA_MASK)) + + def test_wildcard_on_b(self) -> None: + self.assertTrue(matches("CAST", "XAST", AA_MASK)) + + def test_mismatch(self) -> None: + self.assertFalse(matches("CAST", "GAST", AA_MASK)) + + def test_length_mismatch(self) -> None: + self.assertFalse(matches("CAST", "CAS", AA_MASK)) + + def test_empty(self) -> None: + self.assertTrue(matches("", "", AA_MASK)) + + def test_all_wildcards(self) -> None: + self.assertTrue(matches("XXX", "CAS", AA_MASK)) + + +# ── matches_aa_reduced ──────────────────────────────────────────── + +class TestMatchesAaReduced(unittest.TestCase): + + def test_matching_pair(self) -> None: + aa = "CASTIVGGLSQDKIVW" + reduced = aa_to_reduced(aa).decode() + self.assertTrue(matches_aa_reduced(aa, reduced)) + + def test_mismatch(self) -> None: + self.assertFalse(matches_aa_reduced("C", "G")) + + def test_wildcard_aa_side(self) -> None: + self.assertTrue(matches_aa_reduced("X", "s")) + + def test_wildcard_reduced_side(self) -> None: + self.assertTrue(matches_aa_reduced("C", "X")) + + def test_length_mismatch(self) -> None: + self.assertFalse(matches_aa_reduced("CA", "s")) + + def test_empty(self) -> None: + self.assertTrue(matches_aa_reduced("", "")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_memory_benchmark.py b/tests/test_memory_benchmark.py deleted file mode 100644 index 0b5267b..0000000 --- a/tests/test_memory_benchmark.py +++ /dev/null @@ -1,133 +0,0 @@ -"""Memory benchmark for k-mer tokenisation. - -Uses ``tracemalloc`` to measure memory for: - -1. Plain k-mers: tokenize() vs naive str slicing vs naive bytes slicing. -2. Gapped k-mers: tokenize_gapped() vs naive approaches. - -Run with ``python -m pytest tests/test_memory_benchmark.py -s``. -""" - -import random -import tracemalloc -import unittest - -from mir.basic.sequence import AA_MASK -from mir.basic.tokens import tokenize, tokenize_gapped - -N = 100_000 -SEQ_LEN = 15 -K = 3 -MASK_STR = "X" - -_AA_LETTERS = "ACDEFGHIKLMNPQRSTVWY" - - -def _random_strings(n: int, length: int) -> list[str]: - rng = random.Random(42) - return ["".join(rng.choices(_AA_LETTERS, k=length)) for _ in range(n)] - - -def _fmt(nbytes: int) -> str: - return f"{nbytes / 1024:.1f} KiB" - - -class TestMemoryBenchmark(unittest.TestCase): - - def test_plain_kmer_memory(self) -> None: - """Compare memory: tokenize() vs naive str/bytes slicing.""" - strings = _random_strings(N, SEQ_LEN) - byte_strings = [s.encode() for s in strings] - expected = N * (SEQ_LEN - K + 1) - - # naive str slices - tracemalloc.start() - str_kmers = [] - for s in strings: - str_kmers.extend(s[i : i + K] for i in range(len(s) - K + 1)) - cur_str, peak_str = tracemalloc.get_traced_memory() - tracemalloc.stop() - - # naive bytes slices - tracemalloc.start() - bytes_kmers = [] - for b in byte_strings: - bytes_kmers.extend(b[i : i + K] for i in range(len(b) - K + 1)) - cur_bytes, peak_bytes = tracemalloc.get_traced_memory() - tracemalloc.stop() - - # tokenize(bytes) - tracemalloc.start() - tok_kmers = [] - for b in byte_strings: - tok_kmers.extend(tokenize(b, K)) - cur_tok, peak_tok = tracemalloc.get_traced_memory() - tracemalloc.stop() - - self.assertEqual(len(str_kmers), expected) - self.assertEqual(len(bytes_kmers), expected) - self.assertEqual(len(tok_kmers), expected) - - print( - f"\n{'Approach':<32} {'Count':>8} {'Current':>12} {'Peak':>12} " - f"{'Per-item':>10}\n" - f"{'-' * 76}" - ) - for lbl, count, cur, peak in [ - ("naive str slices", len(str_kmers), cur_str, peak_str), - ("naive bytes slices", len(bytes_kmers), cur_bytes, peak_bytes), - ("tokenize(bytes)", len(tok_kmers), cur_tok, peak_tok), - ]: - per = cur / count if count else 0 - print( - f"{lbl:<32} {count:>8} {_fmt(cur):>12} {_fmt(peak):>12} " - f"{per:>8.0f} B" - ) - - def test_gapped_kmer_memory(self) -> None: - """Compare memory: tokenize_gapped() vs naive gapped str slicing.""" - strings = _random_strings(N, SEQ_LEN) - byte_strings = [s.encode() for s in strings] - n_windows = SEQ_LEN - K + 1 - expected = N * n_windows * K - - # naive str gapped - tracemalloc.start() - str_gapped = [] - for s in strings: - for i in range(len(s) - K + 1): - w = s[i : i + K] - for j in range(K): - str_gapped.append(w[:j] + MASK_STR + w[j + 1 :]) - cur_str, peak_str = tracemalloc.get_traced_memory() - tracemalloc.stop() - - # tokenize_gapped(bytes) - tracemalloc.start() - tok_gapped = [] - for b in byte_strings: - tok_gapped.extend(tokenize_gapped(b, K, AA_MASK)) - cur_tok, peak_tok = tracemalloc.get_traced_memory() - tracemalloc.stop() - - self.assertEqual(len(str_gapped), expected) - self.assertEqual(len(tok_gapped), expected) - - print( - f"\n{'Approach':<32} {'Count':>8} {'Current':>12} {'Peak':>12} " - f"{'Per-item':>10}\n" - f"{'-' * 76}" - ) - for lbl, count, cur, peak in [ - ("naive str gapped", len(str_gapped), cur_str, peak_str), - ("tokenize_gapped(bytes)", len(tok_gapped), cur_tok, peak_tok), - ]: - per = cur / count if count else 0 - print( - f"{lbl:<32} {count:>8} {_fmt(cur):>12} {_fmt(peak):>12} " - f"{per:>8.0f} B" - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_mirseq.py b/tests/test_mirseq.py new file mode 100644 index 0000000..fc4df68 --- /dev/null +++ b/tests/test_mirseq.py @@ -0,0 +1,487 @@ +"""Unit tests for the ``mirseq`` C extension and ``alphabets`` module. + +Covers: + - Codon translation: linear and bidirectional (comprehensive) + - AA → reduced alphabet (C and Python paths) + - Tokenization: plain bytes/str, gapped bytes/str + - Distances: Hamming, Levenshtein + - Cross-checking against tokens.py wrappers + +Run with ``python -m pytest tests/test_mirseq.py -v``. +""" + +import unittest + +from mir.basic import mirseq +from mir.basic.alphabets import ( + AA_MASK, + AA_TO_REDUCED, + aa_to_reduced as py_aa_to_reduced, + matches, +) +from mir.basic.tokens import ( + tokenize as py_tokenize, + tokenize_gapped as py_tokenize_gapped, + tokenize_str as py_tokenize_str, + tokenize_gapped_str as py_tokenize_gapped_str, +) +from mir.distances.seqdist import hamming, levenshtein + + +# ── helpers ──────────────────────────────────────────────────────── + +_CODON_MAP = { + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*", + "TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", +} + + +def _py_translate_linear(nt: str) -> str: + out = [] + for i in range(0, len(nt) - 2, 3): + codon = nt[i:i + 3] + out.append("X" if "N" in codon else _CODON_MAP[codon]) + if len(nt) % 3 != 0: + out.append("_") + return "".join(out) + + +def _py_translate_bidi(nt: str) -> str: + n = len(nt) + if n == 0: + return "" + if n % 3 == 0: + return _py_translate_linear(nt) + n_codons = n // 3 + fwd_codons = 4 if n >= 27 else n_codons // 2 + rev_codons = n_codons - fwd_codons + result = [] + for i in range(fwd_codons): + codon = nt[i * 3:i * 3 + 3] + result.append("X" if "N" in codon else _CODON_MAP[codon]) + result.append("_") + for i in range(rev_codons): + pos = n - (rev_codons - i) * 3 + codon = nt[pos:pos + 3] + result.append("X" if "N" in codon else _CODON_MAP[codon]) + return "".join(result) + + +# ── Translation: linear ─────────────────────────────────────────── + +class TestTranslateLinear(unittest.TestCase): + + def test_single_codon_atg(self) -> None: + self.assertEqual(mirseq.translate_linear("ATG"), "M") + + def test_multiple_codons(self) -> None: + self.assertEqual(mirseq.translate_linear("ATGGCTTGA"), "MA*") + + def test_incomplete_trailing_codon(self) -> None: + self.assertEqual(mirseq.translate_linear("ATGGCTTGAA"), "MA*_") + + def test_single_nt(self) -> None: + self.assertEqual(mirseq.translate_linear("A"), "_") + + def test_two_nt(self) -> None: + self.assertEqual(mirseq.translate_linear("AT"), "_") + + def test_n_codon(self) -> None: + self.assertEqual(mirseq.translate_linear("ATGNCTTGA"), "MX*") + + def test_all_n(self) -> None: + self.assertEqual(mirseq.translate_linear("NNN"), "X") + + def test_empty(self) -> None: + self.assertEqual(mirseq.translate_linear(""), "") + + def test_bytes_input(self) -> None: + self.assertEqual(mirseq.translate_linear(b"ATGGCTTGA"), "MA*") + + def test_stop_codons(self) -> None: + self.assertEqual(mirseq.translate_linear("TAATGATAG"), "***") + + def test_all_codons(self) -> None: + for codon, aa in _CODON_MAP.items(): + with self.subTest(codon=codon): + self.assertEqual(mirseq.translate_linear(codon), aa) + + def test_cross_check_reference(self) -> None: + seqs = ["ATGGCTTGA", "ATGGCTTGAA", "ATGNCTTGA", "NNN", + "TTTTTCTTATTG", "GCAGCCGCGGCG"] + for seq in seqs: + with self.subTest(seq=seq): + self.assertEqual( + mirseq.translate_linear(seq), _py_translate_linear(seq)) + + +# ── Translation: bidirectional (comprehensive) ──────────────────── + +class TestTranslateBidi(unittest.TestCase): + + def test_divisible_by_3(self) -> None: + self.assertEqual(mirseq.translate_bidi("ATGGCTTGA"), "MA*") + + def test_empty(self) -> None: + self.assertEqual(mirseq.translate_bidi(""), "") + + def test_bytes_input(self) -> None: + self.assertEqual(mirseq.translate_bidi(b"ATGGCTTGA"), "MA*") + + # -- short sequences (< 27 nt): gap in middle ---------------------- + + def test_4nt(self) -> None: + # 1 codon, fwd=0, rev=1 + self.assertEqual(mirseq.translate_bidi("TGAA"), "_E") + + def test_5nt(self) -> None: + self.assertEqual(mirseq.translate_bidi("ATGAA"), "_E") + + def test_7nt(self) -> None: + # 2 codons, fwd=1, rev=1 + self.assertEqual(mirseq.translate_bidi("ATGGCTA"), "M_L") + + def test_8nt(self) -> None: + self.assertEqual(mirseq.translate_bidi("ATGGCTAA"), "M_*") + + def test_10nt(self) -> None: + # 3 codons, fwd=1, rev=2 + self.assertEqual(mirseq.translate_bidi("ATGGCTTGAA"), "M_LE") + + def test_11nt(self) -> None: + # ATGGCTTGAAC: fwd=ATG→M, gap, rev=TTG→L, AAC→N + self.assertEqual(mirseq.translate_bidi("ATGGCTTGAAC"), "M_LN") + + def test_13nt(self) -> None: + # 4 codons, fwd=2, rev=2 + self.assertEqual(mirseq.translate_bidi("ATGGCTTGAAACT"), "MA_ET") + + def test_16nt(self) -> None: + # 5 codons, fwd=2, rev=3 + result = mirseq.translate_bidi("ATGGCTTGAAACTAAG") + self.assertEqual(result, "MA_ETK") + + def test_25nt(self) -> None: + # 8 codons, fwd=4, rev=4 + # ATG*8+A → fwd reads ATG ATG ATG ATG, rev reads TGA TGA TGA TGA + nt = "ATG" * 8 + "A" + result = mirseq.translate_bidi(nt) + self.assertEqual(len(result), 9) + self.assertEqual(result[:4], "MMMM") + self.assertEqual(result[4], "_") + self.assertEqual(result[5:], "****") + + def test_26nt(self) -> None: + nt = "ATG" * 8 + "AT" + result = mirseq.translate_bidi(nt) + self.assertEqual(len(result), 9) + self.assertEqual(result[4], "_") + + # -- boundary: 27 nt (9*3) — no gap -------------------------------- + + def test_27nt_exact(self) -> None: + nt = "ATG" * 9 + result = mirseq.translate_bidi(nt) + self.assertEqual(result, "M" * 9) + self.assertNotIn("_", result) + + # -- long sequences (>= 27 nt): gap after 4th codon ---------------- + + def test_28nt(self) -> None: + # fwd reads ATG*4, rev reads TGA*5 → ***** + nt = "ATG" * 9 + "A" + result = mirseq.translate_bidi(nt) + self.assertEqual(len(result), 10) + self.assertEqual(result[:4], "MMMM") + self.assertEqual(result[4], "_") + self.assertEqual(result[5:], "*****") + + def test_29nt(self) -> None: + nt = "ATG" * 9 + "AT" + result = mirseq.translate_bidi(nt) + self.assertEqual(len(result), 10) + self.assertEqual(result[4], "_") + + def test_31nt(self) -> None: + nt = "ATG" * 10 + "A" + result = mirseq.translate_bidi(nt) + self.assertEqual(len(result), 11) + self.assertEqual(result[:4], "MMMM") + self.assertEqual(result[4], "_") + + def test_46nt(self) -> None: + nt = "ATG" * 15 + "A" + result = mirseq.translate_bidi(nt) + self.assertEqual(len(result), 16) + self.assertEqual(result[4], "_") + + # -- N nucleotides -------------------------------------------------- + + def test_n_in_forward(self) -> None: + self.assertEqual(mirseq.translate_bidi("NNGNCTTGA"), "XX*") + + def test_n_in_reverse_short(self) -> None: + self.assertEqual(mirseq.translate_bidi("ATGNCTNG"), "M_X") + + # -- cross-check against Python reference --------------------------- + + def test_cross_check_short(self) -> None: + nt_base = "ATGGCTTGAAACTAAGTTTTTCATA" + for length in range(1, 27): + nt = (nt_base * 2)[:length] + with self.subTest(length=length): + self.assertEqual( + mirseq.translate_bidi(nt), _py_translate_bidi(nt)) + + def test_cross_check_long(self) -> None: + nt_base = "ATGGCTTGAAACTAAGTTTTTCATA" * 3 + for length in range(27, 51): + nt = nt_base[:length] + with self.subTest(length=length): + self.assertEqual( + mirseq.translate_bidi(nt), _py_translate_bidi(nt)) + + # -- structural properties ------------------------------------------ + + def test_gap_count(self) -> None: + for length in range(1, 50): + nt = ("ATG" * 20)[:length] + result = mirseq.translate_bidi(nt) + expected_gaps = 0 if length % 3 == 0 else 1 + self.assertEqual(result.count("_"), expected_gaps, f"length={length}") + + def test_output_length(self) -> None: + for length in range(1, 50): + nt = ("ATG" * 20)[:length] + result = mirseq.translate_bidi(nt) + n_codons = length // 3 + expected_len = n_codons if length % 3 == 0 else n_codons + 1 + self.assertEqual(len(result), expected_len, f"length={length}") + + +# ── AA → reduced ───────────────────────────────────────────────── + +class TestAaToReduced(unittest.TestCase): + + def test_basic(self) -> None: + self.assertEqual(mirseq.aa_to_reduced("CASTIVGGLSQDKIVW"), + "slhhllGGlhmcbllW") + + def test_matches_python(self) -> None: + seq = "CASTIVGGLSQDKIVW" + self.assertEqual(mirseq.aa_to_reduced(seq), + py_aa_to_reduced(seq).decode("ascii")) + + def test_special_chars(self) -> None: + self.assertEqual(mirseq.aa_to_reduced("*_X"), "*_X") + + def test_empty(self) -> None: + self.assertEqual(mirseq.aa_to_reduced(""), "") + + def test_bytes_input(self) -> None: + self.assertEqual(mirseq.aa_to_reduced(b"CAST"), "slhh") + + def test_all_aa_mapped(self) -> None: + for aa, expected in AA_TO_REDUCED.items(): + with self.subTest(aa=aa): + self.assertEqual(mirseq.aa_to_reduced(aa), expected) + + +# ── Tokenize bytes ──────────────────────────────────────────────── + +class TestTokenizeBytes(unittest.TestCase): + + def test_aa_k3(self) -> None: + self.assertEqual(mirseq.tokenize_bytes("CASSL", 3), + [b"CAS", b"ASS", b"SSL"]) + + def test_nt_k4(self) -> None: + self.assertEqual(mirseq.tokenize_bytes("ATCGAT", 4), + [b"ATCG", b"TCGA", b"CGAT"]) + + def test_k_equals_len(self) -> None: + self.assertEqual(mirseq.tokenize_bytes("CAST", 4), [b"CAST"]) + + def test_k1(self) -> None: + self.assertEqual(mirseq.tokenize_bytes("ATG", 1), + [b"A", b"T", b"G"]) + + def test_bytes_input(self) -> None: + self.assertEqual(mirseq.tokenize_bytes(b"CASSL", 3), + [b"CAS", b"ASS", b"SSL"]) + + def test_invalid_k(self) -> None: + with self.assertRaises(Exception): + mirseq.tokenize_bytes("CAST", 0) + with self.assertRaises(Exception): + mirseq.tokenize_bytes("CAST", 5) + + def test_cross_check_wrapper(self) -> None: + for seq in ["CASSL", "ATCGATCGATCG", "slhhllGG"]: + for k in [1, 2, 3, 4]: + if k > len(seq): + continue + with self.subTest(seq=seq, k=k): + self.assertEqual( + mirseq.tokenize_bytes(seq, k), + py_tokenize(seq, k)) + + +# ── Tokenize str ────────────────────────────────────────────────── + +class TestTokenizeStr(unittest.TestCase): + + def test_basic(self) -> None: + self.assertEqual(mirseq.tokenize_str("CASSL", 3), + ["CAS", "ASS", "SSL"]) + + def test_cross_check_wrapper(self) -> None: + for seq in ["CASSL", "ATCGATCGATCG"]: + for k in [1, 2, 3]: + with self.subTest(seq=seq, k=k): + self.assertEqual( + mirseq.tokenize_str(seq, k), + py_tokenize_str(seq, k)) + + +# ── Tokenize gapped bytes ──────────────────────────────────────── + +class TestTokenizeGappedBytes(unittest.TestCase): + + def test_aa_gapped_k3(self) -> None: + expected = [ + b"XAS", b"CXS", b"CAX", + b"XSS", b"AXS", b"ASX", + b"XSL", b"SXL", b"SSX", + ] + self.assertEqual( + mirseq.tokenize_gapped_bytes("CASSL", 3, AA_MASK), expected) + + def test_nt_gapped_k2(self) -> None: + self.assertEqual( + mirseq.tokenize_gapped_bytes("ATG", 2, ord("N")), + [b"NT", b"AN", b"NG", b"TN"]) + + def test_gapped_k1(self) -> None: + self.assertEqual( + mirseq.tokenize_gapped_bytes("CA", 1, AA_MASK), [b"X", b"X"]) + + def test_invalid_k(self) -> None: + with self.assertRaises(Exception): + mirseq.tokenize_gapped_bytes("CAST", 0, AA_MASK) + + def test_cross_check_wrapper(self) -> None: + for seq in ["CASSL", "ATCGAT", "slhh"]: + for k in [1, 2, 3]: + if k > len(seq): + continue + with self.subTest(seq=seq, k=k): + self.assertEqual( + mirseq.tokenize_gapped_bytes(seq, k, AA_MASK), + py_tokenize_gapped(seq, k, AA_MASK)) + + def test_gapped_match_plain(self) -> None: + plain = mirseq.tokenize_bytes("CASSL", 3) + gapped = mirseq.tokenize_gapped_bytes("CASSL", 3, AA_MASK) + for i, kmer in enumerate(plain): + for var in gapped[i * 3 : (i + 1) * 3]: + self.assertTrue(matches(kmer, var, AA_MASK)) + + +# ── Tokenize gapped str ────────────────────────────────────────── + +class TestTokenizeGappedStr(unittest.TestCase): + + def test_basic(self) -> None: + gapped = mirseq.tokenize_gapped_str("CASSL", 3, AA_MASK) + self.assertEqual(len(gapped), 9) + self.assertEqual(gapped[0], "XAS") + self.assertIsInstance(gapped[0], str) + + def test_cross_check_wrapper(self) -> None: + self.assertEqual( + mirseq.tokenize_gapped_str("CASSL", 3, AA_MASK), + py_tokenize_gapped_str("CASSL", 3, "X")) + + +# ── Hamming distance ────────────────────────────────────────────── + +class TestHamming(unittest.TestCase): + + def test_identical(self) -> None: + self.assertEqual(mirseq.hamming("CAST", "CAST"), 0) + + def test_one_mismatch(self) -> None: + self.assertEqual(mirseq.hamming("CAST", "CAAT"), 1) + + def test_all_mismatch(self) -> None: + self.assertEqual(mirseq.hamming("AAAA", "TTTT"), 4) + + def test_empty(self) -> None: + self.assertEqual(mirseq.hamming("", ""), 0) + + def test_length_mismatch_raises(self) -> None: + with self.assertRaises(Exception): + mirseq.hamming("ABC", "AB") + + def test_bytes_input(self) -> None: + self.assertEqual(mirseq.hamming(b"CAST", b"CAAT"), 1) + + def test_wrapper(self) -> None: + self.assertEqual(hamming("CAST", "CAAT"), 1) + + +# ── Levenshtein distance ───────────────────────────────────────── + +class TestLevenshtein(unittest.TestCase): + + def test_classic(self) -> None: + self.assertEqual(mirseq.levenshtein("kitten", "sitting"), 3) + + def test_identical(self) -> None: + self.assertEqual(mirseq.levenshtein("CAST", "CAST"), 0) + + def test_insertion(self) -> None: + self.assertEqual(mirseq.levenshtein("ABC", "ABCD"), 1) + + def test_deletion(self) -> None: + self.assertEqual(mirseq.levenshtein("ABCD", "ABC"), 1) + + def test_substitution(self) -> None: + self.assertEqual(mirseq.levenshtein("ABC", "AXC"), 1) + + def test_empty_vs_nonempty(self) -> None: + self.assertEqual(mirseq.levenshtein("", "ABC"), 3) + self.assertEqual(mirseq.levenshtein("ABC", ""), 3) + + def test_both_empty(self) -> None: + self.assertEqual(mirseq.levenshtein("", ""), 0) + + def test_bytes_input(self) -> None: + self.assertEqual(mirseq.levenshtein(b"kitten", b"sitting"), 3) + + def test_wrapper(self) -> None: + self.assertEqual(levenshtein("kitten", "sitting"), 3) + + def test_symmetric(self) -> None: + self.assertEqual( + mirseq.levenshtein("CASSL", "CASSQL"), + mirseq.levenshtein("CASSQL", "CASSL")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_mirseq_benchmark.py b/tests/test_mirseq_benchmark.py new file mode 100644 index 0000000..5e10b4d --- /dev/null +++ b/tests/test_mirseq_benchmark.py @@ -0,0 +1,148 @@ +"""Benchmarks: C (mirseq) vs pure-Python for key operations. + +Run with ``python -m pytest tests/test_mirseq_benchmark.py -v -s``. +""" + +import time +import unittest + +from mir.basic import mirseq +from mir.basic.alphabets import ( + AA_MASK, + AA_TO_REDUCED_TABLE, + _to_bytes, + aa_to_reduced as py_aa_to_reduced, +) + + +def _time_fn(fn, *args, n: int = 5000) -> float: + start = time.perf_counter() + for _ in range(n): + fn(*args) + return time.perf_counter() - start + + +class TestBenchmarks(unittest.TestCase): + + def _report(self, name: str, py_t: float, c_t: float) -> None: + ratio = py_t / c_t if c_t > 0 else float("inf") + print(f" {name:30s} Python={py_t:.4f}s C={c_t:.4f}s speedup={ratio:.1f}x") + + # ── translate_linear ────────────────────────────────────────── + + def test_translate_linear_speed(self) -> None: + nt = "ATG" * 40 + n = 10_000 + + def py_translate(s: str) -> str: + codon_map = { + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*", + "TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + } + out = [] + for i in range(0, len(s) - 2, 3): + codon = s[i:i + 3] + out.append(codon_map.get(codon, "X")) + return "".join(out) + + py_t = _time_fn(py_translate, nt, n=n) + c_t = _time_fn(mirseq.translate_linear, nt, n=n) + self._report("translate_linear", py_t, c_t) + + # ── aa_to_reduced ───────────────────────────────────────────── + + def test_aa_to_reduced_speed(self) -> None: + aa = "CASTIVGGLSQDKIVW" * 5 + n = 20_000 + py_t = _time_fn(py_aa_to_reduced, aa, n=n) + c_t = _time_fn(mirseq.aa_to_reduced, aa, n=n) + self._report("aa_to_reduced (Python bytes.translate vs C)", py_t, c_t) + + # ── tokenize_bytes ──────────────────────────────────────────── + + def test_tokenize_bytes_speed(self) -> None: + seq = "CASTIVGGLSQDKIVW" * 5 + n = 10_000 + + def py_tokenize(s: str, k: int) -> list[bytes]: + b = s.encode() + return [b[i:i + k] for i in range(len(b) - k + 1)] + + py_t = _time_fn(py_tokenize, seq, 3, n=n) + c_t = _time_fn(mirseq.tokenize_bytes, seq, 3, n=n) + self._report("tokenize_bytes", py_t, c_t) + + # ── tokenize_gapped_bytes ───────────────────────────────────── + + def test_tokenize_gapped_bytes_speed(self) -> None: + seq = "CASTIVGGLSQDKIVW" * 3 + n = 5_000 + + def py_tokenize_gapped(s: str, k: int, m: int) -> list[bytes]: + b = s.encode() + out = [] + for i in range(len(b) - k + 1): + kmer = bytearray(b[i:i + k]) + for j in range(k): + v = bytearray(kmer) + v[j] = m + out.append(bytes(v)) + return out + + py_t = _time_fn(py_tokenize_gapped, seq, 3, AA_MASK, n=n) + c_t = _time_fn(mirseq.tokenize_gapped_bytes, seq, 3, AA_MASK, n=n) + self._report("tokenize_gapped_bytes", py_t, c_t) + + # ── hamming ─────────────────────────────────────────────────── + + def test_hamming_speed(self) -> None: + a = "CASTIVGGLSQDKIVW" * 5 + b = "CASTIVGGLSQEKIVW" * 5 + n = 20_000 + + def py_hamming(s1: str, s2: str) -> int: + return sum(c1 != c2 for c1, c2 in zip(s1, s2)) + + py_t = _time_fn(py_hamming, a, b, n=n) + c_t = _time_fn(mirseq.hamming, a, b, n=n) + self._report("hamming", py_t, c_t) + + # ── levenshtein ─────────────────────────────────────────────── + + def test_levenshtein_speed(self) -> None: + a = "CASTIVGGLSQDKIVW" * 3 + b = "CASTGGLSQEKIVW" * 3 + n = 5_000 + + def py_levenshtein(s1: str, s2: str) -> int: + m, n_ = len(s1), len(s2) + prev = list(range(n_ + 1)) + for i in range(1, m + 1): + curr = [i] + [0] * n_ + for j in range(1, n_ + 1): + cost = 0 if s1[i - 1] == s2[j - 1] else 1 + curr[j] = min(curr[j - 1] + 1, prev[j] + 1, prev[j - 1] + cost) + prev = curr + return prev[n_] + + py_t = _time_fn(py_levenshtein, a, b, n=n) + c_t = _time_fn(mirseq.levenshtein, a, b, n=n) + self._report("levenshtein", py_t, c_t) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_sequence.py b/tests/test_sequence.py deleted file mode 100644 index 04903a8..0000000 --- a/tests/test_sequence.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Unit tests for :mod:`mir.basic.sequence` functions. - -Coverage: - make_alphabet / validate — alphabet construction and validation. - aa_to_reduced / translate — byte-level translation. - mask — single-index, range, and slice masking. - matches — wildcard-aware comparison. - matches_aa_reduced — cross-alphabet wildcard match. - str / bytes duality — every function accepts both types. -""" - -import unittest - -from mir.basic.sequence import ( - AA_ALPHABET, - AA_MASK, - AA_TO_REDUCED_TABLE, - NT_ALPHABET, - NT_MASK, - REDUCED_AA_ALPHABET, - REDUCED_AA_MASK, - aa_to_reduced, - make_alphabet, - mask, - matches, - matches_aa_reduced, - translate, - validate, -) - - -class TestMakeAlphabet(unittest.TestCase): - - def test_custom_alphabet(self) -> None: - lut = make_alphabet("AB") - self.assertEqual(len(lut), 256) - self.assertEqual(lut[ord("A")], 1) - self.assertEqual(lut[ord("B")], 1) - self.assertEqual(lut[ord("C")], 0) - - def test_predefined_nt(self) -> None: - for ch in "ATGCN": - self.assertEqual(NT_ALPHABET[ord(ch)], 1) - self.assertEqual(NT_ALPHABET[ord("U")], 0) - - def test_predefined_aa(self) -> None: - for ch in "ACDEFGHIKLMNPQRSTVWYX*_": - self.assertEqual(AA_ALPHABET[ord(ch)], 1) - self.assertEqual(AA_ALPHABET[ord("B")], 0) - - def test_predefined_reduced(self) -> None: - for ch in "lbmcshGFPWYX*_": - self.assertEqual(REDUCED_AA_ALPHABET[ord(ch)], 1) - self.assertEqual(REDUCED_AA_ALPHABET[ord("Z")], 0) - - -class TestValidate(unittest.TestCase): - - def test_valid_nt_str(self) -> None: - self.assertEqual(validate("ATTAGACA", NT_ALPHABET), b"ATTAGACA") - - def test_valid_nt_bytes(self) -> None: - self.assertEqual(validate(b"ATN", NT_ALPHABET), b"ATN") - - def test_valid_aa_bytearray(self) -> None: - self.assertEqual(validate(bytearray(b"CAST"), AA_ALPHABET), b"CAST") - - def test_empty(self) -> None: - self.assertEqual(validate("", NT_ALPHABET), b"") - self.assertEqual(validate(b"", AA_ALPHABET), b"") - - def test_invalid_nt(self) -> None: - with self.assertRaises(ValueError): - validate("ATU", NT_ALPHABET) - - def test_invalid_aa(self) -> None: - with self.assertRaises(ValueError): - validate("B", AA_ALPHABET) - - def test_invalid_reduced(self) -> None: - with self.assertRaises(ValueError): - validate("Z", REDUCED_AA_ALPHABET) - - -class TestTranslateAndReduce(unittest.TestCase): - - def test_aa_to_reduced_str(self) -> None: - self.assertEqual(aa_to_reduced("CASTIVGGLSQDKIVW"), b"slhhllGGlhmcbllW") - - def test_aa_to_reduced_bytes(self) -> None: - self.assertEqual(aa_to_reduced(b"CASTIVGGLSQDKIVW"), b"slhhllGGlhmcbllW") - - def test_generic_translate(self) -> None: - self.assertEqual(translate("CAST", AA_TO_REDUCED_TABLE), b"slhh") - - def test_empty_translate(self) -> None: - self.assertEqual(aa_to_reduced(""), b"") - - -class TestMask(unittest.TestCase): - - def test_single_nt(self) -> None: - self.assertEqual(mask("ATCGAT", 1, NT_MASK), b"ANCGAT") - - def test_range_nt(self) -> None: - self.assertEqual(mask("ATCGAT", (2, 5), NT_MASK), b"ATNNNT") - - def test_slice_nt(self) -> None: - self.assertEqual(mask("ATCGAT", slice(0, 3), NT_MASK), b"NNNGAT") - - def test_aa_single(self) -> None: - self.assertEqual(mask("CASTIV", 0, AA_MASK), b"XASTIV") - - def test_aa_range(self) -> None: - self.assertEqual(mask("CASTIV", (1, 4), AA_MASK), b"CXXXIV") - - def test_reduced_slice(self) -> None: - self.assertEqual(mask("slhhll", slice(2, 5), REDUCED_AA_MASK), b"slXXXl") - - def test_bytes_input(self) -> None: - self.assertEqual(mask(b"ATCG", 0, NT_MASK), b"NTCG") - - def test_out_of_range(self) -> None: - with self.assertRaises(IndexError): - mask("AT", 5, NT_MASK) - - -class TestMatches(unittest.TestCase): - - def test_identical(self) -> None: - self.assertTrue(matches("ATCG", "ATCG", NT_MASK)) - - def test_wildcard_match(self) -> None: - self.assertTrue(matches("ATCG", "ANNG", NT_MASK)) - - def test_no_match(self) -> None: - self.assertFalse(matches("ATCG", "ANNA", NT_MASK)) - - def test_length_mismatch(self) -> None: - self.assertFalse(matches("ATC", "ATCG", NT_MASK)) - - def test_empty(self) -> None: - self.assertTrue(matches("", "", NT_MASK)) - - def test_aa_wildcard(self) -> None: - self.assertTrue(matches("CAST", "XASX", AA_MASK)) - self.assertFalse(matches("CAST", "XATX", AA_MASK)) - - def test_reduced_wildcard(self) -> None: - self.assertTrue(matches("slhh", "sXXh", REDUCED_AA_MASK)) - self.assertFalse(matches("slhh", "sXXY", REDUCED_AA_MASK)) - - def test_bytes_input(self) -> None: - self.assertTrue(matches(b"ATCG", b"ANNG", NT_MASK)) - - def test_mixed_str_bytes(self) -> None: - self.assertTrue(matches("ATCG", b"ANNG", NT_MASK)) - - -class TestMatchesAaReduced(unittest.TestCase): - - def test_match(self) -> None: - reduced = aa_to_reduced("CASTIVGGLSQDKIVW") - self.assertTrue(matches_aa_reduced("CASTIVGGLSQDKIVW", reduced)) - - def test_mismatch(self) -> None: - self.assertFalse(matches_aa_reduced("CASTIVGGLSQDKIVW", b"slhhllGGlhmcbllY")) - - def test_masked_aa(self) -> None: - reduced = aa_to_reduced("CASTIVGGLSQDKIVW") - masked_aa = mask("CASTIVGGLSQDKIVW", 2, AA_MASK) - self.assertTrue(matches_aa_reduced(masked_aa, reduced)) - - def test_masked_reduced(self) -> None: - reduced = aa_to_reduced("CASTIVGGLSQDKIVW") - masked_red = mask(reduced, (2, 5), REDUCED_AA_MASK) - self.assertTrue(matches_aa_reduced("CASTIVGGLSQDKIVW", masked_red)) - - def test_empty(self) -> None: - self.assertTrue(matches_aa_reduced("", "")) - - def test_length_mismatch(self) -> None: - self.assertFalse(matches_aa_reduced("CAS", "sl")) - - def test_bytes_input(self) -> None: - reduced = aa_to_reduced(b"CAST") - self.assertTrue(matches_aa_reduced(b"CAST", reduced)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_sequence_benchmark.py b/tests/test_sequence_benchmark.py deleted file mode 100644 index 230a3ff..0000000 --- a/tests/test_sequence_benchmark.py +++ /dev/null @@ -1,289 +0,0 @@ -"""Speed benchmarks for sequence operations: validation, translation, -slicing, matching, and cross-alphabet matching. - -Each benchmark compares the ``mir.basic.sequence`` function against one -or more naive Python implementations. - -Run with ``python -m pytest tests/test_sequence_benchmark.py -s``. -""" - -import random -import time -import unittest - -from mir.basic.sequence import ( - AA_ALPHABET, - AA_MASK, - AA_TO_REDUCED, - AA_TO_REDUCED_TABLE, - NT_MASK, - _AA_TO_REDUCED_LUT, - _to_bytes, - aa_to_reduced, - matches, - matches_aa_reduced, - validate, -) - -N = 10_000 -SEQ_LEN = 15 -K = 3 - -_AA_LETTERS = "ACDEFGHIKLMNPQRSTVWY" -_AA_SET = frozenset(_AA_LETTERS + "*_X") - - -def _random_strings(n: int, length: int) -> list[str]: - rng = random.Random(42) - return ["".join(rng.choices(_AA_LETTERS, k=length)) for _ in range(n)] - - -def _print_table(title: str, rows: list[tuple[str, float, int]]) -> None: - print( - f"\n{title}\n" - f"{'Method':<40} {'Time (s)':>10} {'ops/s':>14}\n" - f"{'-' * 66}" - ) - for label, elapsed, count in rows: - rate = count / elapsed if elapsed > 0 else float("inf") - print(f"{label:<40} {elapsed:>10.4f} {rate:>14,.0f}") - - -class TestValidationBenchmark(unittest.TestCase): - - def test_validate_lut_vs_set(self) -> None: - """Alphabet validation: LUT (bytes[256]) vs frozenset membership.""" - strings = _random_strings(N, SEQ_LEN) - byte_strings = [s.encode() for s in strings] - - # LUT validation (validate function) - t0 = time.perf_counter() - for b in byte_strings: - validate(b, AA_ALPHABET) - t_lut = time.perf_counter() - t0 - - # frozenset[int] validation - aa_ords = frozenset(ord(c) for c in _AA_SET) - t0 = time.perf_counter() - for b in byte_strings: - for ch in b: - if ch not in aa_ords: - raise ValueError - t_fset = time.perf_counter() - t0 - - # naive str 'in' check - t0 = time.perf_counter() - for s in strings: - for ch in s: - if ch not in _AA_SET: - raise ValueError - t_str_in = time.perf_counter() - t0 - - _print_table( - f"Validation (N={N:,}, len={SEQ_LEN})", - [ - ("validate() [bytes LUT]", t_lut, N), - ("frozenset[int] loop", t_fset, N), - ("str 'in' frozenset[str]", t_str_in, N), - ], - ) - - -class TestTranslationBenchmark(unittest.TestCase): - - def test_translate_lut_vs_dict(self) -> None: - """Translation: bytes.translate vs dict lookup vs manual byte loop.""" - strings = _random_strings(N, SEQ_LEN) - byte_strings = [s.encode() for s in strings] - - # bytes.translate (aa_to_reduced) - t0 = time.perf_counter() - for b in byte_strings: - _ = b.translate(AA_TO_REDUCED_TABLE) - t_translate = time.perf_counter() - t0 - - # aa_to_reduced with str input (includes encode) - t0 = time.perf_counter() - for s in strings: - _ = aa_to_reduced(s) - t_aa_str = time.perf_counter() - t0 - - # naive dict[str,str] lookup + join - t0 = time.perf_counter() - for s in strings: - _ = "".join(AA_TO_REDUCED.get(ch, ch) for ch in s) - t_dict_join = time.perf_counter() - t0 - - # manual byte LUT loop - lut = _AA_TO_REDUCED_LUT - t0 = time.perf_counter() - for b in byte_strings: - _ = bytes(lut[ch] for ch in b) - t_manual = time.perf_counter() - t0 - - _print_table( - f"Translation AA→reduced (N={N:,}, len={SEQ_LEN})", - [ - ("bytes.translate (bytes in)", t_translate, N), - ("aa_to_reduced (str in)", t_aa_str, N), - ("dict[str,str] + join", t_dict_join, N), - ("manual byte LUT loop", t_manual, N), - ], - ) - ratio = t_dict_join / t_translate if t_translate > 0 else float("inf") - print(f"\ndict+join / bytes.translate: {ratio:.1f}x slower") - - -class TestSlicingBenchmark(unittest.TestCase): - - def test_bytes_vs_str_slicing(self) -> None: - """Substring slicing: bytes[i:j] vs str[i:j] at various k.""" - strings = _random_strings(N, SEQ_LEN) - byte_strings = [s.encode() for s in strings] - - for k in (3, 5, 10): - n_slices = SEQ_LEN - k + 1 - expected = N * n_slices - - # str slicing - t0 = time.perf_counter() - cnt = 0 - for s in strings: - for i in range(len(s) - k + 1): - _ = s[i : i + k] - cnt += 1 - t_str = time.perf_counter() - t0 - - # bytes slicing - t0 = time.perf_counter() - cnt2 = 0 - for b in byte_strings: - for i in range(len(b) - k + 1): - _ = b[i : i + k] - cnt2 += 1 - t_bytes = time.perf_counter() - t0 - - # str slicing via encode→slice→decode - t0 = time.perf_counter() - cnt3 = 0 - for s in strings: - b = s.encode() - for i in range(len(b) - k + 1): - _ = b[i : i + k] - cnt3 += 1 - t_enc_slice = time.perf_counter() - t0 - - self.assertEqual(cnt, expected) - self.assertEqual(cnt2, expected) - self.assertEqual(cnt3, expected) - - _print_table( - f"Slicing k={k} (N={N:,}, len={SEQ_LEN}, {n_slices} slices/seq)", - [ - ("str[i:i+k]", t_str, expected), - ("bytes[i:i+k]", t_bytes, expected), - ("str.encode + bytes[i:i+k]", t_enc_slice, expected), - ], - ) - ratio = t_str / t_bytes if t_bytes > 0 else float("inf") - print(f" str/bytes ratio: {ratio:.2f}x") - - -class TestMatchingBenchmark(unittest.TestCase): - - def test_matches_vs_naive(self) -> None: - """Wildcard matching: matches() vs naive Python loop.""" - rng = random.Random(42) - strings_a = _random_strings(N, SEQ_LEN) - # create pairs: 50% identical, 50% with 1 mask position - strings_b = [] - for s in strings_a: - if rng.random() < 0.5: - strings_b.append(s) - else: - pos = rng.randint(0, SEQ_LEN - 1) - strings_b.append(s[:pos] + "X" + s[pos + 1 :]) - - bytes_a = [s.encode() for s in strings_a] - bytes_b = [s.encode() for s in strings_b] - - # matches() function - t0 = time.perf_counter() - res1 = 0 - for a, b in zip(bytes_a, bytes_b): - if matches(a, b, AA_MASK): - res1 += 1 - t_func = time.perf_counter() - t0 - - # naive Python: zip + compare - mask_val = AA_MASK - t0 = time.perf_counter() - res2 = 0 - for a, b in zip(bytes_a, bytes_b): - if len(a) == len(b) and all( - x == y or x == mask_val or y == mask_val - for x, y in zip(a, b) - ): - res2 += 1 - t_naive = time.perf_counter() - t0 - - # naive str comparison - t0 = time.perf_counter() - res3 = 0 - for a, b in zip(strings_a, strings_b): - if len(a) == len(b) and all( - x == y or x == "X" or y == "X" - for x, y in zip(a, b) - ): - res3 += 1 - t_str = time.perf_counter() - t0 - - self.assertEqual(res1, res2) - self.assertEqual(res1, res3) - - _print_table( - f"Wildcard matching (N={N:,}, len={SEQ_LEN})", - [ - ("matches() [bytes]", t_func, N), - ("naive bytes zip+all", t_naive, N), - ("naive str zip+all", t_str, N), - ], - ) - - def test_matches_aa_reduced_vs_naive(self) -> None: - """Cross-alphabet matching: matches_aa_reduced() vs naive.""" - strings = _random_strings(N, SEQ_LEN) - reduced = [aa_to_reduced(s) for s in strings] - - bytes_aa = [s.encode() for s in strings] - - # matches_aa_reduced() - t0 = time.perf_counter() - cnt = 0 - for a, r in zip(bytes_aa, reduced): - if matches_aa_reduced(a, r): - cnt += 1 - t_func = time.perf_counter() - t0 - - # naive: translate then compare - t0 = time.perf_counter() - cnt2 = 0 - for a, r in zip(bytes_aa, reduced): - if a.translate(AA_TO_REDUCED_TABLE) == r: - cnt2 += 1 - t_naive = time.perf_counter() - t0 - - self.assertEqual(cnt, N) - self.assertEqual(cnt2, N) - - _print_table( - f"Cross-alphabet matching (N={N:,}, len={SEQ_LEN})", - [ - ("matches_aa_reduced()", t_func, N), - ("translate + bytes ==", t_naive, N), - ], - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_tokens.py b/tests/test_tokens.py index 9d7fe92..02ea0a6 100644 --- a/tests/test_tokens.py +++ b/tests/test_tokens.py @@ -1,108 +1,105 @@ -"""Unit tests for :mod:`mir.basic.tokens` functions.""" +"""Unit tests for ``mir.basic.tokens`` wrapper functions. + +These delegates to the ``mirseq`` C extension; focus is on the wrapper +API, input normalisation, and agreement with ``mirseq`` direct calls. + +Run with ``python -m pytest tests/test_tokens.py -v``. +""" import unittest -from mir.basic.sequence import AA_MASK, NT_MASK, REDUCED_AA_MASK, matches -from mir.basic.tokens import tokenize, tokenize_gapped, tokenize_gapped_str, tokenize_str +from mir.basic.alphabets import AA_MASK, NT_MASK +from mir.basic.tokens import ( + tokenize, + tokenize_str, + tokenize_gapped, + tokenize_gapped_str, +) +from mir.basic import mirseq + +# ── tokenize (returns list[bytes]) ──────────────────────────────── class TestTokenize(unittest.TestCase): - """Plain k-mer extraction (bytes output).""" - def test_aa_k3(self) -> None: + def test_basic_aa(self) -> None: self.assertEqual(tokenize("CASSL", 3), [b"CAS", b"ASS", b"SSL"]) - def test_nt_k4(self) -> None: - self.assertEqual(tokenize("ATCGAT", 4), [b"ATCG", b"TCGA", b"CGAT"]) + def test_basic_nt(self) -> None: + self.assertEqual(tokenize("ATCGAT", 4), + [b"ATCG", b"TCGA", b"CGAT"]) - def test_reduced_k2(self) -> None: - self.assertEqual(tokenize("slhh", 2), [b"sl", b"lh", b"hh"]) + def test_k1(self) -> None: + self.assertEqual(tokenize("ATG", 1), [b"A", b"T", b"G"]) - def test_k_equals_length(self) -> None: + def test_k_eq_len(self) -> None: self.assertEqual(tokenize("CAST", 4), [b"CAST"]) - def test_k_equals_one(self) -> None: - self.assertEqual(tokenize("ATG", 1), [b"A", b"T", b"G"]) + def test_str_input(self) -> None: + self.assertEqual(tokenize("CAST", 2), [b"CA", b"AS", b"ST"]) def test_bytes_input(self) -> None: - self.assertEqual(tokenize(b"CASSL", 3), [b"CAS", b"ASS", b"SSL"]) + self.assertEqual(tokenize(b"CAST", 2), [b"CA", b"AS", b"ST"]) - def test_bytearray_input(self) -> None: - self.assertEqual(tokenize(bytearray(b"ATG"), 1), [b"A", b"T", b"G"]) + def test_agrees_with_c(self) -> None: + for seq in ["CASSL", "ATCGATCGATCG"]: + for k in [1, 2, 3]: + with self.subTest(seq=seq, k=k): + self.assertEqual(tokenize(seq, k), + mirseq.tokenize_bytes(seq, k)) - def test_invalid_k(self) -> None: - with self.assertRaises(ValueError): - tokenize("CAST", 0) - with self.assertRaises(ValueError): - tokenize("CAST", 5) +# ── tokenize_str (returns list[str]) ───────────────────────────── class TestTokenizeStr(unittest.TestCase): - """Plain k-mer extraction (str output).""" def test_basic(self) -> None: self.assertEqual(tokenize_str("CASSL", 3), ["CAS", "ASS", "SSL"]) - def test_bytes_input(self) -> None: - self.assertEqual(tokenize_str(b"ATG", 1), ["A", "T", "G"]) + def test_type(self) -> None: + result = tokenize_str("CAST", 2) + self.assertIsInstance(result[0], str) + def test_agrees_with_c(self) -> None: + self.assertEqual(tokenize_str("CASSL", 3), + mirseq.tokenize_str("CASSL", 3)) + + +# ── tokenize_gapped (returns list[bytes]) ──────────────────────── class TestTokenizeGapped(unittest.TestCase): - """Gapped k-mer extraction (bytes output).""" - def test_aa_gapped_k3(self) -> None: - gapped = tokenize_gapped("CASSL", 3, AA_MASK) - self.assertEqual(len(gapped), 9) + def test_basic(self) -> None: expected = [ b"XAS", b"CXS", b"CAX", b"XSS", b"AXS", b"ASX", b"XSL", b"SXL", b"SSX", ] - self.assertEqual(gapped, expected) + self.assertEqual(tokenize_gapped("CASSL", 3, AA_MASK), expected) - def test_nt_gapped_k2(self) -> None: - gapped = tokenize_gapped("ATG", 2, NT_MASK) - self.assertEqual(gapped, [b"NT", b"AN", b"NG", b"TN"]) + def test_nt(self) -> None: + self.assertEqual(tokenize_gapped("ATG", 2, NT_MASK), + [b"NT", b"AN", b"NG", b"TN"]) - def test_reduced_gapped_k2(self) -> None: - gapped = tokenize_gapped("slh", 2, REDUCED_AA_MASK) - self.assertEqual(gapped, [b"Xl", b"sX", b"Xh", b"lX"]) + def test_agrees_with_c(self) -> None: + self.assertEqual(tokenize_gapped("CASSL", 3, AA_MASK), + mirseq.tokenize_gapped_bytes("CASSL", 3, AA_MASK)) - def test_gapped_k1(self) -> None: - gapped = tokenize_gapped("CA", 1, AA_MASK) - self.assertEqual(gapped, [b"X", b"X"]) - - def test_invalid_k(self) -> None: - with self.assertRaises(ValueError): - tokenize_gapped("CAST", 0, AA_MASK) - with self.assertRaises(ValueError): - tokenize_gapped("CAST", 5, AA_MASK) - - def test_bytes_input(self) -> None: - gapped = tokenize_gapped(b"ATG", 2, NT_MASK) - self.assertEqual(gapped, [b"NT", b"AN", b"NG", b"TN"]) - - def test_gapped_match_plain(self) -> None: - """Each gapped k-mer should wildcard-match its corresponding plain k-mer.""" - plain = tokenize("CASSL", 3) - gapped = tokenize_gapped("CASSL", 3, AA_MASK) - for i, kmer in enumerate(plain): - variants = gapped[i * 3 : (i + 1) * 3] - for var in variants: - self.assertTrue( - matches(kmer, var, AA_MASK), - f"{kmer} should match {var}", - ) +# ── tokenize_gapped_str (returns list[str]) ────────────────────── class TestTokenizeGappedStr(unittest.TestCase): - """Gapped k-mer extraction (str output).""" def test_basic(self) -> None: - gapped = tokenize_gapped_str("CASSL", 3, "X") - self.assertEqual(len(gapped), 9) - self.assertEqual(gapped[0], "XAS") - self.assertIsInstance(gapped[0], str) + result = tokenize_gapped_str("CASSL", 3, "X") + self.assertEqual(len(result), 9) + self.assertIsInstance(result[0], str) + self.assertEqual(result[0], "XAS") + + def test_agrees_with_c(self) -> None: + self.assertEqual( + tokenize_gapped_str("CASSL", 3, "X"), + mirseq.tokenize_gapped_str("CASSL", 3, AA_MASK)) if __name__ == "__main__": diff --git a/tests/test_tokens_benchmark.py b/tests/test_tokens_benchmark.py deleted file mode 100644 index 57f2e20..0000000 --- a/tests/test_tokens_benchmark.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Speed benchmark: tokenize / tokenize_gapped vs naive Python. - -Compares bytes-based tokenisation functions against naive ``str`` slicing -for both plain and gapped k-mers. Also benchmarks ``str`` vs ``bytes`` -input to verify conversion overhead is negligible. - -Run with ``python -m pytest tests/test_tokens_benchmark.py -s``. -""" - -import random -import time -import unittest - -from mir.basic.sequence import AA_MASK -from mir.basic.tokens import tokenize, tokenize_gapped - -N = 10_000 -SEQ_LEN = 15 -K = 3 -MASK_STR = "X" - -_AA_LETTERS = "ACDEFGHIKLMNPQRSTVWY" - - -def _random_strings(n: int, length: int) -> list[str]: - rng = random.Random(42) - return ["".join(rng.choices(_AA_LETTERS, k=length)) for _ in range(n)] - - -def _print_table(title: str, rows: list[tuple[str, float, int]]) -> None: - print( - f"\n{title}\n" - f"{'Method':<36} {'Time (s)':>10} {'items/s':>14}\n" - f"{'-' * 62}" - ) - for label, elapsed, count in rows: - rate = count / elapsed if elapsed > 0 else float("inf") - print(f"{label:<36} {elapsed:>10.4f} {rate:>14,.0f}") - - -class TestTokenizeBenchmark(unittest.TestCase): - - def test_plain_kmers(self) -> None: - """Plain k-mers: tokenize(bytes) vs naive str slicing.""" - strings = _random_strings(N, SEQ_LEN) - byte_strings = [s.encode() for s in strings] - expected = N * (SEQ_LEN - K + 1) - - # naive str slicing - t0 = time.perf_counter() - cnt = 0 - for s in strings: - for i in range(len(s) - K + 1): - _ = s[i : i + K] - cnt += 1 - t_naive_str = time.perf_counter() - t0 - - # naive bytes slicing - t0 = time.perf_counter() - cnt2 = 0 - for b in byte_strings: - for i in range(len(b) - K + 1): - _ = b[i : i + K] - cnt2 += 1 - t_naive_bytes = time.perf_counter() - t0 - - # tokenize(str input) - t0 = time.perf_counter() - cnt3 = 0 - for s in strings: - cnt3 += len(tokenize(s, K)) - t_tok_str = time.perf_counter() - t0 - - # tokenize(bytes input) - t0 = time.perf_counter() - cnt4 = 0 - for b in byte_strings: - cnt4 += len(tokenize(b, K)) - t_tok_bytes = time.perf_counter() - t0 - - self.assertEqual(cnt, expected) - self.assertEqual(cnt2, expected) - self.assertEqual(cnt3, expected) - self.assertEqual(cnt4, expected) - - _print_table( - f"Plain {K}-mers (N={N:,}, len={SEQ_LEN})", - [ - ("naive str slicing", t_naive_str, expected), - ("naive bytes slicing", t_naive_bytes, expected), - ("tokenize(str input)", t_tok_str, expected), - ("tokenize(bytes input)", t_tok_bytes, expected), - ], - ) - ratio = t_tok_bytes / t_naive_str if t_naive_str > 0 else float("inf") - print(f"\ntokenize(bytes) / naive str: {ratio:.2f}x") - - def test_gapped_kmers(self) -> None: - """Gapped k-mers: tokenize_gapped vs naive str concatenation.""" - strings = _random_strings(N, SEQ_LEN) - byte_strings = [s.encode() for s in strings] - n_windows = SEQ_LEN - K + 1 - expected = N * n_windows * K - - # naive str: slice + replace - t0 = time.perf_counter() - cnt = 0 - for s in strings: - for i in range(len(s) - K + 1): - w = s[i : i + K] - for j in range(K): - _ = w[:j] + MASK_STR + w[j + 1 :] - cnt += 1 - t_naive_str = time.perf_counter() - t0 - - # naive bytes: slice + replace - mask_b = bytes([AA_MASK]) - t0 = time.perf_counter() - cnt2 = 0 - for b in byte_strings: - for i in range(len(b) - K + 1): - w = b[i : i + K] - for j in range(K): - _ = w[:j] + mask_b + w[j + 1 :] - cnt2 += 1 - t_naive_bytes = time.perf_counter() - t0 - - # tokenize_gapped(str input) - t0 = time.perf_counter() - cnt3 = 0 - for s in strings: - cnt3 += len(tokenize_gapped(s, K, AA_MASK)) - t_tok_str = time.perf_counter() - t0 - - # tokenize_gapped(bytes input) - t0 = time.perf_counter() - cnt4 = 0 - for b in byte_strings: - cnt4 += len(tokenize_gapped(b, K, AA_MASK)) - t_tok_bytes = time.perf_counter() - t0 - - self.assertEqual(cnt, expected) - self.assertEqual(cnt2, expected) - self.assertEqual(cnt3, expected) - self.assertEqual(cnt4, expected) - - _print_table( - f"Gapped {K}-mers (N={N:,}, len={SEQ_LEN})", - [ - ("naive str slice+replace", t_naive_str, expected), - ("naive bytes slice+replace", t_naive_bytes, expected), - ("tokenize_gapped(str input)", t_tok_str, expected), - ("tokenize_gapped(bytes input)", t_tok_bytes, expected), - ], - ) - ratio = t_tok_bytes / t_naive_str if t_naive_str > 0 else float("inf") - print(f"\ntokenize_gapped(bytes) / naive str: {ratio:.2f}x") - - -if __name__ == "__main__": - unittest.main() From 7e244270885e0fdf2cd8b72c403c90604af86f46 Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 19:12:49 +0300 Subject: [PATCH 11/24] wip C impl --- CMakeLists.txt | 13 +++ mir/basic/mirseq.cpp | 129 +++++++++----------------- mir/basic/token_tables.py | 31 ++++--- mir/distances/seqdist.cpp | 105 +++++++++++++++++++++ mir/distances/seqdist.py | 4 +- tests/test_mirseq.py | 162 +++++++++++++++++---------------- tests/test_mirseq_benchmark.py | 7 +- tests/test_seqdist.py | 80 ++++++++++++++++ 8 files changed, 355 insertions(+), 176 deletions(-) create mode 100644 mir/distances/seqdist.cpp create mode 100644 tests/test_seqdist.py diff --git a/CMakeLists.txt b/CMakeLists.txt index e2377a1..e660a05 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,19 @@ set_target_properties(cdrscore PROPERTIES ) install(TARGETS cdrscore LIBRARY DESTINATION mir/distances) +# --- seqdist_c (mir.distances) --- +pybind11_add_module(seqdist_c MODULE mir/distances/seqdist.cpp) +target_compile_features(seqdist_c PRIVATE cxx_std_17) +if (MSVC) + target_compile_options(seqdist_c PRIVATE /O2 /DNOMINMAX) +else() + target_compile_options(seqdist_c PRIVATE -O3) +endif() +set_target_properties(seqdist_c PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/mir/distances" +) +install(TARGETS seqdist_c LIBRARY DESTINATION mir/distances) + # --- mirseq (mir.basic) --- pybind11_add_module(mirseq MODULE mir/basic/mirseq.cpp) target_compile_features(mirseq PRIVATE cxx_std_17) diff --git a/mir/basic/mirseq.cpp b/mir/basic/mirseq.cpp index 682d4ed..32cf126 100644 --- a/mir/basic/mirseq.cpp +++ b/mir/basic/mirseq.cpp @@ -1,5 +1,5 @@ /* - * mirseq — C-native sequence translation, tokenization, and distances. + * mirseq — C-native sequence translation and tokenization. * * Compiled as a pybind11 module. All functions accept Python str or bytes * via py::bytes / std::string_view and return Python list[str] or list[bytes]. @@ -15,8 +15,6 @@ * tokenize_str(seq, k) → list[str] sliding window k-mers * tokenize_gapped_bytes(seq,k,m) → list[bytes] gapped k-mers * tokenize_gapped_str(seq,k,m) → list[str] gapped k-mers - * hamming(a, b) → int hamming distance - * levenshtein(a, b) → int levenshtein distance */ #include @@ -135,6 +133,16 @@ static py::str translate_linear(const py::object& obj) { /* ================================================================ * Translation: bidirectional + * + * Inserts a full gap codon (NNN, which translates to '_') into the + * nucleotide sequence so that the total length becomes divisible by 3, + * then performs a standard linear translation. + * + * Gap position: + * - Short sequences (< 27 nt): gap inserted in the middle + * (after the first half of complete codons). + * - Long sequences (>= 27 nt): gap inserted after the 4th codon + * (position 12). * ================================================================ */ static py::str translate_bidi(const py::object& obj) { @@ -144,7 +152,7 @@ static py::str translate_bidi(const py::object& obj) { size_t remainder = n % 3; if (remainder == 0) { - // Exact multiple of 3: just translate linearly + // Exact multiple of 3: translate linearly, no gap needed size_t n_codons = n / 3; std::string result(n_codons, '\0'); const char* s = sv.data; @@ -155,42 +163,46 @@ static py::str translate_bidi(const py::object& obj) { return py::str(result); } - // Not multiple of 3: bidirectional with gap - size_t n_codons = n / 3; // full codons available - // Determine forward and reverse codon counts - // For long sequences (>= 9 codons worth = 27 nt): gap after 4th codon from start - // For shorter sequences: gap in the middle - size_t fwd_codons, rev_codons; - if (n >= 9 * 3) { + // Insert a gap codon (NNN) to make length divisible by 3. + // Gap codon length = 3 - remainder + size_t gap_len = 3 - remainder; + size_t n_codons = n / 3; + + // Determine insertion point (in nucleotide space) + size_t fwd_codons; + if (n >= 27) { fwd_codons = 4; - rev_codons = n_codons - 4; } else { fwd_codons = n_codons / 2; - rev_codons = n_codons - fwd_codons; } + size_t insert_pos = fwd_codons * 3; - // out_len = fwd + 1 (gap) + rev - size_t out_len = fwd_codons + 1 + rev_codons; - std::string result(out_len, '\0'); + // Build padded sequence: [0..insert_pos) + NNN.. + [insert_pos..n) + size_t padded_len = n + gap_len; + // padded_len is now divisible by 3 + size_t total_codons = padded_len / 3; + std::string result(total_codons, '\0'); const char* s = sv.data; - // Forward codons from start - for (size_t i = 0; i < fwd_codons; ++i) - result[i] = translate_codon((unsigned char)s[i*3], - (unsigned char)s[i*3+1], - (unsigned char)s[i*3+2]); - - // Gap - result[fwd_codons] = '_'; - - // Reverse codons from end - for (size_t i = 0; i < rev_codons; ++i) { - size_t nt_pos = n - (rev_codons - i) * 3; - result[fwd_codons + 1 + i] = translate_codon( - (unsigned char)s[nt_pos], - (unsigned char)s[nt_pos+1], - (unsigned char)s[nt_pos+2]); + // Translate codons from the padded sequence + for (size_t i = 0; i < total_codons; ++i) { + size_t nt_base = i * 3; + unsigned char c[3]; + for (int b = 0; b < 3; ++b) { + size_t pos = nt_base + b; + if (pos < insert_pos) + c[b] = (unsigned char)s[pos]; + else if (pos < insert_pos + gap_len) + c[b] = 'N'; // gap filler + else + c[b] = (unsigned char)s[pos - gap_len]; + } + result[i] = translate_codon(c[0], c[1], c[2]); } + // The gap codon (any codon containing N) will produce 'X'. + // We want '_' for the gap, so fix it up: + // The gap codon is at index fwd_codons (the one that starts at insert_pos) + result[fwd_codons] = '_'; return py::str(result); } @@ -282,55 +294,12 @@ static py::list c_tokenize_gapped_str(const py::object& obj, int k, int mask_byt return result; } -/* ================================================================ - * Hamming distance - * ================================================================ */ - -static int c_hamming(const py::object& a, const py::object& b) { - auto sa = to_view(a); - auto sb = to_view(b); - if (sa.len != sb.len) - throw std::invalid_argument("sequences must have equal length for hamming distance"); - int d = 0; - for (size_t i = 0; i < sa.len; ++i) - d += (sa.data[i] != sb.data[i]); - return d; -} - -/* ================================================================ - * Levenshtein distance (classic DP, two-row, O(min(m,n)) space) - * ================================================================ */ - -static int c_levenshtein(const py::object& a, const py::object& b) { - auto sa = to_view(a); - auto sb = to_view(b); - size_t m = sa.len, n = sb.len; - // Ensure m <= n for space optimisation - const char* s = sa.data; - const char* t = sb.data; - if (m > n) { std::swap(s, t); std::swap(m, n); } - std::vector prev(m + 1), curr(m + 1); - for (size_t i = 0; i <= m; ++i) prev[i] = (int)i; - for (size_t j = 1; j <= n; ++j) { - curr[0] = (int)j; - for (size_t i = 1; i <= m; ++i) { - int cost = (s[i-1] != t[j-1]) ? 1 : 0; - int del_ = prev[i] + 1; - int ins = curr[i-1] + 1; - int sub = prev[i-1] + cost; - curr[i] = std::min({del_, ins, sub}); - } - std::swap(prev, curr); - } - return prev[m]; -} - /* ================================================================ * Module definition * ================================================================ */ PYBIND11_MODULE(mirseq, m) { - m.doc() = "C-native sequence translation, tokenization, and distances"; + m.doc() = "C-native sequence translation and tokenization"; // Translation m.def("translate_linear", &translate_linear, @@ -356,12 +325,4 @@ PYBIND11_MODULE(mirseq, m) { m.def("tokenize_gapped_str", &c_tokenize_gapped_str, py::arg("seq"), py::arg("k"), py::arg("mask_byte"), "Gapped k-mers (each position masked) as list[str]"); - - // Distances - m.def("hamming", &c_hamming, - py::arg("a"), py::arg("b"), - "Hamming distance between two equal-length sequences"); - m.def("levenshtein", &c_levenshtein, - py::arg("a"), py::arg("b"), - "Levenshtein (edit) distance between two sequences"); } diff --git a/mir/basic/token_tables.py b/mir/basic/token_tables.py index 269897e..e5b8f3d 100644 --- a/mir/basic/token_tables.py +++ b/mir/basic/token_tables.py @@ -22,6 +22,10 @@ from typing import NamedTuple from mir.basic.alphabets import Seq, _to_bytes +from mir.basic.mirseq import ( + tokenize_bytes as _c_tokenize_bytes, + tokenize_gapped_bytes as _c_tokenize_gapped_bytes, +) # --------------------------------------------------------------------------- @@ -113,22 +117,27 @@ class KmerStats(NamedTuple): # --------------------------------------------------------------------------- def _plain_kmers(raw: bytes, k: int) -> list[tuple[bytes, int]]: - """Overlapping k-mers from *raw* with their start positions.""" - return [(raw[i : i + k], i) for i in range(len(raw) - k + 1)] + """Overlapping k-mers from *raw* with their start positions. + + Delegates to the C extension for the k-mer extraction. + """ + kmers = _c_tokenize_bytes(raw, k) + return [(kmer, i) for i, kmer in enumerate(kmers)] def _gapped_kmers(raw: bytes, k: int, mask_byte: int) -> list[tuple[bytes, int]]: - """Gapped k-mers (each position masked once) with window start positions.""" - n = len(raw) - n_windows = n - k + 1 - buf = bytearray(k) + """Gapped k-mers (each position masked once) with window start positions. + + Delegates to the C extension for the gapped k-mer extraction. + """ + gapped = _c_tokenize_gapped_bytes(raw, k, mask_byte) + n_windows = len(raw) - k + 1 result: list[tuple[bytes, int]] = [] + idx = 0 for i in range(n_windows): - window = raw[i : i + k] - for j in range(k): - buf[:] = window - buf[j] = mask_byte - result.append((bytes(buf), i)) + for _ in range(k): + result.append((gapped[idx], i)) + idx += 1 return result diff --git a/mir/distances/seqdist.cpp b/mir/distances/seqdist.cpp new file mode 100644 index 0000000..8220bda --- /dev/null +++ b/mir/distances/seqdist.cpp @@ -0,0 +1,105 @@ +/* + * seqdist — C-native sequence distance functions. + * + * Compiled as a pybind11 module. Accepts Python str, bytes, or bytearray. + * + * Functions: + * hamming(a, b) → int Hamming distance (equal-length sequences) + * levenshtein(a, b) → int Levenshtein (edit) distance + */ + +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; + +/* ================================================================ + * Helper: extract raw pointer + length from str or bytes + * ================================================================ */ + +struct SeqView { + const char* data; + size_t len; +}; + +static SeqView to_view(const py::object& obj) { + if (py::isinstance(obj)) { + Py_ssize_t sz = 0; + const char* p = PyUnicode_AsUTF8AndSize(obj.ptr(), &sz); + if (!p) throw py::error_already_set(); + return {p, (size_t)sz}; + } + if (py::isinstance(obj)) { + char* buf = nullptr; + Py_ssize_t sz = 0; + PyBytes_AsStringAndSize(obj.ptr(), &buf, &sz); + return {buf, (size_t)sz}; + } + if (py::isinstance(obj)) { + const char* buf = PyByteArray_AS_STRING(obj.ptr()); + size_t sz = (size_t)PyByteArray_GET_SIZE(obj.ptr()); + return {buf, sz}; + } + throw py::type_error("expected str, bytes, or bytearray"); +} + +/* ================================================================ + * Hamming distance + * ================================================================ */ + +static int c_hamming(const py::object& a, const py::object& b) { + auto sa = to_view(a); + auto sb = to_view(b); + if (sa.len != sb.len) + throw std::invalid_argument("sequences must have equal length for hamming distance"); + int d = 0; + for (size_t i = 0; i < sa.len; ++i) + d += (sa.data[i] != sb.data[i]); + return d; +} + +/* ================================================================ + * Levenshtein distance (classic DP, two-row, O(min(m,n)) space) + * ================================================================ */ + +static int c_levenshtein(const py::object& a, const py::object& b) { + auto sa = to_view(a); + auto sb = to_view(b); + size_t m = sa.len, n = sb.len; + const char* s = sa.data; + const char* t = sb.data; + if (m > n) { std::swap(s, t); std::swap(m, n); } + std::vector prev(m + 1), curr(m + 1); + for (size_t i = 0; i <= m; ++i) prev[i] = (int)i; + for (size_t j = 1; j <= n; ++j) { + curr[0] = (int)j; + for (size_t i = 1; i <= m; ++i) { + int cost = (s[i-1] != t[j-1]) ? 1 : 0; + int del_ = prev[i] + 1; + int ins = curr[i-1] + 1; + int sub = prev[i-1] + cost; + curr[i] = std::min({del_, ins, sub}); + } + std::swap(prev, curr); + } + return prev[m]; +} + +/* ================================================================ + * Module definition + * ================================================================ */ + +PYBIND11_MODULE(seqdist_c, m) { + m.doc() = "C-native sequence distance functions (Hamming, Levenshtein)"; + + m.def("hamming", &c_hamming, + py::arg("a"), py::arg("b"), + "Hamming distance between two equal-length sequences"); + m.def("levenshtein", &c_levenshtein, + py::arg("a"), py::arg("b"), + "Levenshtein (edit) distance between two sequences"); +} diff --git a/mir/distances/seqdist.py b/mir/distances/seqdist.py index c7dd40b..2306085 100644 --- a/mir/distances/seqdist.py +++ b/mir/distances/seqdist.py @@ -1,4 +1,4 @@ -"""Thin Python wrappers around the C-native distance functions in ``mirseq``. +"""Thin Python wrappers around the C-native distance functions in ``seqdist_c``. Functions --------- @@ -9,7 +9,7 @@ from __future__ import annotations from mir.basic.alphabets import Seq -from mir.basic.mirseq import hamming as _c_hamming, levenshtein as _c_levenshtein +from mir.distances.seqdist_c import hamming as _c_hamming, levenshtein as _c_levenshtein def hamming(a: Seq, b: Seq) -> int: diff --git a/tests/test_mirseq.py b/tests/test_mirseq.py index fc4df68..3f93ffb 100644 --- a/tests/test_mirseq.py +++ b/tests/test_mirseq.py @@ -1,10 +1,10 @@ -"""Unit tests for the ``mirseq`` C extension and ``alphabets`` module. +"""Unit tests for the ``mirseq`` C extension. Covers: - Codon translation: linear and bidirectional (comprehensive) + - BioPython cross-checks for translation - AA → reduced alphabet (C and Python paths) - Tokenization: plain bytes/str, gapped bytes/str - - Distances: Hamming, Levenshtein - Cross-checking against tokens.py wrappers Run with ``python -m pytest tests/test_mirseq.py -v``. @@ -12,6 +12,8 @@ import unittest +from Bio.Seq import Seq as BioSeq + from mir.basic import mirseq from mir.basic.alphabets import ( AA_MASK, @@ -25,7 +27,6 @@ tokenize_str as py_tokenize_str, tokenize_gapped_str as py_tokenize_gapped_str, ) -from mir.distances.seqdist import hamming, levenshtein # ── helpers ──────────────────────────────────────────────────────── @@ -61,23 +62,30 @@ def _py_translate_linear(nt: str) -> str: def _py_translate_bidi(nt: str) -> str: + """Reference: insert gap nucleotides then translate linearly. + + Gap length = 3 - (len % 3) nucleotides (N's), inserted after + fwd_codons * 3 position. The gap codon translates to '_'. + """ n = len(nt) if n == 0: return "" if n % 3 == 0: return _py_translate_linear(nt) + remainder = n % 3 + gap_len = 3 - remainder n_codons = n // 3 fwd_codons = 4 if n >= 27 else n_codons // 2 - rev_codons = n_codons - fwd_codons + insert_pos = fwd_codons * 3 + # Build padded sequence + padded = nt[:insert_pos] + "N" * gap_len + nt[insert_pos:] + # Translate linearly — all codons are now complete result = [] - for i in range(fwd_codons): - codon = nt[i * 3:i * 3 + 3] - result.append("X" if "N" in codon else _CODON_MAP[codon]) - result.append("_") - for i in range(rev_codons): - pos = n - (rev_codons - i) * 3 - codon = nt[pos:pos + 3] + for i in range(0, len(padded), 3): + codon = padded[i:i + 3] result.append("X" if "N" in codon else _CODON_MAP[codon]) + # Replace the gap codon (at fwd_codons) with '_' + result[fwd_codons] = "_" return "".join(result) @@ -128,6 +136,30 @@ def test_cross_check_reference(self) -> None: self.assertEqual( mirseq.translate_linear(seq), _py_translate_linear(seq)) + def test_cross_check_biopython(self) -> None: + """Verify translate_linear matches BioPython Seq.translate() for + complete-codon sequences (BioPython doesn't produce '_' for + incomplete trailing codons).""" + seqs = ["ATGGCTTGA", "TTTTTCTTATTG", "GCAGCCGCGGCG", + "TAATGATAG", "ATGATGATGATGATGATGATG", + "TGTTGCTGATGG"] + for seq in seqs: + with self.subTest(seq=seq): + # Only compare the full-codon portion + n_full = (len(seq) // 3) * 3 + full_seq = seq[:n_full] + bio_result = str(BioSeq(full_seq).translate()) + mirseq_result = mirseq.translate_linear(full_seq) + self.assertEqual(mirseq_result, bio_result) + + def test_cross_check_biopython_all_codons(self) -> None: + """Verify every codon matches BioPython.""" + for codon in _CODON_MAP: + with self.subTest(codon=codon): + self.assertEqual( + mirseq.translate_linear(codon), + str(BioSeq(codon).translate())) + # ── Translation: bidirectional (comprehensive) ──────────────────── @@ -255,6 +287,49 @@ def test_cross_check_long(self) -> None: self.assertEqual( mirseq.translate_bidi(nt), _py_translate_bidi(nt)) + # -- cross-check bidi against BioPython ----------------------------- + + def test_bidi_vs_biopython(self) -> None: + """For divisible-by-3 sequences, bidi == linear == BioPython.""" + seqs = ["ATG" * 3, "ATG" * 9, "ATGGCTTGA", "TTTTTCTTATTG"] + for seq in seqs: + with self.subTest(seq=seq): + self.assertEqual( + mirseq.translate_bidi(seq), + str(BioSeq(seq).translate())) + + def test_bidi_flanks_vs_biopython(self) -> None: + """Non-gap codons in bidi output match BioPython translation of + those same nucleotide regions.""" + nt_base = "ATGGCTTGAAACTAAGTTTTTCATA" * 3 + for length in range(4, 50): + nt = nt_base[:length] + if length % 3 == 0: + continue + remainder = length % 3 + gap_len = 3 - remainder + n_codons = length // 3 + fwd_codons = 4 if length >= 27 else n_codons // 2 + with self.subTest(length=length): + result = mirseq.translate_bidi(nt) + # Check forward flanking codons against BioPython + fwd_nt = nt[:fwd_codons * 3] + if fwd_nt: + self.assertEqual( + result[:fwd_codons], + str(BioSeq(fwd_nt).translate())) + # Check reverse flanking codons against BioPython + rev_start = fwd_codons * 3 + rev_nt = nt[rev_start:] + # After gap insertion, the reverse portion starts at a new + # codon boundary; extract the codons from end + rev_codons = n_codons - fwd_codons + rev_nt_from_end = nt[length - rev_codons * 3:] + if rev_nt_from_end: + self.assertEqual( + result[fwd_codons + 1:], + str(BioSeq(rev_nt_from_end).translate())) + # -- structural properties ------------------------------------------ def test_gap_count(self) -> None: @@ -418,70 +493,5 @@ def test_cross_check_wrapper(self) -> None: py_tokenize_gapped_str("CASSL", 3, "X")) -# ── Hamming distance ────────────────────────────────────────────── - -class TestHamming(unittest.TestCase): - - def test_identical(self) -> None: - self.assertEqual(mirseq.hamming("CAST", "CAST"), 0) - - def test_one_mismatch(self) -> None: - self.assertEqual(mirseq.hamming("CAST", "CAAT"), 1) - - def test_all_mismatch(self) -> None: - self.assertEqual(mirseq.hamming("AAAA", "TTTT"), 4) - - def test_empty(self) -> None: - self.assertEqual(mirseq.hamming("", ""), 0) - - def test_length_mismatch_raises(self) -> None: - with self.assertRaises(Exception): - mirseq.hamming("ABC", "AB") - - def test_bytes_input(self) -> None: - self.assertEqual(mirseq.hamming(b"CAST", b"CAAT"), 1) - - def test_wrapper(self) -> None: - self.assertEqual(hamming("CAST", "CAAT"), 1) - - -# ── Levenshtein distance ───────────────────────────────────────── - -class TestLevenshtein(unittest.TestCase): - - def test_classic(self) -> None: - self.assertEqual(mirseq.levenshtein("kitten", "sitting"), 3) - - def test_identical(self) -> None: - self.assertEqual(mirseq.levenshtein("CAST", "CAST"), 0) - - def test_insertion(self) -> None: - self.assertEqual(mirseq.levenshtein("ABC", "ABCD"), 1) - - def test_deletion(self) -> None: - self.assertEqual(mirseq.levenshtein("ABCD", "ABC"), 1) - - def test_substitution(self) -> None: - self.assertEqual(mirseq.levenshtein("ABC", "AXC"), 1) - - def test_empty_vs_nonempty(self) -> None: - self.assertEqual(mirseq.levenshtein("", "ABC"), 3) - self.assertEqual(mirseq.levenshtein("ABC", ""), 3) - - def test_both_empty(self) -> None: - self.assertEqual(mirseq.levenshtein("", ""), 0) - - def test_bytes_input(self) -> None: - self.assertEqual(mirseq.levenshtein(b"kitten", b"sitting"), 3) - - def test_wrapper(self) -> None: - self.assertEqual(levenshtein("kitten", "sitting"), 3) - - def test_symmetric(self) -> None: - self.assertEqual( - mirseq.levenshtein("CASSL", "CASSQL"), - mirseq.levenshtein("CASSQL", "CASSL")) - - if __name__ == "__main__": unittest.main() diff --git a/tests/test_mirseq_benchmark.py b/tests/test_mirseq_benchmark.py index 5e10b4d..783a39a 100644 --- a/tests/test_mirseq_benchmark.py +++ b/tests/test_mirseq_benchmark.py @@ -1,4 +1,4 @@ -"""Benchmarks: C (mirseq) vs pure-Python for key operations. +"""Benchmarks: C extensions vs pure-Python for key operations. Run with ``python -m pytest tests/test_mirseq_benchmark.py -v -s``. """ @@ -7,6 +7,7 @@ import unittest from mir.basic import mirseq +from mir.distances import seqdist_c from mir.basic.alphabets import ( AA_MASK, AA_TO_REDUCED_TABLE, @@ -118,7 +119,7 @@ def py_hamming(s1: str, s2: str) -> int: return sum(c1 != c2 for c1, c2 in zip(s1, s2)) py_t = _time_fn(py_hamming, a, b, n=n) - c_t = _time_fn(mirseq.hamming, a, b, n=n) + c_t = _time_fn(seqdist_c.hamming, a, b, n=n) self._report("hamming", py_t, c_t) # ── levenshtein ─────────────────────────────────────────────── @@ -140,7 +141,7 @@ def py_levenshtein(s1: str, s2: str) -> int: return prev[n_] py_t = _time_fn(py_levenshtein, a, b, n=n) - c_t = _time_fn(mirseq.levenshtein, a, b, n=n) + c_t = _time_fn(seqdist_c.levenshtein, a, b, n=n) self._report("levenshtein", py_t, c_t) diff --git a/tests/test_seqdist.py b/tests/test_seqdist.py new file mode 100644 index 0000000..e940000 --- /dev/null +++ b/tests/test_seqdist.py @@ -0,0 +1,80 @@ +"""Unit tests for the ``seqdist_c`` C extension and ``seqdist`` wrapper. + +Covers: Hamming distance, Levenshtein distance. + +Run with ``python -m pytest tests/test_seqdist.py -v``. +""" + +import unittest + +from mir.distances import seqdist_c +from mir.distances.seqdist import hamming, levenshtein + + +# ── Hamming distance ────────────────────────────────────────────── + +class TestHamming(unittest.TestCase): + + def test_identical(self) -> None: + self.assertEqual(seqdist_c.hamming("CAST", "CAST"), 0) + + def test_one_mismatch(self) -> None: + self.assertEqual(seqdist_c.hamming("CAST", "CAAT"), 1) + + def test_all_mismatch(self) -> None: + self.assertEqual(seqdist_c.hamming("AAAA", "TTTT"), 4) + + def test_empty(self) -> None: + self.assertEqual(seqdist_c.hamming("", ""), 0) + + def test_length_mismatch_raises(self) -> None: + with self.assertRaises(Exception): + seqdist_c.hamming("ABC", "AB") + + def test_bytes_input(self) -> None: + self.assertEqual(seqdist_c.hamming(b"CAST", b"CAAT"), 1) + + def test_wrapper(self) -> None: + self.assertEqual(hamming("CAST", "CAAT"), 1) + + +# ── Levenshtein distance ───────────────────────────────────────── + +class TestLevenshtein(unittest.TestCase): + + def test_classic(self) -> None: + self.assertEqual(seqdist_c.levenshtein("kitten", "sitting"), 3) + + def test_identical(self) -> None: + self.assertEqual(seqdist_c.levenshtein("CAST", "CAST"), 0) + + def test_insertion(self) -> None: + self.assertEqual(seqdist_c.levenshtein("ABC", "ABCD"), 1) + + def test_deletion(self) -> None: + self.assertEqual(seqdist_c.levenshtein("ABCD", "ABC"), 1) + + def test_substitution(self) -> None: + self.assertEqual(seqdist_c.levenshtein("ABC", "AXC"), 1) + + def test_empty_vs_nonempty(self) -> None: + self.assertEqual(seqdist_c.levenshtein("", "ABC"), 3) + self.assertEqual(seqdist_c.levenshtein("ABC", ""), 3) + + def test_both_empty(self) -> None: + self.assertEqual(seqdist_c.levenshtein("", ""), 0) + + def test_bytes_input(self) -> None: + self.assertEqual(seqdist_c.levenshtein(b"kitten", b"sitting"), 3) + + def test_wrapper(self) -> None: + self.assertEqual(levenshtein("kitten", "sitting"), 3) + + def test_symmetric(self) -> None: + self.assertEqual( + seqdist_c.levenshtein("CASSL", "CASSQL"), + seqdist_c.levenshtein("CASSQL", "CASSL")) + + +if __name__ == "__main__": + unittest.main() From 845ec00f3d11d0c59fac80067c836ac7e0161ca4 Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 19:35:24 +0300 Subject: [PATCH 12/24] move cdrscore to seqdist, tests for aligner --- mir/distances/aligner.py | 99 ++++++-- mir/distances/seqdist.cpp | 115 ++++++++- tests/test_aligner.py | 516 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 700 insertions(+), 30 deletions(-) create mode 100644 tests/test_aligner.py diff --git a/mir/distances/aligner.py b/mir/distances/aligner.py index 518e64e..b2801de 100644 --- a/mir/distances/aligner.py +++ b/mir/distances/aligner.py @@ -1,3 +1,20 @@ +"""CDR3, germline and clonotype alignment scoring. + +This module provides scoring classes for comparing TCR/BCR sequences: + +* :class:`CDRAligner` — CDR3 amino-acid alignment with gap model and + BLOSUM62 substitution scoring. Delegates to the C extension + ``seqdist_c`` (``score_max`` / ``selfscore``) when available, with a + pure-Python fallback. +* :class:`BioAlignerWrapper` — thin wrapper around BioPython's + ``PairwiseAligner``. +* :class:`GermlineAligner` — dict-based germline gene scoring built from + pairwise sequence alignment. +* :class:`ClonotypeAligner` — composite scorer combining V/J germline + aligners with a CDR3 aligner. +* :class:`ClonotypeScore` / :class:`PairedCloneScore` — score containers. +""" + import time from abc import abstractmethod from itertools import starmap @@ -6,30 +23,45 @@ from Bio.Align import substitution_matrices import typing as t -import importlib import numpy as np from functools import lru_cache from mir.common.clonotype import ClonotypeAA, PairedChainClone from mir.common.segments import Segment, SegmentLibrary -_cdrscore_mod = None -def _get_cdrscore(): - global _cdrscore_mod - if _cdrscore_mod is None: - _cdrscore_mod = importlib.import_module('mir.distances.cdrscore') - return _cdrscore_mod +# --------------------------------------------------------------------------- +# Lazy-load C acceleration from seqdist_c (score_max, selfscore) +# --------------------------------------------------------------------------- +_seqdist_mod = None + +def _get_seqdist(): + """Return the ``seqdist_c`` C module, or *None* if unavailable.""" + global _seqdist_mod + if _seqdist_mod is None: + try: + from mir.distances import seqdist_c as _mod + # Verify the CDR3 scoring functions are present + if hasattr(_mod, 'score_max') and hasattr(_mod, 'selfscore'): + _seqdist_mod = _mod + except ImportError: + pass + return _seqdist_mod class Scoring: + """Abstract base for pairwise sequence scoring.""" + @abstractmethod def score(self, s1: str, s2: str) -> float: - pass + """Raw alignment score between *s1* and *s2*.""" def score_norm(self, s1: str, s2: str) -> float: + """Normalised score: ``score(s1,s2) - max(score(s1,s1), score(s2,s2))``.""" return self.score(s1, s2) - max(self.score(s1, s1), self.score(s2, s2)) class BioAlignerWrapper(Scoring): + """Wrapper around :class:`Bio.Align.PairwiseAligner`.""" + def __init__(self, scoring: str = "blastp"): self.aligner = Align.PairwiseAligner(scoring) @@ -37,21 +69,32 @@ def score(self, s1, s2) -> float: return self.aligner.align(s1, s2).score -# TODO substitution matrix wrapper to load from dict class CDRAligner(Scoring): - _factor = 10.0 - _cdr_mod = None + """CDR3 amino-acid aligner with a simplified gap model. + + Scores are computed over the interior of the CDR3 (skipping + *v_offset* positions from the start and *j_offset* from the end) + using a substitution matrix (BLOSUM62 by default). When sequences + differ in length the shorter sequence is padded with a gap block + placed at each of the *gap_positions* and the best score is kept. + + The heavy lifting is done in C (``seqdist_c.score_max`` / + ``seqdist_c.selfscore``) when available; a pure-Python fallback + is used otherwise. + + Parameters + ---------- + gap_positions : iterable of int + Candidate gap-insertion positions (negative = from end). + mat : substitution_matrices.Array or None + Amino-acid substitution matrix (e.g. BLOSUM62). + gap_penalty : float + Per-position gap penalty (typically negative). + v_offset, j_offset : int + Number of positions to skip at the V/J ends. + """ - @staticmethod - def _get_cdrscore(): - if CDRAligner._cdr_mod is not None: - return CDRAligner._cdr_mod - try: - import importlib - CDRAligner._cdr_mod = importlib.import_module('mir.distances.cdrscore') - except Exception: - CDRAligner._cdr_mod = None - return CDRAligner._cdr_mod + _factor = 10.0 def __init__(self, gap_positions: t.Iterable[int] = (3, 4, -4, -3), @@ -177,7 +220,7 @@ def _selfscore_cached(self, s: str) -> float: val = self._self_cache.get(s) if val is not None: return val - cdr = self._get_cdrscore() + cdr = _get_seqdist() if cdr is not None: val = cdr.selfscore(s, self._mat256, self._factor, self._use_mat) else: @@ -195,7 +238,7 @@ def _selfscore_cached(self, s: str) -> float: return val def score(self, s1, s2) -> float: - cdr = self._get_cdrscore() + cdr = _get_seqdist() if cdr is not None: return cdr.score_max( s1, s2, @@ -241,7 +284,7 @@ def pad(self, s1, s2) -> tuple[tuple[str, str]]: return tuple(res) def alns(self, s1, s2) -> tuple[tuple[str, str, float]]: - cdr = self._get_cdrscore() + cdr = _get_seqdist() if len(s1) == len(s2): if cdr is not None: sc = cdr.score_max( @@ -274,6 +317,8 @@ def __call__(self, gs1: tuple[str, str], gs2: tuple[str, str]): class GermlineAligner: + """Dict-based gene-level aligner built from pre-computed pairwise scores.""" + def __init__(self, dist: dict[tuple[str, str], float]): self.dist = dist self.dist.update(dict(((g2, g1), score) for ((g1, g2), score) in dist.items())) @@ -313,6 +358,8 @@ def from_seqs(cls, class ClonotypeScore: + """Container for V / J / CDR3 component scores.""" + __scores__ = ['v_score', 'j_score', 'cdr3_score'] def __init__(self, v_score: float, j_score: float, cdr3_score: float): @@ -330,6 +377,8 @@ def get_flatten_score(self): class PairedCloneScore: + """Score container for paired alpha/beta chain clonotypes.""" + def __init__(self, alpha_chain_score: ClonotypeScore, beta_chain_score: ClonotypeScore): self.alpha_chain_score = alpha_chain_score self.beta_chain_score = beta_chain_score @@ -340,6 +389,8 @@ def get_flatten_score(self): class ClonotypeAligner: + """Composite aligner combining V gene, J gene, and CDR3 scoring.""" + def __init__(self, v_aligner: GermlineAligner, j_aligner: GermlineAligner, diff --git a/mir/distances/seqdist.cpp b/mir/distances/seqdist.cpp index 8220bda..6b590f8 100644 --- a/mir/distances/seqdist.cpp +++ b/mir/distances/seqdist.cpp @@ -1,14 +1,19 @@ /* - * seqdist — C-native sequence distance functions. + * seqdist — C-native sequence distance and scoring functions. * - * Compiled as a pybind11 module. Accepts Python str, bytes, or bytearray. + * Compiled as pybind11 module ``seqdist_c``. Accepts Python str, bytes, or bytearray. * - * Functions: - * hamming(a, b) → int Hamming distance (equal-length sequences) - * levenshtein(a, b) → int Levenshtein (edit) distance + * Distance functions: + * hamming(a, b) → int Hamming distance (equal-length sequences) + * levenshtein(a, b) → int Levenshtein (edit) distance + * + * CDR3 alignment scoring (from former cdrscore module): + * score_max(s1, s2, mat256, gaps, gap_pen, v_off, j_off, factor, use_mat) → double + * selfscore(s, mat256, factor, use_mat) → double */ #include +#include #include #include #include @@ -89,12 +94,106 @@ static int c_levenshtein(const py::object& a, const py::object& b) { return prev[m]; } +/* ================================================================ + * CDR3 alignment scoring (merged from cdrscore) + * ================================================================ */ + +static inline int norm_pos(int p, int m) { + if (p >= 0) return p > m ? m : p; + int q = m + p; + return q < 0 ? 0 : q; +} + +static inline double seg_equal(const char* s1, const char* s2, int start, int end, + const double* mat, bool use_mat) { + double x = 0.0; + for (int i = start; i < end; ++i) { + unsigned char c1 = (unsigned char)s1[i], c2 = (unsigned char)s2[i]; + x += use_mat ? mat[(size_t)c1 * 256 + c2] : (c1 == c2 ? 0.0 : 1.0); + } + return x; +} + +static inline double score_with_gap(const char* s1, int n1, const char* s2, int n2, + int p_raw, int start, int end, + double gap_pen, const double* mat, bool use_mat) { + if (n1 == n2) return seg_equal(s1, s2, start, end, mat, use_mat); + + if (n1 < n2) { + int gap_len = n2 - n1, p = norm_pos(p_raw, n1); + int g0 = std::max(start, p), g1 = std::min(end, p + gap_len); + double x = 0.0; + x += seg_equal(s1, s2, start, g0, mat, use_mat); + if (g1 > g0) x += (g1 - g0) * gap_pen; + for (int i = g1; i < end; ++i) { + int j = i - gap_len; + unsigned char c1 = (unsigned char)s1[j], c2 = (unsigned char)s2[i]; + x += use_mat ? mat[(size_t)c1 * 256 + c2] : (c1 == c2 ? 0.0 : 1.0); + } + return x; + } else { + int gap_len = n1 - n2, p = norm_pos(p_raw, n2); + int g0 = std::max(start, p), g1 = std::min(end, p + gap_len); + double x = 0.0; + x += seg_equal(s1, s2, start, g0, mat, use_mat); + if (g1 > g0) x += (g1 - g0) * gap_pen; + for (int i = g1; i < end; ++i) { + int j = i - gap_len; + unsigned char c1 = (unsigned char)s1[i], c2 = (unsigned char)s2[j]; + x += use_mat ? mat[(size_t)c1 * 256 + c2] : (c1 == c2 ? 0.0 : 1.0); + } + return x; + } +} + +static double c_score_max(const std::string& s1, const std::string& s2, + py::array_t mat256, + py::array_t gaps, + double gap_pen, int v_off, int j_off, double factor, bool use_mat) { + const double* mat = nullptr; + if (use_mat) { + auto mbuf = mat256.request(); + if (mbuf.ndim != 2 || mbuf.shape[0] != 256 || mbuf.shape[1] != 256) + throw std::runtime_error("mat must be 256x256"); + mat = static_cast(mbuf.ptr); + } + auto gb = gaps.request(); + const int* gp = static_cast(gb.ptr); + int ng = (int)gb.shape[0]; + + int L = std::max((int)s1.size(), (int)s2.size()); + int start = v_off, end = L - j_off; + if (end <= start) return 0.0; + + double best = -1e300; + { + py::gil_scoped_release release; + for (int k = 0; k < ng; ++k) { + double sc = score_with_gap(s1.data(), (int)s1.size(), s2.data(), (int)s2.size(), + gp[k], start, end, gap_pen, mat, use_mat); + if (sc > best) best = sc; + } + } + return factor * best; +} + +static double c_selfscore(const std::string& s, + py::array_t mat256, + double factor, bool use_mat) { + if (!use_mat) return 0.0; + auto mbuf = mat256.request(); + const double* mat = static_cast(mbuf.ptr); + double x = 0.0; + for (unsigned char c : s) x += mat[(size_t)c * 256 + c]; + return factor * x; +} + /* ================================================================ * Module definition * ================================================================ */ PYBIND11_MODULE(seqdist_c, m) { - m.doc() = "C-native sequence distance functions (Hamming, Levenshtein)"; + m.doc() = "C-native sequence distance and CDR3 scoring functions"; m.def("hamming", &c_hamming, py::arg("a"), py::arg("b"), @@ -102,4 +201,8 @@ PYBIND11_MODULE(seqdist_c, m) { m.def("levenshtein", &c_levenshtein, py::arg("a"), py::arg("b"), "Levenshtein (edit) distance between two sequences"); + m.def("score_max", &c_score_max, + "Best CDR3 alignment score over a set of gap positions"); + m.def("selfscore", &c_selfscore, + "Self-alignment score (diagonal of substitution matrix)"); } diff --git a/tests/test_aligner.py b/tests/test_aligner.py new file mode 100644 index 0000000..0003033 --- /dev/null +++ b/tests/test_aligner.py @@ -0,0 +1,516 @@ +"""Correctness and speed benchmark tests for CDR3 / clonotype alignment. + +CDR3 sequences generated from human TRB using OLGA +(Sethna et al., 2019, *Bioinformatics*). +""" + +import time +import math +import pytest +import numpy as np +from Bio import Align +from Bio.Align import substitution_matrices + +from mir.distances.aligner import ( + CDRAligner, + BioAlignerWrapper, + Scoring, + GermlineAligner, + ClonotypeScore, + PairedCloneScore, +) + +# --------------------------------------------------------------------------- +# OLGA-generated human TRB CDR3 amino-acid sequences (50 sequences) +# --------------------------------------------------------------------------- +OLGA_CDR3S = [ + "CASSLETGEACNQPQHF", + "CATSGHRDRQVQPQHF", + "CASSLGRDRGMNTEAFF", + "CASSRGNTIYF", + "CASSRGPQGPYRYGYTF", + "CATSDLEGQGDKNTEAFF", + "CASSRGTSRYPYEQYF", + "CASSCHEVGTQHF", + "CASSEGDRDEQYF", + "CASIPGTSGSSTDTQYF", + "CASSGKGLAGGSLENEQYF", + "CASSRGHGNTIYF", + "CASSWSKGITGELFF", + "CASSPPIQGSGDWRIMVGNYEQYF", + "CARGSRGGFSGANVLTF", + "CELQQETQYF", + "CASNRRGRDEAFF", + "CASSKRGQGVLYGYTF", + "CASSQVVQGAIETQYF", + "CASSGCNRGYSNQPQHF", + "CASSQVTESPDYEQYF", + "CSGGTGTEAFF", + "CASSYKVGSYGYTF", + "CASSGPGCHAGELFF", + "CASSPPPIGTLTDTQYF", + "CASSTGPTLGNQPQHF", + "CAWSGRGGRANAEKLFF", + "CASSTRGINEKLFF", + "CASSFWGHRGVEKLFF", + "CASSYRGGRSYNSPLHF", + "CASSGKRSCTTEAFF", + "CASKTGRTGELFF", + "CASSGVLAKNIQYF", + "CASSSGKRNYGYTF", + "CASSYLYTAKNIQYF", + "CSDGTAYNEQFF", + "CASSQVIPGQAYEGRAGAFF", + "CASSERRQFGPRYEQYF", + "CASTESGTSGGATGNVSSYEQYF", + "CASFVRRSESYEQYF", + "CASSFRNEQYF", + "CASSPRTGPDQHF", + "CASCLQGEESQHNEQFF", + "CASRLGTRTGGTGANVLTF", + "CASGLSLAVVSDEQFF", + "CNIYIPGLAGGQAFHRGYEQYF", + "CASSKLGPEASTDTQYF", + "CASTFPSILGGGTMLTDTQYF", + "CAIRAGQGFRVAKNIQYF", + "CSVFPRAFRMNTEAFF", +] + + +# =================================================================== +# Correctness tests +# =================================================================== + + +class TestCDRAlignerBasic: + """Unit tests for CDRAligner scoring logic.""" + + def setup_method(self): + self.aligner = CDRAligner() + + # -- Self-score properties ------------------------------------------- + + def test_self_score_positive(self): + """Self-score with BLOSUM62 must be positive for valid AA seqs.""" + for s in OLGA_CDR3S[:20]: + sc = self.aligner.score(s, s) + assert sc > 0, f"self-score for {s} should be > 0, got {sc}" + + def test_score_norm_self_is_nonpositive(self): + """Normalised self-score must be <= 0. + + Note: CDRAligner.score_norm uses ``_selfscore_cached`` (full-length + diagonal) while ``score(s, s)`` only covers [v_offset..L-j_offset), + so score_norm(s, s) is typically negative, not zero. + """ + for s in OLGA_CDR3S[:10]: + sn = self.aligner.score_norm(s, s) + assert sn <= 1e-9, f"norm(self) for {s} = {sn}" + + def test_score_dist_self_is_zero(self): + """Distance to self must be zero.""" + for s in OLGA_CDR3S[:10]: + d = self.aligner.score_dist(s, s) + assert d == pytest.approx(0.0, abs=1e-9) + + # -- Symmetry -------------------------------------------------------- + + def test_score_symmetry(self): + """score(a,b) == score(b,a).""" + pairs = list(zip(OLGA_CDR3S[:15], OLGA_CDR3S[15:30])) + for a, b in pairs: + assert self.aligner.score(a, b) == pytest.approx( + self.aligner.score(b, a), abs=1e-9 + ), f"asymmetric score for {a}, {b}" + + def test_score_norm_symmetry(self): + for a, b in zip(OLGA_CDR3S[:10], OLGA_CDR3S[10:20]): + assert self.aligner.score_norm(a, b) == pytest.approx( + self.aligner.score_norm(b, a), abs=1e-9 + ) + + # -- Normalised score is non-positive -------------------------------- + + def test_score_norm_nonpositive(self): + """Normalised score must be <= 0 (similarity relative to self).""" + for a, b in zip(OLGA_CDR3S[:15], OLGA_CDR3S[15:30]): + sn = self.aligner.score_norm(a, b) + assert sn <= 1e-9, f"score_norm({a},{b}) = {sn} > 0" + + # -- Equal-length alignment ------------------------------------------ + + def test_equal_length_no_gap(self): + """Equal-length sequences should not need gaps.""" + s1 = "CASSLETGE" + s2 = "CASSRGTGE" + a = CDRAligner(gap_positions=(3,), v_offset=3, j_offset=3) + pads = a.pad(s1, s2) + assert pads == ((s1, s2),) + + def test_equal_length_deterministic(self): + """Same input → same output.""" + s1, s2 = OLGA_CDR3S[0], OLGA_CDR3S[1] + sc1 = self.aligner.score(s1, s2) + sc2 = self.aligner.score(s1, s2) + assert sc1 == sc2 + + # -- Gap padding ----------------------------------------------------- + + def test_pad_length_consistency(self): + """Padded sequences must have equal length (= max of originals).""" + s1 = OLGA_CDR3S[0] + s2 = OLGA_CDR3S[3] # different length + for p1, p2 in self.aligner.pad(s1, s2): + assert len(p1) == len(p2), f"pad mismatch: {len(p1)} vs {len(p2)}" + + def test_alns_returns_score_per_gap(self): + """alns() returns one (s1pad, s2pad, score) per gap position.""" + s1, s2 = OLGA_CDR3S[0], OLGA_CDR3S[3] + result = self.aligner.alns(s1, s2) + assert len(result) == len(self.aligner.gap_positions) + for padded1, padded2, sc in result: + assert isinstance(sc, float) + assert len(padded1) == len(padded2) + + # -- score_dist triangle inequality ---------------------------------- + + def test_triangle_inequality(self): + """score_dist should satisfy the triangle inequality for simple cases.""" + seqs = OLGA_CDR3S[:5] + d = {} + for i, si in enumerate(seqs): + for j, sj in enumerate(seqs): + d[(i, j)] = self.aligner.score_dist(si, sj) + + for i in range(len(seqs)): + for j in range(len(seqs)): + for k in range(len(seqs)): + # Note: score_dist isn't a proper metric for gapped seqs, + # but for equal-length it should hold. + if len(seqs[i]) == len(seqs[j]) == len(seqs[k]): + assert d[(i, k)] <= d[(i, j)] + d[(j, k)] + 1e-6 + + # -- Identity matrix scoring ----------------------------------------- + + def test_identity_matrix_equal_len(self): + """With identity matrix (no BLOSUM), equal-length mismatch count.""" + a = CDRAligner(mat=None, gap_penalty=0, v_offset=0, j_offset=0) + s1 = "CASSLETGE" + s2 = "CASSRXTGE" + # mat=None → mismatch=1.0, match=0.0 + expected = sum(1.0 for c1, c2 in zip(s1, s2) if c1 != c2) * CDRAligner._factor + assert a.score(s1, s2) == pytest.approx(expected) + + +# =================================================================== +# C vs Python fallback consistency +# =================================================================== + + +class TestCvsPythonFallback: + """Verify that C-accelerated and pure-Python paths give identical results.""" + + def setup_method(self): + self.aligner = CDRAligner() + + def _py_score(self, s1, s2): + """Force the pure-Python path.""" + if len(s1) == len(s2): + return self.aligner._score_equal_len_py(s1, s2) + best = -math.inf + for p in self.aligner.gap_positions: + sc = self.aligner._score_with_gap_py(s1, s2, int(p)) + if sc > best: + best = sc + return best + + def _py_selfscore(self, s): + """Force the pure-Python selfscore path.""" + if self.aligner.mat is None: + return 0.0 + x = 0.0 + m = self.aligner.mat + for c in s: + x += m[c, c] + return self.aligner._factor * x + + def test_equal_len_c_vs_py(self): + same_len_pairs = [ + (a, b) for a, b in zip(OLGA_CDR3S, OLGA_CDR3S[1:]) + if len(a) == len(b) + ] + if not same_len_pairs: + # Make some equal-length pairs by truncation + same_len_pairs = [(s[:10], s[:10].replace('S', 'T', 1)) for s in OLGA_CDR3S[:5]] + for a, b in same_len_pairs: + c_score = self.aligner.score(a, b) + py_score = self._py_score(a, b) + assert c_score == pytest.approx(py_score, abs=1e-6), \ + f"C vs Py mismatch for same-len {a}, {b}: {c_score} vs {py_score}" + + def test_diff_len_c_vs_py(self): + diff_len_pairs = [ + (a, b) for a, b in zip(OLGA_CDR3S, OLGA_CDR3S[1:]) + if len(a) != len(b) + ] + for a, b in diff_len_pairs[:15]: + c_score = self.aligner.score(a, b) + py_score = self._py_score(a, b) + assert c_score == pytest.approx(py_score, abs=1e-6), \ + f"C vs Py mismatch for diff-len {a}, {b}: {c_score} vs {py_score}" + + def test_selfscore_c_vs_py(self): + for s in OLGA_CDR3S[:20]: + from mir.distances.aligner import _get_seqdist + cdr = _get_seqdist() + if cdr is None: + pytest.skip("C extension not available") + c_val = cdr.selfscore(s, self.aligner._mat256, self.aligner._factor, self.aligner._use_mat) + py_val = self._py_selfscore(s) + assert c_val == pytest.approx(py_val, abs=1e-6), \ + f"selfscore mismatch for {s}: {c_val} vs {py_val}" + + +# =================================================================== +# BioPython cross-check +# =================================================================== + + +class TestBioPythonCrossCheck: + """Cross-check CDRAligner against BioPython PairwiseAligner. + + For *equal-length* ungapped alignment with the same substitution + matrix and no gap penalty the scores should agree (up to the offset + trimming and the factor scaling that CDRAligner applies). + """ + + def test_ungapped_equal_length_vs_biopython(self): + """Compare BLOSUM62 scores for equal-length CDR3 pairs. + + CDRAligner computes score over positions [v_offset .. L-j_offset), + scaled by _factor. When v_offset = j_offset = 0 and sequences + have the same length (so no gap is needed), the raw score should + equal BioPython's *ungapped* global alignment score × _factor. + """ + mat = substitution_matrices.load("BLOSUM62") + # CDRAligner with zero offsets to compare full-length + cdr = CDRAligner(mat=mat, gap_penalty=-1000.0, v_offset=0, j_offset=0) + + bio = Align.PairwiseAligner() + bio.mode = "global" + bio.substitution_matrix = mat + bio.open_gap_score = -1000.0 # effectively disable gaps + bio.extend_gap_score = -1000.0 + + # Gather equal-length pairs from OLGA set + pairs = [] + for i, s1 in enumerate(OLGA_CDR3S): + for s2 in OLGA_CDR3S[i + 1 :]: + if len(s1) == len(s2): + pairs.append((s1, s2)) + if len(pairs) >= 20: + break + if len(pairs) >= 20: + break + + # Also create some by trimming to a common length + for s1, s2 in zip(OLGA_CDR3S[:10], OLGA_CDR3S[10:20]): + L = min(len(s1), len(s2)) + pairs.append((s1[:L], s2[:L])) + + assert len(pairs) > 0, "No pairs generated" + + for s1, s2 in pairs: + cdr_score = cdr.score(s1, s2) / CDRAligner._factor + bio_score = bio.align(s1, s2).score + assert cdr_score == pytest.approx(bio_score, abs=1e-4), ( + f"CDRAligner vs BioPython mismatch for " + f"{s1}/{s2}: {cdr_score} vs {bio_score}" + ) + + def test_ungapped_selfscore_vs_biopython(self): + """Self-score from CDRAligner must match BioPython diagonal sum.""" + mat = substitution_matrices.load("BLOSUM62") + cdr = CDRAligner(mat=mat, v_offset=0, j_offset=0) + + bio = Align.PairwiseAligner() + bio.mode = "global" + bio.substitution_matrix = mat + bio.open_gap_score = -1000.0 + bio.extend_gap_score = -1000.0 + + for s in OLGA_CDR3S[:15]: + cdr_ss = cdr.score(s, s) / CDRAligner._factor + bio_ss = bio.align(s, s).score + assert cdr_ss == pytest.approx(bio_ss, abs=1e-4), \ + f"self-score mismatch for {s}: {cdr_ss} vs {bio_ss}" + + +# =================================================================== +# Backward compatibility +# =================================================================== + + +class TestBackwardCompat: + """Ensure public API has not changed.""" + + def test_scoring_abc(self): + assert hasattr(Scoring, "score") + assert hasattr(Scoring, "score_norm") + + def test_cdraligner_interface(self): + a = CDRAligner() + assert callable(a.score) + assert callable(a.score_norm) + assert callable(a.score_dist) + assert callable(a.pad) + assert callable(a.alns) + + def test_bioaligner_wrapper(self): + w = BioAlignerWrapper() + sc = w.score("CASS", "CASS") + assert isinstance(sc, float) + + def test_clonotype_score_attrs(self): + cs = ClonotypeScore(1.0, 2.0, 3.0) + assert cs.v_score == 1.0 + assert cs.j_score == 2.0 + assert cs.cdr3_score == 3.0 + assert cs.get_flatten_score() == [1.0, 2.0, 3.0] + + def test_paired_clone_score(self): + a = ClonotypeScore(1, 2, 3) + b = ClonotypeScore(4, 5, 6) + p = PairedCloneScore(a, b) + assert p.get_flatten_score() == [1, 2, 3, 4, 5, 6] + + def test_distances_init_exports(self): + from mir.distances import GermlineAligner, ClonotypeAligner, ClonotypeScore + assert GermlineAligner is not None + assert ClonotypeAligner is not None + assert ClonotypeScore is not None + + def test_seqdist_c_has_all_functions(self): + """The merged seqdist_c module must expose all original functions.""" + from mir.distances import seqdist_c + assert hasattr(seqdist_c, "hamming") + assert hasattr(seqdist_c, "levenshtein") + assert hasattr(seqdist_c, "score_max") + assert hasattr(seqdist_c, "selfscore") + + +# =================================================================== +# Speed benchmarks (pytest-benchmark style, manual timing) +# =================================================================== + + +class TestAlignmentBenchmarks: + """Speed benchmarks for CDR3 alignment. + + Compares C-accelerated scoring against the Python fallback and + BioPython's PairwiseAligner. Results are printed to stdout — run + with ``pytest -s`` to see them. + """ + + N_PAIRS = 200 # number of pairs to score in each benchmark + + def _make_pairs(self): + pairs = [] + n = len(OLGA_CDR3S) + for i in range(self.N_PAIRS): + a = OLGA_CDR3S[i % n] + b = OLGA_CDR3S[(i * 7 + 3) % n] + pairs.append((a, b)) + return pairs + + def test_benchmark_c_scoring(self): + aligner = CDRAligner() + pairs = self._make_pairs() + + # Warm up + for a, b in pairs[:5]: + aligner.score(a, b) + + t0 = time.perf_counter() + for a, b in pairs: + aligner.score(a, b) + elapsed = time.perf_counter() - t0 + + rate = self.N_PAIRS / elapsed + print(f"\n CDRAligner C : {self.N_PAIRS} pairs in {elapsed*1000:.1f} ms " + f"({rate:.0f} pairs/s)") + + def test_benchmark_python_fallback(self): + aligner = CDRAligner() + pairs = self._make_pairs() + + def py_score(s1, s2): + if len(s1) == len(s2): + return aligner._score_equal_len_py(s1, s2) + best = -math.inf + for p in aligner.gap_positions: + sc = aligner._score_with_gap_py(s1, s2, int(p)) + if sc > best: + best = sc + return best + + # Warm up + for a, b in pairs[:5]: + py_score(a, b) + + t0 = time.perf_counter() + for a, b in pairs: + py_score(a, b) + elapsed = time.perf_counter() - t0 + + rate = self.N_PAIRS / elapsed + print(f"\n CDRAligner Py : {self.N_PAIRS} pairs in {elapsed*1000:.1f} ms " + f"({rate:.0f} pairs/s)") + + def test_benchmark_biopython(self): + bio = BioAlignerWrapper() + pairs = self._make_pairs() + + # Warm up + for a, b in pairs[:5]: + bio.score(a, b) + + t0 = time.perf_counter() + for a, b in pairs: + bio.score(a, b) + elapsed = time.perf_counter() - t0 + + rate = self.N_PAIRS / elapsed + print(f"\n BioPython : {self.N_PAIRS} pairs in {elapsed*1000:.1f} ms " + f"({rate:.0f} pairs/s)") + + def test_c_faster_than_python(self): + """C extension should be significantly faster than Python fallback.""" + aligner = CDRAligner() + pairs = self._make_pairs() + + def py_score(s1, s2): + if len(s1) == len(s2): + return aligner._score_equal_len_py(s1, s2) + best = -math.inf + for p in aligner.gap_positions: + sc = aligner._score_with_gap_py(s1, s2, int(p)) + if sc > best: + best = sc + return best + + # Time C path + t0 = time.perf_counter() + for a, b in pairs: + aligner.score(a, b) + t_c = time.perf_counter() - t0 + + # Time Python path + t0 = time.perf_counter() + for a, b in pairs: + py_score(a, b) + t_py = time.perf_counter() - t0 + + speedup = t_py / t_c if t_c > 0 else float("inf") + print(f"\n C/Py speedup : {speedup:.1f}x") + assert speedup > 2.0, f"C extension only {speedup:.1f}x faster than Python" From 17a79e46112a80781da6c9b98a7296653b3eb26a Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 19:35:37 +0300 Subject: [PATCH 13/24] 2prev --- CMakeLists.txt | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e660a05..41c0f15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,20 +4,7 @@ project(mir_native LANGUAGES CXX) find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) find_package(pybind11 CONFIG REQUIRED) -# --- cdrscore (mir.distances) --- -pybind11_add_module(cdrscore MODULE mir/distances/cdrscore.cpp) -target_compile_features(cdrscore PRIVATE cxx_std_17) -if (MSVC) - target_compile_options(cdrscore PRIVATE /O2 /DNOMINMAX) -else() - target_compile_options(cdrscore PRIVATE -O3) -endif() -set_target_properties(cdrscore PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/mir/distances" -) -install(TARGETS cdrscore LIBRARY DESTINATION mir/distances) - -# --- seqdist_c (mir.distances) --- +# --- seqdist_c (mir.distances) — distances + CDR3 scoring --- pybind11_add_module(seqdist_c MODULE mir/distances/seqdist.cpp) target_compile_features(seqdist_c PRIVATE cxx_std_17) if (MSVC) From 235af3628cf3cfd7731ee5e5a8cb9ba8ac6101fc Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 19:53:50 +0300 Subject: [PATCH 14/24] finished alignment refactoring for now --- mir/distances/aligner.py | 110 ++++++++++++++++++- mir/distances/cdrscore.cpp | 97 ----------------- mir/distances/seqdist.cpp | 165 +++++++++++++++++++++++++--- tests/test_aligner.py | 218 ++++++++++++++++++++++++++++++++++++- 4 files changed, 471 insertions(+), 119 deletions(-) delete mode 100644 mir/distances/cdrscore.cpp diff --git a/mir/distances/aligner.py b/mir/distances/aligner.py index b2801de..2694a4e 100644 --- a/mir/distances/aligner.py +++ b/mir/distances/aligner.py @@ -4,8 +4,18 @@ * :class:`CDRAligner` — CDR3 amino-acid alignment with gap model and BLOSUM62 substitution scoring. Delegates to the C extension - ``seqdist_c`` (``score_max`` / ``selfscore``) when available, with a - pure-Python fallback. + ``seqdist_c`` when available, with a pure-Python fallback. + + Key methods: + + - :meth:`~CDRAligner.score` — best alignment score across gap positions. + - :meth:`~CDRAligner.score_norm` / :meth:`~CDRAligner.score_dist` — + normalised / distance variants. + - :meth:`~CDRAligner.align` — best alignment with visualization strings: + gapped sequences and a midline showing matches (``|``), conservative + substitutions (``:``) and mismatches (``.``). Implemented in the C + extension for speed. + * :class:`BioAlignerWrapper` — thin wrapper around BioPython's ``PairwiseAligner``. * :class:`GermlineAligner` — dict-based germline gene scoring built from @@ -13,6 +23,26 @@ * :class:`ClonotypeAligner` — composite scorer combining V/J germline aligners with a CDR3 aligner. * :class:`ClonotypeScore` / :class:`PairedCloneScore` — score containers. + +Performance +----------- +Benchmarked on 200 pairs of OLGA-generated human TRB CDR3 sequences +(lengths 10–24 aa), measured on Apple M-series, single thread: + ++-----------------------+-----------+--------------+ +| Method | Time (ms) | Throughput | ++=======================+===========+==============+ +| CDRAligner (C) | 0.2 | ~1 M pairs/s | ++-----------------------+-----------+--------------+ +| BioPython PairwiseAl. | 0.7 | ~270 k p/s | ++-----------------------+-----------+--------------+ +| CDRAligner (Python) | 3.9 | ~50 k p/s | ++-----------------------+-----------+--------------+ + +The C extension is **~20× faster** than the pure-Python fallback and +**~4× faster** than BioPython for the simplified CDR3 gap model. +For ungapped equal-length alignment the CDRAligner C scores match +BioPython exactly (same BLOSUM62 matrix, verified in test suite). """ import time @@ -308,6 +338,82 @@ def alns(self, s1, s2) -> tuple[tuple[str, str, float]]: scores = tuple(self._score_with_gap_py(s1, s2, int(p)) for p in self.gap_positions) return tuple((sp1, sp2, sc) for (sp1, sp2), sc in zip(self.pad(s1, s2), scores)) + def align(self, s1: str, s2: str) -> tuple[str, str, str, float]: + """Best alignment with visualization strings. + + Finds the gap position that maximises the score and returns + three equal-length strings plus the score: + + * ``s1_gapped`` — first sequence with ``'-'`` at gap positions + * ``midline`` — ``'|'`` exact match, ``':'`` positive + substitution score, ``'.'`` non-positive, ``' '`` gap + * ``s2_gapped`` — second sequence with ``'-'`` at gap positions + * ``score`` — the best alignment score (scaled by ``_factor``) + + Uses the C extension (``seqdist_c.best_alignment``) when + available, otherwise falls back to pure Python. + + Returns + ------- + tuple[str, str, str, float] + ``(s1_gapped, midline, s2_gapped, score)`` + """ + cdr = _get_seqdist() + if cdr is not None and hasattr(cdr, 'best_alignment'): + return cdr.best_alignment( + s1, s2, + self._mat256, + np.asarray(self.gap_positions, dtype=np.int32), + self.gap_penalty, self.v_offset, self.j_offset, + self._factor, self._use_mat + ) + return self._align_py(s1, s2) + + def _align_py(self, s1: str, s2: str) -> tuple[str, str, str, float]: + """Pure-Python fallback for :meth:`align`.""" + n1, n2 = len(s1), len(s2) + mat = self.mat + + def _mid(c1: str, c2: str) -> str: + if c1 == c2: + return '|' + if mat is not None and mat[c1, c2] > 0: + return ':' + return '.' + + if n1 == n2: + sc = self._score_equal_len_py(s1, s2) + mid = ''.join(_mid(a, b) for a, b in zip(s1, s2)) + return (s1, mid, s2, sc) + + # Find best gap position + best_sc = float('-inf') + best_p = int(self.gap_positions[0]) + for p in self.gap_positions: + sc = self._score_with_gap_py(s1, s2, int(p)) + if sc > best_sc: + best_sc = sc + best_p = int(p) + + if n1 < n2: + gap_len = n2 - n1 + k = self._norm_pos(best_p, n1) + gs1 = s1[:k] + '-' * gap_len + s1[k:] + gs2 = s2 + else: + gap_len = n1 - n2 + k = self._norm_pos(best_p, n2) + gs1 = s1 + gs2 = s2[:k] + '-' * gap_len + s2[k:] + + mid_chars = [] + for a, b in zip(gs1, gs2): + if a == '-' or b == '-': + mid_chars.append(' ') + else: + mid_chars.append(_mid(a, b)) + return (gs1, ''.join(mid_chars), gs2, best_sc) + class _Scoring_Wrapper: def __init__(self, scoring: Scoring): self.scoring = scoring diff --git a/mir/distances/cdrscore.cpp b/mir/distances/cdrscore.cpp deleted file mode 100644 index 9124507..0000000 --- a/mir/distances/cdrscore.cpp +++ /dev/null @@ -1,97 +0,0 @@ -#include -#include -#include -#include - -namespace py = pybind11; - -static inline int norm_pos(int p, int m) { if (p >= 0) return p > m ? m : p; int q = m + p; return q < 0 ? 0 : q; } - -static inline double seg_equal(const char* s1, const char* s2, int start, int end, - const double* mat, bool use_mat) { - double x = 0.0; - for (int i = start; i < end; ++i) { - unsigned char c1 = (unsigned char)s1[i], c2 = (unsigned char)s2[i]; - x += use_mat ? mat[(size_t)c1 * 256 + c2] : (c1 == c2 ? 0.0 : 1.0); - } - return x; -} - -static inline double score_with_gap(const char* s1, int n1, const char* s2, int n2, - int p_raw, int start, int end, - double gap_pen, const double* mat, bool use_mat) { - if (n1 == n2) return seg_equal(s1, s2, start, end, mat, use_mat); - - if (n1 < n2) { - int gap_len = n2 - n1, p = norm_pos(p_raw, n1); - int g0 = std::max(start, p), g1 = std::min(end, p + gap_len); - double x = 0.0; - x += seg_equal(s1, s2, start, g0, mat, use_mat); - if (g1 > g0) x += (g1 - g0) * gap_pen; - for (int i = g1; i < end; ++i) { - int j = i - gap_len; - unsigned char c1 = (unsigned char)s1[j], c2 = (unsigned char)s2[i]; - x += use_mat ? mat[(size_t)c1 * 256 + c2] : (c1 == c2 ? 0.0 : 1.0); - } - return x; - } else { - int gap_len = n1 - n2, p = norm_pos(p_raw, n2); - int g0 = std::max(start, p), g1 = std::min(end, p + gap_len); - double x = 0.0; - x += seg_equal(s1, s2, start, g0, mat, use_mat); - if (g1 > g0) x += (g1 - g0) * gap_pen; - for (int i = g1; i < end; ++i) { - int j = i - gap_len; - unsigned char c1 = (unsigned char)s1[i], c2 = (unsigned char)s2[j]; - x += use_mat ? mat[(size_t)c1 * 256 + c2] : (c1 == c2 ? 0.0 : 1.0); - } - return x; - } -} - -double score_max(const std::string& s1, const std::string& s2, - py::array_t mat256, - py::array_t gaps, - double gap_pen, int v_off, int j_off, double factor, bool use_mat) { - const double* mat = nullptr; - if (use_mat) { - auto mbuf = mat256.request(); - if (mbuf.ndim != 2 || mbuf.shape[0] != 256 || mbuf.shape[1] != 256) - throw std::runtime_error("mat must be 256x256"); - mat = static_cast(mbuf.ptr); - } - auto gb = gaps.request(); - const int* gp = static_cast(gb.ptr); - int ng = (int)gb.shape[0]; - - int L = std::max((int)s1.size(), (int)s2.size()); - int start = v_off, end = L - j_off; - if (end <= start) return 0.0; - - double best = -1e300; - { - py::gil_scoped_release release; - for (int k = 0; k < ng; ++k) { - double sc = score_with_gap(s1.data(), (int)s1.size(), s2.data(), (int)s2.size(), - gp[k], start, end, gap_pen, mat, use_mat); - if (sc > best) best = sc; - } - } - return factor * best; -} - -double selfscore(const std::string& s, - py::array_t mat256, - double factor, bool use_mat) { - if (!use_mat) return 0.0; - auto mbuf = mat256.request(); - const double* mat = static_cast(mbuf.ptr); - double x = 0.0; - for (unsigned char c : s) x += mat[(size_t)c * 256 + c]; - return factor * x; -} - -PYBIND11_MODULE(cdrscore, m) { - m.def("score_max", &score_max); - m.def("selfscore", &selfscore); -} diff --git a/mir/distances/seqdist.cpp b/mir/distances/seqdist.cpp index 6b590f8..5edf77a 100644 --- a/mir/distances/seqdist.cpp +++ b/mir/distances/seqdist.cpp @@ -7,13 +7,16 @@ * hamming(a, b) → int Hamming distance (equal-length sequences) * levenshtein(a, b) → int Levenshtein (edit) distance * - * CDR3 alignment scoring (from former cdrscore module): + * CDR3 alignment scoring: * score_max(s1, s2, mat256, gaps, gap_pen, v_off, j_off, factor, use_mat) → double * selfscore(s, mat256, factor, use_mat) → double + * best_alignment(s1, s2, mat256, gaps, gap_pen, v_off, j_off, factor, use_mat) + * → (s1_gapped, midline, s2_gapped, score) */ #include #include +#include #include #include #include @@ -95,7 +98,7 @@ static int c_levenshtein(const py::object& a, const py::object& b) { } /* ================================================================ - * CDR3 alignment scoring (merged from cdrscore) + * CDR3 alignment scoring * ================================================================ */ static inline int norm_pos(int p, int m) { @@ -146,17 +149,42 @@ static inline double score_with_gap(const char* s1, int n1, const char* s2, int } } +/* --- shared helper: find best gap position among candidates ---------- */ + +struct GapResult { double score; int gap_idx; }; + +static GapResult find_best_gap(const char* s1, int n1, const char* s2, int n2, + const int* gp, int ng, int start, int end, + double gap_pen, const double* mat, bool use_mat) { + double best = -1e300; + int best_k = 0; + for (int k = 0; k < ng; ++k) { + double sc = score_with_gap(s1, n1, s2, n2, + gp[k], start, end, gap_pen, mat, use_mat); + if (sc > best) { best = sc; best_k = k; } + } + return {best, best_k}; +} + +/* --- extract mat pointer and validate -------------------------------- */ + +static const double* extract_mat( + py::array_t& mat256, + bool use_mat) { + if (!use_mat) return nullptr; + auto mbuf = mat256.request(); + if (mbuf.ndim != 2 || mbuf.shape[0] != 256 || mbuf.shape[1] != 256) + throw std::runtime_error("mat must be 256x256"); + return static_cast(mbuf.ptr); +} + +/* --- score_max: best score across gap positions ---------------------- */ + static double c_score_max(const std::string& s1, const std::string& s2, py::array_t mat256, py::array_t gaps, double gap_pen, int v_off, int j_off, double factor, bool use_mat) { - const double* mat = nullptr; - if (use_mat) { - auto mbuf = mat256.request(); - if (mbuf.ndim != 2 || mbuf.shape[0] != 256 || mbuf.shape[1] != 256) - throw std::runtime_error("mat must be 256x256"); - mat = static_cast(mbuf.ptr); - } + const double* mat = extract_mat(mat256, use_mat); auto gb = gaps.request(); const int* gp = static_cast(gb.ptr); int ng = (int)gb.shape[0]; @@ -165,18 +193,16 @@ static double c_score_max(const std::string& s1, const std::string& s2, int start = v_off, end = L - j_off; if (end <= start) return 0.0; - double best = -1e300; - { - py::gil_scoped_release release; - for (int k = 0; k < ng; ++k) { - double sc = score_with_gap(s1.data(), (int)s1.size(), s2.data(), (int)s2.size(), - gp[k], start, end, gap_pen, mat, use_mat); - if (sc > best) best = sc; - } + GapResult r; + { py::gil_scoped_release release; + r = find_best_gap(s1.data(), (int)s1.size(), s2.data(), (int)s2.size(), + gp, ng, start, end, gap_pen, mat, use_mat); } - return factor * best; + return factor * r.score; } +/* --- selfscore ------------------------------------------------------- */ + static double c_selfscore(const std::string& s, py::array_t mat256, double factor, bool use_mat) { @@ -188,6 +214,104 @@ static double c_selfscore(const std::string& s, return factor * x; } +/* ================================================================ + * Best alignment with visualization strings + * + * Finds the best gap position (via find_best_gap), then builds three + * equal-length strings: + * s1_gapped — first sequence with '-' inserted for gaps + * midline — '|' exact match, ':' positive substitution score, + * '.' non-positive score, ' ' gap position + * s2_gapped — second sequence with '-' inserted for gaps + * Returns (s1_gapped, midline, s2_gapped, score). + * ================================================================ */ + +static inline char mid_char(unsigned char c1, unsigned char c2, + const double* mat, bool use_mat) { + if (c1 == c2) return '|'; + if (use_mat && mat[(size_t)c1 * 256 + c2] > 0) return ':'; + return '.'; +} + +/* Build midline for a gapped alignment where the gap of `gap_len` + characters starts at position `p` in the output (longer) string. */ +static std::string build_midline(const char* shorter, int ns, + const char* longer, int nl, + int p, int gap_len, bool gap_in_first, + const double* mat, bool use_mat) { + std::string mid(nl, ' '); + for (int i = 0; i < p; ++i) + mid[i] = mid_char((unsigned char)shorter[i], + (unsigned char)longer[i], mat, use_mat); + /* positions [p .. p+gap_len) stay ' ' (gap) */ + for (int i = p + gap_len; i < nl; ++i) { + int j = i - gap_len; + unsigned char cs = (unsigned char)shorter[j]; + unsigned char cl = (unsigned char)longer[i]; + mid[i] = gap_in_first + ? mid_char(cs, cl, mat, use_mat) /* gap in s1 → shorter is s1 */ + : mid_char(cl, cs, mat, use_mat); /* gap in s2 → shorter is s2 */ + } + return mid; +} + +static py::tuple c_best_alignment( + const std::string& s1, const std::string& s2, + py::array_t mat256, + py::array_t gaps, + double gap_pen, int v_off, int j_off, double factor, bool use_mat) { + + const double* mat = extract_mat(mat256, use_mat); + auto gb = gaps.request(); + const int* gp = static_cast(gb.ptr); + int ng = (int)gb.shape[0]; + + int n1 = (int)s1.size(), n2 = (int)s2.size(); + int L = std::max(n1, n2); + int start = v_off, end = L - j_off; + + GapResult r; + if (end > start) { + py::gil_scoped_release release; + r = find_best_gap(s1.data(), n1, s2.data(), n2, + gp, ng, start, end, gap_pen, mat, use_mat); + } else { + r = {0.0, 0}; + } + + /* --- build alignment strings --- */ + std::string gs1, gs2, mid; + + if (n1 == n2) { + gs1 = s1; gs2 = s2; + mid.resize(n1); + for (int i = 0; i < n1; ++i) + mid[i] = mid_char((unsigned char)s1[i], (unsigned char)s2[i], mat, use_mat); + } else if (n1 < n2) { + int gap_len = n2 - n1; + int p = norm_pos(gp[r.gap_idx], n1); + gs1.reserve(n2); + gs1.append(s1, 0, p); + gs1.append(gap_len, '-'); + gs1.append(s1, p, n1 - p); + gs2 = s2; + mid = build_midline(s1.data(), n1, s2.data(), n2, + p, gap_len, true, mat, use_mat); + } else { + int gap_len = n1 - n2; + int p = norm_pos(gp[r.gap_idx], n2); + gs1 = s1; + gs2.reserve(n1); + gs2.append(s2, 0, p); + gs2.append(gap_len, '-'); + gs2.append(s2, p, n2 - p); + mid = build_midline(s2.data(), n2, s1.data(), n1, + p, gap_len, false, mat, use_mat); + } + + return py::make_tuple(gs1, mid, gs2, factor * r.score); +} + /* ================================================================ * Module definition * ================================================================ */ @@ -205,4 +329,9 @@ PYBIND11_MODULE(seqdist_c, m) { "Best CDR3 alignment score over a set of gap positions"); m.def("selfscore", &c_selfscore, "Self-alignment score (diagonal of substitution matrix)"); + m.def("best_alignment", &c_best_alignment, + "Best alignment with gapped strings and midline visualization.\n" + "Returns (s1_gapped, midline, s2_gapped, score).\n" + "Midline chars: '|' exact match, ':' positive sub. score, " + "'.' non-positive, ' ' gap."); } diff --git a/tests/test_aligner.py b/tests/test_aligner.py index 0003033..a22697a 100644 --- a/tests/test_aligner.py +++ b/tests/test_aligner.py @@ -365,6 +365,7 @@ def test_cdraligner_interface(self): assert callable(a.score_dist) assert callable(a.pad) assert callable(a.alns) + assert callable(a.align) def test_bioaligner_wrapper(self): w = BioAlignerWrapper() @@ -397,6 +398,156 @@ def test_seqdist_c_has_all_functions(self): assert hasattr(seqdist_c, "levenshtein") assert hasattr(seqdist_c, "score_max") assert hasattr(seqdist_c, "selfscore") + assert hasattr(seqdist_c, "best_alignment") + + +# =================================================================== +# Alignment visualization tests +# =================================================================== + + +class TestAlignVisualization: + """Tests for CDRAligner.align() — gapped alignment strings.""" + + def setup_method(self): + self.aligner = CDRAligner() + + # -- Basic structure ------------------------------------------------- + + def test_align_returns_four_tuple(self): + s1, s2 = OLGA_CDR3S[0], OLGA_CDR3S[3] + result = self.aligner.align(s1, s2) + assert len(result) == 4 + gs1, mid, gs2, score = result + assert isinstance(gs1, str) + assert isinstance(mid, str) + assert isinstance(gs2, str) + assert isinstance(score, float) + + def test_align_equal_lengths(self): + """Equal-length strings: no gaps, all three strings same length.""" + gs1, mid, gs2, sc = self.aligner.align( + OLGA_CDR3S[0], OLGA_CDR3S[0] + ) + assert gs1 == OLGA_CDR3S[0] + assert gs2 == OLGA_CDR3S[0] + assert len(mid) == len(gs1) + assert all(c == '|' for c in mid) # self-alignment: all match + + def test_align_strings_equal_length(self): + """All three output strings must have the same length.""" + for a, b in zip(OLGA_CDR3S[:15], OLGA_CDR3S[15:30]): + gs1, mid, gs2, _ = self.aligner.align(a, b) + assert len(gs1) == len(mid) == len(gs2), ( + f"length mismatch: {len(gs1)}, {len(mid)}, {len(gs2)}" + ) + + def test_align_output_length_is_max(self): + """Output length equals max(len(s1), len(s2)).""" + for a, b in zip(OLGA_CDR3S[:10], OLGA_CDR3S[10:20]): + gs1, mid, gs2, _ = self.aligner.align(a, b) + expected_len = max(len(a), len(b)) + assert len(gs1) == expected_len + + # -- Gap characters -------------------------------------------------- + + def test_gaps_in_shorter_sequence(self): + """Gaps ('-') must appear only in the shorter sequence.""" + for a, b in zip(OLGA_CDR3S[:10], OLGA_CDR3S[10:20]): + if len(a) == len(b): + continue + gs1, mid, gs2, _ = self.aligner.align(a, b) + if len(a) < len(b): + assert '-' in gs1 + assert '-' not in gs2 + else: + assert '-' not in gs1 + assert '-' in gs2 + + def test_gap_count_matches_length_diff(self): + """Number of '-' chars equals the length difference.""" + for a, b in zip(OLGA_CDR3S[:10], OLGA_CDR3S[10:20]): + gs1, mid, gs2, _ = self.aligner.align(a, b) + diff = abs(len(a) - len(b)) + assert gs1.count('-') + gs2.count('-') == diff + + # -- Midline characters ---------------------------------------------- + + def test_midline_chars_valid(self): + """Midline only contains |, :, ., or space.""" + for a, b in zip(OLGA_CDR3S[:15], OLGA_CDR3S[15:30]): + _, mid, _, _ = self.aligner.align(a, b) + for c in mid: + assert c in '|:. ', f"unexpected midline char: {c!r}" + + def test_midline_pipe_means_match(self): + """'|' in midline ↔ same residue at that position.""" + for a, b in zip(OLGA_CDR3S[:10], OLGA_CDR3S[10:20]): + gs1, mid, gs2, _ = self.aligner.align(a, b) + for i, c in enumerate(mid): + if c == '|': + assert gs1[i] == gs2[i], ( + f"'|' at {i} but {gs1[i]} != {gs2[i]}" + ) + + def test_midline_space_means_gap(self): + """' ' in midline ↔ gap character in one of the sequences.""" + for a, b in zip(OLGA_CDR3S[:10], OLGA_CDR3S[10:20]): + gs1, mid, gs2, _ = self.aligner.align(a, b) + for i, c in enumerate(mid): + if c == ' ': + assert gs1[i] == '-' or gs2[i] == '-' + + # -- Score consistency ----------------------------------------------- + + def test_align_score_matches_score(self): + """align() score must match score().""" + for a, b in zip(OLGA_CDR3S[:15], OLGA_CDR3S[15:30]): + _, _, _, aln_sc = self.aligner.align(a, b) + sc = self.aligner.score(a, b) + assert aln_sc == pytest.approx(sc, abs=1e-6), ( + f"align score {aln_sc} != score {sc} for {a}, {b}" + ) + + def test_align_symmetry(self): + """align(a,b) score == align(b,a) score; gaps swap sides.""" + for a, b in zip(OLGA_CDR3S[:10], OLGA_CDR3S[10:20]): + gs1_ab, _, gs2_ab, sc_ab = self.aligner.align(a, b) + gs1_ba, _, gs2_ba, sc_ba = self.aligner.align(b, a) + assert sc_ab == pytest.approx(sc_ba, abs=1e-6) + + # -- C vs Python fallback ------------------------------------------- + + def test_align_c_vs_python(self): + """C and Python fallback align() must produce identical output.""" + aligner = CDRAligner() + for a, b in zip(OLGA_CDR3S[:15], OLGA_CDR3S[15:30]): + c_result = aligner.align(a, b) + py_result = aligner._align_py(a, b) + assert c_result[0] == py_result[0], f"gs1 mismatch for {a}, {b}" + assert c_result[1] == py_result[1], f"mid mismatch for {a}, {b}" + assert c_result[2] == py_result[2], f"gs2 mismatch for {a}, {b}" + assert c_result[3] == pytest.approx(py_result[3], abs=1e-6) + + # -- Visual output (run with pytest -s) ------------------------------ + + def test_visualize_sample_alignments(self): + """Print a few representative alignments for visual inspection.""" + pairs = [ + (OLGA_CDR3S[0], OLGA_CDR3S[3]), # 17 vs 11 aa + (OLGA_CDR3S[4], OLGA_CDR3S[5]), # 17 vs 18 aa + (OLGA_CDR3S[0], OLGA_CDR3S[0]), # self + (OLGA_CDR3S[13], OLGA_CDR3S[15]), # 24 vs 10 aa + ] + print("\n") + for s1, s2 in pairs: + gs1, mid, gs2, sc = self.aligner.align(s1, s2) + norm = self.aligner.score_norm(s1, s2) + print(f" {gs1}") + print(f" {mid}") + print(f" {gs2}") + print(f" score={sc:.1f} norm={norm:.1f} " + f"len1={len(s1)} len2={len(s2)}\n") # =================================================================== @@ -413,6 +564,7 @@ class TestAlignmentBenchmarks: """ N_PAIRS = 200 # number of pairs to score in each benchmark + N_GAPS = len(CDRAligner().gap_positions) # gap positions tested per pair def _make_pairs(self): pairs = [] @@ -438,7 +590,7 @@ def test_benchmark_c_scoring(self): rate = self.N_PAIRS / elapsed print(f"\n CDRAligner C : {self.N_PAIRS} pairs in {elapsed*1000:.1f} ms " - f"({rate:.0f} pairs/s)") + f"({rate:.0f} pairs/s, {self.N_GAPS} gap positions/pair)") def test_benchmark_python_fallback(self): aligner = CDRAligner() @@ -465,7 +617,7 @@ def py_score(s1, s2): rate = self.N_PAIRS / elapsed print(f"\n CDRAligner Py : {self.N_PAIRS} pairs in {elapsed*1000:.1f} ms " - f"({rate:.0f} pairs/s)") + f"({rate:.0f} pairs/s, {self.N_GAPS} gap positions/pair)") def test_benchmark_biopython(self): bio = BioAlignerWrapper() @@ -483,6 +635,7 @@ def test_benchmark_biopython(self): rate = self.N_PAIRS / elapsed print(f"\n BioPython : {self.N_PAIRS} pairs in {elapsed*1000:.1f} ms " f"({rate:.0f} pairs/s)") + # BioPython uses its own gap model, not our gap_positions def test_c_faster_than_python(self): """C extension should be significantly faster than Python fallback.""" @@ -514,3 +667,64 @@ def py_score(s1, s2): speedup = t_py / t_c if t_c > 0 else float("inf") print(f"\n C/Py speedup : {speedup:.1f}x") assert speedup > 2.0, f"C extension only {speedup:.1f}x faster than Python" + + def test_benchmark_align_visualization(self): + """Benchmark align() (C) that also builds visualization strings.""" + aligner = CDRAligner() + pairs = self._make_pairs() + + # Warm up + for a, b in pairs[:5]: + aligner.align(a, b) + + t0 = time.perf_counter() + for a, b in pairs: + aligner.align(a, b) + elapsed = time.perf_counter() - t0 + + rate = self.N_PAIRS / elapsed + print(f"\n CDRAligner C align(): {self.N_PAIRS} pairs in " + f"{elapsed*1000:.1f} ms ({rate:.0f} pairs/s, {self.N_GAPS} gap positions/pair)") + + def test_benchmark_summary(self): + """Print a combined benchmark summary table.""" + aligner = CDRAligner() + bio = BioAlignerWrapper() + pairs = self._make_pairs() + + def py_score(s1, s2): + if len(s1) == len(s2): + return aligner._score_equal_len_py(s1, s2) + best = -math.inf + for p in aligner.gap_positions: + sc = aligner._score_with_gap_py(s1, s2, int(p)) + if sc > best: + best = sc + return best + + # Warm-up all paths + for a, b in pairs[:5]: + aligner.score(a, b) + aligner.align(a, b) + py_score(a, b) + bio.score(a, b) + + results = {} + for label, fn in [ + ("CDRAligner C score()", lambda a, b: aligner.score(a, b)), + ("CDRAligner C align()", lambda a, b: aligner.align(a, b)), + ("BioPython PairwiseAl.", lambda a, b: bio.score(a, b)), + ("CDRAligner Python", py_score), + ]: + t0 = time.perf_counter() + for a, b in pairs: + fn(a, b) + elapsed = time.perf_counter() - t0 + results[label] = elapsed + + print(f"\n Pairs: {self.N_PAIRS}, Gap positions: {self.N_GAPS}") + print(f" {'Method':<25} {'Time (ms)':>10} {'Throughput':>15}") + print(f" {'-'*25} {'-'*10} {'-'*15}") + for label, elapsed in results.items(): + rate = self.N_PAIRS / elapsed + print(f" {label:<25} {elapsed*1000:>9.1f} {rate:>12,.0f} p/s") From 9ddcb7679fe96637416b0b961def2d22e547b821 Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 20:00:11 +0300 Subject: [PATCH 15/24] Rewrite old kmer impl by Danya --- mir/basic/kmers.py | 192 ----------------------- mir/biomarkers/__init__.py | 4 +- mir/biomarkers/kmer_stats.py | 268 +++++++++++++++++++++++++++++++++ notebooks/kmer_generator.ipynb | 4 +- 4 files changed, 271 insertions(+), 197 deletions(-) delete mode 100644 mir/basic/kmers.py create mode 100644 mir/biomarkers/kmer_stats.py diff --git a/mir/basic/kmers.py b/mir/basic/kmers.py deleted file mode 100644 index b5725bb..0000000 --- a/mir/basic/kmers.py +++ /dev/null @@ -1,192 +0,0 @@ -from itertools import islice -from scipy.stats import chi2_contingency -import numpy as np -import pandas as pd -from statsmodels.stats.multitest import multipletests - -from mir.common.repertoire import Repertoire -import matplotlib.pyplot as plt - - -def comparison_plotter(res: pd.DataFrame, plot_type=None, ax=None): - if plot_type == 'line': - if not ax: - fig, ax = plt.subplots(figsize=(10, 10)) - ax.scatter(np.log2(res.freq_1), np.log2(res.freq_2), s=1) - ax.plot([np.log2(res[res.freq_1 != 0].freq_1.min()), np.log2(res.freq_1.max())], - [np.log2(res[res.freq_1 != 0].freq_1.min()), np.log2(res.freq_1.max())], '--', c='red') - ax.set_xlabel('log2(freq_1)') - ax.set_ylabel('log2(freq_2)') - - if plot_type == 'volcano': - if not ax: - fig, ax = plt.subplots(figsize=(10, 6)) - ax.scatter(np.log2(res.freq_fc), -np.log2(res.p_val.apply(lambda x: 2 ** (-100) if x < 2 ** (-100) else x)), - s=1) - ax.set_xlabel('logFC') - ax.set_ylabel('-log(p)') - - top_results = res[res.freq_fc.apply(lambda x: not np.isinf(np.log2(x)))].sort_values(by='p_val').head(10) - top_results.p_val = top_results.p_val.apply(lambda x: 2 ** (-100) if x < 2 ** (-100) else x) - for index, row in top_results.iterrows(): - x = np.log2(row['freq_fc']) - y = -np.log2(row['p_val']) - ax.annotate(index, (x, y), textcoords="offset points", ha='center', - xytext=(0, 10), arrowprops=dict(arrowstyle='-', lw=0.5)) - - -def generate_comparison_from_counts(count_res, p_adj_func): - count_res['p_val'] = np.nan - n = count_res.count_1.sum() - m = count_res.count_2.sum() - count_res['freq_1'] = count_res.count_1 / n - count_res['freq_2'] = count_res.count_2 / m - - for kmer in count_res.index: - count_res.loc[kmer, 'p_val'] = chi2_contingency([[count_res['count_1'][kmer], n - count_res['count_1'][kmer]], - [count_res['count_2'][kmer], m - count_res['count_2'][kmer]]])[ - 1] - - count_res['freq_fc'] = (count_res.count_1 / n) / (count_res.count_2 / m) - - if not p_adj_func: - count_res['p_val_adj'] = multipletests(count_res.p_val, method='holm')[1] - else: - count_res['p_val_adj'] = p_adj_func(count_res.p_val) - - return count_res - - -class KmersTable: - """ - Class to generate series of subsequnces of length k from Repertoire - """ - - def __init__(self, k: int, repertoire: Repertoire): - """ - Creating a new KmeresTable object - - :param k: length of subsequnces to be generated - :param repertoire: Repertoire class object - """ - self.k = k - self.repertoire = repertoire - self.vc_k_mers_list_generator = np.vectorize(self._k_mers_list_generator, otypes=[np.ndarray]) - self.kmer_table = list() - self.count_table = dict() - - def _over_slice(self, cdr3): - """ - Generator of kmers - :param cdr3: cdr3 aa sequence - """ - iterator = iter(cdr3) - res = tuple(islice(iterator, self.k)) - if len(res) == self.k: - yield res - for elem in iterator: - res = res[1:] + (elem,) - yield res - - def _k_mers_list_generator(self, clonotype) -> list: - """ - Form a list from generator - :param cdr3: cdr3 aa sequence - :return: list of kmers for cdr3 - """ - cdr3 = clonotype.cdr3aa - res = ["".join(elem) for elem in self._over_slice(cdr3)] - return res - - def generate_kmers_table(self) -> np.ndarray: - """ - Generate the table with kmers for repertoire - :return: Series of kmers arrays for each cdr3 in repertoire - """ - if len(self.kmer_table): - return self.kmer_table - else: - self.kmer_table = self.vc_k_mers_list_generator(self.repertoire.clonotypes) - return self.kmer_table - - def generate_kmers_count_table(self) -> dict[str:int]: - """ - Generate the dict with numbers of occurrence of each kmer in repertoire - :return: dict with amount of each kmer - """ - if len(self.count_table): - return self.count_table - else: - kmers_array = self.generate_kmers_table() - unique_kmers, kmers_counts = np.unique((np.concatenate(kmers_array)), return_counts=True) - self.count_table = dict(zip(unique_kmers, kmers_counts)) - return self.count_table - - def compare_with_another_KmersTable(self, - kmers_table: 'KmersTable', - plot_comparison=None, - ax=None, - p_adj_func=None - ): - """ - Function to compare self with another KmersTable - :param kmers_table: another KmersTable object to be compared with self - :param plot_comparison: - None - do not plot comparison - 'line' - plot log2(frequences) of kmers from two sets - 'volcano' - plot Volcanoplot of kmers comparison - :param ax: ax to plot comparison - :param p_adj_func: function to adjust p_values array, which returns array of adjusted ps - :return: pd.DataFrame with comparison results - """ - - if self.k != kmers_table.k: - raise ValueError('K should be equal for comparison') - - table_1 = pd.DataFrame.from_dict(self.generate_kmers_count_table(), orient='index') - table_2 = pd.DataFrame.from_dict(kmers_table.generate_kmers_count_table(), orient='index') - - table_1.columns = ['count_1'] - table_2.columns = ['count_2'] - - count_res = table_1.join(table_2, how='outer').fillna(0) - res = generate_comparison_from_counts(count_res, p_adj_func) - - if plot_comparison: - comparison_plotter(res=res, plot_type=plot_comparison, ax=ax) - return res - - -def compare_two_repertoire_kmers(repertoire_1: Repertoire, - repertoire_2: Repertoire, - k: int, - plot_comparison=None, - ax=None, - p_adj_func=None - ) -> pd.DataFrame: - """ - Function to compare 2 Repertoires on the kmers level - :param repertoire_1: first repertoire - :param repertoire_2: first repertoire - :param k: length of subsequnces to be compared - :param plot_comparison: - None - do not plot comparison - 'line' - plot log2(frequences) of kmers from two sets - 'volcano' - plot Volcanoplot of kmers comparison - :param ax: ax to plot comparison - :param p_adj_func: function to adjust p_values array, which returns array of adjusted ps - :return: pd.DataFrame with comparison results - """ - - table_1 = pd.DataFrame.from_dict(KmersTable(k, repertoire_1).generate_kmers_count_table(), orient='index') - table_2 = pd.DataFrame.from_dict(KmersTable(k, repertoire_2).generate_kmers_count_table(), orient='index') - - table_1.columns = ['count_1'] - table_2.columns = ['count_2'] - - res = table_1.join(table_2, how='outer').fillna(0) - generate_comparison_from_counts(res, p_adj_func) - - if plot_comparison: - comparison_plotter(res=res, plot_type=plot_comparison, ax=ax) - return res diff --git a/mir/biomarkers/__init__.py b/mir/biomarkers/__init__.py index 95b85c1..b01871d 100644 --- a/mir/biomarkers/__init__.py +++ b/mir/biomarkers/__init__.py @@ -1,3 +1 @@ -""" -a module which contains functions for biomarkers selection and postprocessing -""" \ No newline at end of file +"""Biomarker detection and k-mer differential analysis for immune repertoires.""" \ No newline at end of file diff --git a/mir/biomarkers/kmer_stats.py b/mir/biomarkers/kmer_stats.py new file mode 100644 index 0000000..170955a --- /dev/null +++ b/mir/biomarkers/kmer_stats.py @@ -0,0 +1,268 @@ +"""K-mer frequency analysis and differential comparison of immune repertoires. + +Extracts overlapping k-mers from CDR3 amino-acid sequences, counts their +occurrences across a :class:`~mir.common.repertoire.Repertoire`, and performs +chi-squared differential analysis between two repertoires. + +K-mer tokenisation is delegated to the C-accelerated +:func:`mir.basic.tokens.tokenize_str` for speed. + +Classes +------- +* :class:`KmerCounter` — Extract and count k-mers from a single repertoire. + +Functions +--------- +* :func:`compare_kmer_counts` — Chi-squared comparison of two count tables + with multiple-testing correction. +* :func:`compare_repertoire_kmers` — End-to-end comparison of two + :class:`~mir.common.repertoire.Repertoire` objects. +* :func:`plot_comparison` — Scatter / volcano visualisation of comparison + results. +""" + +from __future__ import annotations + +from collections import Counter +from typing import Callable + +import numpy as np +import pandas as pd +from scipy.stats import chi2_contingency +from statsmodels.stats.multitest import multipletests + +from mir.basic.tokens import tokenize_str +from mir.common.repertoire import Repertoire + + +# --------------------------------------------------------------------------- +# K-mer counting +# --------------------------------------------------------------------------- + +class KmerCounter: + """Count overlapping k-mers across all CDR3 sequences in a repertoire. + + Uses the C-accelerated :func:`~mir.basic.tokens.tokenize_str` to extract + k-mers from each clonotype's ``cdr3aa`` field. + + Parameters + ---------- + k : int + K-mer length (must be >= 1). + repertoire : Repertoire + Source repertoire whose clonotypes will be scanned. + + Examples + -------- + >>> counter = KmerCounter(k=3, repertoire=rep) + >>> counts = counter.counts() # dict[str, int] + >>> df = counter.counts_dataframe() # single-column DataFrame + """ + + def __init__(self, k: int, repertoire: Repertoire) -> None: + if k < 1: + raise ValueError(f"k must be >= 1, got {k}") + self.k = k + self.repertoire = repertoire + self._counts: Counter[str] | None = None + + def counts(self) -> dict[str, int]: + """Return k-mer counts for the repertoire. + + Results are cached after the first call. + + Returns + ------- + dict[str, int] + Mapping from k-mer string to occurrence count. + """ + if self._counts is None: + self._counts = Counter() + for cl in self.repertoire.clonotypes: + self._counts.update(tokenize_str(cl.cdr3aa, self.k)) + return dict(self._counts) + + def counts_dataframe(self, column: str = "count") -> pd.DataFrame: + """Return counts as a single-column :class:`~pandas.DataFrame`. + + Parameters + ---------- + column : str + Name of the count column (default ``"count"``). + + Returns + ------- + pandas.DataFrame + Indexed by k-mer string. + """ + return pd.DataFrame.from_dict(self.counts(), orient="index", columns=[column]) + + +# --------------------------------------------------------------------------- +# Statistical comparison +# --------------------------------------------------------------------------- + +def compare_kmer_counts( + counts_1: dict[str, int], + counts_2: dict[str, int], + p_adj_method: str = "holm", + p_adj_func: Callable[[np.ndarray], np.ndarray] | None = None, +) -> pd.DataFrame: + """Chi-squared comparison of two k-mer count dictionaries. + + For every k-mer observed in either repertoire a 2 × 2 contingency test + is performed. P-values are corrected for multiple testing. + + Parameters + ---------- + counts_1, counts_2 : dict[str, int] + K-mer occurrence counts (e.g. from :meth:`KmerCounter.counts`). + p_adj_method : str + Method for :func:`statsmodels.stats.multitest.multipletests` + (default ``"holm"``). Ignored when *p_adj_func* is given. + p_adj_func : callable, optional + Custom function that accepts and returns an array of p-values. + When provided, *p_adj_method* is ignored. + + Returns + ------- + pandas.DataFrame + Columns: ``count_1``, ``count_2``, ``freq_1``, ``freq_2``, + ``freq_fc``, ``p_val``, ``p_val_adj``. Indexed by k-mer. + """ + df1 = pd.DataFrame.from_dict(counts_1, orient="index", columns=["count_1"]) + df2 = pd.DataFrame.from_dict(counts_2, orient="index", columns=["count_2"]) + df = df1.join(df2, how="outer").fillna(0).astype({"count_1": int, "count_2": int}) + + n1 = df["count_1"].sum() + n2 = df["count_2"].sum() + if n1 == 0 or n2 == 0: + raise ValueError("Both count tables must be non-empty") + + df["freq_1"] = df["count_1"] / n1 + df["freq_2"] = df["count_2"] / n2 + + # Vectorised contingency tables → chi-squared p-values + pvals = np.empty(len(df)) + c1 = df["count_1"].values + c2 = df["count_2"].values + for i in range(len(df)): + table = [[c1[i], n1 - c1[i]], [c2[i], n2 - c2[i]]] + pvals[i] = chi2_contingency(table)[1] + df["p_val"] = pvals + + # Fold change (freq_1 / freq_2); 0-frequency guarded by fillna above + with np.errstate(divide="ignore", invalid="ignore"): + df["freq_fc"] = df["freq_1"] / df["freq_2"] + + # Multiple testing correction + if p_adj_func is not None: + df["p_val_adj"] = p_adj_func(df["p_val"].values) + else: + df["p_val_adj"] = multipletests(df["p_val"].values, method=p_adj_method)[1] + + return df + + +def compare_repertoire_kmers( + repertoire_1: Repertoire, + repertoire_2: Repertoire, + k: int, + p_adj_method: str = "holm", + p_adj_func: Callable[[np.ndarray], np.ndarray] | None = None, +) -> pd.DataFrame: + """Compare two repertoires by k-mer frequency using chi-squared tests. + + Convenience wrapper that builds :class:`KmerCounter` instances, + extracts counts, and delegates to :func:`compare_kmer_counts`. + + Parameters + ---------- + repertoire_1, repertoire_2 : Repertoire + Repertoires to compare. + k : int + K-mer length. + p_adj_method : str + Multiple-testing correction method (default ``"holm"``). + p_adj_func : callable, optional + Custom p-value adjustment function. + + Returns + ------- + pandas.DataFrame + Same format as :func:`compare_kmer_counts`. + """ + c1 = KmerCounter(k, repertoire_1).counts() + c2 = KmerCounter(k, repertoire_2).counts() + return compare_kmer_counts(c1, c2, p_adj_method=p_adj_method, p_adj_func=p_adj_func) + + +# --------------------------------------------------------------------------- +# Visualisation helpers +# --------------------------------------------------------------------------- + +def plot_comparison( + df: pd.DataFrame, + kind: str = "volcano", + ax=None, + top_n: int = 10, +): + """Plot the results of a k-mer comparison. + + Parameters + ---------- + df : pandas.DataFrame + Output of :func:`compare_kmer_counts` or + :func:`compare_repertoire_kmers`. + kind : ``"volcano"`` | ``"scatter"`` + Plot type. ``"scatter"`` shows log₂ frequencies; ``"volcano"`` + shows log₂ fold-change vs. −log₂ p-value. + ax : matplotlib.axes.Axes, optional + Axes to draw on. Created automatically if *None*. + top_n : int + Number of top hits to label on volcano plots (default 10). + + Returns + ------- + matplotlib.axes.Axes + """ + import matplotlib.pyplot as plt # lazy import to avoid hard dep at module level + + if ax is None: + _, ax = plt.subplots(figsize=(10, 6) if kind == "volcano" else (10, 10)) + + if kind == "scatter": + # Guard against log2(0) + nonzero = df[(df["freq_1"] > 0) & (df["freq_2"] > 0)] + ax.scatter(np.log2(nonzero["freq_1"]), np.log2(nonzero["freq_2"]), s=1) + lo = np.log2(nonzero[["freq_1", "freq_2"]].min().min()) + hi = np.log2(nonzero[["freq_1", "freq_2"]].max().max()) + ax.plot([lo, hi], [lo, hi], "--", c="red") + ax.set_xlabel("log₂(freq_1)") + ax.set_ylabel("log₂(freq_2)") + + elif kind == "volcano": + floor = 2.0 ** -100 + pv = df["p_val"].clip(lower=floor) + fc = df["freq_fc"].replace([np.inf, -np.inf], np.nan).dropna() + log_fc = np.log2(fc) + neg_log_p = -np.log2(pv.loc[log_fc.index]) + + ax.scatter(log_fc, neg_log_p, s=1) + ax.set_xlabel("log₂ FC") + ax.set_ylabel("−log₂(p)") + + # Label top hits + top = df.loc[log_fc.index].sort_values("p_val").head(top_n) + for kmer, row in top.iterrows(): + x = np.log2(row["freq_fc"]) if np.isfinite(np.log2(row["freq_fc"])) else 0 + y = -np.log2(max(row["p_val"], floor)) + ax.annotate( + kmer, (x, y), + textcoords="offset points", ha="center", + xytext=(0, 10), arrowprops=dict(arrowstyle="-", lw=0.5), + ) + else: + raise ValueError(f"Unknown plot kind {kind!r}; use 'scatter' or 'volcano'") + + return ax diff --git a/notebooks/kmer_generator.ipynb b/notebooks/kmer_generator.ipynb index 3fd5097..343b6ef 100644 --- a/notebooks/kmer_generator.ipynb +++ b/notebooks/kmer_generator.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "ad00cfd1-f70f-4b81-aad8-e76668c3ce62", "metadata": {}, "outputs": [], @@ -18,7 +18,7 @@ "sys.path.append(\"../\")\n", "\n", "from mir.common.repertoire import Repertoire\n", - "from mir.basic.kmers import KmersTable, compare_two_repertoire_kmers\n", + "from mir.biomarkers.kmer_stats import KmerCounter, compare_repertoire_kmers\n", "from mir.common.parser import *\n", "from scipy.stats import chi2_contingency\n", "import numpy as np\n", From 74b20dc00cfc3864b98b7a4747365aef48a7d621 Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 20:00:20 +0300 Subject: [PATCH 16/24] Upd docs --- README.md | 23 +++++++++++++++++++++++ docs/mir.basic.rst | 24 ++++++++++++++++-------- docs/mir.biomarkers.rst | 8 ++++++++ docs/mir.distances.rst | 16 ++++++++++++++++ 4 files changed, 63 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 89840f7..fd00b2c 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,29 @@ repertoire = Repertoire.load( ) ``` +### Mask and match sequences + +```python +from mir.basic.alphabets import ( + aa_to_reduced, + mask, + matches, + matches_aa_reduced, + NT_MASK, + AA_MASK, +) + +nt_masked = mask("ATCGAT", (2, 5), NT_MASK) +assert nt_masked == b"ATNNNT" + +aa = "CASTIV" +reduced = aa_to_reduced(aa) + +# Matching ignores mask symbols: N for nucleotides, X for amino-acid alphabets. +assert matches(mask(aa, 0, AA_MASK), aa, AA_MASK) +assert matches_aa_reduced(aa, mask(reduced, 3, AA_MASK)) +``` + ## Resources - Example notebooks are available in [notebooks/](https://github.com/antigenomics/mirpy/tree/main/notebooks). diff --git a/docs/mir.basic.rst b/docs/mir.basic.rst index e7295d5..7967f53 100644 --- a/docs/mir.basic.rst +++ b/docs/mir.basic.rst @@ -28,14 +28,6 @@ mir.basic.fast_clust module :undoc-members: :show-inheritance: -mir.basic.kmers module ----------------------- - -.. automodule:: mir.basic.kmers - :members: - :undoc-members: - :show-inheritance: - mir.basic.pgen module --------------------- @@ -52,6 +44,22 @@ mir.basic.sampling module :undoc-members: :show-inheritance: +mir.basic.alphabets module +-------------------------- + +.. automodule:: mir.basic.alphabets + :members: + :undoc-members: + :show-inheritance: + +mir.basic.mirseq module +----------------------- + +.. automodule:: mir.basic.mirseq + :members: + :undoc-members: + :show-inheritance: + mir.basic.segment_usage module ------------------------------ diff --git a/docs/mir.biomarkers.rst b/docs/mir.biomarkers.rst index f78feab..d3d2e39 100644 --- a/docs/mir.biomarkers.rst +++ b/docs/mir.biomarkers.rst @@ -12,6 +12,14 @@ mir.biomarkers.fisher_biomarkers_detector module :undoc-members: :show-inheritance: +mir.biomarkers.kmer_stats module +-------------------------------- + +.. automodule:: mir.biomarkers.kmer_stats + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/mir.distances.rst b/docs/mir.distances.rst index baa2aa1..f3c800d 100644 --- a/docs/mir.distances.rst +++ b/docs/mir.distances.rst @@ -28,6 +28,22 @@ mir.distances.search module :undoc-members: :show-inheritance: +mir.distances.seqdist module +---------------------------- + +.. automodule:: mir.distances.seqdist + :members: + :undoc-members: + :show-inheritance: + +mir.distances.seqdist\_c module +------------------------------- + +.. automodule:: mir.distances.seqdist_c + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- From 0f3cc2c316f140666fb98af7b2202d31446232f6 Mon Sep 17 00:00:00 2001 From: mikessh Date: Sat, 11 Apr 2026 20:13:01 +0300 Subject: [PATCH 17/24] Test for kmer biomarkers --- tests/assets/fetch_vdjdb_gilgfvftl.sh | 67 + tests/assets/gilgfvftl_trb_cdr3.txt | 5233 +++++++++++++++++++++++++ tests/test_kmer_stats.py | 200 + 3 files changed, 5500 insertions(+) create mode 100644 tests/assets/fetch_vdjdb_gilgfvftl.sh create mode 100644 tests/assets/gilgfvftl_trb_cdr3.txt create mode 100644 tests/test_kmer_stats.py diff --git a/tests/assets/fetch_vdjdb_gilgfvftl.sh b/tests/assets/fetch_vdjdb_gilgfvftl.sh new file mode 100644 index 0000000..10b6c9d --- /dev/null +++ b/tests/assets/fetch_vdjdb_gilgfvftl.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# Fetch the latest VDJdb release and extract human TRB CDR3 sequences +# specific for HLA-A*02:01 restricted GILGFVFTL epitope (Influenza M1). +# +# Output: tests/assets/gilgfvftl_trb_cdr3.txt (one CDR3aa per line, deduplicated) +# +# Usage: bash tests/assets/fetch_vdjdb_gilgfvftl.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +OUTFILE="$SCRIPT_DIR/gilgfvftl_trb_cdr3.txt" + +# Skip download if output already exists +if [ -f "$OUTFILE" ]; then + n=$(wc -l < "$OUTFILE") + echo "Already have $OUTFILE ($n sequences), skipping download." + exit 0 +fi + +TMPDIR=$(mktemp -d) +trap 'rm -rf "$TMPDIR"' EXIT + +# --- Fetch latest VDJdb release URL via GitHub API --- +echo "Querying GitHub for latest VDJdb release..." +ASSET_URL=$(curl -sL https://api.github.com/repos/antigenomics/vdjdb-db/releases/latest \ + | grep -o '"browser_download_url": *"[^"]*"' \ + | head -1 \ + | sed 's/"browser_download_url": *"//;s/"//') + +if [ -z "$ASSET_URL" ]; then + echo "ERROR: Could not determine download URL" >&2 + exit 1 +fi + +echo "Downloading $ASSET_URL ..." +curl -sL "$ASSET_URL" -o "$TMPDIR/vdjdb.zip" +unzip -q "$TMPDIR/vdjdb.zip" -d "$TMPDIR/vdjdb" + +# --- Find the main database file --- +DBFILE=$(find "$TMPDIR/vdjdb" -name 'vdjdb.txt' -not -name 'vdjdb.slim.txt' | head -1) +if [ -z "$DBFILE" ]; then + # Fall back to slim + DBFILE=$(find "$TMPDIR/vdjdb" -name 'vdjdb.slim.txt' | head -1) +fi +if [ -z "$DBFILE" ]; then + echo "ERROR: Could not find vdjdb.txt in archive" >&2 + exit 1 +fi +echo "Using database file: $DBFILE" + +# --- Filter: human TRB, HLA-A*02, GILGFVFTL epitope --- +# VDJdb columns (tab-separated): +# 0:complex.id 1:Gene 2:CDR3 3:V 4:J 5:Species 6:MHC A 7:MHC B +# 8:MHC class 9:Epitope 10:Epitope gene 11:Epitope species ... +# +# Filter criteria: +# Gene == TRB +# Species == HomoSapiens +# Epitope == GILGFVFTL +# MHC A contains A*02 +# CDR3 starts with C and ends with F (canonical) +awk -F'\t' 'NR > 1 && $2 == "TRB" && $6 == "HomoSapiens" && $10 == "GILGFVFTL" && $7 ~ /A\*02/ && $3 ~ /^C[A-Z]+F$/ { print $3 }' "$DBFILE" \ + | sort -u > "$OUTFILE" + +n=$(wc -l < "$OUTFILE") +echo "Wrote $n unique CDR3 sequences to $OUTFILE" diff --git a/tests/assets/gilgfvftl_trb_cdr3.txt b/tests/assets/gilgfvftl_trb_cdr3.txt new file mode 100644 index 0000000..3fdf79d --- /dev/null +++ b/tests/assets/gilgfvftl_trb_cdr3.txt @@ -0,0 +1,5233 @@ +CAAATGLYGYTF +CAAPLGSNQPQHF +CAASGRASETQYF +CAASSRSTDTQYF +CAASTGIYGYTF +CAASTGNYGYTF +CAASTGSYGYTF +CAASTGVLGYTF +CAASTRSTDTQYF +CACSFSGGRRDGYTF +CACSGRSGETQYF +CACSGTEDQPQHF +CACSINGGRRDGYTF +CACSIRSTDTQYF +CACSQGSYGYTF +CACSQRSATQEYF +CACTLVSTEAFF +CAGGGAGYGYTF +CAGGGMGTEAFF +CAGRGTDDYGYTF +CAGRITNTEAFF +CAGSIRSSYEQYF +CAGSIRSTETQYF +CAGSIYSNQPQHF +CAGSSRSGHEQYF +CAGSTRSSYEQYF +CAHQCVLLLSRGNDNEQFF +CAIAGQGNTGELFF +CAIDRNTEAFF +CAIGTGHPRTF +CAIKLRGSEGDTQYF +CAILGYGSTEAFF +CAINGEGYGYTF +CAIPGRGTANQPQHF +CAIPQDGGNTIYF +CAIQTGWDFWDGYTF +CAIRPTGISYEQYF +CAIRSQGAVSHEQFF +CAISDGDSTYGYTF +CAISDRSGNTIYF +CAISDSGQMNTEAFF +CAISDSPNTGELFF +CAISDTTGVREQYF +CAISDTYEQYF +CAISEDRETQYF +CAISEGGINTEAFF +CAISEGQGSYNSPLHF +CAISEPPQGPRLAGVFNEQFF +CAISESSEQFF +CAISESTAHSYNEQFF +CAISEWSSEKLFF +CAISGAARDTQYF +CAISGGDRANYGYTF +CAISGGQGRREQFF +CAISGSETQYF +CAISLIYPGELFF +CAISPGQGIGYGYTF +CAISQNIDEQFF +CAISRGNEQYF +CAISTSVSTDTQYF +CAITDSSYNEQFF +CAITGGGEQFF +CAITHPPIGETQYF +CAITREGNTGELFF +CALEGNGELFF +CALERANEQFF +CALQGEVLNTEAFF +CALVVSGPTGELFF +CANSARDHSDPNTGELFF +CANSFRSGETQYF +CANSIGQVDTIYF +CANSIRSSYEQYF +CANSIRSTDTQYF +CANSIRSTETQYF +CANSLRAGETQYF +CANSLRGSGEQFF +CANSQQGSSYEQYF +CANSQRASYEQYF +CANSSRSGETQYF +CANSVRRGELFF +CAPLKGSSYEQYF +CAPRRGPTNEKLFF +CARALFGNTIYF +CARGQEDLYGYTF +CARLGIWYPAGGHEQFF +CARRGGSYEQYF +CARRINGGRRDGYTF +CARRIRRSYEQYF +CARRPLLAPQGYAQYF +CARRSRQVLPNEQFF +CARRTRRPDAQYF +CARSFFSGSYNEQFF +CARSIFAADTQYF +CARSIRSASAQYF +CARSSLSLATQSF +CARSSRAATEQYF +CARSTGSYGYTF +CARSYGANTAAFF +CARVIGDTQYF +CASAAGTYGYTF +CASAFYSNQPQHF +CASAGAGVEKLFF +CASAGFSVNEQFF +CASAGGSSYEQYF +CASAGISTDTQYF +CASAGTGVEKLFF +CASAGTGVEKLLF +CASAGTGVNIQYF +CASAHPRASGSTDTQYF +CASAKGYGYTF +CASALAGGPGNEQFF +CASALGGYNEQFF +CASANGGITYNEQFF +CASANVGLNTEAFF +CASAPDGFFYGYTF +CASAPGTGIYEQYF +CASARDRDNTQYF +CASASEQGTDIQYF +CASASISNQPQHF +CASATFGSSYNEQFF +CASATFSYNEQFF +CASATFTNEKLFF +CASATGADTEAFF +CASATGAYGYTF +CASATGTYGYTF +CASAVGNEQFF +CASAWIGTGELFF +CASAWLSTDTQYF +CASAWTLNTEAFF +CASAWTVNTEAFF +CASAYGLNTGELFF +CASCLRGGAHEQFF +CASCMDIDNSPLHF +CASCSLSSQPQHF +CASCVGAGETRYF +CASDTGGGYGYTF +CASDVTGAGEKLFF +CASEGETQYF +CASEPQENTEAFF +CASERANTGELFF +CASEREWGTQYF +CASETRRTAYDTEAFF +CASETSGRQSDTQYF +CASETTSTDTQYF +CASGAGGPLNEQFF +CASGDRPYGYTF +CASGFFVDTAKNIQYF +CASGFGDRGYNEQFF +CASGFYSNQPQHF +CASGGAGSYEQYF +CASGGGLYEQYF +CASGGISTDTQYF +CASGGLSTDTQYF +CASGGLVHEQFF +CASGGPGSYEQYF +CASGGPPKSYEQYF +CASGGQYQETQYF +CASGGTGPWNEQFF +CASGGTSTDTQYF +CASGGVSGVFF +CASGIIAGGYNEQFF +CASGIPWEQYF +CASGIRSSYEQYF +CASGIRSVAEAFF +CASGLAGATGELFF +CASGLAGEQYF +CASGLAGSYNEQFF +CASGLDPTVTYNEQFF +CASGLGGESYEQYF +CASGLIYPGELFF +CASGLPYEQYF +CASGLRRTPEVEQYF +CASGLRSSYEQYF +CASGLVPEGLVYEQYF +CASGLVPGGLVYEQYF +CASGLVPGGVVYEQYF +CASGMTGLTSEQYF +CASGNKDIHNEQFF +CASGNYGYTF +CASGPAYGYTF +CASGPGSSYNEQFF +CASGPHGGGNNEQFF +CASGPYSLYEQYF +CASGQGAFF +CASGQGAGEQYF +CASGQGATEAFF +CASGQLAGGLNEQFF +CASGQYSNQPQHF +CASGREEGNEQFF +CASGRGTRTYEQYF +CASGRLSYNEQFF +CASGSGAEAFF +CASGSGAFF +CASGSGGRSYNEQFF +CASGSSGANVLTF +CASGTEGNTIYF +CASGTGAGELFF +CASGTGGFYGYTF +CASGTPWEQYF +CASGTRSADTQYF +CASGTSYTDTQYF +CASGTWGLDEQYF +CASGTYSNQPQHF +CASGVGQGYEQYF +CASGVLAEDTIYF +CASGVRSAYEQYF +CASGVRSGEVEQFF +CASGVVGGGAANEQFF +CASGVYSNQPQHF +CASGWGEQFF +CASGWVGQPQHF +CASGYGPDEAFF +CASGYRDRGLIEQFF +CASGYSPLHF +CASGYSTDTQYF +CASGYTGELFF +CASHTSGVPVGYEQYF +CASIAGLVGDTQYF +CASIDRPATNEKLFF +CASIGGGGWAQPQHF +CASIIRSSYEQYF +CASILAGEPDTQYF +CASILLRIVGQLNEKLFF +CASIPDRGSGNTIYF +CASIPLAGPFNEQFF +CASIQWVRRRNPSYEQYF +CASIRGGGSTDTQYF +CASIRGLAGVRTDTQYF +CASIRGNTEAFF +CASISDREGTEAFF +CASISLDPLYF +CASITGTATANYGYTF +CASKAFGGSYEQYF +CASKARGSPLHF +CASKDNISFRGYTF +CASKGGGGYTF +CASKGLSYEQYF +CASKGMASGGLRGTGELFF +CASKGRWGRGRAARDTEAFF +CASKGTSGGVNEQFF +CASKGYEQFF +CASKHASGELFF +CASKPSQALIEGYGYTF +CASKQNNEQFF +CASKRGMSNQPQHF +CASKSEKGAGGSPLHF +CASKSGAEAFF +CASKSGLGGTTEAFF +CASKTLAGARETQYF +CASKWGGRAVLNTEAFF +CASLISGTDTQYF +CASLKKRELAVNYEQYF +CASLLYKGGANTEAFF +CASLPWAGGPYNEQFF +CASLREQDYGYTF +CASLRESPGADGYTF +CASLSDRVEKLFF +CASLSGEGGPTGELFF +CASLSRRRRQYF +CASLTPGRQTEAFF +CASMARGWEETQYF +CASMELAGLNEQFF +CASMGAFF +CASMGGLYEQYF +CASMGLAGLNEQFF +CASMVGQGTGELFF +CASNAGSTNYGYTF +CASNAPDTEAFF +CASNDLDYSYNEQFF +CASNDMGGNANEQFF +CASNDRGNGYTF +CASNEGPYNEQFF +CASNERWEGTYGYTF +CASNGGSYEQYF +CASNGGYSTDTQYF +CASNGLSTGELFF +CASNIAGDNEQFF +CASNIFGGGSEQFF +CASNIFRGGNEQFF +CASNIGAGEQFF +CASNIGTGGHGYTF +CASNIRSSWELFF +CASNIRSSYEQYF +CASNIYSNQPQHF +CASNLAGHTDTQYF +CASNLLGDDNEQFF +CASNMLSTDTQYF +CASNNKDRGYGYTF +CASNNRASNEQFF +CASNPASGGELFF +CASNPFGASYNEQFF +CASNPGRLRSPLHF +CASNPSSTDTQYF +CASNQDPGGQETQYF +CASNQGNEQFF +CASNQRGGRNEQFF +CASNRFLGSYNEQFF +CASNRREHDEQFF +CASNSGLAIEEQFF +CASNSGLLEAFF +CASNSRQGRQFF +CASNTGTGLGEQFF +CASNTGTTQETQYF +CASNTRATYEQYF +CASNTRSTDTQYF +CASNVYSGANVLTF +CASNWGTGELFF +CASPAGNTEAFF +CASPGGWIWIPQYSNQPQHF +CASPGSGGREETQYF +CASPPLSGTGHVRGYTF +CASPPRDEQYF +CASPQGGCYEQYF +CASPRQSPLHF +CASQGAYSNQPQHF +CASQGDWREIDTGELFF +CASQGLAGLEEQFF +CASQGLAGLNEQFF +CASQGVSYNSPLHF +CASQIYGGGDEQFF +CASQSGFSGANVLTF +CASQVAASYEQYF +CASRAFSGSYEQYF +CASRAGQGIHEQYF +CASRAGRAYEQYF +CASRAGREAFF +CASRAGVGNEQFF +CASRATGDLYEQYF +CASRDGGTEAFF +CASRDGLKLFF +CASRDLREGDYGYTF +CASRDRLAGGVETQYF +CASRDSNTYEQYF +CASRDTGQRQPQHF +CASREFQGLAYEQYF +CASREFSDRGYEQYF +CASRELEGATDTQYF +CASRELGLAKNIQYF +CASRERSYNGEQFF +CASREVGWNEQFF +CASRFGFSEAFF +CASRFGTEGSGANVLTF +CASRFLAGLYNEQFF +CASRFTSGSSYEQYF +CASRGAGELFF +CASRGDGSFRELFF +CASRGDREGTEAFF +CASRGEEFYLHF +CASRGGEAFF +CASRGGIEEQYF +CASRGGTEAFF +CASRGGTIAFF +CASRGLDYYNEQFF +CASRGLGGSSYNEQFF +CASRGLSAFF +CASRGNEQFF +CASRGRAGGTQYF +CASRGREAFF +CASRGRSASYGYTF +CASRGSLNQPQHF +CASRGTSLNEQFF +CASRHFTGGGSTDTQYF +CASRIDSNQPQHF +CASRIHGGAINEQFF +CASRILDRANLNTEAFF +CASRINPGGADEAFF +CASRIRAAETQYF +CASRKDNGVAGELFF +CASRKTGGGETQYF +CASRLAEETQYF +CASRLAGSHEQYF +CASRLAGVSYNEQFF +CASRLGRVSNTEAFF +CASRLGSYNEQFF +CASRLHPGHMSYTF +CASRLKDSDSQPQHF +CASRLQVGTEAFF +CASRLVQGRAFYEQYF +CASRMASGETQYF +CASRMGGNTEAFF +CASRPAGGSNQPQHF +CASRPFGGPKNIQYF +CASRPFHPGALFF +CASRPGDNEQFF +CASRPGGGYEQYF +CASRPGLATDTQYF +CASRPHVPPLAGPANTGELFF +CASRPIRADRVTTGELFF +CASRPKQGGGGNEQFF +CASRPLLGSNQPQHF +CASRPNEGPYEQYF +CASRPPPAGQKETQYF +CASRPQDSYEQYF +CASRPSGPQETQYF +CASRPSGSELIYEQYF +CASRPSPGVFNEQFF +CASRPVPYNEQFF +CASRQALQETQYF +CASRQEMNTEAFF +CASRQGDNSPLHF +CASRQGGTEAFF +CASRQGNTEAFF +CASRQGPTEAFF +CASRQGTGSGAFF +CASRQIGTSTDTQYF +CASRQSFNEQFF +CASRQTSGSYNEQFF +CASRQWTGGTTGELFF +CASRRDGPSYEQYF +CASRRDSTSSYEQYF +CASRRGTDTLTDTQYF +CASRSAYEQYF +CASRSDAEAFF +CASRSFSNQPQHF +CASRSGAEAFF +CASRSGAEALF +CASRSGAETFF +CASRSGAEVFF +CASRSGAKAFF +CASRSGNYGEQYF +CASRSGREAFF +CASRSGTEAFF +CASRSLAGVWDTQYF +CASRSLTDTQYF +CASRSPQETQYF +CASRSQEGSNYGYTF +CASRSQGARGYTF +CASRSSAEAFF +CASRSSSASEAFF +CASRSVAEAFF +CASRTEKTVLLAFF +CASRTFGGSSYNEQFF +CASRTGARGYTF +CASRTGEWPETQYF +CASRTGGFEAFF +CASRTGGYEQYF +CASRTGSMNTEAFF +CASRTGTGAFYEQYF +CASRTIAGALNEQFF +CASRTLRLAGGLFTDTQYF +CASRTRSSYEQCF +CASRTSGSYEQFF +CASRTSPTDTQYF +CASRTVNPPLHF +CASRTVNSPLHF +CASRTYRGQSYEQYF +CASRVDRLPSNEKLFF +CASRVGDSTEAFF +CASRVMGAGGEAFF +CASRVRDRGRLDYGYTF +CASRVTGTQYF +CASRVTSGGRDEQFF +CASRWGAEAFF +CASRWGQGPSYEQYF +CASRYSGGDTGELFF +CASRYSRFAEYEQYF +CASSAALSEAFF +CASSADGILNTEAFF +CASSADRAYEQYF +CASSADRGLDPHQAQHF +CASSAFGADTQYF +CASSAFPTAHNQPQHF +CASSAGAYGYTF +CASSAGLYGYTF +CASSAGNYGYTF +CASSAGPYNEQFF +CASSAGQGGSPLHF +CASSAGQGLPYEQYF +CASSAGQGRDTEAFF +CASSAGQGVKNIQYF +CASSAGQISRVTQYF +CASSAGQPQHF +CASSAGRTSTDTQYF +CASSAGSSGYTF +CASSAGSYGYTF +CASSAGTENTEAFF +CASSAGTSGGSTDTQYF +CASSAHGADTQYF +CASSAIAGAYNEQFF +CASSAIAGGNYEQYF +CASSAIGSSYNEQFF +CASSALAGGYNEQFF +CASSALAGPGNEQFF +CASSALAGRYNEQFF +CASSALGAGGDTQYF +CASSALLNTEAFF +CASSALPEAFF +CASSALQGPSQPQHF +CASSALSGIYNEQFF +CASSALSGRAIDEQYF +CASSALSNQPQHF +CASSALVLYATDTQYF +CASSALVSRETQYF +CASSAMTGWGGYTF +CASSANSINEQFF +CASSAPDTQYF +CASSAPHTGELFF +CASSAPPGTTPYEQYF +CASSAPPPSYEQYF +CASSAPSHSTDTQYF +CASSAQANQPQHF +CASSAQDHSDPNTGELFF +CASSAQSNQPQHF +CASSARAGITQYF +CASSARAGSGELFF +CASSARASDTQYF +CASSARASEGAFF +CASSARASGELFF +CASSARATDTQYF +CASSARDDYDEQYF +CASSARDHSGPNTGELFF +CASSARFGTKQFF +CASSARGADGYTF +CASSARGAYEQYF +CASSARGSGEQYF +CASSARLAGGGNEQFF +CASSARQGLPYEQYF +CASSARSADTQYF +CASSARSAETQYF +CASSARSAYEQYF +CASSARSDEKLFF +CASSARSGEPQHF +CASSARSGNEQFF +CASSARSGTEAFF +CASSARSGYEQYF +CASSARSMDIQYF +CASSARSNQPQHF +CASSARSNYEQYF +CASSARSQETQYF +CASSARSSDTQYF +CASSARSSGELFF +CASSARSSGEQFF +CASSARSSNEQFF +CASSARSSSEAFF +CASSARSSSEQYF +CASSARSSTEQYF +CASSARSSYEQYF +CASSARSTDTPYF +CASSARSTDTQYF +CASSARSTGELFF +CASSARVGNEQYF +CASSASEDYEQYF +CASSASGAYNEQFF +CASSASGDTGELFF +CASSASGGAGNEQFF +CASSASGRYNEQFF +CASSASGTTDTQYF +CASSASRGEVGSPLHF +CASSATGSTYEQYF +CASSATGVAGELFF +CASSATSGGYNEQFF +CASSATSIYEQFF +CASSATVNTEAFF +CASSAVGVPHVEKLFF +CASSAVLGNTEAFF +CASSAVQGDTQYF +CASSAVSSTDTQYF +CASSAVWGYEQYF +CASSAYDTDTQYF +CASSAYDWDEQFF +CASSAYGGNTEAFF +CASSAYHSYEQYF +CASSAYLGAGELFF +CASSCTGTGETPF +CASSDAGGGMAEAFF +CASSDAGGSGNTIYF +CASSDALGVSNYGYTF +CASSDAYGYKLFF +CASSDDPGFGQPQHF +CASSDELETLKLYGYTF +CASSDFAPDLTSGDYNEQFF +CASSDFGLLGTEAFF +CASSDGTGGILYEQYF +CASSDHPGSRELWKHHIF +CASSDLAGDPTDTQYF +CASSDLDRAKGTDTQYF +CASSDLDSNQPQHF +CASSDLGEQFF +CASSDLSTDTQYF +CASSDLTVEYF +CASSDPAGNTIYF +CASSDPGNTQYF +CASSDPPGSSYNEQFF +CASSDPSFFLTNTEAFF +CASSDPTGRLGTEAFF +CASSDRAAGGYTF +CASSDRAGETQYF +CASSDRAGNEQFF +CASSDRAGNTIYF +CASSDRANTEAFF +CASSDRASNEQYF +CASSDRATGELFF +CASSDRATQPQHF +CASSDRDHRRFF +CASSDRDSYNEQFF +CASSDRELAVHTQYF +CASSDRGGGDEQFF +CASSDRGGYEQYF +CASSDRSGNEQFF +CASSDRSGNTIYF +CASSDRSGTEQYF +CASSDRSNQPQHF +CASSDRSNTEAFF +CASSDRSSQPQHF +CASSDRTSGINEQFF +CASSDSAGGTDTQYF +CASSDSASSYNEQFF +CASSDSDNNEQFF +CASSDSEGDTDTQYF +CASSDSGAREGTGELFF +CASSDSGTTNEKLFF +CASSDSIQSTDTQYF +CASSDSPRVDEQYF +CASSDSRQGAFRRETQYF +CASSDSRTPFQRPLFEAFF +CASSDSSGANVLTF +CASSDSSGGLYNEQFF +CASSDSSHEGQGAYEQYF +CASSDSSVSEQFF +CASSDSTSGGRDTQYF +CASSDSVETGELFF +CASSDSYLEGGYTF +CASSDTGVDEQFF +CASSDWDGINLAGAFPWETQYF +CASSDWGTEAFF +CASSDWRTSPWNEQFF +CASSDWSGNTIYF +CASSDYAANTEAFF +CASSEAEWGSAQPQHF +CASSEAHYEQYF +CASSEAISGAWDEQFF +CASSEALAGGGTDTQYF +CASSEALLGNQPQHF +CASSEALVLEQFF +CASSEALWDSTHKNIQYF +CASSEAWNTEAFF +CASSEDPATNEKLFF +CASSEDPTEADTQYF +CASSEDTGAQETQYF +CASSEDTGLLETQYF +CASSEEAGEYNEQFF +CASSEEGQGRLSEQFF +CASSEEGRLYNSPLHF +CASSEEKYEQYF +CASSEEYNEQFF +CASSEFAGDNNEQFF +CASSEFGTSGSGEQFF +CASSEGEAVYNSPLHF +CASSEGFTSTDTQYF +CASSEGGGPYNEQFF +CASSEGGTGGFNSHEQYF +CASSEGGVETQYF +CASSEGQGPSYEQYF +CASSEGQVSPGELFF +CASSEGSYGYTF +CASSEGTNEQFF +CASSEGTSSYNEQFF +CASSEGTVSNEKLFF +CASSEGTVTEAFF +CASSEGVTPPNSPLHF +CASSEGWEGNYNEQFF +CASSEHRTGSDYGYTF +CASSEIGAPGNTIYF +CASSEISGTPQHF +CASSEKDRTVGTDTQYF +CASSELAGGAFGNQPQHF +CASSELDSYTEAFF +CASSELGATQYF +CASSELGDTQYF +CASSELGGALGPQHF +CASSELGRSSSGNTIYF +CASSELPGYGNTIYF +CASSELREQFF +CASSELRYEQYF +CASSELSGSGQEQYF +CASSELTGQGSEQYF +CASSENTGTGGNYGYTF +CASSEPHGLNQPQHF +CASSEPLGGTSYNEQFF +CASSEQGANEKLFF +CASSEQSNQPQHF +CASSERADRAAYEQYF +CASSERDRTTNEKLFF +CASSEREQFF +CASSEREQYF +CASSERGAGDTIYF +CASSERGPDQFPHTEAFF +CASSERRQGLGNQPQHF +CASSERSNQPQHF +CASSERSTDTQYF +CASSESDYNEQFF +CASSESEGLGEKLFF +CASSESTAGRVPSTDTQYF +CASSESTDTEAFF +CASSESTGVEKLFF +CASSESTLSLTEAFF +CASSESTNTEAFF +CASSESWDTQYF +CASSESYRGEKLFF +CASSETENHYEQFF +CASSETGGTLSPLHF +CASSETGLFGGEAFF +CASSETGLNTEAFF +CASSETGSNTEAFF +CASSETGSTYEQYF +CASSETGSVYEQYF +CASSETGVGGYTF +CASSETGVGSYEQYF +CASSETSGGTDTQYF +CASSETSINEQFF +CASSETYTSGPNTGELFF +CASSEVDWVGGADGYTF +CASSEVGAYSNQPQHF +CASSEVGQGARYNEQFF +CASSEVSSGDNEQFF +CASSEWAGGGTDTQYF +CASSEWARQNNSPLHF +CASSEWSHNTGELFF +CASSEWYLMNTEAFF +CASSEYAGRSYNEQFF +CASSEYDVNEQFF +CASSEYLGTAYEQYF +CASSFAGTANAKNIQYF +CASSFARAYEQYF +CASSFASEAFF +CASSFATKQYF +CASSFAVGPQHF +CASSFAVRSLGGGRWADTQYF +CASSFDGVYEQYF +CASSFDHSNEQFF +CASSFDRGTETQYF +CASSFDSSQPRLF +CASSFEAGELFF +CASSFESSYNEQFF +CASSFETFGSGELFF +CASSFETVMSSYEQYF +CASSFFARGQGTYEQYF +CASSFFDGSYEQYF +CASSFFGGASPDTQYF +CASSFFGGREAFF +CASSFFQGVGEQYF +CASSFFSGMISDTQYF +CASSFFSSGNTIYF +CASSFGAGADTQYF +CASSFGAGAGSYNEQFF +CASSFGAGELFF +CASSFGAGGNEQFF +CASSFGAGNNEQFF +CASSFGASSTEAFF +CASSFGDEQFF +CASSFGENTGELFF +CASSFGEQFF +CASSFGESYEQYF +CASSFGGGAWTDTQYF +CASSFGGGQGTQYF +CASSFGGGYREAFF +CASSFGGQPQHF +CASSFGGRGLGFSGNTIYF +CASSFGGSEQYF +CASSFGGTEAFF +CASSFGGTQYF +CASSFGHPGTQYF +CASSFGIYEQYF +CASSFGIYGYTF +CASSFGLGADTQYF +CASSFGLGSYEQYF +CASSFGLHDYTF +CASSFGLHGYTF +CASSFGLTSSYNEQFF +CASSFGLYEQYF +CASSFGLYGYTF +CASSFGMGSEKLFF +CASSFGMHGYTF +CASSFGPRDTQYF +CASSFGQGADEAFF +CASSFGQGADEQYF +CASSFGQGADIQYF +CASSFGQGADTQYF +CASSFGQGAEKLFF +CASSFGQGAVGNTIYF +CASSFGQGDEKLFF +CASSFGQGGDTQYF +CASSFGQGGEKLFF +CASSFGQGGYQPQHF +CASSFGQGLEQYF +CASSFGQGLGAFF +CASSFGQGSDTQYF +CASSFGQGSEKLFF +CASSFGQGSETQYF +CASSFGQGSTEAFF +CASSFGQGSYEQYF +CASSFGQGTEAFF +CASSFGQGTGELFF +CASSFGQGYGYTF +CASSFGQNTGELFF +CASSFGRGSYEQYF +CASSFGSDYNEQFF +CASSFGSGANVLTF +CASSFGSGGYEQYF +CASSFGSGNNEQFF +CASSFGSGRWLNEQFF +CASSFGSHGYTF +CASSFGSTDTQYF +CASSFGSVGYTF +CASSFGSYGYTF +CASSFGTGELFF +CASSFGTGENIQYF +CASSFGTGGFYTF +CASSFGTGGGQPQHF +CASSFGTGQSSYNEQFF +CASSFGTLYGYTF +CASSFGTSRGRETQYF +CASSFGTSSDTQYF +CASSFGTSSSGANVLTF +CASSFGTWGYTF +CASSFGVGLAGVGLDTQYF +CASSFGVHGYTF +CASSFGVYEQYF +CASSFHDRGDQPQHF +CASSFHGLAGEETQYF +CASSFIFPGELFF +CASSFIHREEGYTF +CASSFISTDTQYF +CASSFKDNGETQYF +CASSFKNAEAFF +CASSFLAGEGTDTQYF +CASSFLAGGYNEQFF +CASSFLAGPYNEQFF +CASSFLATDEQFF +CASSFLDRGPNTEAFF +CASSFLDRVFGGELFF +CASSFLFSRTGRQETQYF +CASSFLGANEQFF +CASSFLGGEGSTLNTGELFF +CASSFLGGQYNEQFF +CASSFLGKNEKLFF +CASSFLGPGLNEQYF +CASSFLGTGLNEQYF +CASSFLIGELFF +CASSFLNLNNEQFF +CASSFLQGLNTEAFF +CASSFLRFYGYTF +CASSFLRGSTDTQYF +CASSFLSNQPQHF +CASSFLSTDTQYF +CASSFLTGAYEQYF +CASSFLTGLNEQFF +CASSFLTGNTEAFF +CASSFLVETQYF +CASSFMGADYGYTF +CASSFMGASTDTQYF +CASSFMGQGGSYTF +CASSFMVGAGELFF +CASSFNPRQVLSYEQYF +CASSFNRGPPLSGANVLTF +CASSFNTEAFF +CASSFPAGNEQFF +CASSFPPGQGPGYNSPLHF +CASSFPPRLYANYGYTF +CASSFPTSGGAEDTQYF +CASSFQGTTGANVLTF +CASSFQMEGGQDTQYF +CASSFQPSPYNEQFF +CASSFQRAGELFF +CASSFQSDNEQFF +CASSFRAALEQYF +CASSFRAAYEQYF +CASSFRAGETQYF +CASSFRAGNEQFF +CASSFRAPYGYTF +CASSFRASYEQYF +CASSFRATDTQYF +CASSFRDPHQPQHF +CASSFRDRGLGEQFF +CASSFRFGGEAFF +CASSFRGETGFTGELFF +CASSFRGTYNEQFF +CASSFRGVFSPLHF +CASSFRLAGGDEQFF +CASSFRLAGVSDTQYF +CASSFRLDLTDTQYF +CASSFRRGKGETQYF +CASSFRRPGLAKNIQYF +CASSFRSADTQYF +CASSFRSANEQFF +CASSFRSGETQYF +CASSFRSGIEQYF +CASSFRSGTEAFF +CASSFRSGVEQYF +CASSFRSQETQYF +CASSFRSSEKLFF +CASSFRSSETQYF +CASSFRSSQPQHF +CASSFRSSTEAFF +CASSFRSSYEQYF +CASSFRSTDTQYF +CASSFRSTGELFF +CASSFSAGELFF +CASSFSEEQFF +CASSFSFILTPSQPINIQYF +CASSFSGDTEAFF +CASSFSGLSNTEAFF +CASSFSGNTGELFF +CASSFSGTANTEAFF +CASSFSLGGAIDTQYF +CASSFSPSSYNEQFF +CASSFSQGEQETQYF +CASSFSRGHEKLFF +CASSFSSSYNEQFF +CASSFSSTDTQYF +CASSFSSYEQYF +CASSFSTAGRETQYF +CASSFSTDTQYF +CASSFSTGANTEAFF +CASSFSYGYTF +CASSFTDGPYEQYF +CASSFTGNSPLHF +CASSFTGPGELFF +CASSFTGSTDTQYF +CASSFTGSTEAFF +CASSFTLNTEAFF +CASSFTTGYEQYF +CASSFTVFTGYTF +CASSFTVNTEAFF +CASSFTVNYGYTF +CASSFVAGGYNEQFF +CASSFVDRDTEAFF +CASSFWSDNEQFF +CASSFYGTGNTEAFF +CASSFYPGGNYGYTF +CASSFYPITRGGKTGELFF +CASSFYSNQPQHF +CASSFYSTTEQFF +CASSGAGAYNEQFF +CASSGDFATSYKQYF +CASSGDFPNEQFF +CASSGDSSGNTIYF +CASSGDSSNEQFF +CASSGDTGTTLYNEQFF +CASSGDTYNEQFF +CASSGEGGTTNTGELFF +CASSGETDTQYF +CASSGFDGSNQPQHF +CASSGFLYSPLHF +CASSGGDTYNEQFF +CASSGGGGNTEAFF +CASSGGGVDEKLFF +CASSGGHQNNEKLFF +CASSGGITVYGYTF +CASSGGLAGSTDTQYF +CASSGGRNQPQHF +CASSGGTSSDTQYF +CASSGGVSGGAGETQYF +CASSGHGADTQYF +CASSGISTDTQYF +CASSGLAAPSYNEQFF +CASSGLAGARNEQYF +CASSGLAGINEQFF +CASSGLAGLNEQFF +CASSGLAGTYNEQFF +CASSGLATDTQYF +CASSGLEGAGDTQYF +CASSGLNEQFF +CASSGLSNQPQHF +CASSGLSSGNTIYF +CASSGNEPPQYGETQYF +CASSGNSGNTIYF +CASSGNWEVGEAFF +CASSGPGGVDEQYF +CASSGPGTGYNEQFF +CASSGPTNTGELFF +CASSGPVAEQFF +CASSGQAGVEQFF +CASSGQGAQPQHF +CASSGQGDTGELFF +CASSGQGGTEAFF +CASSGQGNQPQHF +CASSGQLNTEAFF +CASSGQSNQPQHF +CASSGRAADTQYF +CASSGRAAGELFF +CASSGRAAHEQYF +CASSGRAAYEQYF +CASSGRAGDTQYF +CASSGRAGGELFF +CASSGRAGNEQFF +CASSGRAGNTIYF +CASSGRAGTEAFF +CASSGRAGVEQFF +CASSGRASDEQYF +CASSGRASDTQYF +CASSGRASGELFF +CASSGRASTEAFF +CASSGRATDTQYF +CASSGRATGELFF +CASSGRATGKLFF +CASSGRATYEQYF +CASSGRGTDTQYF +CASSGRRTGELFF +CASSGRSADTQYF +CASSGRSADTRYF +CASSGRSAETQYF +CASSGRSAGELFF +CASSGRSAGEQYF +CASSGRSAVEAFF +CASSGRSAVEQFF +CASSGRSDGELFF +CASSGRSGDTQYF +CASSGRSGETQYF +CASSGRSGIEQYF +CASSGRSGNTIYF +CASSGRSGVEQFF +CASSGRSGVEQYF +CASSGRSGYEQYF +CASSGRSGYNEQFF +CASSGRSLEEQYF +CASSGRSNQPQHF +CASSGRSNRPQHF +CASSGRSNTEAFF +CASSGRSQETQYF +CASSGRSSDEQFF +CASSGRSSDTQYF +CASSGRSSEPQHF +CASSGRSSGELFF +CASSGRSSGTQYF +CASSGRSSQPQHF +CASSGRSSYEQYF +CASSGRSTDEQYF +CASSGRSTDTQYF +CASSGRSTGELFF +CASSGRSTGEQYF +CASSGRSVGELFF +CASSGRTADTQYF +CASSGRTAETQYF +CASSGRTGSSYEQYF +CASSGSDTQYF +CASSGSGTSAKNIQYF +CASSGSGVGTGELFF +CASSGSGYEQFF +CASSGTATNEKLFF +CASSGTAVEKLFF +CASSGTEFF +CASSGTGAGEQYF +CASSGTGGPGELFF +CASSGTGGSPLHF +CASSGTGVDIQYF +CASSGTKETQYF +CASSGTSFTEQYF +CASSGTSGGYEQYF +CASSGTSGNYEQYF +CASSGTSGSTDTQYF +CASSGTSGTEQYF +CASSGTSLAEQYF +CASSGTSNQPQHF +CASSGTSVHEQYF +CASSGVGGGELFF +CASSGVSLQPQHF +CASSGVSNQPQHF +CASSGWLVQGNQPQHF +CASSGWSLGPQHF +CASSGYSINEQFF +CASSGYSNQPQHF +CASSGYSVDTQYF +CASSHAAGLAGLQETQYF +CASSHAGGGDEQFF +CASSHFMTGGQPQHF +CASSHFPPPYEQYF +CASSHFQGASYGYTF +CASSHGAEAFF +CASSHGGDSGNTIYF +CASSHGQGGQETQYF +CASSHGSYGYTF +CASSHGTAGSGNTIYF +CASSHGTGMTIYF +CASSHGTVETEAFF +CASSHKQATGRKTYEQYF +CASSHLAGGYNEQFF +CASSHLDRTAGADTQYF +CASSHLDSTTNEKLFF +CASSHLMSTDTQYF +CASSHLQGGQYNEQFF +CASSHLYEQYF +CASSHNSGLAGRTYEQYF +CASSHPGDEQYF +CASSHPGHIGQPQHF +CASSHPGQGITNEKLFF +CASSHPNPGLATGELFF +CASSHPRRGGEEKLFF +CASSHQGFNDYTF +CASSHQGFNGYTF +CASSHRATGELFF +CASSHREGRRLANEQYF +CASSHRGTGELFF +CASSHRQWEGLAGELFF +CASSHRSGQPQHF +CASSHRSNQPQHF +CASSHRSSYEQYF +CASSHRSTDTQYF +CASSHRSTGELFF +CASSHRTGDEKLFF +CASSHSSGGMSTEAFF +CASSHTDYSNQPQHF +CASSHTGPGYGYTF +CASSHTGSVNTEAFF +CASSHTGVITEAFF +CASSHTRTLGAKNIQYF +CASSHVWAGTGFVYGYTF +CASSHYSITEAFF +CASSHYSVNEQFF +CASSIAAWGTDNTGELFF +CASSIAAYNEQFF +CASSIADMSNQPQHF +CASSIAGDEQFF +CASSIAGDTQYF +CASSIAGETEAFF +CASSIAGFEQYF +CASSIAGGEQYF +CASSIAGGLREQFF +CASSIAGNSNQPQHF +CASSIAGNTEAFF +CASSIAPTDGEAFF +CASSIAPTDTQYF +CASSIAPTGELFF +CASSIAQGGYTF +CASSIARDEQFF +CASSIAREQYF +CASSIASHNEQFF +CASSIASNSPLHF +CASSIASVEGGQFF +CASSIATVDTQYF +CASSIAVRNEQFF +CASSIAVTNYGYTF +CASSIDGGNEQFF +CASSIDGNVAPLHF +CASSIDHGGRDGYTF +CASSIDHSTYYEQYF +CASSIDSNQPQHF +CASSIDTSSGANVLTF +CASSIEDLTNYGYTF +CASSIEGLAEYNEQFF +CASSIEGRTEAFF +CASSIEGTSGYEQYF +CASSIEQYYEQYF +CASSIESGEQFF +CASSIFAGAPYEQYF +CASSIFAGGDTQYF +CASSIFAGGFTEAFF +CASSIFAGGLFYNEQFF +CASSIFAGGTEQYF +CASSIFAGSNQPQHF +CASSIFAGVNQPQHF +CASSIFAHSSQPQHF +CASSIFANTEAFF +CASSIFAVSGNTIYF +CASSIFDGSYEQYF +CASSIFDGYYEQYF +CASSIFDSAREQFF +CASSIFDSRNQPQHF +CASSIFDSSNQPQHF +CASSIFDTANQPQHF +CASSIFFTGELFF +CASSIFFTSSGGYTF +CASSIFGAGELFF +CASSIFGAGNQPQHF +CASSIFGANVLTF +CASSIFGGAEAFF +CASSIFGGAETQYF +CASSIFGGAGNQPQHF +CASSIFGGAGSYNEQFF +CASSIFGGAKNIQYF +CASSIFGGAREQFF +CASSIFGGDYNEQFF +CASSIFGGEQLFF +CASSIFGGFHNEQFF +CASSIFGGGLNEQFF +CASSIFGGGRNEQFF +CASSIFGGGYNEQFF +CASSIFGGPKPQHF +CASSIFGGPNTEAFF +CASSIFGGPYNEQFF +CASSIFGGQGSYNEQFF +CASSIFGGQPQHF +CASSIFGGSAYNEQFF +CASSIFGGSGNTIYF +CASSIFGGSNEKLFF +CASSIFGGSNQPQHF +CASSIFGGSSYEQYF +CASSIFGGSSYNEQFF +CASSIFGGSTYNEQFF +CASSIFGGSYNEQFF +CASSIFGGTSNQPQHF +CASSIFGGYEQYF +CASSIFGHQNTEAFF +CASSIFGINEQFF +CASSIFGLNEQFF +CASSIFGNGYTF +CASSIFGNLPYEQYF +CASSIFGNQPQHF +CASSIFGNTIYF +CASSIFGQGEQYF +CASSIFGQGSQPQHF +CASSIFGQGTQYF +CASSIFGQREQYF +CASSIFGQVDAFF +CASSIFGRGEQFF +CASSIFGSDYNEQFF +CASSIFGSEGNEQFF +CASSIFGSPFYEQYF +CASSIFGSPRGEQYF +CASSIFGSSGNTIYF +CASSIFGSSNQPQHF +CASSIFGSYEQYF +CASSIFGTDTQYF +CASSIFGTDTYF +CASSIFGTGGTEAFF +CASSIFGTSLQFF +CASSIFHQKEAFF +CASSIFHSPKNQPQHF +CASSIFHTAGEGAFF +CASSIFHTDTQYF +CASSIFHTETQYF +CASSIFHTGELFF +CASSIFHTHYEQYF +CASSIFIYNEQFF +CASSIFKGSNQPQHF +CASSIFLGAEAFF +CASSIFLGQDQYF +CASSIFLGSNYGYTF +CASSIFMSSGNTIYF +CASSIFNGQEAFF +CASSIFNNQPQHF +CASSIFQAGELFF +CASSIFQAPSYEQYF +CASSIFQFNEQFF +CASSIFQGGGNTEAFF +CASSIFQGQPQHF +CASSIFQGSNQPQHF +CASSIFQNEKLFF +CASSIFQSGANVLTF +CASSIFQSGNQPQHF +CASSIFQTDTQYF +CASSIFSAALRPATF +CASSIFSAGGKDEQFF +CASSIFSAGNEQFF +CASSIFSANNQPQHF +CASSIFSAYNEQFF +CASSIFSGANVLTF +CASSIFSGGADTQYF +CASSIFSGGANEQFF +CASSIFSGGAYNEQFF +CASSIFSGGQNIQYF +CASSIFSGGYNEQFF +CASSIFSGKPQHF +CASSIFSGLNEQFF +CASSIFSGNTIYF +CASSIFSGPGIEAFF +CASSIFSGRENEQYF +CASSIFSGSGDGYTF +CASSIFSGSMNEQFF +CASSIFSGSYNEQFF +CASSIFSINEQFF +CASSIFSNQPQHF +CASSIFSTPNQPQHF +CASSIFSVNEQFF +CASSIFSVSNEQYF +CASSIFSVTELFF +CASSIFTGANVLTF +CASSIFTGGTEAFF +CASSIFTIEPQHF +CASSIFVGAHEAFF +CASSIFVGALSDEQYF +CASSIFVGNEQFF +CASSIFVGRNEQFF +CASSIFWGSDYGYTF +CASSIFWQSNTEAFF +CASSIFWRGEQFF +CASSIFYGTDYGYTF +CASSIFYRAEAFF +CASSIFYSSTDTQYF +CASSIFYTSGNTIYF +CASSIGADTQYF +CASSIGAFGYTF +CASSIGAGEQFF +CASSIGAGVKFF +CASSIGAHGYTF +CASSIGASGRDEQFF +CASSIGAWGYTF +CASSIGAYGYTF +CASSIGAYRYTF +CASSIGEGNGQFF +CASSIGEYGYTF +CASSIGFWGYTF +CASSIGFYGYTF +CASSIGGASYNEQFF +CASSIGGDEKLFF +CASSIGGELFF +CASSIGGGNQPQHF +CASSIGGPDTQYF +CASSIGGRDIQYF +CASSIGGREAFF +CASSIGGRQQYF +CASSIGGSGNQPQHF +CASSIGGSNEQFF +CASSIGGSSYNEQFF +CASSIGGTEAFF +CASSIGGWNEQYF +CASSIGHNEQFF +CASSIGHYGYTF +CASSIGHYSGNTIYF +CASSIGIEGYTF +CASSIGIESYEQYF +CASSIGIGEAFF +CASSIGIHGYTF +CASSIGIYGYTF +CASSIGLAGYTF +CASSIGLFGYTF +CASSIGLGEQFF +CASSIGLGEQYF +CASSIGLHGYTF +CASSIGLIGYTF +CASSIGLLSWQPQHF +CASSIGLNMGNQPQHF +CASSIGLPGYTF +CASSIGLWGYTF +CASSIGLWRTEAFF +CASSIGLYGYTF +CASSIGMGEQYF +CASSIGMGGTYNEQFF +CASSIGMHGYTF +CASSIGMSNTGELFF +CASSIGMYGYTF +CASSIGNAPVF +CASSIGNSPLHF +CASSIGNYGYTF +CASSIGPYTEAFF +CASSIGQGLDTQYF +CASSIGQGSGYEQYF +CASSIGQRAFF +CASSIGRSDADTQYF +CASSIGSAFEQYF +CASSIGSATEAFF +CASSIGSFGYTF +CASSIGSGEQFF +CASSIGSGQAYEQYF +CASSIGSHGYTF +CASSIGSTDTQYF +CASSIGSYGSTF +CASSIGSYGYTF +CASSIGTDTQYF +CASSIGTGEAFF +CASSIGTGELFF +CASSIGTGEQFF +CASSIGTGFTEQFF +CASSIGTGIYEQYF +CASSIGTGTQYF +CASSIGTHGYTF +CASSIGTSGDYNEQFF +CASSIGTSYNEQFF +CASSIGTTREGAFF +CASSIGTVQYF +CASSIGTYGYTF +CASSIGVAGYTF +CASSIGVEGTQYF +CASSIGVGEAFF +CASSIGVHGYTF +CASSIGVNGYTF +CASSIGVWGYTF +CASSIGVYGYTF +CASSIGWAGYTF +CASSIGWHGYTF +CASSIGWNGYTF +CASSIGWQTQYF +CASSIGWSGYTF +CASSIGWYGYTF +CASSIGYHGYTF +CASSIGYYGYTF +CASSIHAGLTSGNTIYF +CASSIHAGNQPQHF +CASSIHFSAGQPQHF +CASSIHGADTQYF +CASSIHGAETQYF +CASSIHGAGTEAFF +CASSIHGLETQYF +CASSIHHGNEQFF +CASSIHKNQPQHF +CASSIHQDQPQHF +CASSIHSADTQYF +CASSIHSANQPQHF +CASSIHSGANNEQFF +CASSIHSGGAVEQFF +CASSIHSGGNNEQFF +CASSIHSGGNTEAFF +CASSIHSGGVEAFF +CASSIHSGNQPQHF +CASSIHSGQNTEAFF +CASSIHSGQPNTQYF +CASSIHSGSITEAFF +CASSIHSGSNNEQFF +CASSIHSLNEQFF +CASSIHSNQPQHF +CASSIHSTDTQYF +CASSIHTANQPQHF +CASSIHWGNTEAFF +CASSIIAGAYNEQFF +CASSIIAGENIQYF +CASSIIAGGYNEQFF +CASSIIAGMDEQFF +CASSIIALDTQYF +CASSIIATSTEQFF +CASSIIDRTLGTDTQYF +CASSIIDRTRGTDTQYF +CASSIIGGAYNEQFF +CASSIIGGRRDGYTF +CASSIIGGSYNEQFF +CASSIIGSFDNEQFF +CASSIIGSGNTIYF +CASSIIGSSYNEQFF +CASSIIGVNEQFF +CASSIIHSSYNEQFF +CASSIISGDTQYF +CASSIISGGAPAF +CASSIISGGDYTF +CASSIISGNEQFF +CASSIISGPYNEQFF +CASSIISGSYNEQFF +CASSIISGVEQFF +CASSIISGYEQYF +CASSIISLDTQYF +CASSIISLVHNEQFF +CASSIISNQPQHF +CASSIISQETQYF +CASSIISQNEQFF +CASSIISSYEQYF +CASSIISTDTQYF +CASSIISVDGYTF +CASSIISVDTQYF +CASSIISVGEQFF +CASSIKSGGELFF +CASSIKSSYNEQFF +CASSILAGADTQYF +CASSILAGAYNEQFF +CASSILAGDEQFF +CASSILAGGYNEQFF +CASSILAGIEQFF +CASSILAGNEQFF +CASSILAGPYNEQFF +CASSILAGSYEQYF +CASSILAGSYNEQFF +CASSILAGVGRNEQFF +CASSILAQSEAFF +CASSILASSYNEQFF +CASSILASYEQYF +CASSILATDTQYF +CASSILAVAGNEQFF +CASSILAVDEQFF +CASSILAYYGYTF +CASSILDEQFF +CASSILDYGYTF +CASSILGAGANVLTF +CASSILGAQPQHF +CASSILGASYNEQFF +CASSILGGADVQFF +CASSILGKDTQYF +CASSILGLHEQFF +CASSILGLPGPYEQYF +CASSILGSSNQPQHF +CASSILGSSRNEQFF +CASSILITGELFF +CASSILITSYYEQYF +CASSILLGETQYF +CASSILPSGGYNEQFF +CASSILSGAYNEQFF +CASSILSGPRTEAFF +CASSILSGRAQYF +CASSILSLDTQYF +CASSILSMNTEAFF +CASSILSNQPQHF +CASSILSNTEAFF +CASSILSQDEQFF +CASSILSQQPQHF +CASSILSRDRPSTDTQYF +CASSILSRQPQHF +CASSILSTDTQYF +CASSILSYNEQFF +CASSILTGPQPQHF +CASSILTGPRTEAFF +CASSILTGQEQFF +CASSILVTSEQYF +CASSILVYNEQFF +CASSIMAAGGTEQFF +CASSIMAGAYNEQFF +CASSIMAGEYNEQFF +CASSIMAGSYNEQFF +CASSIMALGRSEAFF +CASSIMGASPQHF +CASSIMGGPYNEQFF +CASSIMGPGELFF +CASSIMGSGNTIYF +CASSIMGTDTQYF +CASSIMIRSEAFF +CASSIMRGPYNEQFF +CASSIMSGPSYNEQFF +CASSIMSNQPQHF +CASSIMSTDTQYF +CASSIMVTDTQYF +CASSINAYNEQFF +CASSINGGAYNEQFF +CASSINGGRRDGYTF +CASSINGGSRNGYTF +CASSINGGTNTEAFF +CASSINSGGYNEQFF +CASSINSGWGEQFF +CASSINSNQPQHF +CASSINSRAPLHF +CASSINSYSGNTIYF +CASSIPATEAFF +CASSIPGGGEQFF +CASSIPSTGELFF +CASSIPTGAEQYF +CASSIPTGGADTQYF +CASSIPTVSVGAQHF +CASSIQGAGELFF +CASSIQGGGELFF +CASSIQGITEAFF +CASSIQGNQPQHF +CASSIQGQAGNQPQHF +CASSIQHNEQFF +CASSIQSTDTQYF +CASSIRAAAEAFF +CASSIRAADEQYF +CASSIRAADTQYF +CASSIRAAEGYTF +CASSIRAAEPQHF +CASSIRAAETQYF +CASSIRAAGELFF +CASSIRAASEAFF +CASSIRAASPQHF +CASSIRAASYNEQFF +CASSIRAATEAFF +CASSIRAAVEQFF +CASSIRAAWEQYF +CASSIRAAYEQYF +CASSIRADDEQYF +CASSIRADDTQYF +CASSIRADGELFF +CASSIRADNEQFF +CASSIRADSPLHF +CASSIRADYEQYF +CASSIRAEGKLFF +CASSIRAFNQPQHF +CASSIRAGDEQFF +CASSIRAGDEQYF +CASSIRAGDNTEAFF +CASSIRAGDTQYF +CASSIRAGEKLFF +CASSIRAGETQYF +CASSIRAGFEQYF +CASSIRAGGEQYF +CASSIRAGGNEQFF +CASSIRAGGPMEQYF +CASSIRAGGRNEQFF +CASSIRAGGSPLHF +CASSIRAGIEQFF +CASSIRAGNEKLFF +CASSIRAGNEQFF +CASSIRAGTEQFF +CASSIRAGVEQFF +CASSIRAGVEQYF +CASSIRAGWEQYF +CASSIRAGYEQFF +CASSIRAGYEQYF +CASSIRAGYNEQFF +CASSIRAGYTEAFF +CASSIRAHYGYTF +CASSIRANTEAFF +CASSIRAPETQYF +CASSIRAQETQYF +CASSIRASDTQYF +CASSIRASETQYF +CASSIRASGVEQFF +CASSIRASHEQFF +CASSIRASNEQFF +CASSIRASQPQHF +CASSIRASVEQYF +CASSIRASWEQYF +CASSIRASYEQYF +CASSIRATDTQYF +CASSIRATGELFF +CASSIRATNEQFF +CASSIRATWEQYF +CASSIRATYEQYF +CASSIRCSYEQYF +CASSIRDGVNTEAFF +CASSIREGTEAFF +CASSIRFGGNAIYF +CASSIRFGTEAFF +CASSIRFSGANVLTF +CASSIRGAETQYF +CASSIRGAGEQFF +CASSIRGAHEQYF +CASSIRGAQEQYF +CASSIRGAYEQYF +CASSIRGFYNEQFF +CASSIRGGGEQFF +CASSIRGGIEQFF +CASSIRGGNEQFF +CASSIRGGSSYNEQFF +CASSIRGGVEQFF +CASSIRGGYEQYF +CASSIRGLASYNEQFF +CASSIRGLSNQPQHF +CASSIRGLYEQYF +CASSIRGRAPLPQHF +CASSIRGSDEQYF +CASSIRGSDTQYF +CASSIRGSETQYF +CASSIRGSGELFF +CASSIRGSGEQYF +CASSIRGSQPQHF +CASSIRGSTEAFF +CASSIRGSYEQYF +CASSIRHLYEQYF +CASSIRHSGSYNEQFF +CASSIRLGYNEQFF +CASSIRNSYEQYF +CASSIRNSYNEQFF +CASSIRNTGELFF +CASSIRPSYEQYF +CASSIRPTGELFF +CASSIRSADEQFF +CASSIRSADEQYF +CASSIRSADTQYF +CASSIRSAEKLFF +CASSIRSAETQYF +CASSIRSAGEAFF +CASSIRSAGELFF +CASSIRSAGEQYF +CASSIRSAGPLFF +CASSIRSAGRAFF +CASSIRSAHEAFF +CASSIRSAHEQFF +CASSIRSAHEQYF +CASSIRSAIEQYF +CASSIRSANVLTF +CASSIRSAQPQHF +CASSIRSASEQYF +CASSIRSATEAFF +CASSIRSAWAQHF +CASSIRSAWAQYF +CASSIRSAWEQFF +CASSIRSAYEQFF +CASSIRSAYEQYF +CASSIRSAYERYF +CASSIRSDAEAFF +CASSIRSDDIQYF +CASSIRSDEKLFF +CASSIRSDGELFF +CASSIRSDNEQFF +CASSIRSDSEAFF +CASSIRSDYEQYF +CASSIRSEDTQYF +CASSIRSEETQYF +CASSIRSEYEQYF +CASSIRSFYEQYF +CASSIRSGAEQFF +CASSIRSGAEQYF +CASSIRSGATQYF +CASSIRSGAYTEAFF +CASSIRSGDKQFF +CASSIRSGDTQYF +CASSIRSGEEQYF +CASSIRSGEPQHF +CASSIRSGESRAFF +CASSIRSGESTQYF +CASSIRSGETQYF +CASSIRSGFEQFF +CASSIRSGGEQFF +CASSIRSGGEQYF +CASSIRSGGPQHF +CASSIRSGGTEAFF +CASSIRSGHEQFF +CASSIRSGHEQYF +CASSIRSGIEAFF +CASSIRSGIEQFF +CASSIRSGNEKLFF +CASSIRSGNEQFF +CASSIRSGNEQYF +CASSIRSGNIQYF +CASSIRSGNTEAFF +CASSIRSGPEAFF +CASSIRSGPYTEAFF +CASSIRSGQAYTEAFF +CASSIRSGQPQHF +CASSIRSGSEQFF +CASSIRSGTEAFF +CASSIRSGTEQYF +CASSIRSGVEQFF +CASSIRSGVEQYF +CASSIRSGVIQYF +CASSIRSGWELFF +CASSIRSGWEQFF +CASSIRSGWEQYF +CASSIRSGYEQFF +CASSIRSGYEQYF +CASSIRSGYGYTF +CASSIRSHDEQFF +CASSIRSHYEQYF +CASSIRSLDIQYF +CASSIRSLDTQYF +CASSIRSLEPQHF +CASSIRSLETQYF +CASSIRSLGELFF +CASSIRSMDIQYF +CASSIRSNGEQFF +CASSIRSNNEQFF +CASSIRSNQPQHF +CASSIRSNTEAFF +CASSIRSNYEQYF +CASSIRSNYNEQFF +CASSIRSPYEQYF +CASSIRSQDTQYF +CASSIRSQETQYF +CASSIRSQTEQFF +CASSIRSQVEQFF +CASSIRSSAEAFF +CASSIRSSAEQYF +CASSIRSSCEQYF +CASSIRSSDEQFF +CASSIRSSDEQYF +CASSIRSSDTQYF +CASSIRSSEEQYF +CASSIRSSEPQHF +CASSIRSSETQYF +CASSIRSSFEAFF +CASSIRSSGELFF +CASSIRSSGEQFF +CASSIRSSGPQFF +CASSIRSSHEQFF +CASSIRSSHEQYF +CASSIRSSHNEQFF +CASSIRSSHTQYF +CASSIRSSLEQYF +CASSIRSSNEQFF +CASSIRSSNEQYF +CASSIRSSQPQHF +CASSIRSSSEQFF +CASSIRSSSEQYF +CASSIRSSTEAFF +CASSIRSSVAQYF +CASSIRSSVEAFF +CASSIRSSVEFF +CASSIRSSVEQFF +CASSIRSSVEQYF +CASSIRSSVMQYF +CASSIRSSWEAFF +CASSIRSSWELFF +CASSIRSSWEQYF +CASSIRSSWKLFF +CASSIRSSYEAFF +CASSIRSSYEQFF +CASSIRSSYEQYF +CASSIRSSYEYF +CASSIRSSYGQYF +CASSIRSSYKQYF +CASSIRSSYNEQFF +CASSIRSTAEQYF +CASSIRSTDAQYF +CASSIRSTDIQYF +CASSIRSTDTQYF +CASSIRSTDTRYF +CASSIRSTEGAFF +CASSIRSTEKLFF +CASSIRSTEPQHF +CASSIRSTETQYF +CASSIRSTGELFF +CASSIRSTGELSF +CASSIRSTGGLFF +CASSIRSTHEQFF +CASSIRSTHTQYF +CASSIRSTKTQYF +CASSIRSTNEQFF +CASSIRSTQPQHF +CASSIRSTSEQYF +CASSIRSTSPLHF +CASSIRSTTEAFF +CASSIRSTVEQFF +CASSIRSTYEQYF +CASSIRSVAEAFF +CASSIRSVAEQYF +CASSIRSVDTQYF +CASSIRSVGELFF +CASSIRSVGEQYF +CASSIRSVHEQFF +CASSIRSVTEAFF +CASSIRSVYEQFF +CASSIRSVYEQYF +CASSIRSYYEQFF +CASSIRTAANYGYTF +CASSIRTAGDEAFF +CASSIRTANEQYF +CASSIRTSYEQYF +CASSIRVGYEQYF +CASSIRWAYEQYF +CASSIRWGYEQYF +CASSIRYMGTSLAF +CASSISAGPYNEQFF +CASSISATGELFF +CASSISDRAQETQYF +CASSISESCGYTF +CASSISESYGYTF +CASSISGANQPQHF +CASSISGASNYGYTF +CASSISGGPGETQYF +CASSISGGSEQFF +CASSISGGTEAFF +CASSISGLEAFF +CASSISGPGEQYF +CASSISGSGNEQFF +CASSISGTGNEQFF +CASSISGTSGSTQYF +CASSISGTSPEQYF +CASSISGTSYNEQFF +CASSISITTYNEQFF +CASSISLAAEQFF +CASSISPTGELFF +CASSISPTPRQGQFF +CASSISRRDRPNEKLFF +CASSISSGQPQHF +CASSISSGSYNEQFF +CASSISSGTEQFF +CASSISSINEQFF +CASSISSTDTQYF +CASSISSTGELFF +CASSISSTGEQYF +CASSISTAKNIQYF +CASSISTTGEQFF +CASSITANYGYTF +CASSITASGGHEQFF +CASSITGAFGTEAFF +CASSITGDSSYNEQFF +CASSITGGEGGELFF +CASSITGGNNEQFF +CASSITGNSNQPQHF +CASSITGPDTQYF +CASSITGTYNEQFF +CASSITGVNEQFF +CASSITGVYGYTF +CASSITGWGSPLHF +CASSITGYEQYF +CASSITHTGELFF +CASSITIGSTEAFF +CASSITISEQFF +CASSITPNTEAFF +CASSITPTGELFF +CASSITRSYEQYF +CASSITSGGLNEQFF +CASSITSGGYNEQFF +CASSITSGNEQFF +CASSITSGSYNEQFF +CASSITSLGELFF +CASSITSSYNEQFF +CASSITSTDTQYF +CASSITSTGELFF +CASSITSVNEQFF +CASSITSVNGYTF +CASSIVAGASEAFF +CASSIVAGGLNEQFF +CASSIVAGNYNEQFF +CASSIVAGSSYNEQFF +CASSIVAGTDTQYF +CASSIVAGVEQFF +CASSIVAGVYNEQFF +CASSIVALDEQFF +CASSIVALDTQYF +CASSIVAPSYEQYF +CASSIVAPTYEQYF +CASSIVAQDEQYF +CASSIVAQYEQYF +CASSIVASGAGGELFF +CASSIVGAAKNIQYF +CASSIVGFNEQFF +CASSIVGGAGNEQFF +CASSIVGGAYNEQFF +CASSIVGGIAYNEQFF +CASSIVGGNEQFF +CASSIVGGNYNEQFF +CASSIVGGSYNEQFF +CASSIVGGTDTQYF +CASSIVGGTEEAFF +CASSIVGGVEAFF +CASSIVGIGNQPQHF +CASSIVGSAYNEQFF +CASSIVGSGANVLTF +CASSIVGSGYNEQFF +CASSIVGSSYNEQFF +CASSIVGTGTYEQYF +CASSIVGVSEQYF +CASSIVHAADTQYF +CASSIVHVSYNEQFF +CASSIVIVAGANVLTF +CASSIVNSAPTYEQYF +CASSIVPGTGELFF +CASSIVPLDTQYF +CASSIVPWGGADGYTF +CASSIVQGAYNEQFF +CASSIVSADTQYF +CASSIVSGDEQFF +CASSIVSGDTQYF +CASSIVSGDYNEQFF +CASSIVSGGYNEQFF +CASSIVSGNEQFF +CASSIVSGSEQYF +CASSIVSGSYNEQFF +CASSIVSIAGELFF +CASSIVSINEQFF +CASSIVSLDEQFF +CASSIVSNQPQHF +CASSIVSQETQYF +CASSIVSQGELFF +CASSIVSRAEQYF +CASSIVSRDTQYF +CASSIVSRHEQFF +CASSIVSSETQYF +CASSIVSTDTQYF +CASSIVSTGELFF +CASSIVSTNEAFF +CASSIVSVADTQYF +CASSIVSVDEQFF +CASSIVSVNEQFF +CASSIVTATEAFF +CASSIVTDTQYF +CASSIVTGYGYTF +CASSIVVHGYTF +CASSIVVLGGELFF +CASSIVVPAEAFF +CASSIVVQGNEQFF +CASSIWAQDTQYF +CASSIWGTYEQYF +CASSIWQGSNQPQHF +CASSIWTDQETQYF +CASSIWTGDGYTF +CASSIYAALGTEAFF +CASSIYAGGGNTIYF +CASSIYAGVIYEQYF +CASSIYALGELFF +CASSIYDTVNTEAFF +CASSIYEGGYNEQFF +CASSIYFGEKLFF +CASSIYGAGELFF +CASSIYGANVLTF +CASSIYGAYEQYF +CASSIYGGAGDGYTF +CASSIYGGAGNEQFF +CASSIYGGEGPNTEAFF +CASSIYGGEVEQYF +CASSIYGGGNTIYF +CASSIYGGGYNEQFF +CASSIYGGSGEAFF +CASSIYGGSYNEQFF +CASSIYGMGNQPQHF +CASSIYGNTEAFF +CASSIYGNTGELFF +CASSIYGQSEQYF +CASSIYGSDEQFF +CASSIYGSPTNTQYF +CASSIYGSSNQPQHF +CASSIYGSSYNEQFF +CASSIYGSVGTEAFF +CASSIYGSYEQYF +CASSIYGTDTQYF +CASSIYGTGAYEQYF +CASSIYGTGTKGTEAFF +CASSIYGYGEQYF +CASSIYHLNPEAFF +CASSIYHRGGFYEQYF +CASSIYHTDTQYF +CASSIYIWTEAFF +CASSIYLLGTEAFF +CASSIYLMENQPQHF +CASSIYLSGNEQFF +CASSIYNNQPQHF +CASSIYNTDTQYF +CASSIYQGSSYNEQFF +CASSIYQGVYNEQFF +CASSIYRFTEAFF +CASSIYRGSYNEQFF +CASSIYRLGYEQYF +CASSIYRTSYNEQFF +CASSIYRWNTIYF +CASSIYSAGIGYTF +CASSIYSANEQFF +CASSIYSGAYNEQFF +CASSIYSGDTQYF +CASSIYSGGKNIQYF +CASSIYSGGNNEQFF +CASSIYSGGYNEQFF +CASSIYSGNEQYF +CASSIYSGNQPQHF +CASSIYSGNTIYF +CASSIYSGPGNTIYF +CASSIYSGPYNEQFF +CASSIYSGQPQHF +CASSIYSGSYNEQFF +CASSIYSGVGNEQFF +CASSIYSGVWNEQFF +CASSIYSIEEQFF +CASSIYSINEQFF +CASSIYSISEAFF +CASSIYSKDTQYF +CASSIYSKETQYF +CASSIYSLNEQFF +CASSIYSLNTEAFF +CASSIYSNGEQYF +CASSIYSNQPQHF +CASSIYSNSNQPQHF +CASSIYSNTEAFF +CASSIYSRAYGYTF +CASSIYSRGDEQFF +CASSIYSSGNTIYF +CASSIYSSNEQFF +CASSIYSSSYNEQFF +CASSIYSSTDTQYF +CASSIYSSYEQYF +CASSIYSTDTQYF +CASSIYSTPDQPQHF +CASSIYSVNEQFF +CASSIYSYNEQFF +CASSKAHRVSRRPQHF +CASSKANGGSNQPQHF +CASSKARSLGNRGNEQFF +CASSKDRGWTEAFF +CASSKEFGQGQCGNQPQHF +CASSKFSINEQFF +CASSKGERESELGQETQYF +CASSKGGDEQFF +CASSKGGGSSYNSPLHF +CASSKGGISGRTNTGELFF +CASSKGGLNEQFF +CASSKGGRGGPETQYF +CASSKGGRPYYGYTF +CASSKGGTRGNEQFF +CASSKGKLPPGYGYTF +CASSKGTGEMIQPQHF +CASSKGTSGSSLGNEQFF +CASSKGTTGALQETQYF +CASSKGVDSEADTQYF +CASSKIAGGPYTQYF +CASSKLSRDEQFF +CASSKLSTDTQYF +CASSKNGQGANPFGELFF +CASSKNSGNTGELFF +CASSKPGRIRPLHF +CASSKQTGTGNYGYTF +CASSKRASVEQYF +CASSKRASYEQYF +CASSKRATSTDTQYF +CASSKRGAETQYF +CASSKRGGYEQYF +CASSKRGSYEQYF +CASSKRSGAEAFF +CASSKRSGIEQYF +CASSKRSGVEQYF +CASSKRSNQPQHF +CASSKRSQEPQHF +CASSKRSQETQYF +CASSKRSSDTQYF +CASSKRSSGELFF +CASSKRSSYEQYF +CASSKRSSYNEQFF +CASSKRSTDEQFF +CASSKRSTDTQYF +CASSKRSTGELFF +CASSKSASYEQYF +CASSKTIGNTEAFF +CASSKVGTVDQPQHF +CASSKVHYNEQFF +CASSKYSVNEQFF +CASSLAAGAEQYF +CASSLAAGGPNTEAFF +CASSLAAQGAAKNIQYF +CASSLAARGGSYEQYF +CASSLAASNQPQHF +CASSLADPLPSGQYF +CASSLAESVYEQYF +CASSLAEVHNEQFF +CASSLAFRQRETDTQYF +CASSLAGAGGYTF +CASSLAGDGSYNEQFF +CASSLAGDSYEQYF +CASSLAGEVNEQFF +CASSLAGGAYGYTF +CASSLAGGPGNEQFF +CASSLAGGPLDTQYF +CASSLAGGRNEQFF +CASSLAGGSNQPQHF +CASSLAGNYGYTF +CASSLAGSPNEQFF +CASSLAGSSDEQFF +CASSLAGSYNEQFF +CASSLAGTMGQFF +CASSLAGVGEQFF +CASSLALAGGFMGNTEAFF +CASSLALAGTDTQYF +CASSLALASWDEQFF +CASSLALHSDGYTF +CASSLAMTDTTDTQYF +CASSLAPDRGETQYF +CASSLAPGGVKDTQYF +CASSLAPLSTDTQYF +CASSLAQGGALRGYTF +CASSLAQPTGGYYEQYF +CASSLAQRSEQYF +CASSLASGASSSYNEQFF +CASSLASGLYEQYF +CASSLATVVSGANVLTF +CASSLAVDRSSYNSPLHF +CASSLAVTDGAEAFF +CASSLAWSSYEQYF +CASSLAYEQFF +CASSLAYEQYF +CASSLAYKRQGPSMGTEAFF +CASSLDAGWFETQYF +CASSLDENYEQYF +CASSLDFAYEQYF +CASSLDGATYNEQFF +CASSLDGRPHLEQYF +CASSLDGTGALSNQPQHF +CASSLDPGQGWTNTEAFF +CASSLDPVYRSTEAFF +CASSLDRAGGSYEQYF +CASSLDRGVNTEAFF +CASSLDSANYGYTF +CASSLDSTYGYTF +CASSLDTEAYEQYF +CASSLDTGIYTEAFF +CASSLDTQYF +CASSLDTTGELFF +CASSLEAGNTEAFF +CASSLEAGSYNEQFF +CASSLEDRVIGSPLHF +CASSLEEGGGAYEQYF +CASSLEGAGGADTQYF +CASSLEGANYGYTF +CASSLEGGAKNGYTF +CASSLEGGGTDNEQFF +CASSLEGGTTPFYEQYF +CASSLEGLADTDTQYF +CASSLEGTATDTQYF +CASSLEGYNEQFF +CASSLEGYTEAFF +CASSLELAVTDTQYF +CASSLEPGTGETQYF +CASSLERGGKLFF +CASSLERGLTRETQYF +CASSLERGNTEAFF +CASSLERGTDTQYF +CASSLERLAGPDGSYEQYF +CASSLERNTYEQYF +CASSLERSTDTQYF +CASSLERTSGTSTDTQYF +CASSLESGTGANYGYTF +CASSLETGGGNEKLFF +CASSLETSDGETQYF +CASSLETSFTHGDTQYF +CASSLEVGGQETQYF +CASSLEVGLGGYTF +CASSLEVGQETQYF +CASSLEVTTNEKLFF +CASSLEWGGETQYF +CASSLFAGGPVEQYF +CASSLFAGTGELFF +CASSLFATNTDTQYF +CASSLFEETQYF +CASSLFFYSNQPQHF +CASSLFGGTDTQYF +CASSLFGHGGTEAFF +CASSLFGIGNQPQHF +CASSLFGRDLGGYTF +CASSLFGSPLHF +CASSLFGTHQETQYF +CASSLFPEDSYNEQFF +CASSLFPGFGEQYF +CASSLFPGLYGYTF +CASSLFPYRPEAFF +CASSLFQETQYF +CASSLFQGDEQFF +CASSLFQGYEQFF +CASSLFRGPETQYF +CASSLFRGQETQYF +CASSLFSETQYF +CASSLFSIGEQFF +CASSLFSISEQFF +CASSLFTGAFYEQYF +CASSLGADNEQFF +CASSLGAGASDEQYF +CASSLGAGAYEQFF +CASSLGAGDSSYEQYF +CASSLGAGFLQETQYF +CASSLGAGGLTGELFF +CASSLGAGGTVEQYF +CASSLGAGIGYTF +CASSLGAYGYTF +CASSLGAYSGANVLTF +CASSLGDEQFF +CASSLGDFTYEQYF +CASSLGDNTEAFF +CASSLGDPVWNTQYF +CASSLGDQPQHF +CASSLGDRAGGYTF +CASSLGDRPSTDTQYF +CASSLGDSFTGELFF +CASSLGDTEAFF +CASSLGEASYVGQPQHF +CASSLGELTDTQYF +CASSLGESSRDSNQPQHF +CASSLGETQYF +CASSLGFGNTIYF +CASSLGFNEQFF +CASSLGFTDTQYF +CASSLGGEQYF +CASSLGGFNGYTF +CASSLGGGFAQPQHF +CASSLGGGHTDTQYF +CASSLGGGMGYTF +CASSLGGGNEAFF +CASSLGGGSGGYTGELFF +CASSLGGGVSEAFF +CASSLGGLAGIRSEQFF +CASSLGGLRTEAFF +CASSLGGNTGELFF +CASSLGGQAAGYGYTF +CASSLGGSAYNEQFF +CASSLGGTAQGAVEAFF +CASSLGGTEAFF +CASSLGGVGYTEAFF +CASSLGGVSDTQYF +CASSLGHQETQYF +CASSLGIVEQFF +CASSLGIYGYTF +CASSLGKTSETEETQYF +CASSLGLAGDEQYF +CASSLGLANEQFF +CASSLGLDRAYATNEKLFF +CASSLGLGGSTDTQYF +CASSLGLGTGELFF +CASSLGLGYEQFF +CASSLGLHYEQYF +CASSLGLYDIEETQYF +CASSLGLYGYTF +CASSLGMGLMAFF +CASSLGMGLPYEQYF +CASSLGMRGGWDEQFF +CASSLGNEQFF +CASSLGPGVALEYF +CASSLGPITGLPYNEQFF +CASSLGPNRGAEQYF +CASSLGPSDFVTASGSITGGPDTQYF +CASSLGPSDTQYF +CASSLGPSGQETQYF +CASSLGPTGELFF +CASSLGQDTDTQYF +CASSLGQGADEQYF +CASSLGQGATEAFF +CASSLGQGATF +CASSLGQGRETQYF +CASSLGQGRRDEQFF +CASSLGQGSYGYTF +CASSLGQGTDTQYF +CASSLGQGTEAFF +CASSLGQGTGKLFF +CASSLGQMGNQPQHF +CASSLGQPQHF +CASSLGQQAKNIQYF +CASSLGQSANYGYTF +CASSLGQVAFYEQYF +CASSLGQVSYNEQFF +CASSLGRAVNYGYTF +CASSLGRNSGNTIYF +CASSLGRTNQPQHF +CASSLGRVSEQYF +CASSLGSFGYTF +CASSLGSGGPDEQYF +CASSLGSGLVYEQYF +CASSLGSQGGKQPQHF +CASSLGSRNEQFF +CASSLGSSNQPQHF +CASSLGSYGYTF +CASSLGTANYGYTF +CASSLGTDSGYGYTF +CASSLGTGAPANTGELFF +CASSLGTGENEQYF +CASSLGTGGGYTF +CASSLGTGGYEQYF +CASSLGTGHNEQFF +CASSLGTGLYGYTF +CASSLGTGMVTDTQYF +CASSLGTGNTIYF +CASSLGTGPYNEQFF +CASSLGTSLYF +CASSLGTSYEQYF +CASSLGTVASYEQYF +CASSLGTVYEQYF +CASSLGTYEQYF +CASSLGVAGTNTGELFF +CASSLGVAVYEQYF +CASSLGVGTEAFF +CASSLGVGWEAFF +CASSLGVTDTQYF +CASSLGVYGYTF +CASSLGWGFGEQYF +CASSLHEGNEQFF +CASSLHLLRGLNTGELFF +CASSLHRGKLFF +CASSLIAGGYNEQFF +CASSLIDRENTEAFF +CASSLIFEETQYF +CASSLIFPDTQYF +CASSLIFPGELFF +CASSLIFPGRAFF +CASSLIFPSGEQYF +CASSLIGGQGANGYTF +CASSLIQGAGSPLHF +CASSLISASGGIKNIQYF +CASSLISYNEQFF +CASSLIYPGELFF +CASSLKTHYSNQPQHF +CASSLKVYEQYF +CASSLLAGAGELFF +CASSLLAGGADWNEQFF +CASSLLAGGSGTDTQYF +CASSLLAGPYNEQFF +CASSLLAGVEQYF +CASSLLAGVYNEQFF +CASSLLASGFGTDTQYF +CASSLLATDTQYF +CASSLLFGGGDSYNEQFF +CASSLLGFSDGGTGELFF +CASSLLGFSTETQYF +CASSLLGGAGTDTQYF +CASSLLGGWSEAFF +CASSLLGHSNQPQHF +CASSLLGPSYNEQFF +CASSLLGRAGTDTQYF +CASSLLGTGPEAFF +CASSLLGTSGPRETQYF +CASSLLGVGEKLFF +CASSLLLARETQYF +CASSLLNPLTEAFF +CASSLLQAQRETTDTQYF +CASSLLQGALSSTDTQYF +CASSLLQGGSSPLHF +CASSLLQGPSYEQYF +CASSLLQGTEAFF +CASSLLSGTSHEQYF +CASSLLSGVLNTEAFF +CASSLLSNQPQHF +CASSLLSSAGEAFF +CASSLLVISHQPQHF +CASSLLVLDTQYF +CASSLLVSGVSSTDTQYF +CASSLLVTDTQYF +CASSLLWAEYNEQFF +CASSLMADTQYF +CASSLMAGSYEQYF +CASSLMGNQPQHF +CASSLMRGGTYNSPLHF +CASSLNGGSSYNEQFF +CASSLNGQETQYF +CASSLNLGQGYTF +CASSLNLKSSYNSPLHF +CASSLNLRDREAYEQYF +CASSLNRGGSYEQYF +CASSLNRGPADTQYF +CASSLNRGPSDTQYF +CASSLNSSSYEQYF +CASSLNTEAFF +CASSLPDSNEQFF +CASSLPGAINSPLHF +CASSLPGGYNEQFF +CASSLPGMSNSPLHF +CASSLPGRNYEQYF +CASSLPGTGGSPLHF +CASSLPGVGYNEQFF +CASSLPIRDRGYATNEKLFF +CASSLPKTINSEQYF +CASSLPRTPGRNTEAFF +CASSLPYKRDTQYF +CASSLQDRALSNDEQYF +CASSLQGANEKLFF +CASSLQGGDLDRRRPKSSPLHF +CASSLQGGPKNIQYF +CASSLQGKETQYF +CASSLQGNQPQHF +CASSLQGNYGYTF +CASSLQGTDTQYF +CASSLQGWEPQHF +CASSLQISIAGVSYNEQFF +CASSLQNTEAFF +CASSLQRGKVYEQYF +CASSLQSSTEAFF +CASSLQSSYEQYF +CASSLQSSYNEQFF +CASSLQSVDANGYTF +CASSLQVTGTNYGYTF +CASSLRAAGELFF +CASSLRAGDNEQFF +CASSLRAGEVEQFF +CASSLRAGGERYF +CASSLRAGIEQYF +CASSLRAGKTQYF +CASSLRAGWGYGYTF +CASSLRALAGFTYEQYF +CASSLRALSSYNSPLHF +CASSLRASETQYF +CASSLRASGAQYF +CASSLRASGEQYF +CASSLRASYEQYF +CASSLRDGSEAFF +CASSLRDGTSRTGWETQYF +CASSLRDRGARPNTEAFF +CASSLRDRGLSTEAFF +CASSLRDRGRGGTEAFF +CASSLRDSSGQETQYF +CASSLRGGFSVGYTF +CASSLRGLEGYTF +CASSLRGSGEQFF +CASSLRGVVSYEQFF +CASSLRKGFDEQYF +CASSLRLLQETQYF +CASSLRQDYGYTF +CASSLRQGLRNEQFF +CASSLRREGEVGQPQHF +CASSLRSADTQYF +CASSLRSAGEQYF +CASSLRSATEAFF +CASSLRSAYEQYF +CASSLRSGAVEQYF +CASSLRSGDTQYF +CASSLRSGGLEYF +CASSLRSGTEAFF +CASSLRSGTEDTDTQYF +CASSLRSNTEAFF +CASSLRSQETQYF +CASSLRSSDTQYF +CASSLRSSGELFF +CASSLRSSIEAFF +CASSLRSSNEQFF +CASSLRSSYEQYF +CASSLRSSYNEQFF +CASSLRSTDTQYF +CASSLRSTGELFF +CASSLRSTGTQYF +CASSLRSTQPQHF +CASSLRSTVEQFF +CASSLRSVEEQYF +CASSLRTGPGYEQYF +CASSLRTGRWEKLFF +CASSLRTRSSGANVLTF +CASSLRTSGGHVQETQYF +CASSLRVSEQFF +CASSLRVSYNEQFF +CASSLSAGETQYF +CASSLSAGVETQYF +CASSLSDGTDTQYF +CASSLSDRATERSYEQYF +CASSLSGAAEQYF +CASSLSGASEAFF +CASSLSGGEYGYTF +CASSLSGGPAEQYF +CASSLSGLGEQFF +CASSLSGNRQETQYF +CASSLSGNTEAFF +CASSLSGPYNEQFF +CASSLSGQGKFF +CASSLSGRVVYEQYF +CASSLSGSSGTNEQFF +CASSLSGSTYNEQFF +CASSLSGSYEQYF +CASSLSGSYNEQFF +CASSLSGVTEAFF +CASSLSLAEIYEQYF +CASSLSLAGGLSEDYEQYF +CASSLSLGVRAHEQYF +CASSLSLTSGGAGTDTQYF +CASSLSPAGTASYGYTF +CASSLSPEGFSEQYF +CASSLSPTSGGLPPYEQYF +CASSLSRGRTYEQYF +CASSLSRPYPETQYF +CASSLSRRGPNYGYTF +CASSLSSGFYNEQFF +CASSLSSGTYEQYF +CASSLSSVWQDLSYEQYF +CASSLSSYNEQFF +CASSLSTGEFSYEQYF +CASSLSTGGGLGETQYF +CASSLSVFKLYEQYF +CASSLTAGGGETQYF +CASSLTAGGGNSPLHF +CASSLTAGPTDTQYF +CASSLTATDTQYF +CASSLTEVSYEQYF +CASSLTFGGALETQYF +CASSLTFTGDLNQPQHF +CASSLTGAEEQFF +CASSLTGAGTEAFF +CASSLTGAGYEQYF +CASSLTGAPLDTQYF +CASSLTGAQETQYF +CASSLTGGGHEQYF +CASSLTGGGNIQYF +CASSLTGGLGTEAFF +CASSLTGGTDTQYF +CASSLTGGTEAFF +CASSLTGLTPLSTDTQYF +CASSLTGSGTEAFF +CASSLTGSPYEQYF +CASSLTGSPYNEQFF +CASSLTGSTPTDTQYF +CASSLTGTGFNYGYTF +CASSLTGTLVDEQYF +CASSLTGTTYEQYF +CASSLTGTVYEQYF +CASSLTGVDGYTF +CASSLTGVGEQYF +CASSLTGVPGNTIYF +CASSLTGVVREQYF +CASSLTHTYEQYF +CASSLTIGYNEQFF +CASSLTLVFEAFF +CASSLTPGHQPQHF +CASSLTQNSNQPQHF +CASSLTSGGLNEQFF +CASSLTSGGPGDTQYF +CASSLTSGNEQFF +CASSLTSGNNEQFF +CASSLTSGTVKNIQYF +CASSLTSGVADTQYF +CASSLTSLANEQFF +CASSLTSSSYNEQFF +CASSLTSVNEQFF +CASSLTSVSEQFF +CASSLTTGGRNEQFF +CASSLTVAGVYPGYEQYF +CASSLTVLYGYTF +CASSLVAGGGETQYF +CASSLVDRSITDTQYF +CASSLVDSGLMDEQYF +CASSLVEEEPTRGWPQHF +CASSLVEGGLAGSYNEQFF +CASSLVELQVEAFF +CASSLVETGELFF +CASSLVGDTYEQYF +CASSLVGEWEQFF +CASSLVGGGPQHF +CASSLVGGLVGGYTF +CASSLVGGPADTQYF +CASSLVGGTDEKLFF +CASSLVGLSYEQYF +CASSLVGLYSNQPQHF +CASSLVGRLTVDEQYF +CASSLVGSASGYNSPLHF +CASSLVGTGELFF +CASSLVGTGFLDEQFF +CASSLVGVDEQFF +CASSLVGVGEQYF +CASSLVGVGNTEAFF +CASSLVIGEQFF +CASSLVLGQVYEQYF +CASSLVPGQASYEQYF +CASSLVPPGLAGSTDTQYF +CASSLVPSGGPVSTDTQYF +CASSLVPTGQETQYF +CASSLVQASENEQYF +CASSLVRASGSMEQYF +CASSLVRDGPYEQYF +CASSLVRGDTEAFF +CASSLVSAHEQYF +CASSLVSASTDTQYF +CASSLVSGMLYF +CASSLVSSRHEQFF +CASSLVSSYNEQFF +CASSLVTEAFF +CASSLVTSGSYNEQFF +CASSLVTYPTDTQYF +CASSLVVPGTDTQYF +CASSLVVTSGFNEQFF +CASSLVWQGYGYTF +CASSLWDRAKANYGYTF +CASSLWEDYGYTF +CASSLWGDYEQYF +CASSLWLGGDTHYGYTF +CASSLWMNTEAFF +CASSLWQGASGNTIYF +CASSLWRGAGEAFF +CASSLWSNEQFF +CASSLWTGGGEQYF +CASSLWTGNTEAFF +CASSLWWRGGTDTQYF +CASSLYAGQNEQFF +CASSLYASGDSYNEQFF +CASSLYASGGATDTQYF +CASSLYASYEQYF +CASSLYDIAGGVQFF +CASSLYDSYNEQFF +CASSLYDTNTEAFF +CASSLYEQYF +CASSLYGGDNNEQFF +CASSLYGGGTGELFF +CASSLYGLAGDGEQYF +CASSLYGPLSTDTQYF +CASSLYITEPQHF +CASSLYKGIQETQYF +CASSLYPGPSYEQYF +CASSLYPGSYEQYF +CASSLYQTYEQYF +CASSLYRGEKLFF +CASSLYRGGTEAFF +CASSLYRGNTEAFF +CASSLYRQGWEAFF +CASSLYSATGELFF +CASSLYSDGGQGYTF +CASSLYSGGRNTGELFF +CASSLYSNQPQHF +CASSLYSPLAYSWNEKLFF +CASSLYSQDEQFF +CASSLYSSNEQFF +CASSLYYLAPKTYEQYF +CASSMAAGGELFF +CASSMAGGPGNEQFF +CASSMAGNTEAFF +CASSMAPTDTQYF +CASSMASLSEQFF +CASSMDRFTNQPQHF +CASSMDRGSADTQYF +CASSMDSNQPQHF +CASSMERTGSDTGELFF +CASSMFAGGPREQYF +CASSMFALDTQYF +CASSMFASNYGYTF +CASSMFATDTQYF +CASSMFDLPYNEQFF +CASSMFDSSYEQYF +CASSMFGALGTEAFF +CASSMFGAPNQPQHF +CASSMFGGGYNEQFF +CASSMFGGNEQFF +CASSMFGGNNQPQHF +CASSMFGGQPQHF +CASSMFGKETQYF +CASSMFGQLGYTF +CASSMFGTDTQYF +CASSMFGTEEQYF +CASSMFGVAGYTF +CASSMFHSGNTIYF +CASSMFLGQPQHF +CASSMFMISGNTIYF +CASSMFMPGELFF +CASSMFPGRNEQFF +CASSMFPNYQPQHF +CASSMFQGWNQPQHF +CASSMFSSLNTEAFF +CASSMFVGQPQHF +CASSMGAHGYTF +CASSMGAYGYTF +CASSMGDRSMGNIQYF +CASSMGGGATVLTF +CASSMGGSPLHF +CASSMGGVNTEAFF +CASSMGIYGYTF +CASSMGLVSFGQFF +CASSMGLYGYTF +CASSMGLYNEQFF +CASSMGNYGYTF +CASSMGQGTYEQYF +CASSMGQPQHF +CASSMGSGGTQYF +CASSMGSGSYGEAFF +CASSMGSHGYTF +CASSMGSVGYTF +CASSMGSYGYTF +CASSMGTGNTIYF +CASSMGVTEAFF +CASSMGVYGYTF +CASSMGYYGYTF +CASSMHGASELYF +CASSMIGETQYF +CASSMIGTGALNEQFF +CASSMIPQDTQYF +CASSMIQGSINEQYF +CASSMISINEQFF +CASSMLAAEAFF +CASSMLALDTQYF +CASSMLALGPQHF +CASSMLATDTQYF +CASSMLRIVYDTQYF +CASSMLSQDIQYF +CASSMMGSGYNEQFF +CASSMMSGTEAFF +CASSMPGPINEQFF +CASSMRAAAEAFF +CASSMRAAGELFF +CASSMRAALEQYF +CASSMRAAYEQYF +CASSMRAGETQYF +CASSMRAGGEQYF +CASSMRAGVEQFF +CASSMRAGYEQYF +CASSMRASHEQYF +CASSMRASNEQFF +CASSMRASSEQFF +CASSMRASSPLHF +CASSMRASVEQFF +CASSMRASVEQYF +CASSMRASYEQYF +CASSMRATGELFF +CASSMRGADTQYF +CASSMRGSDTQYF +CASSMRHLEEQYF +CASSMRSADTQYF +CASSMRSAGELFF +CASSMRSAHEQYF +CASSMRSAVGQFF +CASSMRSAWEQFF +CASSMRSAYEQYF +CASSMRSDEKLFF +CASSMRSDGELFF +CASSMRSEDTQYF +CASSMRSGANVLTF +CASSMRSGAVTQYF +CASSMRSGDTQYF +CASSMRSGETQYF +CASSMRSGIEQYF +CASSMRSGNTIYF +CASSMRSGPEQYF +CASSMRSGQPQHF +CASSMRSGSEHYF +CASSMRSGSEQFF +CASSMRSGSEQYF +CASSMRSGSGQYF +CASSMRSGSKQYF +CASSMRSGTMAFF +CASSMRSGVEQFF +CASSMRSGVEQYF +CASSMRSGYEQYF +CASSMRSNQPQHF +CASSMRSNTEAFF +CASSMRSQETQYF +CASSMRSSDIQYF +CASSMRSSDTQYF +CASSMRSSEQHF +CASSMRSSGELFF +CASSMRSSGEQYF +CASSMRSSGTQYF +CASSMRSSHEQFF +CASSMRSSHEQYF +CASSMRSSSEQYF +CASSMRSSSPLHF +CASSMRSSWEQYF +CASSMRSSYEQYF +CASSMRSSYNEQFF +CASSMRSTDTQYF +CASSMRSTGELFF +CASSMRSTNEQFF +CASSMRWAYEQYF +CASSMSAANQPQHF +CASSMSGVPTDTQYF +CASSMSGYNEQFF +CASSMSSGTEQYF +CASSMSSLDEQFF +CASSMSSSGYNEQFF +CASSMSTGELFF +CASSMSTSRNEAFF +CASSMTATHGYTF +CASSMTGGNEQFF +CASSMTGGTYGYTF +CASSMTGTGGTEAFF +CASSMTGVYEQYF +CASSMTGWGEQYF +CASSMTGYNEQFF +CASSMTSGNEQFF +CASSMTSGPYNEQFF +CASSMTSGSEQYF +CASSMTSGSLNEQFF +CASSMTSINEQYF +CASSMTSLNEQFF +CASSMTSRGEQFF +CASSMTSTDTQYF +CASSMVAALGEQFF +CASSMVAGAYNEQFF +CASSMVATDTQYF +CASSMVAVSEAFF +CASSMVAVTTDTQYF +CASSMVGGAKNIQYF +CASSMVGGLLYGYTF +CASSMVGGPGNEQFF +CASSMVGGPYNEQFF +CASSMVGGSYNEQFF +CASSMVGGVYGYTF +CASSMVGIHEQYF +CASSMVGSGYNEQFF +CASSMVMNTEAFF +CASSMVSIDEQFF +CASSMVSQGEQFF +CASSMVSTDTQYF +CASSMVVIGTEAFF +CASSMVVSAEAFF +CASSMWGQWGNQPQHF +CASSMWSAHEQYF +CASSMYAGNVLTF +CASSMYAGYNEQFF +CASSMYDTVNTEAFF +CASSMYGGDTQYF +CASSMYGGQPQHF +CASSMYGGSANEQFF +CASSMYGGSNQPQHF +CASSMYGYTEAFF +CASSMYLVSLNEQYF +CASSMYNSGANVLTF +CASSMYSGSANEQFF +CASSMYSINIQYF +CASSMYSNQPQHF +CASSMYSTDTQYF +CASSMYSVNEQFF +CASSMYYLSYNEQFF +CASSNFAGNTIYF +CASSNFDSSYEQYF +CASSNGAYGYTF +CASSNGLAGPSYNEQFF +CASSNGQYSNQPQHF +CASSNHAGAAQYF +CASSNHGAYEQYF +CASSNIATDTQYF +CASSNLAGGYNEQFF +CASSNLAGLNEQFF +CASSNLMLQGTGKLFF +CASSNLPRDEQYF +CASSNPDRTERTEAFF +CASSNPGTSTEDTQYF +CASSNQGPVGYTF +CASSNRAADTQYF +CASSNRAAYEQYF +CASSNRAGVEQYF +CASSNRASEKLFF +CASSNRASNEQFF +CASSNRASYEQYF +CASSNRSGETQYF +CASSNSGRYNEQFF +CASSNTVNYGYTF +CASSNVAGINEQFF +CASSNYRPFSTDTQYF +CASSPAEGQRGEKLFF +CASSPAGASNGYTF +CASSPAGHTGELFF +CASSPAGPGSPLHF +CASSPASGNEQYF +CASSPASGSYEQYF +CASSPATGPREQYF +CASSPDGGSQPQHF +CASSPDGPYNEQFF +CASSPDGSNTEAFF +CASSPDLGYEQYF +CASSPDNTGELFF +CASSPDQETSYTDTQYF +CASSPDRAEGDTQYF +CASSPDRAITEAFF +CASSPDRESNQPQHF +CASSPDRGLLNQPQHF +CASSPDRLNQPQHF +CASSPDSDAYEQYF +CASSPDSDSYEQYF +CASSPDSYNQPQHF +CASSPDTEDTQYF +CASSPDTPNYGYTF +CASSPEAGVSNTEAFF +CASSPEDRVGDNTEAFF +CASSPEEGGWSYEQYF +CASSPEGQSSNTGELFF +CASSPEGVSYEQYF +CASSPETSEPHNEQFF +CASSPETYYNEQFF +CASSPFETAYEQYF +CASSPFGANVLTF +CASSPFGGSFNEQYF +CASSPFGGSIAKNIQYF +CASSPFGSDEQFF +CASSPFGSSLAFF +CASSPFGSSYNEQFF +CASSPFGTGYNEQFF +CASSPFGTRELFF +CASSPFGVGNSPLHF +CASSPFHTGELFF +CASSPFLGGAYGYTF +CASSPFLWGAQYF +CASSPFQGSYEQYF +CASSPFWGGEQYF +CASSPGAGGDTQYF +CASSPGAVSSTDTQYF +CASSPGEGDYGYTF +CASSPGGAAGYTF +CASSPGGGGQQYF +CASSPGGLAGADTQYF +CASSPGGMNTEAFF +CASSPGGSPEQYF +CASSPGHPSSYNEQFF +CASSPGISVNYGYTF +CASSPGKGAYEQYF +CASSPGLAGDYEQYF +CASSPGLAGGFSGTDTQYF +CASSPGLAGNNEQYF +CASSPGLGLDTQYF +CASSPGLTSTDTQYF +CASSPGPGLNQPQHF +CASSPGPTGDNSPLHF +CASSPGQDYQPQHF +CASSPGQGATGELFF +CASSPGQGAWEQYF +CASSPGQGSYEQYF +CASSPGQGTEAFF +CASSPGQGVSYEQYF +CASSPGQGWNGYTF +CASSPGQRTEAFF +CASSPGRVYGYTF +CASSPGSEAGSYGYTF +CASSPGTADYNEQFF +CASSPGTGNQPQHF +CASSPGTGPNQPQHF +CASSPGTGPRTLHF +CASSPGTGTYEQYF +CASSPGTGVGQFF +CASSPGTGYYEQYF +CASSPGTSGGPFTDTQYF +CASSPGTSGIYEQYF +CASSPGTSSIYEQYF +CASSPGTSVNEQFF +CASSPGVSVEQYF +CASSPGWEQYF +CASSPHEAAEQYF +CASSPHGAAEQYF +CASSPHGAAERYF +CASSPHGAAKQYF +CASSPHGGGINEQFF +CASSPHGGGNTEAFF +CASSPHGGQPRTGQHF +CASSPHGGSNTEAFF +CASSPHHGNTEAFF +CASSPHQASYEQYF +CASSPHRDFYTDTQYF +CASSPHSAFYEQYF +CASSPHSGGNNEQFF +CASSPHWSSYEQYF +CASSPIAGSSYEQYF +CASSPIAGVSDEQFF +CASSPIAQETQYF +CASSPIGGAGELFF +CASSPIGGGGNEQFF +CASSPIGIGTLPF +CASSPIGQGAGSNQPQHF +CASSPIGSTDTQYF +CASSPIPGELFF +CASSPITQTGPLDTEAFF +CASSPIWRDRVNTEAFF +CASSPKDIRVYNEQFF +CASSPKDTPYEQYF +CASSPKDTQYF +CASSPKKEKNEQFF +CASSPKTYSNQPQHF +CASSPLAGFYNEQFF +CASSPLAGGATDTQYF +CASSPLAGGFNEQFF +CASSPLAGGLNEQFF +CASSPLAGLYNEQFF +CASSPLAGPANEQFF +CASSPLAGPYNEQFF +CASSPLAGRGYNEQFF +CASSPLAGTYNEQFF +CASSPLAGWDIQYF +CASSPLAGYYNEQFF +CASSPLAVTYNEQFF +CASSPLDRGDETQYF +CASSPLDRTIGTDTQYF +CASSPLDTKQQFF +CASSPLDVTENTEAFF +CASSPLEGGYNEQFF +CASSPLFGGHQPQHF +CASSPLFGNTIYF +CASSPLGDADTQYF +CASSPLGGNQPQHF +CASSPLGVAGELFF +CASSPLIGEQYF +CASSPLLAGGTGELFF +CASSPLLEGHQPQHF +CASSPLLGGHQPQHF +CASSPLLSESYNEQFF +CASSPLPGGETQYF +CASSPLQGANTGELFF +CASSPLQGLVGYTF +CASSPLRETGISGANVLTF +CASSPLSASEEFF +CASSPLSEKLFF +CASSPLSGNEQFF +CASSPLSGRVTDTQYF +CASSPLSLGSPYNEQFF +CASSPLSNQPQHF +CASSPLSTDTQYF +CASSPLTDTEAFF +CASSPLTGSEAFF +CASSPLVGPQPQHF +CASSPLWNTGELFF +CASSPLWVAGGRETQYF +CASSPLYEQYF +CASSPNGGAGGPGNEQFF +CASSPNGGSSYNEQFF +CASSPNIASNNEQFF +CASSPNSGSYNEQFF +CASSPNSNQPQHF +CASSPPAGGKAGELFF +CASSPPAVGTEAFF +CASSPPEAFF +CASSPPFVGGHGYTF +CASSPPGENYNEQFF +CASSPPGTSTDTQYF +CASSPPGVREQFF +CASSPPLGVDGYTF +CASSPPNGRAEQYF +CASSPPPGAVETQYF +CASSPPSAGYNEQFF +CASSPPTFSYEQYF +CASSPPTLGQNYEQYF +CASSPPVPPWSEQYF +CASSPQADYEQYF +CASSPQAGASYEQYF +CASSPQDRARGNEQFF +CASSPQDRDFYEQYF +CASSPQGFYGYTF +CASSPQGINYGYTF +CASSPQGSHEQYF +CASSPQMNTEAFF +CASSPQRTAHEQYF +CASSPQSNQPQHF +CASSPQTGAYGYTF +CASSPQVEAFF +CASSPQVGSGASYNEQFF +CASSPQYSRHYEQYF +CASSPRAADTQYF +CASSPRAAGELFF +CASSPRAGDTQYF +CASSPRAGETQYF +CASSPRAGNEKLFF +CASSPRAGQPQHF +CASSPRAGWETQYF +CASSPRANTGELFF +CASSPRASLEQYF +CASSPRDGQETQYF +CASSPRDREGNEQFF +CASSPRDRERGEQYF +CASSPRDRGHSYNEQFF +CASSPRDRRDEQFF +CASSPRDSGVTGELFF +CASSPRGDGSYTF +CASSPRGGDTQYF +CASSPRGGGEQFF +CASSPRGGIEQYF +CASSPRGGMTQHF +CASSPRGGYEQYF +CASSPRGPNTEAFF +CASSPRGPSTEAFF +CASSPRGQSYEQYF +CASSPRGRLNEQFF +CASSPRLGSTDTQYF +CASSPRLRGEGLNEQFF +CASSPRPDKEAFF +CASSPRQGIETQYF +CASSPRQGPVEKLFF +CASSPRQGWSQETQYF +CASSPRQGWTNEKLFF +CASSPRRAGAANTGELFF +CASSPRSADTQYF +CASSPRSAEPQHF +CASSPRSAGELFF +CASSPRSATAQHF +CASSPRSATGAFF +CASSPRSAVEQYF +CASSPRSAYEQYF +CASSPRSDGELFF +CASSPRSDSPLHF +CASSPRSGDTQYF +CASSPRSGEPQHF +CASSPRSGETQYF +CASSPRSGGEQFF +CASSPRSGIEAFF +CASSPRSGIEQFF +CASSPRSGNEQFF +CASSPRSGNEQYF +CASSPRSGNTIYF +CASSPRSGQPQHF +CASSPRSGSEQYF +CASSPRSGSTEAFF +CASSPRSGTEQYF +CASSPRSGVEQFF +CASSPRSGVEQYF +CASSPRSGVGAFF +CASSPRSNIEQFF +CASSPRSNQPQHF +CASSPRSSDTQYF +CASSPRSSEPQHF +CASSPRSSGPQHF +CASSPRSSQPQHF +CASSPRSSYEQYF +CASSPRSTDTQCF +CASSPRSTDTQYF +CASSPRSTNTQYF +CASSPRTGGRWQYF +CASSPRTGNSGANVLTF +CASSPRTGSDTQYF +CASSPRTSGGATDTQYF +CASSPRTVRLRETQYF +CASSPRVNRNTEAFF +CASSPSDFGTDTQYF +CASSPSFWDMGPYTF +CASSPSGATQPQHF +CASSPSGGATGELFF +CASSPSGLAGLTDTQYF +CASSPSGNEQFF +CASSPSGPWPQYF +CASSPSGSEQFF +CASSPSGTGGNEQFF +CASSPSGVDQYF +CASSPSGVYEQFF +CASSPSHGGLYEQYF +CASSPSLAGYGNEQFF +CASSPSPGVGLYEQYF +CASSPSRDLAWGFEKLFF +CASSPSRILTGETEKLFF +CASSPSSNQPQHF +CASSPSSSYEQYF +CASSPSSVADTQYF +CASSPSTTLSSGASGETQYF +CASSPSTTTGNIQYF +CASSPSVNPGGYGYTF +CASSPTADNQPQHF +CASSPTATGELFF +CASSPTATHGYTF +CASSPTGGAGSTDTQYF +CASSPTGGAGTEAFF +CASSPTGGEGGYTF +CASSPTGGGSPLHF +CASSPTGGLETQYF +CASSPTGGNTEAFF +CASSPTGGPADTQYF +CASSPTGGPGDTQYF +CASSPTGGSYNEQFF +CASSPTGSDTGELFF +CASSPTGSLNEQFF +CASSPTGSVAEQYF +CASSPTGVGEQFF +CASSPTGVGPQHF +CASSPTGVGSPLHF +CASSPTGVGTQYF +CASSPTGVNTEAFF +CASSPTGVYNSPLHF +CASSPTGWVTREAFF +CASSPTGYYNEQFF +CASSPTLQGPLGETQYF +CASSPTPGQGVTGELFF +CASSPTRPVEQYF +CASSPTSGGYNEQFF +CASSPTSNTEAFF +CASSPTSTGYGYTF +CASSPTTGPYGYTF +CASSPTVAGAPETQYF +CASSPTVNYGYTF +CASSPTVRTGETDTQYF +CASSPTYPPEAFF +CASSPVAGATTDTQYF +CASSPVAGWYNEQFF +CASSPVGGEQFF +CASSPVGIDEQYF +CASSPVGIGEAFF +CASSPVGLNTEAFF +CASSPVGTEAFF +CASSPVGTGELFF +CASSPVGTLYEQYF +CASSPVGVGEAFF +CASSPVGVGELFF +CASSPVGVGEQFF +CASSPVGVGEQYF +CASSPVLGIGEQYF +CASSPVSESTYEQYF +CASSPVSTDTQYF +CASSPVSYNEQFF +CASSPVTGAEQYF +CASSPVTGGGSGANVLTF +CASSPWDRGLAEAFF +CASSPWDSRFYGYTF +CASSPWDTDTQYF +CASSPWGGNTEAFF +CASSPWGIGGMNTEAFF +CASSPWGQTASSYNEQFF +CASSPWILYNEQFF +CASSPWSGSQETQYF +CASSPWTAGRTGELFF +CASSPWTTNTGELFF +CASSPYAGANTEAFF +CASSPYDLGSNQPQHF +CASSPYGGAINEQFF +CASSPYGGWTEAFF +CASSPYHTNEQFF +CASSPYNEQFF +CASSPYRTTGYEQYF +CASSPYSGGNSPLHF +CASSPYWTAAGELFF +CASSQACRGRHRNTIYF +CASSQADRDTYEQYF +CASSQAGGRDEAFF +CASSQAHRGINQPQHF +CASSQAWLAKDTQYF +CASSQAWRDTQYF +CASSQCRGFWTQPQHF +CASSQDADTQYF +CASSQDAGGWDEQFF +CASSQDAGLAKNIQYF +CASSQDAGTSGSLQGEQYF +CASSQDDGTFSYNEQFF +CASSQDDLSLGANVLTF +CASSQDEDGFMNTEAFF +CASSQDEIGLASDTQYF +CASSQDEPYEQYF +CASSQDEYGLIHYGYTF +CASSQDGFAYEQYF +CASSQDGFGLFNEQFF +CASSQDGGGAPGNEQFF +CASSQDGGTSGADTQYF +CASSQDGIGLFTEAFF +CASSQDGKGATYNSPLHF +CASSQDGLAPYEQYF +CASSQDGQGETGELFF +CASSQDHGSGELFF +CASSQDHLGLDTQYF +CASSQDHRGQGQPQHF +CASSQDITVLYGYTF +CASSQDKGRAYEQYF +CASSQDLAGEGTDTQYF +CASSQDLAGGPDTQYF +CASSQDLAGVSTDTQYF +CASSQDLEGEGYNEQFF +CASSQDLGAGTEAFF +CASSQDLGGWEQFF +CASSQDLGSGNEQFF +CASSQDLGTVYGYTF +CASSQDLLAGNEQFF +CASSQDLLGVYNEQFF +CASSQDLPGPNTEAFF +CASSQDLTFIKTQYF +CASSQDLTSEAFF +CASSQDPAGGGVEQFF +CASSQDPAGSITDTQYF +CASSQDPAGVGYEQYF +CASSQDPAREQYF +CASSQDPGDEQFF +CASSQDPGGQETQYF +CASSQDPGYEQYF +CASSQDPHGFVANVLTF +CASSQDPLGTTDTQYF +CASSQDPLWRRTGALRGTGELFF +CASSQDPSGGFAYEQYF +CASSQDPSPSTDTQYF +CASSQDPSRTYEQYF +CASSQDQGAYEQYF +CASSQDRGDAYNEQFF +CASSQDRGTGYNEQFF +CASSQDRGTSTDTQYF +CASSQDRLAGNYEQYF +CASSQDRLFVGPKNTEAFF +CASSQDRREQYF +CASSQDRTGGGYEQYF +CASSQDRVRETQYF +CASSQDRVTEAFF +CASSQDSGEPYNEQFF +CASSQDSGQAMGYTF +CASSQDSGRYNEQFF +CASSQDSSGGFTYEQYF +CASSQDSSGQETQYF +CASSQDTGMGNQPQHF +CASSQDTGMNTEAFF +CASSQDTGSTEAFF +CASSQDVGSWNQPQHF +CASSQDVHRGIPGELFF +CASSQDWATVYEQYF +CASSQDWGAGTQYF +CASSQDWGLNELFF +CASSQDWGQPNTEAFF +CASSQDWPGGAGELFF +CASSQDWRNSPLHF +CASSQDYGLEAFF +CASSQEAGALGETQYF +CASSQEASGGYEQYF +CASSQEDAAYGVNEQFF +CASSQEDAGAPYNEQFF +CASSQEDGTGSAGELFF +CASSQEDLTSGTNEQFF +CASSQEDRAQDKADYGYTF +CASSQEEVTGDKPYEQYF +CASSQEGAEAPYNEQFF +CASSQEGAGELFF +CASSQEGGDTEAFF +CASSQEGGLAWETQYF +CASSQEGGLDYGYTF +CASSQEGIGLSTTNSPLHF +CASSQEGQGAGELFF +CASSQEGVEAFF +CASSQEGVGGFDIQYF +CASSQEGVLSGYGYTF +CASSQEGWYNEQFF +CASSQEHNSGPYEQYF +CASSQEIGFSPLHF +CASSQEMVGTEAFF +CASSQEPPGQGLAYNEQFF +CASSQERAGQETQYF +CASSQERGGKWAYEQYF +CASSQERGGRYTEAFF +CASSQERGGYNEQFF +CASSQERQTILEAFF +CASSQERWGGGNTIYF +CASSQESLAGRGAYNEQFF +CASSQETDRGYNSPLHF +CASSQETGASNTEAFF +CASSQETGATNEKLFF +CASSQETGLGNQPQHF +CASSQETGVGTEAFF +CASSQEVFRLAETSGEYEQYF +CASSQEVGATSGNTIYF +CASSQEVLRTDTQYF +CASSQEVQGGEKLFF +CASSQEWAQGYEQYF +CASSQEWLAVSTDTQYF +CASSQEYLAAYEQYF +CASSQFFGNTIYF +CASSQFLGGTEPYTF +CASSQFMGDTNYGYTF +CASSQFNEKLFF +CASSQFPLTGGNTEAFF +CASSQFQYFYGYTF +CASSQGAFGYTF +CASSQGAGELFF +CASSQGAHGYTF +CASSQGASGSFEETQYF +CASSQGAYGSTF +CASSQGAYGYTF +CASSQGDHSPLHF +CASSQGDYTF +CASSQGEGNEQFF +CASSQGEGTEAFF +CASSQGFYGYTF +CASSQGGAGELFF +CASSQGGDYGYTF +CASSQGGLGDSEQFF +CASSQGGQGYEQYF +CASSQGGSGEDTEAFF +CASSQGGTELTYSPLHF +CASSQGGTGYGYTF +CASSQGHGLPYEQYF +CASSQGIRLAGDQETQYF +CASSQGLAGDLQETQYF +CASSQGLAGEIGELFF +CASSQGLAGSEQYF +CASSQGLAPGELFF +CASSQGLGASFGFIGYTF +CASSQGLGVGNTIYF +CASSQGLNTGELFF +CASSQGLSFPRPREAFF +CASSQGNEQFF +CASSQGNYGYTF +CASSQGPGARQPQHF +CASSQGPGLAGDFDLYNEQFF +CASSQGPGLLSYEQYF +CASSQGPGNTEAFF +CASSQGPGTGNTEAFF +CASSQGPPGQGMKKQYF +CASSQGPTGGYVSEAFF +CASSQGQGAWGYTF +CASSQGQNTGELFF +CASSQGRGPGKYF +CASSQGRLAGNQETQYF +CASSQGRQETQYF +CASSQGRYNEQFF +CASSQGSEAFF +CASSQGSGLAGYEQYF +CASSQGSMGEQYF +CASSQGSPGTGIRRETQYF +CASSQGSWGYTF +CASSQGSYGYTF +CASSQGTEHSNQPQHF +CASSQGTGELFF +CASSQGTGGEQYF +CASSQGTSGFWQFF +CASSQGVGGPATNEKLFF +CASSQGVTGELFF +CASSQGWDSPLHF +CASSQGWSDSFYGYTF +CASSQGWSGSFHFEQFF +CASSQGYSTDTQYF +CASSQHGTSNEQFF +CASSQHYTGENTEAFF +CASSQIDTDEKLFF +CASSQIGTESRNQYF +CASSQISNQPQHF +CASSQISTDTQYF +CASSQLAGGQGEQYF +CASSQLFGEDTQYF +CASSQLGGDSSGANVLTF +CASSQLGGISNQPQHF +CASSQLGRGDNEQFF +CASSQLGRSWETQYF +CASSQLSNTEAFF +CASSQMTGLNTEAFF +CASSQNPGGQETQYF +CASSQPDSNQPQHF +CASSQPKGPNEKLFF +CASSQPLTGGRNTGELFF +CASSQPSLLGGNQPQHF +CASSQQGEANSPLHF +CASSQQWEETQYF +CASSQRAAHIQYF +CASSQRASYEQYF +CASSQRGGGIAFF +CASSQRGGIEQFF +CASSQRPSEVGELFF +CASSQRSAGELFF +CASSQRSATREYF +CASSQRSAYEQYF +CASSQRSGETQYF +CASSQRSNTEAFF +CASSQRSQETQYF +CASSQRSSDTQYF +CASSQRSSEPQHF +CASSQRSSHEQYF +CASSQRSSTEAFF +CASSQRSSYEQYF +CASSQRSSYNEQFF +CASSQRSTDTQYF +CASSQRSTGELFF +CASSQRSTNTQYF +CASSQRSTSEAFF +CASSQRSVGELFF +CASSQSREGSPLHF +CASSQSTGANVLTF +CASSQSTGSSYEQYF +CASSQSTSGSYEQYF +CASSQTGAQGEAFF +CASSQTGLNTEAFF +CASSQTGPTGELFF +CASSQTGSTDTQYF +CASSQTSFYGEQYF +CASSQTSGAYNEQFF +CASSQTYEQYF +CASSQVDGAVKVETQYF +CASSQVEAFYEQYF +CASSQVEDSNQPQHF +CASSQVEEFHGELFF +CASSQVFRDRGNTEAFF +CASSQVGEAGNTIYF +CASSQVGGPDTQYF +CASSQVGRPDTQYF +CASSQVGTGYNEQFF +CASSQVHRASGANVLTF +CASSQVLGGGDTQYF +CASSQVLGLSVTGELFF +CASSQVLSAGDSYNEQFF +CASSQVPIENF +CASSQVQGTYEQYF +CASSQVRGGPTEAFF +CASSQVRKGQGAGNQPQHF +CASSQVRTETQYF +CASSQVSGPDTQYF +CASSQVSQGDTGELFF +CASSQVSSSAEQFF +CASSQVVGNTIYF +CASSQVWAGGLGDTQYF +CASSQVWKREDTEAFF +CASSQWDLQETQYF +CASSQWDRFSYEQYF +CASSQWGNNEKLFF +CASSQWTGGGGQPQHF +CASSQWTQHF +CASSQYAGQQEFF +CASSQYMTTNTEAFF +CASSQYNQPQHF +CASSQYRGGGETQYF +CASSQYSGGYEQYF +CASSQYSNQPQHF +CASSQYSVGEQYF +CASSRADTSGYYGYTF +CASSRAGLPHEQYF +CASSRASTDTQYF +CASSRDPDGDTEAFF +CASSRDPVTGPTDTQYF +CASSRDSTGYGYTF +CASSREAGGYEQYF +CASSRELTGTDSYEQYF +CASSREPNPNYGYTF +CASSRGATQETQYF +CASSRGGLSYNEQFF +CASSRGGYNEQFF +CASSRGLAANTGELFF +CASSRGLSSGNTIYF +CASSRGPSGPDTQYF +CASSRGQLYEQYF +CASSRGTAILETQYF +CASSRGVYEQYF +CASSRHPGGTDTQYF +CASSRIAGTYNEQFF +CASSRLAGEQYF +CASSRLAGVETQYF +CASSRLATSGELFF +CASSRLSNQPQHF +CASSRLSTDTQYF +CASSRNYGYTF +CASSRPGTSAFPYEQYF +CASSRQGVGQPQHF +CASSRQSNQPQHF +CASSRQTEAFF +CASSRRASYEQYF +CASSRRDSGANVLTF +CASSRRGLREKLFF +CASSRRGSDEQFF +CASSRRHDRPGNYGYTF +CASSRRSADEQFF +CASSRRSADTQYF +CASSRRSGGDEQFF +CASSRRSGGEQYF +CASSRRSNQPQHF +CASSRRSSDEQYF +CASSRRSTDTQYF +CASSRRSTGELFF +CASSRRSTQPQHF +CASSRSGRETQYF +CASSRSTGDTEAFF +CASSRTASSYNEQFF +CASSRTGGAGNTEAFF +CASSRTGGEQYF +CASSRTGGWDTEAFF +CASSRTGIDEQFF +CASSRTGLNTEAFF +CASSRTGNQPQHF +CASSRTGNSNQPQHF +CASSRTGSGELFF +CASSRTGTSYEQYF +CASSRTGTYEQYF +CASSRTSDTQYF +CASSRTSGDTQYF +CASSRTSGGLNEQFF +CASSRTSGGVKDTQYF +CASSRTSGISLTDTQYF +CASSRTSGNEQFF +CASSRTSTDTQYF +CASSRTVQETQYF +CASSRVGTTYEQYF +CASSRVLGFRGYGYTF +CASSRWGLNNEQFF +CASSRYGNQPQHF +CASSRYHTDTQYF +CASSRYLGGGETQYF +CASSRYPVDTQYF +CASSRYSLDTQYF +CASSRYSNQPQHF +CASSRYSVDTQHF +CASSRYSVDTQYF +CASSSAAAYEQYF +CASSSAATGLEQFF +CASSSADSLYEQYF +CASSSAGGRDEQFF +CASSSAGPYGYTF +CASSSAGTEGFF +CASSSANYGYTF +CASSSAPTDTQYF +CASSSCSGWTQYF +CASSSDDWTVAKNIQYF +CASSSDPRYGYTF +CASSSDRDNTGELFF +CASSSDTGYEQYF +CASSSEASGENF +CASSSELRGLNYGYTF +CASSSESPGELFF +CASSSFDLDITGELFF +CASSSFGGRELFF +CASSSFGGRPQHF +CASSSFGSSGGMDTQYF +CASSSFGTDTQYF +CASSSFGVNGYTF +CASSSFLGAEAFF +CASSSFLLSEQYF +CASSSFRGSEAFF +CASSSFSANEKLFF +CASSSFSINEQFF +CASSSFWTGLDEKLFF +CASSSFYTDTQYF +CASSSGAYGYTF +CASSSGDSNTEAFF +CASSSGFVGPYQPQHF +CASSSGFYEQYF +CASSSGGGGNTIYF +CASSSGGPFYEQYF +CASSSGGPYNEQFF +CASSSGGQASSYEQYF +CASSSGGSNTGELFF +CASSSGKGSTDTQYF +CASSSGLAGGNEQFF +CASSSGLAGVNEQFF +CASSSGLFSF +CASSSGLGVGYTF +CASSSGLPTDTQYF +CASSSGLVAPGELFF +CASSSGLVSNTGELFF +CASSSGNTEAFF +CASSSGPDSGNTIYF +CASSSGQDGGYTF +CASSSGQGAYEQYF +CASSSGQGERETQYF +CASSSGQGFTDTQYF +CASSSGQGNQPQHF +CASSSGSFYEQYF +CASSSGSNNEQFF +CASSSGSSGGSYNEQFF +CASSSGSYGYTF +CASSSGTAYNEQFF +CASSSGTGGGYEQYF +CASSSGTGLYGYTF +CASSSGTRRDNEQFF +CASSSGTSGGDEQFF +CASSSGTSQETQYF +CASSSGTTNEKLFF +CASSSGVYGYTF +CASSSHAGGNTEAFF +CASSSHEGSGGYTF +CASSSHGADTQYF +CASSSHGGGEQYF +CASSSHGGGYGYTF +CASSSHHGQEQYF +CASSSHQRTGVDEQFF +CASSSHRGEGAFF +CASSSIDETGSNEKLFF +CASSSIFPGELFF +CASSSIRGPLSGNTIYF +CASSSIRSEAFF +CASSSISNQPQHF +CASSSKLADQETQYF +CASSSKRGPYNEQFF +CASSSLDVAGYEQYF +CASSSLDVTENTEAFF +CASSSLGAREQFF +CASSSLGGHGEAFF +CASSSLGGRGDTQYF +CASSSLGGWTEAFF +CASSSLGGYTF +CASSSLGSDTQYF +CASSSLITDTQYF +CASSSLITGELFF +CASSSLLNTEAFF +CASSSLSLDRGGGQAFF +CASSSLTGTGDGYTF +CASSSLVMVAGQPYEQYF +CASSSNIIRGSSYEQYF +CASSSNQPQHF +CASSSNRNTGELFF +CASSSNSIQPQHF +CASSSPGGMETQYF +CASSSPGHTYEQYF +CASSSPKGAKYEQYF +CASSSPLDWETQYF +CASSSPLIAGATDTQYF +CASSSPLIRQGANYGYTF +CASSSPNIIRDPTGELFF +CASSSPPDRGPNTGELFF +CASSSPQGRLRGGYNEQFF +CASSSPQGVSNTEAFF +CASSSPRRGIQETQYF +CASSSPSGGTQYF +CASSSPTSGDYEQYF +CASSSPWGQGPHEQYF +CASSSPWTSGTYEQYF +CASSSPYRRSYNEQFF +CASSSQDLTSYEQYF +CASSSQGDQGADTQYF +CASSSQGGGSGYTF +CASSSQGSLNYGYTF +CASSSQSNQPQHF +CASSSQSQDGASYNSPLHF +CASSSRAAAEAFF +CASSSRAAETQYF +CASSSRAAGEQFF +CASSSRAAVEQYF +CASSSRAAYEQYF +CASSSRADEKLFF +CASSSRAEAFF +CASSSRAGDTQYF +CASSSRAGETQYF +CASSSRAGFAQYF +CASSSRAGGDTQYF +CASSSRAGGEQYF +CASSSRAGNEQYF +CASSSRAGNTIYF +CASSSRAGTEAFF +CASSSRAGWDEQFF +CASSSRAGWELFF +CASSSRAGYEQYF +CASSSRASGEQYF +CASSSRASHEQYF +CASSSRASYEQYF +CASSSRATDTQYF +CASSSRATGELFF +CASSSRAVDEQYF +CASSSRAYYEQYF +CASSSRDGGTDTQYF +CASSSRDRASVFSQETQYF +CASSSRDRGGEGTDTQYF +CASSSRGANEQFF +CASSSRGDSWAETQYF +CASSSRGGHEQYF +CASSSRGSDTQYF +CASSSRGSSPQHF +CASSSRIGTYGYTF +CASSSRLSDFGTDTQYF +CASSSRPFWDRAMEGANVLTF +CASSSRPGELFF +CASSSRQGAANEKLFF +CASSSRQGPTGELFF +CASSSRSADTQYF +CASSSRSAGELFF +CASSSRSAIEQFF +CASSSRSANEQFF +CASSSRSAYEQYF +CASSSRSFYEQYF +CASSSRSGDTQYF +CASSSRSGEKLFF +CASSSRSGETQYF +CASSSRSGHEQFF +CASSSRSGHEQYF +CASSSRSGIEQFF +CASSSRSGIEQYF +CASSSRSGNEKLFF +CASSSRSGNEQFF +CASSSRSGNIQYF +CASSSRSGSEQFF +CASSSRSGTEAFF +CASSSRSGTEQFF +CASSSRSGVEQFF +CASSSRSGVEQYF +CASSSRSGWEQYF +CASSSRSGWTQYF +CASSSRSGYEQYF +CASSSRSHQPQHF +CASSSRSKYEQYF +CASSSRSQDTQYF +CASSSRSQETQYF +CASSSRSQHEQYF +CASSSRSQVEQYF +CASSSRSQYEQYF +CASSSRSRETQYF +CASSSRSRQYRGSSYEQYF +CASSSRSSAEAFF +CASSSRSSAEQFF +CASSSRSSDTQYF +CASSSRSSEEQYF +CASSSRSSEPQHF +CASSSRSSETQYF +CASSSRSSGELFF +CASSSRSSGEQFF +CASSSRSSGEQYF +CASSSRSSGTQYF +CASSSRSSIGQFF +CASSSRSSNEQFF +CASSSRSSTEAFF +CASSSRSSYEQYF +CASSSRSSYNEQFF +CASSSRSTDTQYF +CASSSRSTEKLFF +CASSSRSTGELFF +CASSSRSTGEQFF +CASSSRSVGELFF +CASSSRTGGEQYF +CASSSRWGYEQYF +CASSSSAGERETQYF +CASSSSAPTPDTQYF +CASSSSDGDNEQFF +CASSSSGGAGELFF +CASSSSGGEYNEQFF +CASSSSGTGAYEQYF +CASSSSLAGGREQYF +CASSSSLSTIYF +CASSSSNQPQHF +CASSSSRSTDTQYF +CASSSSSGETQYF +CASSSSSRTSNNEQFF +CASSSSSVNEQFF +CASSSSTDTQYF +CASSSSTGGVSNQPQHF +CASSSTAGPPGEQFF +CASSSTDRGFVNTEAFF +CASSSTGAGELFF +CASSSTGASNSPLHF +CASSSTGAYNEQFF +CASSSTGETQYF +CASSSTGGDGYTF +CASSSTGGGEKDQPQHF +CASSSTGGGITEAFF +CASSSTGGGVTEAFF +CASSSTGGPKNEQFF +CASSSTGGTGELFF +CASSSTGGTNTEAFF +CASSSTGIGSQPQHF +CASSSTGLLNEQFF +CASSSTGTGAWDTQYF +CASSSTGTSGSTDTQYF +CASSSTGTTNYEQYF +CASSSTGVDEQFF +CASSSTGVPNEKLFF +CASSSTIGATEAFF +CASSSTIGFSEAFF +CASSSTLYNEQFF +CASSSTRASYNEQFF +CASSSTSSTNEKLFF +CASSSTSTDTQYF +CASSSVELGGTGELFF +CASSSVFVTYSNQPQHF +CASSSVGSPYEQYF +CASSSVRSTDTQYF +CASSSVSGGAFNEQFF +CASSSVSVDTQYF +CASSSWERADTQYF +CASSSWTGDEQFF +CASSSWTGDTQYF +CASSSYGSYEQYF +CASSSYGTDTQYF +CASSSYLQDRGTGELFF +CASSSYPWDSTYNEQFF +CASSSYRDRESGANVLTF +CASSSYRGGETQYF +CASSSYSGNTIYF +CASSSYSMDEQYF +CASSSYSNQPQHF +CASSSYSVDTQYF +CASSSYSYEQYF +CASSSYSYNEQFF +CASSTAQPQHF +CASSTASTDTQYF +CASSTDGSNQPQHF +CASSTDLGEKLFF +CASSTDRQFGYGYTF +CASSTDSYEQYF +CASSTDTNEQFF +CASSTETVNTEAFF +CASSTFAGDYEQYF +CASSTFDGSYEQYF +CASSTFDSNQPQHF +CASSTFDTSNQPQHF +CASSTFGANVLTF +CASSTFGGQPQHF +CASSTFGSSYNEQFF +CASSTFGTGGNEQFF +CASSTFLGAEAFF +CASSTFLGGRSPLHF +CASSTFLGTEAFF +CASSTFLLDEQYF +CASSTFSNPYNEQFF +CASSTFSNTEAFF +CASSTFSRDIQYF +CASSTFSSGRTEAFF +CASSTFSSNQPQHF +CASSTFTGVHYGYTF +CASSTFWGTEVEETQYF +CASSTGAWGYTF +CASSTGAYGYTF +CASSTGAYNEQFF +CASSTGFYGYTF +CASSTGGEVETQYF +CASSTGGLLYEQYF +CASSTGGTDTQYF +CASSTGGTEAFF +CASSTGGYYGYTF +CASSTGIIGYTF +CASSTGIYGYTF +CASSTGLNGRAQYF +CASSTGLWGYTF +CASSTGLYGYTF +CASSTGNTEAFF +CASSTGNYGYTF +CASSTGQETQYF +CASSTGQGMDTEAFF +CASSTGQPQHF +CASSTGRNYGYTF +CASSTGRTGTSGTNEQFF +CASSTGSGNQPQHF +CASSTGSTDTQYF +CASSTGSYGCTF +CASSTGSYGYTF +CASSTGTASGQETQYF +CASSTGTASNQPQHF +CASSTGTLYGYTF +CASSTGTPQFF +CASSTGTPQHF +CASSTGTPVFF +CASSTGTSWEQYF +CASSTGTTDTQYF +CASSTGVAEQYF +CASSTGVHGYTF +CASSTGVPLSEQYF +CASSTGVPQHF +CASSTGVYGYTF +CASSTGWGAGQPQHF +CASSTGYHGYTF +CASSTGYYGYTF +CASSTHGADEQYF +CASSTHGADTQYF +CASSTHGGAWQYF +CASSTHGGGNTEAFF +CASSTHGGNEQFF +CASSTHGGNEQYF +CASSTHHGAEAFF +CASSTHHLGIQYF +CASSTHHNNEQFF +CASSTHLGSGNTIYF +CASSTHLLDTQYF +CASSTHPGNEQYF +CASSTHSDRETEAFF +CASSTHSGGNNEQFF +CASSTHSGNEQFF +CASSTHSGNQPQHF +CASSTHSTYEQYF +CASSTHTGANYGYTF +CASSTIAGGRNEQFF +CASSTIAGGTDTQYF +CASSTIEGQGGRHTQYF +CASSTISLDIQYF +CASSTISVDTQYF +CASSTKDGGTGELFF +CASSTKEGGTGELFF +CASSTKEGGTGEQFF +CASSTKVGGTGELFF +CASSTLAGAWNEQFF +CASSTLAGGQETQYF +CASSTLAGPYNEQFF +CASSTLAGTYNEQFF +CASSTLASYEQYF +CASSTLATANTEAFF +CASSTLATDTQYF +CASSTLDRATTGELFF +CASSTLDRTSGFDTQYF +CASSTLDSYNEQFF +CASSTLGLAGGSFF +CASSTLLAVSSYNEQFF +CASSTLPGGPDEQFF +CASSTLRGPYNEQFF +CASSTLSTDTQYF +CASSTLTGGHQPQHF +CASSTMGNAPLDTQYF +CASSTMNTEAFF +CASSTPDRADPNYGYTF +CASSTPDRGTISGNTIYF +CASSTPGYTYEQYF +CASSTPLTARNGYTF +CASSTPRQGSNTGELFF +CASSTQGLYYGYTF +CASSTRAADTQYF +CASSTRAAGELFF +CASSTRAANEQFF +CASSTRAAYEQYF +CASSTRAFYEQYF +CASSTRAGDTQYF +CASSTRAGEKLFF +CASSTRAGETQYF +CASSTRAGGVGQFF +CASSTRAGNEKLFF +CASSTRAGVEQYF +CASSTRAGYEQYF +CASSTRASEGQYF +CASSTRASNEQFF +CASSTRASQPQHF +CASSTRASYEQYF +CASSTRATETQYF +CASSTRATGELFF +CASSTRATSEQYF +CASSTRDLAKSSYNSPLHF +CASSTRDLLYQETQYF +CASSTRDRREKTQYF +CASSTRFGNTIYF +CASSTRGAYEQYF +CASSTRGGDEQYF +CASSTRGGNQPQHF +CASSTRGGQPQHF +CASSTRGGTEAFF +CASSTRGGYEQYF +CASSTRGSHEQYF +CASSTRGSNEQFF +CASSTRGSTGELFF +CASSTRRTDTQYF +CASSTRSAAPLHF +CASSTRSADTQYF +CASSTRSAETQYF +CASSTRSAHEQFF +CASSTRSAIEQFF +CASSTRSANEQFF +CASSTRSAYEQYF +CASSTRSDEKLFF +CASSTRSDNEQFF +CASSTRSDSEAFF +CASSTRSDSPLHF +CASSTRSDYEQYF +CASSTRSGDTQYF +CASSTRSGETQYF +CASSTRSGGTEAFF +CASSTRSGIEQFF +CASSTRSGNTIYF +CASSTRSGQPQHF +CASSTRSGSEQYF +CASSTRSGSPLHF +CASSTRSGTEAFF +CASSTRSGTEQFF +CASSTRSGTEQYF +CASSTRSGTGQYF +CASSTRSGWELFF +CASSTRSGWEQYF +CASSTRSGWIQYF +CASSTRSGYEQYF +CASSTRSHDIQYF +CASSTRSHEPQHF +CASSTRSLDTQYF +CASSTRSNQPQHF +CASSTRSNTEAFF +CASSTRSQETQYF +CASSTRSSAEAFF +CASSTRSSDEQFF +CASSTRSSETQYF +CASSTRSSGELFF +CASSTRSSNEQFF +CASSTRSSSEQYF +CASSTRSSSPQHF +CASSTRSSTEAFF +CASSTRSSTEQYF +CASSTRSSVEQYF +CASSTRSSWEQFF +CASSTRSSWGQYF +CASSTRSSYEQYF +CASSTRSSYNEQFF +CASSTRSTDTLYF +CASSTRSTDTQHF +CASSTRSTDTQYF +CASSTRSTEAQHF +CASSTRSTGELCF +CASSTRSTGELFF +CASSTRSTGEQYF +CASSTRSTHEQYF +CASSTRSTNTQYF +CASSTRSTSPGHF +CASSTRSTVEQYF +CASSTRSTYEQYF +CASSTSAANQPQHF +CASSTSEGGLFYEQYF +CASSTSGANQPQHF +CASSTSGGGTEAFF +CASSTSHVGTEAFF +CASSTSNEQPQHF +CASSTSSGQTQETQYF +CASSTSSTDTQYF +CASSTSSTGELFF +CASSTSSVDTQYF +CASSTSTGTGYGYTF +CASSTTGAYNEQFF +CASSTTGIDGYTF +CASSTTGTPGNTIYF +CASSTTGYEQYF +CASSTTIGDTEAFF +CASSTTPETQYF +CASSTTSGEETQYF +CASSTTSGGEQYF +CASSTTSGGLNEQFF +CASSTTSGGVETQYF +CASSTTSGTEQFF +CASSTTVSGELFF +CASSTTWTRWDHNEQFF +CASSTVDPMLSEAFF +CASSTVDRSEHMNTEAFF +CASSTVEGDTQYF +CASSTVSLDTQYF +CASSTWADTEAFF +CASSTWGGGAYNEQFF +CASSTWIRTNTEAFF +CASSTWSTDTQYF +CASSTWTRDEQFF +CASSTYANTEAFF +CASSTYGTDTQYF +CASSTYGTTGELFF +CASSTYHSDEQFF +CASSTYHSTDTQYF +CASSTYITTEAFF +CASSTYNEQFF +CASSTYREDGTEAFF +CASSTYRVANEKLFF +CASSTYSGSYNEQFF +CASSTYSIDTQYF +CASSTYSNQPQHF +CASSTYSQDTQYF +CASSTYSRDTQYF +CASSTYSRQPQHF +CASSTYSTDTQYF +CASSTYSTGSGYTF +CASSVADRGFADTQYF +CASSVAGGDEQFF +CASSVAGGDTGELFF +CASSVAGGGSDEQYF +CASSVALAGGSTDTQYF +CASSVALGMNTEAFF +CASSVAPDTQYF +CASSVAPELNTEAFF +CASSVAPTDTQYF +CASSVAQYGYTF +CASSVAVGTGSGANVLTF +CASSVDGDTSTDTQYF +CASSVDGGGLDEQFF +CASSVDGGGLDEQYF +CASSVDLDSSYNEQFF +CASSVDPAGGSSYEQYF +CASSVDPAQETQYF +CASSVDSGSDYEQYF +CASSVDSPGTEAFF +CASSVDTLNTEAFF +CASSVEAAGAGELFF +CASSVEDYGSVSYGYTF +CASSVEGGLAQETQYF +CASSVEGTVTDTQYF +CASSVEKGHSYNSPLHF +CASSVELAPGELFF +CASSVEMGQITDTQYF +CASSVETGTGVAFF +CASSVFATGGSGYTF +CASSVFFRSKLFF +CASSVFGGQPQHF +CASSVFGGWDTEAFF +CASSVFGTDTQYF +CASSVFSADTQYF +CASSVFSGADTQYF +CASSVFSGAYNEQFF +CASSVFSTQPQHF +CASSVFSVGELFF +CASSVFSVPGNTIYF +CASSVFVGEGYTF +CASSVGADIIMNTEAFF +CASSVGAGEAQCF +CASSVGDLLTGELFF +CASSVGDNSGNTIYF +CASSVGGGVTDTEAFF +CASSVGGNSNEKLFF +CASSVGGTSTDTQYF +CASSVGIYGYTF +CASSVGKTSTDTQYF +CASSVGLDSAREAQHF +CASSVGLGQPQHF +CASSVGLGRGYEQYF +CASSVGLQGDYNEQFF +CASSVGLYGYTF +CASSVGMYGYTF +CASSVGQALGETQYF +CASSVGQPQHF +CASSVGRTSQETQYF +CASSVGRTSTDTQYF +CASSVGRVGYTF +CASSVGSGTEAFF +CASSVGSYGYTF +CASSVGTGGVGPLYEQYF +CASSVGTVKNQPQHF +CASSVGTYGYTF +CASSVGVAEAFF +CASSVGVAGFNEKLFF +CASSVGVEQFF +CASSVGVHGTF +CASSVGVHGYTF +CASSVGVYGYTF +CASSVGWTGTGETQYF +CASSVGWYGYTF +CASSVGYPAFF +CASSVHGGANTEAFF +CASSVHGVSEQYF +CASSVIAGGETQYF +CASSVKAGETQYF +CASSVLAGAYNEQFF +CASSVLAGGYNEQFF +CASSVLAGPYNEQFF +CASSVLDGPRGKLFF +CASSVLGGGEQFF +CASSVLGSSYNEQFF +CASSVLGVADTQYF +CASSVLNQETQYF +CASSVLQGSGNQPQHF +CASSVNSNQPQHF +CASSVPGQSNTEAFF +CASSVQANTEAFF +CASSVQEGPTYEQYF +CASSVQSSYEQYF +CASSVRAADTQYF +CASSVRAAWEQYF +CASSVRAAYEQYF +CASSVRADGELFF +CASSVRAGETQYF +CASSVRARETQYF +CASSVRASDEQYF +CASSVRASTEAFF +CASSVRASYEQYF +CASSVRATDTQYF +CASSVRGADTQYF +CASSVRGGTEAFF +CASSVRHGTEAFF +CASSVRSADTQYF +CASSVRSAGELFF +CASSVRSAHEQFF +CASSVRSATEAFF +CASSVRSAVEQFF +CASSVRSAYEQYF +CASSVRSGDTQYF +CASSVRSGEMDTQYF +CASSVRSGETQYF +CASSVRSGHEQYF +CASSVRSGWQPQHF +CASSVRSGYEQFF +CASSVRSPYEQYF +CASSVRSQETQYF +CASSVRSRYEQYF +CASSVRSSDTQYF +CASSVRSSGEQFF +CASSVRSSMNTEAFF +CASSVRSSTEQFF +CASSVRSSYEQHF +CASSVRSSYEQYF +CASSVRSSYKQYF +CASSVRSTDMQYF +CASSVRSTDTQYF +CASSVRSTGALFF +CASSVRSTGELFF +CASSVRSTGGLFF +CASSVRSTYEQYF +CASSVRTGYTEAFF +CASSVSATGELFF +CASSVSGATADTQYF +CASSVSGSTDTQYF +CASSVSSYNEQFF +CASSVTGSVGELFF +CASSVTGTSAGELFF +CASSVTSGDYNEQFF +CASSVTSGSYNEQFF +CASSVTVPHEQFF +CASSVVASSYNEQFF +CASSVVEGFTDTQYF +CASSVVGGGAANEQFF +CASSVVPSSYEQYF +CASSVVSIDTQYF +CASSVVSTDTQYF +CASSVVSTTEAFF +CASSVWAGEAQYF +CASSVWAGGSPNEQFF +CASSVWDGQETQYF +CASSVWTGGYNEQFF +CASSVYGFGANVLTF +CASSVYGGNYEQYF +CASSVYGGVSYNEQFF +CASSVYSNQPQHF +CASSWDKTYEQYF +CASSWDPTGDYGYTF +CASSWDREGGSYNEQFF +CASSWDRNNEQFF +CASSWDTDSPLHF +CASSWEGINYGYTF +CASSWFGSGANVLTF +CASSWFLGNSPLHF +CASSWFQGNEQFF +CASSWFSGANVLTF +CASSWFTRSEKLFF +CASSWGAYGYTF +CASSWGGGSHYGYTF +CASSWGGRPIETQYF +CASSWGGSNEQFF +CASSWGISYNEQFF +CASSWGLDNEQFF +CASSWGQGARAFF +CASSWGQGSTDTQYF +CASSWGSEVVADTQYF +CASSWGSHGYTF +CASSWGSWGYTF +CASSWGSYGYTF +CASSWGTAEQFF +CASSWGTGNYEQYF +CASSWGWGGEEQYF +CASSWHLKETQYF +CASSWIGGSSYNEQFF +CASSWISSGANVLTF +CASSWLAGDNEQFF +CASSWLAGPYNEQFF +CASSWLSDTQYF +CASSWLSNQPQHF +CASSWNSGSYEQYF +CASSWNTGELFF +CASSWQGNQPQHF +CASSWRAADTQYF +CASSWRGNQPQHF +CASSWRSDTEAFF +CASSWRSNQPQHF +CASSWRSTDTQYF +CASSWRSTGELFF +CASSWSGAGEQFF +CASSWSSYEQYF +CASSWTAAYTDTQYF +CASSWTANTEAFF +CASSWTANYGYTF +CASSWTGGEQYF +CASSWTGGGSGEQYF +CASSWTGGGVYGYTF +CASSWTGGYEQYF +CASSWTGNTEAFF +CASSWTGTEAFF +CASSWTLNTEAFF +CASSWTMNTEAFF +CASSWTQETQYF +CASSWTSGAYNEQFF +CASSWTSGGADTQYF +CASSWTSGGGNEQFF +CASSWTSGMYEQYF +CASSWTVLYGYTF +CASSWTVNTEAFF +CASSWTVNYGYTF +CASSWVGYEQYF +CASSWVSLDTQYF +CASSWVVPPNEQFF +CASSWWTGGMETQYF +CASSWYGGTDTQYF +CASSWYSNQPQHF +CASSYAAGELFF +CASSYAETQYF +CASSYAGGGNTIYF +CASSYAGTVEKLFF +CASSYAMPGQASGNTIYF +CASSYARTPLHF +CASSYCPELAKNIQYF +CASSYELTGTSLETQYF +CASSYFAGTSYNEQFF +CASSYFGGALSEQYF +CASSYFGLAFQETQYF +CASSYFGTDTQYF +CASSYFRDTNSPLHF +CASSYFRRLPGQVFYEQYF +CASSYFSGRADDTQYF +CASSYGAGMGYTF +CASSYGAGTGELFF +CASSYGANTGELFF +CASSYGARPEWVYEQYF +CASSYGASGAYNEQFF +CASSYGDEAFF +CASSYGFPTSGGNTGELFF +CASSYGFYEQYF +CASSYGGGSYEQYF +CASSYGGGYEQYF +CASSYGGYEQYF +CASSYGIAGKETQYF +CASSYGKGMYEQYF +CASSYGKPLYSNQPQHF +CASSYGLEQFF +CASSYGLNTEAFF +CASSYGMGTGELFF +CASSYGQFISYEQYF +CASSYGQGADEQFF +CASSYGQGAGRAFF +CASSYGQGAHTEAFF +CASSYGQGGTIQYF +CASSYGQGIGEQFF +CASSYGQGIIYGYTF +CASSYGQGLGKLFF +CASSYGQGLPLHF +CASSYGQGLYEQYF +CASSYGQGMDIQYF +CASSYGQGSGTQYF +CASSYGQGTGELFF +CASSYGQGTGIQYF +CASSYGQNTGELFF +CASSYGQNYEQYF +CASSYGSGLGEQYF +CASSYGSLAGEDTQYF +CASSYGSNTGELFF +CASSYGSYGYTF +CASSYGTGFNEQFF +CASSYGTGGDAHGYTF +CASSYGTGMNEQFF +CASSYGTGSSEQYF +CASSYGTGSYEQYF +CASSYGTNTGELFF +CASSYGTRGEQYF +CASSYGTSTDTQYF +CASSYGVGTEAFF +CASSYGVYGYTF +CASSYGWNTGELFF +CASSYGYNEQFF +CASSYHGLDEQFF +CASSYHNTNKNIQYF +CASSYHSNQPQHF +CASSYHSQETQYF +CASSYIDRVYNSPLHF +CASSYIGGGYNEQFF +CASSYIGVNQPQHF +CASSYIHEQYF +CASSYISTDTQYF +CASSYLAGEITDTQYF +CASSYLAGGLINEQFF +CASSYLEGEKLFF +CASSYLEGSETQETQYF +CASSYLGVEGYTF +CASSYLQGANGYTF +CASSYLRTGSVRYNEQFF +CASSYLSTDTQYF +CASSYLTGMNQPQHF +CASSYLVAEAFF +CASSYLYRIGQLEAFF +CASSYMGDRGYEQYF +CASSYMNTEAFF +CASSYNLAGTSLDEQFF +CASSYNPETFHSNQPQHF +CASSYNPRTEVSNQPQHF +CASSYNRDRETSNEKLFF +CASSYNRTPGQGYTQYF +CASSYNSRTHSSNQPQHF +CASSYNTEAFF +CASSYNVRDGDYGYTF +CASSYPAAGHSNQPQHF +CASSYPDRGHSNQPQHF +CASSYPDRGTSTDTQYF +CASSYPEQGTSNEKLFF +CASSYPETGTSLDEQFF +CASSYPGHYGYTF +CASSYPGQYNEQFF +CASSYPGTGTSNQPQHF +CASSYPIAGHSYNEQFF +CASSYPIQGTSNQPQHF +CASSYPLLYSGANTGELFF +CASSYPPSGVSTDTQYF +CASSYPRDAGTSLDIQYF +CASSYPREGHSNQPQHF +CASSYPSRTGTSRDEQFF +CASSYPVEGISTGELFF +CASSYQGEAFF +CASSYQGRTGELFF +CASSYRAPGHTEAFF +CASSYRASETQYF +CASSYRATDTQYF +CASSYRDHNEQFF +CASSYRDKSAGNTIYF +CASSYRFF +CASSYRGPSGNTIYF +CASSYRLAGASQETQYF +CASSYRSAYEQYF +CASSYRSGAGELFF +CASSYRSQETQYF +CASSYRSSNEQFF +CASSYRSSYEQYF +CASSYRSTDTQYF +CASSYRSTGELFF +CASSYRSTLEQFF +CASSYRSTQPQHF +CASSYRVDSNQPQHF +CASSYRVGPLHF +CASSYRVTEAFF +CASSYSAAGEQFF +CASSYSAEADTQYF +CASSYSAVQGNNQPQHF +CASSYSDRVHEQYF +CASSYSEGSYEQYF +CASSYSEVTEAFF +CASSYSGDYNEQFF +CASSYSGGDEQFF +CASSYSGHVNTEAFF +CASSYSGPAAGQETQYF +CASSYSGTGFQVRQYF +CASSYSGTGNEQFF +CASSYSHDGHSNQPQHF +CASSYSINAAVNYGYTF +CASSYSLDSHSNQPQHF +CASSYSLQGGLDEQYF +CASSYSMDGHSNQPQHF +CASSYSMMQDVSNQPQHF +CASSYSNRGWDTEAFF +CASSYSNTQNEAFF +CASSYSQGVGDTQYF +CASSYSQRTGVSNQPQHF +CASSYSQTSGHSYNEQFF +CASSYSRPGLSNQPQHF +CASSYSSGGADTQYF +CASSYSSGGAETQYF +CASSYSSGLVSSEQYF +CASSYSSGQWGWNTEAFF +CASSYSSSSAGEQFF +CASSYSSSYNEQFF +CASSYSSTRATSNQPQHF +CASSYSTGTVDYEQYF +CASSYSTGVVNSPLHF +CASSYSTPLGIYEQYF +CASSYSTSGNTIYF +CASSYSTVRDESFATDTQYF +CASSYSYEQYF +CASSYTAKTDTSNQPQHF +CASSYTAQPDTSNQPQHF +CASSYTGGLMAFF +CASSYTGGLRYGYTF +CASSYTLNTEAFF +CASSYTLPGTVPQRNQPQHF +CASSYTSGNTGELFF +CASSYTTGGSGELFF +CASSYTTNYGYTF +CASSYTVNYGYTF +CASSYTYGYTF +CASSYVASGSSNYGYTF +CASSYVGPESTDTQYF +CASSYVPGRNNNQPQHF +CASSYVTGWDSPLHF +CASSYVTTSGGHANTGELFF +CASSYYGSYNSPLHF +CASSYYNEQFF +CASSYYRNQPQHF +CASSYYSNQPQHF +CASSYYSRQPQHF +CASTAGLNTGELFF +CASTAGRGTGELFF +CASTARSSYNEQFF +CASTDGTAFTEAFF +CASTEMGREGYNEQFF +CASTEREGSSGELFF +CASTETGPYGYTF +CASTEWTALSYNEQFF +CASTFGQAPPAFF +CASTFMGGGEQPQHF +CASTGAGELFF +CASTGGYGYTF +CASTGLWVGTDTQYF +CASTGTANLNTEAFF +CASTGTPRSQYF +CASTHIFPGELFF +CASTHLRDKTFF +CASTHSAISEAFF +CASTIGETGAQHYNEQFF +CASTIGLARGAGTGELFF +CASTIHYGNTEAFF +CASTIMGSGNTIYF +CASTIYQGAPTEAFF +CASTLAGGLPEQFF +CASTLAGVGYEQFF +CASTLFARQETQYF +CASTLGCWGSVETQYF +CASTLGERNPRGYTF +CASTLGGGYEQYF +CASTLGNTEAFF +CASTLPGGRGPEGYTF +CASTLPGLAGSYNEQFF +CASTLPRRQAEQFF +CASTLSSSYNEQFF +CASTMTMGWTEAFF +CASTPAGGRDEQYF +CASTPDSEIYEQYF +CASTPEPGLGPTNEKLFF +CASTPEYGYTF +CASTPFGGAGYGYTF +CASTPGIYNSPLHF +CASTPGKREAPPYNEQFF +CASTPGPQVDTQYF +CASTPGQGFRNEQFF +CASTPGTGMSYGYTF +CASTPGTGVSGEQYF +CASTPHRGPSYEQYF +CASTPLAGDTDTQYF +CASTPLAGGLNEQFF +CASTPLGAPYGYTF +CASTPLGGNTGELFF +CASTPLGSSYNEQFF +CASTPMSGGNQPQHF +CASTPPGGRGYTF +CASTPQGARTEAFF +CASTPQGAYEQYF +CASTPSSGSAGNTIYF +CASTPSWAGGPTDTQYF +CASTPTANYGYTF +CASTPTGGNTEAFF +CASTPTGVNEQFF +CASTPVAGVNRNEQFF +CASTPYAGGHNEQFF +CASTPYEGGHNEQFF +CASTQRIGVEQYF +CASTQRSATQEYF +CASTRDSYNSPLHF +CASTRGNEQFF +CASTRLAGGFNEQFF +CASTSGGSPLHF +CASTSGNTEAFF +CASTSGQGETGELFF +CASTSMSGGWDTQYF +CASTSRLRTGTEETQYF +CASTSWGETQYF +CASTSWLANNEQFF +CASTTAVYGYTF +CASTTGTVNTEAFF +CASTTGVNTIYF +CASTTIHGGVTDYGYTF +CASTTLPGPLNEQFF +CASTTVQGGYTF +CASTVFQGGGQPQHF +CASTVGAVQPQHF +CASTWGLSSYNEQFF +CASTWTGAYGYTF +CASTYIGLSNQPQHF +CASTYRSSYNEQFF +CASTYSSGNTIYF +CASVMTGGFTEAFF +CASVPYGYEQYF +CASVWQGAYNEQFF +CASWAYGYTF +CASWGGAVNTEAFF +CASWGGQAFF +CASWGGVVNTEAFF +CASWSRRRRQYF +CASYDFTEAFF +CASYPWTGEKETQYF +CASYTSDYGYTF +CATAIGHHPYEQYF +CATAMYGASYEQYF +CATAPDRTYEQYF +CATAPHSGNQPQHF +CATAPLSINEQFF +CATASSSTDTQYF +CATDARVGNTGELFF +CATDPGEGNTIYF +CATEASSYEQYF +CATENSGTGDSYEQYF +CATERPSRHQPQHF +CATFSGLYNEQFF +CATGFGGAYNEQFF +CATGFPYEQYF +CATGGPGDPEAFF +CATGIGELFF +CATGLLYEQYF +CATGTSGFSTDTQYF +CATGWTSNSPLHF +CATILYEILLGKASSMRSTDTQYF +CATILYEILLGKATLYAVLVSALEQYF +CATILYEILLGKATLYAVLVSASSQRSATQEYF +CATILYEILLGKATLYAYEQYF +CATILYEIPGSGGGPMGGAVFVQSCNIVGTGDNQPQHF +CATKPGGGNTEAFF +CATKRGAFDYGYTF +CATKTTSGGYNEQFF +CATNGRGGYYNEQFF +CATNNFAGGNEQFF +CATNSGELFF +CATPADRSYEQYF +CATPKGEASTEAFF +CATQAGMGTGELFF +CATQGDNTYEQYF +CATQKDSLNEKLFF +CATRPRTGGNEAFF +CATRSGGPHEQYF +CATRSSDGGNQPQHF +CATRTSGSGAGTDTQYF +CATSAPLRQGLFNYGYTF +CATSAVGVPTEAFF +CATSDFCRPQADNSPLHF +CATSDFEVAGSSYNEQFF +CATSDFPPGVRGAPQETQYF +CATSDFTDSGANVLTF +CATSDGLAGGWANVLTF +CATSDISITEAFF +CATSDLGDGGNEQFF +CATSDLKGQGSSYNEQFF +CATSDLLSTAGVVNTEAFF +CATSDLLTGAHGYTF +CATSDLRGDYNEQFF +CATSDLRTGDLNEQFF +CATSDPGGDRETGTEAFF +CATSDPPQDTQYF +CATSDPSTSGRSGGQETQYF +CATSDPTGEDTQYF +CATSDQKTGSTPGELFF +CATSDSFNRGVTDTQYF +CATSDSGLAETKSSYNEQFF +CATSDSSGPYNEQFF +CATSDTHSGANVLTF +CATSDVTGGLVGNQPQHF +CATSDWLGGTDTQYF +CATSDWTGGANTEAFF +CATSDYPSGGPPRCTDTQYF +CATSDYVSGTLQETQYF +CATSEGDALFQPQHF +CATSERGDLNFGTDTQYF +CATSGGSFRGTDTQYF +CATSGGWTYSNEQFF +CATSGPPGPWGEQYF +CATSGQPQETQYF +CATSGRAGNEQFF +CATSGRAGVEQFF +CATSGRDMGLAKNIQYF +CATSGRSLEEQYF +CATSGRSTDTQYF +CATSGTGGSGANVLTF +CATSGTGMQYF +CATSGYLLDRSTDTQYF +CATSIYSNQPQHF +CATSKSDGNYEQYF +CATSKSYSGDTQYF +CATSKWTGETQYF +CATSLGTSGRASEQYF +CATSLRTSGIIEQYF +CATSNRAGNTIYF +CATSPGLYTGELFF +CATSPLLESNYGYTF +CATSPTGTSGILSYNEQFF +CATSPYRGADTQYF +CATSQGAYGYTF +CATSQGSYGYTF +CATSQRSTDTQYF +CATSRAAGSYNEQFF +CATSRAWDYGNPLSGNSGANVLTF +CATSRDGGDTQYF +CATSRDLASQAFF +CATSRDLGATDTQYF +CATSRDQGRGQPQHF +CATSRDRLAGGIHEQYF +CATSRDRSSGNTIYF +CATSRDRYEQYF +CATSRDWYEQYF +CATSREGANTGELFF +CATSREGLGGEQFF +CATSRPTGSSSGANVLTF +CATSRPVGEYTEAFF +CATSRRAGDTQYF +CATSRSGTGGLIAYSNQPQHF +CATSRSRAGAYEQYF +CATSRVPSSYNEQFF +CATSSDHRDRELDTQYF +CATSSGGTEAFF +CATSSGIYGYTF +CATSSGSRQGNNEQFF +CATSSQNEQFF +CATSSSGGGYNEQFF +CATSSTQSPDGRSGNTIYF +CATSSVGAQTQYF +CATSTGAYGYTF +CATSTGDSNQPQHF +CATSTGNYGYTF +CATSTGSYGYTF +CATSTGTYEQYF +CATSTPNTEAFF +CATSTTGPTGELFF +CATSTYRVGSSDEQYF +CATSVGTEETHYGYTF +CATSVRASYEQYF +CATSVRSSYEQYF +CATSYEDNSGANVLTF +CATSYRSSDTQYF +CATTIYEQYF +CATTPGGSLSGYTF +CATTSDGTANEQFF +CATTYGDRRNTEAFF +CATWTENMNTEAFF +CATYIGGDTEAFF +CATYLSRDRDSPLHF +CAVGRSGSYNEQFF +CAVLVDPYNEQFF +CAVNLGDNLADEQYF +CAVQMGRGYTF +CAVSGRAGETQYF +CAWDYNRGGYTF +CAWGLTGGWNEQFF +CAWGTGQGVIYNEQFF +CAWIGLGGETQYF +CAWITEMNTEAFF +CAWMGAGQRNTIYF +CAWNPGFSSNQPQHF +CAWNRGSTDTQYF +CAWRDRVGHQPQHF +CAWRGAQGYTF +CAWRGGQGQIYEQYF +CAWRYYGYTF +CAWSASPGASNQPQHF +CAWSAVGGGYGYTF +CAWSAVGGGYTF +CAWSAVRTSANDEQFF +CAWSDIWENYNSPLHF +CAWSEIGGGYGYTF +CAWSFLDVTAGTEAFF +CAWSGGTSRRTQYF +CAWSGLGPSYNEQFF +CAWSGQGYEQFF +CAWSGSDTQYF +CAWSGVGGASPLHF +CAWSGVSYEQYF +CAWSLNGDEQYF +CAWSLSMNTEAFF +CAWSLSSGELFF +CAWSNGGRAEAFF +CAWSPGQGGETQYF +CAWSPGTGAYGYTF +CAWSPGTVSEQYF +CAWSPKTSDTGELFF +CAWSPWDRAGQVNTEAFF +CAWSQGASGNQPQHF +CAWSRMGGNTIYF +CAWSRTGATEAFF +CAWSRVGGYEQYF +CAWSSADTYYEQYF +CAWSSGGLATNEKLFF +CAWSSGLGNEKLFF +CAWSVAGDTDTQYF +CAWSVFNRGTGYTF +CAWSVGGGGYGYTF +CAWSVGGGGYTF +CAWSVGSYGYTF +CAWSVHGVPGYTF +CAWSVIGGVSEQFF +CAWSVILWGRGPTDTQYF +CAWSVLGSTDTQYF +CAWSVLPEAFF +CAWSVLTDSPLHF +CAWSVPGTYSYEQYF +CAWSVQGGTEAFF +CAWSVQGPGVRTEAFF +CAWSVRTSNEQFF +CAWSVSSYEQYF +CAWSVSVQGYGYTF +CAWSWTGGTSGYTF +CAWTGDTEAFF +CAWTGTGKIGWDSPLHF +CAWTIENRNTGELFF +CAWVLGPAGDTQYF +CAYSGTAPHEQYF +CCQKRERLWAEGYNEQFF +CDSNQNEGQVDYEQHF +CPILLGADGYTF +CPRRPGQGVSYAQYF +CPSSFYRNQPQHF +CPSSIFGGKEAFF +CPSSIGVYGYTF +CPSSRTGLNTEAFF +CQARDPVSGQGALLLYEQYF +CQQQKSYEQYF +CQQQRSTGGSNQPQHF +CRAEEGERGMRSYNEQFF +CRGQGSNQPQHF +CRMRGPANTEAFF +CRPPRGTHSNQPQHF +CRVGQVMVLCTF +CRYKGQGVSGANVLTF +CSAAGLAGGSYEQYF +CSAALRRRGYTF +CSAARTGGNEKLFF +CSAASLLRLAGEYNEQFF +CSACRDREPDTQYF +CSADQGEHYNEQFF +CSADSWAGGGYEQYF +CSADTGTSGYEQYF +CSAEDSYGYTF +CSAEGGGSYEQYF +CSAEGNTEAFF +CSAEQDGYEQYF +CSAERQGGYTF +CSAETGVGQPQHF +CSAEWDRAYNEQFF +CSAFDRVYGYTF +CSAGEDWHEQYF +CSAGGGLPYEQYF +CSAGLPSREMAGELFF +CSAGPSGKNIQYF +CSAGQGPYEQYF +CSAGRDGTNEKLFF +CSAGRYQETQYF +CSAGTYNSPLHF +CSAGWRDLNTEAFF +CSAHGGQGRAYEQYF +CSAHPNSGSSYNEQFF +CSAIARELTVADTQYF +CSAIGGLYEQYF +CSAIGTYNEQFF +CSAIPGTGGRIKAFF +CSAIPTGNYEQYF +CSAISGSGGVGDTQYF +CSAISSGANVLTF +CSAITDRTYSNQPQHF +CSAKEGPTGGYGYTF +CSAKKGGGDTDTQYF +CSAKLGQDHTGELFF +CSAKPGTLNEGTDTQYF +CSAKQVDSNQPQHF +CSALAVYPQESSYNEQFF +CSALDRAIANTGELFF +CSALGGTQYF +CSALHGSVGLNQPQHF +CSALNRGDDSPLHF +CSALPFYGYTF +CSALPGIYNEQFF +CSALSDEQYF +CSALSF +CSANAEAYQETQYF +CSANAFSNQPQHF +CSANDRDGLREHF +CSANQRTGVRWNEQFF +CSANRPGTEHSNQPQHF +CSANRVGEQYF +CSANSDRVSYNEQFF +CSANSIKRGLNYGYTF +CSANSSPLHF +CSAPGGPNKNTGELFF +CSAPGGVNAFF +CSAPGQGFNEKLFF +CSAPGQGSYEQYF +CSAPILGGVYTEAFF +CSAPPGDHTEAFF +CSAPRGSGTIDTQYF +CSAPTPGQGSSSPLHF +CSAPTSGPYNEQFF +CSAPVVTGPTDTQYF +CSAPWTGLNEQFF +CSAPYRVEGRIVNEQFF +CSAQKGEEAEGAFF +CSAQSGPGEVEKLFF +CSAQSTDTQYF +CSAQTKWYEQYF +CSARADPGTFTFGPSGTFSTDTQYF +CSARALPVGNTIYF +CSARAPDSYYEQFF +CSARAQGSLAKNIQYF +CSARASDSYEQYF +CSARASRTFTDTQYF +CSARATSGRDYNEQFF +CSARAWDRAQATNEKLFF +CSARDAIRRRGWNEQFF +CSARDDFGQGSYNEQFF +CSARDDRGATNEKLFF +CSARDDTTDTQYF +CSARDFAEAFF +CSARDGDSSYEQYF +CSARDGRREYEQYF +CSARDGTRAHYEQFF +CSARDGTYYEQYF +CSARDGVGNGYTF +CSARDIERTGELFF +CSARDKAGEGYNEQFF +CSARDKFYRASGDSYEQYF +CSARDLRTFRTDTQYF +CSARDLTAGSYEQYF +CSARDNLAGDTDTQYF +CSARDPGAENTIYF +CSARDPGLGRSLSTDTQYF +CSARDPGQLVSGNTIYF +CSARDPGTGESDTQYF +CSARDPVSGQGALLLYDQYF +CSARDQGSEQYF +CSARDRAGEGYNEQFF +CSARDRDYGYTF +CSARDREQGGEITEAFF +CSARDRQEETQYF +CSARDRSGNTIYF +CSARDRTGNGYTF +CSARDSGTGLPATYSGNTIYF +CSARDTESRGSYNEQFF +CSARDTISRTSVTDTQYF +CSARDTSGRAMDEQFF +CSARDTSGSYNEQFF +CSARDTSNQPQHF +CSARDTTNSYYTDTQYF +CSARDWAVPYNEQFF +CSARDWDRARATNEKLFF +CSARDYAAYEQYF +CSARDYLNSYEQYF +CSARDYRGGTTYEQYF +CSAREDQRAQETQYF +CSAREEGGRDQPQHF +CSAREGGASGSPFSYEQYF +CSAREGGVQHQPQHF +CSAREGSGGSDIQYF +CSAREGSYEQYF +CSAREIRDFAYEQYF +CSARELDTQYF +CSARELGEEAYEQYF +CSAREQGTGGLSGELFF +CSARESGAGDTGELFF +CSARETGLAGWIDTQYF +CSARETGYDGPYNEQFF +CSAREVYEQFF +CSARGALADFNSYEQYF +CSARGDQIGSYEQYF +CSARGDRSYEQYF +CSARGDWNYEQYF +CSARGEDTDTQYF +CSARGELGVDEQFF +CSARGEVPTSGRWEETQYF +CSARGFAGGTDLEQFF +CSARGFTSGDSETQYF +CSARGGARDMNTEAFF +CSARGGDAEQYF +CSARGGDSVLRVGEQYF +CSARGGFLSPLHF +CSARGGVANEQYF +CSARGGVYNEQFF +CSARGLGGETQYF +CSARGLPYDHEQFF +CSARGLSNEQFF +CSARGPSGANVLTF +CSARGQQGAHEQYF +CSARGQSLYYSNQPQHF +CSARGRRDKAFF +CSARGRSEDTQYF +CSARGTNNNEQFF +CSARGVRVGGNTEAFF +CSARGWGLSYNEQFF +CSARGWSSYEQYF +CSARIWPYPAGGEETQYF +CSARKTSGGTYEQYF +CSARLAESSYEQYF +CSARLAGLTYNEQFF +CSARLTGGSGQPQHF +CSARNPSGTNNEQFF +CSARPEEKLFF +CSARPELAISQETQYF +CSARPRDREDNEQFF +CSARPSDRAYEQYF +CSARPSGGVSAYEQYF +CSARPTGAPLSYEQYF +CSARPWDRASATGANVLTF +CSARPWDRGRVTNEKLFF +CSARPWSTDTQYF +CSARQAGGHNEQFF +CSARRGDTYEQYF +CSARRQRSTDTQYF +CSARSGLAGGGDTQYF +CSARSWDRSELMGYTF +CSARTCF +CSARTDNSPLHF +CSARTGGMFYNEQFF +CSARTGQTYEQYF +CSARTIAGEGYNEQFF +CSARTTGGWEQYF +CSARTVYEQYF +CSARTWSSYEQYF +CSARVAGVEDTQYF +CSARVRGDVYGYTF +CSARVWDSYEQYF +CSARVWDTGELFF +CSARWRYSRGPDTQYF +CSARYPDYNEQFF +CSASAPTSYEQYF +CSASGARAGNTIYF +CSASGGRQRTEAFF +CSASGLWTGADTQYF +CSASGTLPYEQYF +CSASLAISTDTQYF +CSASPASGEETQYF +CSASPESLSYEQYF +CSASPGDYEQYF +CSASPRTGGIYEQYF +CSASPVSLSYEQYF +CSASQADMWGTQYF +CSASQGEAFF +CSASQGRYSNQPQHF +CSASQGYGNTIYF +CSASRGQYSNQPQHF +CSASSDESSYNEQFF +CSASSGGASDTQYF +CSASSGGPTDTQYF +CSASSGSYEQYF +CSASSLSGGQTPLGTEAFF +CSASSWAGGDTGELFF +CSASTQGALVLTNEQFF +CSASWGDYTEAFF +CSASYPSVNTEAFF +CSATDRGLSGPEAFF +CSATEGGLWEQYF +CSATGDPSYEQYF +CSATHRLAGQIYNEQFF +CSATRDRGNYGYTF +CSATRGGRTDTQYF +CSATSLAGDNEQFF +CSATTRSSQYF +CSAVGGAKNIQYF +CSAVGTSGGSRPTDTQYF +CSAVSDTQYF +CSAWDRTEAFF +CSAWDYGYTF +CSAWTRDSYNEQFF +CSAWTSDSYNEQFF +CSAYQERNEQFF +CSAYSETSRGEQYF +CSDTLKTGASYNEQFF +CSETGVSDEQFF +CSFGQGAETQYF +CSGAGVVAGGFNTGELFF +CSGGQARYEQFF +CSGGRGETCNQPQHF +CSGLAELGSENTEAFF +CSGQGAFGSGELFF +CSGRINTEAFF +CSLARDTKAFF +CSLQGGRYNEQFF +CSPALGIYGYTF +CSPGLYLASATDTQYF +CSPIQDSLGYTF +CSRPAGDASYEQYF +CSSRPPGGDEQFF +CSSSIRSSYEQYF +CSSTGTEGYTGELFF +CSTRDRGLLNTEAFF +CSVADYSGLVPSTDTQYF +CSVAFRDVSSYEQYF +CSVAGNTEAFF +CSVAGTADLEAFF +CSVAPSYNTGELFF +CSVAQTGGYEQYF +CSVARQGGDEQFF +CSVASGTGTNTEAFF +CSVDDRGTTEAFF +CSVDGDSVAFF +CSVDGGGAVPLDTQYF +CSVDGTGEGANVLTF +CSVDLEANYGYTF +CSVDPTSGGETQYF +CSVDQGAGGYDGYTF +CSVDRDRGLQETQYF +CSVDREISGNTIYF +CSVDTGSPGYGYTF +CSVDTLGRGGTEAFF +CSVDVAGENQETQYF +CSVEDEGLTDTQYF +CSVEDGRDTGEGTDTQYF +CSVEDISGYNEQFF +CSVEDISGYNEQYF +CSVEDRSYFAGYTF +CSVEDSSYGYTF +CSVEDSVQGQVDTQYF +CSVEDVASGSGEQFF +CSVEDWTSGKETQYF +CSVEENGNYGYTF +CSVEENTGTGAIYEQYF +CSVEGAGGYDEQFF +CSVEGANTGELFF +CSVEGEQFF +CSVEGGALIYNEQFF +CSVEGGYSYEQYF +CSVEGPMSYEQYF +CSVEGRGTTEAFF +CSVEGRPGHTEAFF +CSVEGTRGYNEQFF +CSVEIAGVYNEQFF +CSVEISGKEDTQYF +CSVEKGTATLEQYF +CSVELASGTYEQYF +CSVELGVSYNEQFF +CSVEQGAIGYTF +CSVERSGVQTDTQYF +CSVETGQTQETQYF +CSVEVGQVAEQYF +CSVEVGRAVNTEAFF +CSVEVPPGEQFF +CSVEVVGVPYNEQFF +CSVEWDPPRRGTEAFF +CSVEYGTKFF +CSVEYGTQYF +CSVGASYYEQYF +CSVGDGNEKLFF +CSVGEGGYTEAFF +CSVGEGNQPQHF +CSVGEGQVTEAFF +CSVGGQGARCNQPQHF +CSVGGTSGISSYEQYF +CSVGGWQGYNEQFF +CSVGLPYNEQFF +CSVGPGDNYGYTF +CSVGQTSSYEQYF +CSVGSETGAFFNYEQYF +CSVGSGDHGEQFF +CSVGSGEDSPQYF +CSVGSGGTNEKLFF +CSVGSIYGYTF +CSVGTGDWGEQYF +CSVGTGDYEQYF +CSVGTGGAGQPQHF +CSVIHRDARYNEQFF +CSVITDTQYF +CSVKGGAHTGELFF +CSVKGGSSYEQYF +CSVKQGDYNEQFF +CSVKSGDRGESPQETQYF +CSVLGGWGMNTEAFF +CSVLGRRVSYEQYF +CSVLGTQEESYEQYF +CSVLKGTEAFF +CSVLPPDRGLNQPQHF +CSVLQGSPYEQYF +CSVLRYF +CSVPGGSGYTF +CSVPPLGGIGETQYF +CSVQQGNTGELFF +CSVQRPAAYNEQFF +CSVRQSSIYGYTF +CSVRSGGAKNIQYF +CSVRTGRDEQYF +CSVRTGVAEGYTF +CSVRTIYEEQYF +CSVRTNEFYEQYF +CSVSTGGFGNQPQHF +CSVSTLLRGFNYGYTF +CSVTHGGGNQPQHF +CSVVASGNPDTQYF +CSVVDSGKTHNEQFF +CSVVGRFTDTQYF +CSVVGSERRSSYNEQFF +CSVVSGKGLNTEAFF +CSVVSRDSYEQYF +CSVVTENNYGYTF +CSVVVGDLYNEQFF +CTSSAGQGLPYEQYF +CTSSFGGTQYF +CTSSIRSSTEAFF +CTSSIRSSYEQYF +CTSSMRSADTQYF +CTSSMRSSGTQYF +CTSSQEGAGAPYNEQFF +CTSSQGSYGYTF +CTSSQVGGPDTQYF +CTSSTGASGYTF +CTVVETEAFF +CVSRSGAEAFF +CVSSIRSAYEQYF +CVSSIRSSYEQYF +CVSSLYSNQPQHF +CVSSVRSSYEQYF +CVTSAPLRQGLFNYGYTF diff --git a/tests/test_kmer_stats.py b/tests/test_kmer_stats.py new file mode 100644 index 0000000..a95a184 --- /dev/null +++ b/tests/test_kmer_stats.py @@ -0,0 +1,200 @@ +"""Benchmark and biological validation for :mod:`mir.biomarkers.kmer_stats`. + +Downloads HLA-A*02 GILGFVFTL-specific TRB CDR3 sequences from VDJdb +(via ``tests/assets/fetch_vdjdb_gilgfvftl.sh``) and compares them against +1 000 random TRB CDR3 sequences generated by OLGA. + +The GILGFVFTL epitope (Influenza A matrix protein M1_58-66) is presented by +HLA-A*02 and is one of the best-characterised CD8+ T-cell epitopes. Public +TCR beta chains recognising this epitope are dominated by TRBV19 usage and a +conserved **"RS"** motif in the CDR3 loop. We expect k-mers containing "RS" +to be significantly enriched compared to the OLGA background. + +Run with ``pytest -s tests/test_kmer_stats.py`` to see benchmark output. + +Requires: + * ``tests/assets/gilgfvftl_trb_cdr3.txt`` — run + ``bash tests/assets/fetch_vdjdb_gilgfvftl.sh`` first. +""" + +from __future__ import annotations + +import os +import time +import unittest +from pathlib import Path + +import numpy as np +import pandas as pd + +from mir.biomarkers.kmer_stats import ( + KmerCounter, + compare_kmer_counts, + compare_repertoire_kmers, +) +from mir.common.clonotype import ClonotypeAA +from mir.common.repertoire import Repertoire + +ASSETS = Path(__file__).parent / "assets" +GILG_FILE = ASSETS / "gilgfvftl_trb_cdr3.txt" + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _load_gilgfvftl_cdr3s() -> list[str]: + """Load GILGFVFTL-specific CDR3 sequences from the asset file.""" + if not GILG_FILE.exists(): + raise FileNotFoundError( + f"{GILG_FILE} not found. Run:\n" + " bash tests/assets/fetch_vdjdb_gilgfvftl.sh" + ) + seqs = [l.strip() for l in GILG_FILE.read_text().splitlines() if l.strip()] + if len(seqs) < 20: + raise RuntimeError(f"Expected >= 20 GILGFVFTL CDR3s, got {len(seqs)}") + return seqs + + +def _make_repertoire(cdr3s: list[str]) -> Repertoire: + """Build a minimal Repertoire from a list of CDR3aa strings.""" + clonotypes = [ClonotypeAA(cdr3aa=s) for s in cdr3s] + return Repertoire(clonotypes) + + +def _generate_olga_background(n: int = 1000, seed: int = 42) -> list[str]: + """Generate *n* random TRB CDR3aa sequences using OLGA.""" + from mir.basic.pgen import OlgaModel + + np.random.seed(seed) + model = OlgaModel(chain="TRB") + return model.generate_sequences(n) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +@unittest.skipUnless(GILG_FILE.exists(), + "VDJdb asset missing — run tests/assets/fetch_vdjdb_gilgfvftl.sh") +class TestKmerStatsBenchmark(unittest.TestCase): + """K-mer differential analysis: GILGFVFTL-specific vs OLGA background.""" + + @classmethod + def setUpClass(cls): + cls.gilg_cdr3s = _load_gilgfvftl_cdr3s() + cls.bg_cdr3s = _generate_olga_background(1000) + cls.gilg_rep = _make_repertoire(cls.gilg_cdr3s) + cls.bg_rep = _make_repertoire(cls.bg_cdr3s) + + # -- counting benchmarks ----------------------------------------------- + + def test_kmer_counter_speed(self): + """Benchmark KmerCounter on both repertoires.""" + for label, rep in [("GILGFVFTL", self.gilg_rep), + ("OLGA bg ", self.bg_rep)]: + t0 = time.perf_counter() + kc = KmerCounter(k=3, repertoire=rep) + _ = kc.counts() + elapsed = time.perf_counter() - t0 + n_kmers = len(kc.counts()) + n_clones = len(rep.clonotypes) + print(f"\n {label}: {n_clones} clonotypes → " + f"{n_kmers} unique 3-mers in {elapsed*1000:.1f} ms") + + # -- comparison benchmarks --------------------------------------------- + + def test_compare_repertoire_speed(self): + """Benchmark end-to-end repertoire comparison.""" + for k in (2, 3, 4): + t0 = time.perf_counter() + df = compare_repertoire_kmers(self.gilg_rep, self.bg_rep, k=k) + elapsed = time.perf_counter() - t0 + n_sig = (df["p_val_adj"] < 0.05).sum() + print(f"\n k={k}: {len(df)} kmers, {n_sig} significant " + f"(adj p < 0.05) in {elapsed*1000:.1f} ms") + + # -- biological validation: RS motif ----------------------------------- + + def test_rs_motif_enriched_k3(self): + """k=3 RS-containing k-mers should be among the top enriched.""" + df = compare_repertoire_kmers(self.gilg_rep, self.bg_rep, k=3) + # Keep only k-mers enriched in GILGFVFTL (freq_fc > 1) + enriched = df[df["freq_fc"] > 1].sort_values("p_val") + + top20 = enriched.head(20) + rs_in_top20 = [kmer for kmer in top20.index if "RS" in kmer] + + print(f"\n Top 20 enriched 3-mers (GILGFVFTL vs OLGA):") + print(f" {'k-mer':<8} {'FC':>8} {'p_val':>12} {'p_adj':>12} {'RS?':>4}") + print(f" {'-'*8} {'-'*8} {'-'*12} {'-'*12} {'-'*4}") + for kmer in top20.index: + row = top20.loc[kmer] + tag = " *" if "RS" in kmer else "" + print(f" {kmer:<8} {row['freq_fc']:>8.2f} " + f"{row['p_val']:>12.2e} {row['p_val_adj']:>12.2e}{tag}") + + self.assertGreater(len(rs_in_top20), 0, + "Expected RS-containing k-mers in top 20 enriched") + + def test_rs_motif_enriched_k4(self): + """k=4 RS-containing k-mers should be among the top enriched.""" + df = compare_repertoire_kmers(self.gilg_rep, self.bg_rep, k=4) + enriched = df[df["freq_fc"] > 1].sort_values("p_val") + + top30 = enriched.head(30) + rs_in_top30 = [kmer for kmer in top30.index if "RS" in kmer] + + print(f"\n Top 30 enriched 4-mers (GILGFVFTL vs OLGA):") + print(f" {'k-mer':<8} {'FC':>8} {'p_val':>12} {'p_adj':>12} {'RS?':>4}") + print(f" {'-'*8} {'-'*8} {'-'*12} {'-'*12} {'-'*4}") + for kmer in top30.index: + row = top30.loc[kmer] + tag = " *" if "RS" in kmer else "" + print(f" {kmer:<8} {row['freq_fc']:>8.2f} " + f"{row['p_val']:>12.2e} {row['p_val_adj']:>12.2e}{tag}") + + self.assertGreater(len(rs_in_top30), 0, + "Expected RS-containing k-mers in top 30 enriched") + + def test_rs_pvalue_significant(self): + """At least one RS-containing k-mer should have adjusted p < 0.05.""" + df = compare_repertoire_kmers(self.gilg_rep, self.bg_rep, k=3) + rs_kmers = df[[("RS" in kmer) for kmer in df.index]] + # Filter for enriched in GILGFVFTL + rs_enriched = rs_kmers[rs_kmers["freq_fc"] > 1] + + print(f"\n All RS-containing 3-mers:") + for kmer in rs_enriched.sort_values("p_val").index: + row = rs_enriched.loc[kmer] + sig = "***" if row["p_val_adj"] < 0.001 else ( + "**" if row["p_val_adj"] < 0.01 else ( + "*" if row["p_val_adj"] < 0.05 else "")) + print(f" {kmer} FC={row['freq_fc']:.2f} " + f"p={row['p_val']:.2e} p_adj={row['p_val_adj']:.2e} {sig}") + + min_padj = rs_enriched["p_val_adj"].min() + self.assertLess(min_padj, 0.05, + f"Best RS k-mer adj p-value = {min_padj:.2e}, expected < 0.05") + + # -- summary ----------------------------------------------------------- + + def test_benchmark_summary(self): + """Print combined summary of GILGFVFTL vs OLGA comparison.""" + print(f"\n Dataset sizes:") + print(f" GILGFVFTL CDR3s : {len(self.gilg_cdr3s)}") + print(f" OLGA background : {len(self.bg_cdr3s)}") + + for k in (2, 3, 4, 5): + df = compare_repertoire_kmers(self.gilg_rep, self.bg_rep, k=k) + enriched = df[df["freq_fc"] > 1].sort_values("p_val") + n_sig = (enriched["p_val_adj"] < 0.05).sum() + top3 = list(enriched.head(3).index) + rs_any = any("RS" in km for km in enriched.head(20).index) + print(f" k={k}: {len(df):>5} kmers, " + f"{n_sig:>3} significant, " + f"top3={top3}, " + f"RS in top20={'yes' if rs_any else 'no'}") + + +if __name__ == "__main__": + unittest.main() From 1b172f6be32e14eb1100e4b91912bf035d725448 Mon Sep 17 00:00:00 2001 From: Elizaveta Vlasova Date: Tue, 14 Apr 2026 00:04:48 +0300 Subject: [PATCH 18/24] removed needless dependencies, upd tedm etc --- pyproject.toml | 8 +++++--- requirements.txt | 5 +---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4da2ddc..0f59966 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,18 +25,20 @@ classifiers = [ dependencies = [ "biopython", "matplotlib", + "multipy", "numpy", "olga", "pandas", + "polars", + "pympler", "python_igraph", "scipy", "seaborn", - "umap-learn", "scikit-learn", + "statsmodels", "textdistance", - "plotnine", - "stringzilla", "tcrtrie @ git+https://github.com/MikePodsytnik/TCRtrie@0.1.2-tcrtriepy", + "tqdm", ] license = {text = "GPL-3.0 license"} diff --git a/requirements.txt b/requirements.txt index f332ade..511a25b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,9 +14,6 @@ statsmodels==0.14.1 textdistance==4.5.0 tcrtrie @ git+https://github.com/MikePodsytnik/TCRtrie@0.1.2-tcrtriepy tqdm==4.66.2 -stringzilla==4.6.0 -stringutils==1.0.6 umap-learn==0.5.3 pybind11==2.11.0 -multipy==0.16 -polars==1.39.3 \ No newline at end of file +polars==1.39.3 From b2139a619a03e4a288e1e5fa672af3fd5f93a640 Mon Sep 17 00:00:00 2001 From: Elizaveta Vlasova Date: Tue, 14 Apr 2026 00:27:47 +0300 Subject: [PATCH 19/24] resolved comments and added tests config --- .github/workflows/tests.yml | 85 +++++++++++++++++++++++++++++++++ .vscode/settings.json | 4 +- mir/basic/mirseq.cpp | 13 +++-- mir/basic/token_tables.py | 4 +- mir/distances/seqdist.cpp | 3 +- pyproject.toml | 8 +++- quick_setup.sh | 9 ++-- tests/__init__.py | 1 + tests/conftest.py | 17 +++++++ tests/test_aligner.py | 3 ++ tests/test_kmer_stats.py | 4 ++ tests/test_mirseq_benchmark.py | 5 ++ tests/test_repertoire.py | 9 ++-- tests/test_token_tables.py | 21 ++++++++ tests/test_token_tables_impl.py | 3 ++ 15 files changed, 170 insertions(+), 19 deletions(-) create mode 100644 .github/workflows/tests.yml create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..80d7db1 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,85 @@ +name: Test Suite + +on: + push: + branches: + - main + - dev + pull_request: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: tests-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + name: Fast tests on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - windows-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Upgrade pip + run: python -m pip install --upgrade pip + + - name: Install test dependencies + run: python -m pip install pytest + + - name: Install package + run: python -m pip install . + + - name: Run fast test suite + run: python -m pytest tests -m "not benchmark and not integration" -q + + test-heavy: + name: Heavy tests on ${{ matrix.os }} + if: github.event_name == 'push' && github.ref == 'refs/heads/dev' + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - windows-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Upgrade pip + run: python -m pip install --upgrade pip + + - name: Install test dependencies + run: python -m pip install pytest + + - name: Install package + run: python -m pip install . + + - name: Run benchmark and integration suites + env: + RUN_BENCHMARKS: "1" + RUN_INTEGRATION: "1" + run: python -m pytest tests -m "benchmark or integration" -q diff --git a/.vscode/settings.json b/.vscode/settings.json index 4143014..60b89b7 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -6,5 +6,7 @@ "python.testing.pytestEnabled": true, "[python]": { "editor.defaultFormatter": "ms-python.python" - } + }, + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda" } \ No newline at end of file diff --git a/mir/basic/mirseq.cpp b/mir/basic/mirseq.cpp index 32cf126..3b551ec 100644 --- a/mir/basic/mirseq.cpp +++ b/mir/basic/mirseq.cpp @@ -261,14 +261,13 @@ static py::list c_tokenize_gapped_bytes(const py::object& obj, int k, int mask_b int n_windows = n - k + 1; int total = n_windows * k; py::list result(total); - // Temporary buffer for each gapped k-mer - char* buf = (char*)alloca(k); + std::string buf(k, '\0'); int idx = 0; for (int i = 0; i < n_windows; ++i) { for (int j = 0; j < k; ++j) { - std::memcpy(buf, sv.data + i, k); + std::memcpy(buf.data(), sv.data + i, k); buf[j] = (char)mask_byte; - result[idx++] = py::bytes(buf, k); + result[idx++] = py::bytes(buf.data(), k); } } return result; @@ -282,13 +281,13 @@ static py::list c_tokenize_gapped_str(const py::object& obj, int k, int mask_byt int n_windows = n - k + 1; int total = n_windows * k; py::list result(total); - char* buf = (char*)alloca(k); + std::string buf(k, '\0'); int idx = 0; for (int i = 0; i < n_windows; ++i) { for (int j = 0; j < k; ++j) { - std::memcpy(buf, sv.data + i, k); + std::memcpy(buf.data(), sv.data + i, k); buf[j] = (char)mask_byte; - result[idx++] = py::str(std::string(buf, k)); + result[idx++] = py::str(std::string(buf.data(), k)); } } return result; diff --git a/mir/basic/token_tables.py b/mir/basic/token_tables.py index e5b8f3d..0a2cd2d 100644 --- a/mir/basic/token_tables.py +++ b/mir/basic/token_tables.py @@ -230,7 +230,7 @@ def summarize_rearrangements( if id_set is None: ids[key] = {rid} dups[key] = dc - else: + elif rid not in id_set: id_set.add(rid) dups[key] += dc return {k: KmerStats(len(ids[k]), dups[k]) for k in ids} @@ -280,7 +280,7 @@ def summarize_annotations( if id_set is None: ids[flat_key] = {rid} dups[flat_key] = dc - else: + elif rid not in id_set: id_set.add(rid) dups[flat_key] += dc # Pivot into nested dict diff --git a/mir/distances/seqdist.cpp b/mir/distances/seqdist.cpp index 5edf77a..d7779a9 100644 --- a/mir/distances/seqdist.cpp +++ b/mir/distances/seqdist.cpp @@ -207,8 +207,7 @@ static double c_selfscore(const std::string& s, py::array_t mat256, double factor, bool use_mat) { if (!use_mat) return 0.0; - auto mbuf = mat256.request(); - const double* mat = static_cast(mbuf.ptr); + const double* mat = extract_mat(mat256, use_mat); double x = 0.0; for (unsigned char c : s) x += mat[(size_t)c * 256 + c]; return factor * x; diff --git a/pyproject.toml b/pyproject.toml index 0f59966..e8871cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ readme = "README.md" requires-python = ">=3.11" classifiers = [ "Programming Language :: Python :: 3", - "License :: OSI Approved :: GPL-3.0 license", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", ] dependencies = [ @@ -50,3 +50,9 @@ Issues = "https://github.com/antigenomics/mirpy/issues" wheel.packages = ["mir"] logging.level = "INFO" +[tool.pytest.ini_options] +markers = [ + "benchmark: performance-oriented tests excluded from the default test run", + "integration: slow or environment-dependent tests excluded from the default test run", +] + diff --git a/quick_setup.sh b/quick_setup.sh index bc0252b..76a40e2 100644 --- a/quick_setup.sh +++ b/quick_setup.sh @@ -1,4 +1,7 @@ +#!/usr/bin/env sh + python3 -m venv venv -. venv/bin/activate.fish -export CMAKE_POLICY_VERSION_MINIMUM=3.5 && pip install . -pip install pytest pylint \ No newline at end of file +. venv/bin/activate +export CMAKE_POLICY_VERSION_MINIMUM=3.5 +pip install . +pip install pytest pylint diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5afdc22 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,17 @@ +import os + +import pytest + + +RUN_BENCHMARKS = os.getenv("RUN_BENCHMARKS") == "1" +RUN_INTEGRATION = os.getenv("RUN_INTEGRATION") == "1" + +skip_benchmarks = pytest.mark.skipif( + not RUN_BENCHMARKS, + reason="set RUN_BENCHMARKS=1 to run benchmark tests", +) + +skip_integration = pytest.mark.skipif( + not RUN_INTEGRATION, + reason="set RUN_INTEGRATION=1 to run integration tests", +) diff --git a/tests/test_aligner.py b/tests/test_aligner.py index a22697a..ba4680b 100644 --- a/tests/test_aligner.py +++ b/tests/test_aligner.py @@ -11,6 +11,7 @@ from Bio import Align from Bio.Align import substitution_matrices +from tests.conftest import skip_benchmarks from mir.distances.aligner import ( CDRAligner, BioAlignerWrapper, @@ -555,6 +556,8 @@ def test_visualize_sample_alignments(self): # =================================================================== +@skip_benchmarks +@pytest.mark.benchmark class TestAlignmentBenchmarks: """Speed benchmarks for CDR3 alignment. diff --git a/tests/test_kmer_stats.py b/tests/test_kmer_stats.py index a95a184..9c15a26 100644 --- a/tests/test_kmer_stats.py +++ b/tests/test_kmer_stats.py @@ -26,7 +26,9 @@ import numpy as np import pandas as pd +import pytest +from tests.conftest import skip_integration from mir.biomarkers.kmer_stats import ( KmerCounter, compare_kmer_counts, @@ -76,6 +78,8 @@ def _generate_olga_background(n: int = 1000, seed: int = 42) -> list[str]: @unittest.skipUnless(GILG_FILE.exists(), "VDJdb asset missing — run tests/assets/fetch_vdjdb_gilgfvftl.sh") +@skip_integration +@pytest.mark.integration class TestKmerStatsBenchmark(unittest.TestCase): """K-mer differential analysis: GILGFVFTL-specific vs OLGA background.""" diff --git a/tests/test_mirseq_benchmark.py b/tests/test_mirseq_benchmark.py index 783a39a..bf8a473 100644 --- a/tests/test_mirseq_benchmark.py +++ b/tests/test_mirseq_benchmark.py @@ -6,6 +6,9 @@ import time import unittest +import pytest + +from tests.conftest import skip_benchmarks from mir.basic import mirseq from mir.distances import seqdist_c from mir.basic.alphabets import ( @@ -23,6 +26,8 @@ def _time_fn(fn, *args, n: int = 5000) -> float: return time.perf_counter() - start +@skip_benchmarks +@pytest.mark.benchmark class TestBenchmarks(unittest.TestCase): def _report(self, name: str, py_t: float, c_t: float) -> None: diff --git a/tests/test_repertoire.py b/tests/test_repertoire.py index 574317f..f101941 100644 --- a/tests/test_repertoire.py +++ b/tests/test_repertoire.py @@ -1,5 +1,7 @@ import os import unittest +from pathlib import Path + import pandas as pd from mir.biomarkers.fisher_biomarkers_detector import FisherBiomarkersDetector @@ -10,11 +12,12 @@ class TestRepertoireDataset(unittest.TestCase): def setUp(self): - self.meta = pd.read_csv('assets/test_meta.csv') + assets_dir = Path(__file__).parent / "assets" + self.meta = pd.read_csv(assets_dir / "meta.csv") self.rd = RepertoireDataset.load(parser=VDJtoolsParser(sep=','), metadata=self.meta, threads=1, - paths=[f'assets/{x}' for x in self.meta.file_name]) + paths=[str(assets_dir / x) for x in self.meta.file_name]) self.ill_rd, self.healthy_rd = self.rd.split_by_metadata_function(splitting_method=lambda x: x.status == 'ill') @@ -60,4 +63,4 @@ def test_clustering(self): if __name__ == "__main__": print(os.getcwd()) - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/test_token_tables.py b/tests/test_token_tables.py index aad6bc3..51b87ce 100644 --- a/tests/test_token_tables.py +++ b/tests/test_token_tables.py @@ -6,6 +6,7 @@ import pytest +from tests.conftest import skip_benchmarks, skip_integration from mir.basic.token_tables import ( Kmer, KmerAnnotation, @@ -169,6 +170,8 @@ def test_positions_gapped(self): # Benchmark # --------------------------------------------------------------------------- +@skip_benchmarks +@pytest.mark.benchmark class TestTokenizeRearrangementsBenchmark: N = 100_000 K = 5 @@ -333,11 +336,18 @@ def test_gapped_shared_summary(self): assert stats[shared].rearrangement_count == 2 assert stats[shared].duplicate_count == 5 + def test_repeated_kmer_counts_duplicate_once_per_rearrangement(self): + r = _make_rearrangement("CASSCAS", duplicate_count=11) + stats = summarize_rearrangements([r], k=3) + assert stats[Kmer("TRB", "TRBV5-1", "TRBC1", b"CAS")] == KmerStats(1, 11) + # --------------------------------------------------------------------------- # Benchmark — summarize # --------------------------------------------------------------------------- +@skip_benchmarks +@pytest.mark.benchmark class TestSummarizeRearrangementsBenchmark: N = 100_000 K = 5 @@ -441,6 +451,13 @@ def test_position_distinguishes_annotations(self): assert a0 in inner and inner[a0] == KmerStats(1, 1) assert a4 in inner and inner[a4] == KmerStats(1, 1) + def test_repeated_annotation_counts_duplicate_once_per_rearrangement(self): + r = _make_rearrangement("CASSCAS", duplicate_count=9) + ann = summarize_annotations([r], k=3) + ks = KmerSeq("TRB", b"CAS") + assert ann[ks][KmerAnnotation("TRBV5-1", "TRBC1", 0)] == KmerStats(1, 9) + assert ann[ks][KmerAnnotation("TRBV5-1", "TRBC1", 4)] == KmerStats(1, 9) + def test_gapped_annotations(self): r = _make_rearrangement("CASSLA", duplicate_count=6) ann = summarize_annotations([r], k=4, mask_byte=MASK) @@ -480,6 +497,8 @@ def test_gapped_different_genes_merge(self): # Benchmark — summarize_annotations # --------------------------------------------------------------------------- +@skip_benchmarks +@pytest.mark.benchmark class TestSummarizeAnnotationsBenchmark: N = 100_000 K = 5 @@ -511,6 +530,8 @@ def test_benchmark_annotations_plain(self, rearrangements): # OLGA-based realistic benchmark # --------------------------------------------------------------------------- +@skip_integration +@pytest.mark.integration class TestOlgaKmerSummary: """Generate 10,000 human TCR-beta rearrangements via OLGA and validate biological expectations on k-mer incidence.""" diff --git a/tests/test_token_tables_impl.py b/tests/test_token_tables_impl.py index c8d5d4f..dc26866 100644 --- a/tests/test_token_tables_impl.py +++ b/tests/test_token_tables_impl.py @@ -13,6 +13,7 @@ import polars as pl import pytest +from tests.conftest import skip_benchmarks from mir.basic import token_tables_pl as plmod from mir.basic.token_tables import ( Kmer, @@ -258,6 +259,8 @@ def _measure(func, label: str) -> dict: return {"label": label, "elapsed": elapsed, "peak_mem": peak, "result": result} +@skip_benchmarks +@pytest.mark.benchmark class TestBenchmarkImplementations: """Compare time and memory: naive (token_tables.py) vs Polars on 10,000 OLGA-generated TCR-beta rearrangements.""" From c599aaddb199aa089300d27bcbc0d015951e7649 Mon Sep 17 00:00:00 2001 From: Elizaveta Vlasova Date: Tue, 14 Apr 2026 00:31:52 +0300 Subject: [PATCH 20/24] upd test config --- .github/workflows/tests.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 80d7db1..82da72b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -2,9 +2,6 @@ name: Test Suite on: push: - branches: - - main - - dev pull_request: workflow_dispatch: From 7d84cfc706dde0732da8e176252eebf4cbd69feb Mon Sep 17 00:00:00 2001 From: Elizaveta Vlasova Date: Tue, 14 Apr 2026 00:36:15 +0300 Subject: [PATCH 21/24] fixed tests --- .github/workflows/tests.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 82da72b..d6c3f1b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -43,7 +43,8 @@ jobs: run: python -m pip install . - name: Run fast test suite - run: python -m pytest tests -m "not benchmark and not integration" -q + working-directory: ${{ runner.temp }} + run: python -m pytest "${{ github.workspace }}/tests" -m "not benchmark and not integration" -q test-heavy: name: Heavy tests on ${{ matrix.os }} @@ -79,4 +80,5 @@ jobs: env: RUN_BENCHMARKS: "1" RUN_INTEGRATION: "1" - run: python -m pytest tests -m "benchmark or integration" -q + working-directory: ${{ runner.temp }} + run: python -m pytest "${{ github.workspace }}/tests" -m "benchmark or integration" -q From c160c3eab06e76773247b257a0d965af8904b0e1 Mon Sep 17 00:00:00 2001 From: Elizaveta Vlasova Date: Tue, 14 Apr 2026 00:44:23 +0300 Subject: [PATCH 22/24] fix tests --- .github/workflows/tests.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d6c3f1b..36af54f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -42,9 +42,12 @@ jobs: - name: Install package run: python -m pip install . + - name: Copy tests to temp directory + run: python -c "import pathlib, shutil; src = pathlib.Path(r'${{ github.workspace }}') / 'tests'; dst = pathlib.Path(r'${{ runner.temp }}') / 'tests'; shutil.rmtree(dst, ignore_errors=True); shutil.copytree(src, dst)" + - name: Run fast test suite working-directory: ${{ runner.temp }} - run: python -m pytest "${{ github.workspace }}/tests" -m "not benchmark and not integration" -q + run: python -m pytest tests -m "not benchmark and not integration" -q test-heavy: name: Heavy tests on ${{ matrix.os }} @@ -76,9 +79,12 @@ jobs: - name: Install package run: python -m pip install . + - name: Copy tests to temp directory + run: python -c "import pathlib, shutil; src = pathlib.Path(r'${{ github.workspace }}') / 'tests'; dst = pathlib.Path(r'${{ runner.temp }}') / 'tests'; shutil.rmtree(dst, ignore_errors=True); shutil.copytree(src, dst)" + - name: Run benchmark and integration suites env: RUN_BENCHMARKS: "1" RUN_INTEGRATION: "1" working-directory: ${{ runner.temp }} - run: python -m pytest "${{ github.workspace }}/tests" -m "benchmark or integration" -q + run: python -m pytest tests -m "benchmark or integration" -q From 6665a7d0a32f79b8701928004717b4fed6d99d93 Mon Sep 17 00:00:00 2001 From: Elizaveta Vlasova Date: Tue, 14 Apr 2026 00:55:22 +0300 Subject: [PATCH 23/24] fixed repertoire test --- tests/test_repertoire.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_repertoire.py b/tests/test_repertoire.py index f101941..3cc9e77 100644 --- a/tests/test_repertoire.py +++ b/tests/test_repertoire.py @@ -34,9 +34,10 @@ def test_public_clonotypes_size(self): assert len(self.rd.clonotype_usage_matrix.public_clonotypes) == 5 def test_public_clonotypes(self): - assert 'CGGGF' in self.rd.clonotype_usage_matrix.public_clonotypes - assert 'CASTA' in self.rd.clonotype_usage_matrix.public_clonotypes - assert 'CFRRA' in self.rd.clonotype_usage_matrix.public_clonotypes + public_cdr3aa = [x.cdr3aa for x in self.rd.clonotype_usage_matrix.public_clonotypes] + assert 'CGGGF' in public_cdr3aa + assert 'CASTA' in public_cdr3aa + assert 'CFRRA' in public_cdr3aa def test_usage_full_matrix_values_for_CGGGF(self): From 34d77b29c82e69d86113bb9abe328a18826754e7 Mon Sep 17 00:00:00 2001 From: Elizaveta Vlasova Date: Tue, 14 Apr 2026 15:15:02 +0300 Subject: [PATCH 24/24] update for tcrtrie PyPI package --- README.md | 6 ++++++ pyproject.toml | 4 ++-- requirements.txt | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fd00b2c..4daf5f3 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,12 @@ Requirements: - Python 3.11+ - a C/C++ build toolchain for compiled extensions +Install from PyPI: + +```bash +pip install mirpy-lib +``` + Install from the repository root: ```bash diff --git a/pyproject.toml b/pyproject.toml index e8871cf..11637f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ build-backend = "scikit_build_core.build" [project] name = "mirpy-lib" -version = "0.1.3" +version = "1.0.0" authors = [ { name="VEK239", email="elizaveta.k.vlasova@gmail.com" }, { name="mikessh", email="mikhail.shugay@gmail.com"}, @@ -37,7 +37,7 @@ dependencies = [ "scikit-learn", "statsmodels", "textdistance", - "tcrtrie @ git+https://github.com/MikePodsytnik/TCRtrie@0.1.2-tcrtriepy", + "tcrtrie>=0.1.2", "tqdm", ] license = {text = "GPL-3.0 license"} diff --git a/requirements.txt b/requirements.txt index 511a25b..b7246bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ seaborn==0.12.2 scikit-learn==1.3.0 statsmodels==0.14.1 textdistance==4.5.0 -tcrtrie @ git+https://github.com/MikePodsytnik/TCRtrie@0.1.2-tcrtriepy +tcrtrie>=0.1.2 tqdm==4.66.2 umap-learn==0.5.3 pybind11==2.11.0