From 09bf8157ad5d09f9fa255c678ed6c6adb4ccdf33 Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Mon, 2 Sep 2024 14:52:00 +0200
Subject: [PATCH 1/9] Add positional substitution matrices

---
 src/biotite/sequence/align/alignment.py |  19 ++-
 src/biotite/sequence/align/matrix.py    | 214 ++++++++++++++++++++----
 src/biotite/sequence/seqtypes.py        | 127 +++++++++++++-
 tests/sequence/align/test_matrix.py     |  29 ++++
 tests/sequence/test_seqtypes.py         |  21 +++
 tests/test_repr.py                      |   4 +
 6 files changed, 372 insertions(+), 42 deletions(-)

diff --git a/src/biotite/sequence/align/alignment.py b/src/biotite/sequence/align/alignment.py
index d33e3d051..cbbbe1cdd 100644
--- a/src/biotite/sequence/align/alignment.py
+++ b/src/biotite/sequence/align/alignment.py
@@ -9,7 +9,6 @@
 import textwrap
 from collections.abc import Sequence
 import numpy as np
-from biotite.sequence.alphabet import LetterAlphabet
 
 __all__ = [
     "Alignment",
@@ -111,7 +110,7 @@ def _gapped_str(self, seq_index):
         for i in range(len(self.trace)):
             j = self.trace[i][seq_index]
             if j != -1:
-                seq_str += self.sequences[seq_index][j]
+                seq_str += str(self.sequences[seq_index][j])
             else:
                 seq_str += "-"
         return seq_str
@@ -133,7 +132,7 @@ def __str__(self):
         # has an non-single letter alphabet
         all_single_letter = True
         for seq in self.sequences:
-            if not isinstance(seq.get_alphabet(), LetterAlphabet):
+            if not _is_single_letter(seq.alphabet):
                 all_single_letter = False
         if all_single_letter:
             # First dimension: sequence number,
@@ -665,3 +664,17 @@ def remove_terminal_gaps(alignment):
             "no overlap and the resulting alignment would be empty"
         )
     return alignment[start:stop]
+
+
+def _is_single_letter(alphabet):
+    """
+    More relaxed version of :func:`biotite.sequence.alphabet.is_letter_alphabet()`:
+    It is sufficient that only only the string representation of each symbol is only
+    a single character.
+    """
+    if alphabet.is_letter_alphabet():
+        return True
+    for symbol in alphabet:
+        if len(str(symbol)) != 1:
+            return False
+    return True
diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py
index 2a7d23437..06308115d 100644
--- a/src/biotite/sequence/align/matrix.py
+++ b/src/biotite/sequence/align/matrix.py
@@ -7,7 +7,11 @@
 
 import os
 import numpy as np
-from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
+from biotite.sequence.seqtypes import (
+    NucleotideSequence,
+    PositionalSequence,
+    ProteinSequence,
+)
 
 __all__ = ["SubstitutionMatrix"]
 
@@ -78,6 +82,11 @@ class SubstitutionMatrix(object):
         or a dictionary mapping the symbol pairing to scores,
         or a string referencing a matrix in the internal database.
 
+    Attributes
+    ----------
+    shape : tuple
+        The shape of the substitution matrix.
+
     Raises
     ------
     KeyError
@@ -151,34 +160,18 @@ def __init__(self, alphabet1, alphabet2, score_matrix):
         # score matrix -> make the score matrix read-only
         self._matrix.setflags(write=False)
 
-    def __repr__(self):
-        """Represent SubstitutionMatrix as a string for debugging."""
-        return (
-            f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
-            f"np.{np.array_repr(self._matrix)})"
-        )
-
-    def __eq__(self, item):
-        if not isinstance(item, SubstitutionMatrix):
-            return False
-        if self._alph1 != item.get_alphabet1():
-            return False
-        if self._alph2 != item.get_alphabet2():
-            return False
-        if not np.array_equal(self.score_matrix(), item.score_matrix()):
-            return False
-        return True
-
-    def __ne__(self, item):
-        return not self == item
+    @property
+    def shape(self):
+        """
+        Get the shape (i.e. the length of both alphabets)
+        of the substitution matrix.
 
-    def _fill_with_matrix_dict(self, matrix_dict):
-        self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
-        for i in range(len(self._alph1)):
-            for j in range(len(self._alph2)):
-                sym1 = self._alph1.decode(i)
-                sym2 = self._alph2.decode(j)
-                self._matrix[i, j] = int(matrix_dict[sym1, sym2])
+        Returns
+        -------
+        shape : tuple
+            Matrix shape.
+        """
+        return (len(self._alph1), len(self._alph2))
 
     def get_alphabet1(self):
         """
@@ -280,26 +273,155 @@ def get_score(self, symbol1, symbol2):
         code2 = self._alph2.encode(symbol2)
         return self._matrix[code1, code2]
 
-    def shape(self):
+    def as_positional(self, sequence1, sequence2):
         """
-        Get the shape (i.e. the length of both alphabets)
-        of the subsitution matrix.
+        Transform this substitution matrix and two sequences into positional
+        equivalents.
+
+        This means the new substitution matrix is position-specific: It has the lengths
+        of the sequences instead of the lengths of their alphabets.
+        Its scores represent the same scores as the original matrix, but now mapped
+        onto the positions of the sequences.
+
+        Parameters
+        ----------
+        sequence1, sequence2 : seq.Sequence, length=n
+            The sequences to create the positional equivalents from.
 
         Returns
         -------
-        shape : tuple
-            Matrix shape.
+        pos_matrix : align.SubstitutionMatrix, shape=(n, n)
+            The position-specific substitution matrix.
+        pos_sequence1, pos_sequence2 : PositionalSequence, length=n
+            The positional sequences.
+
+        Notes
+        -----
+        After the transformation the substitution scores remain the same, i.e.
+        `substitution_matrix.get_score(sequence1[i], sequence2[j])` is equal to
+        `pos_matrix.get_score(pos_sequence1[i], pos_sequence2[j])`.
+
+        Examples
+        --------
+
+        Run an alignment with the usual substitution matrix:
+
+        >>> seq1 = ProteinSequence("BIQTITE")
+        >>> seq2 = ProteinSequence("IQLITE")
+        >>> matrix = SubstitutionMatrix.std_protein_matrix()
+        >>> print(matrix)
+            A   C   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   T   V   W   Y   B   Z   X   *
+        A   4   0  -2  -1  -2   0  -2  -1  -1  -1  -1  -2  -1  -1  -1   1   0   0  -3  -2  -2  -1   0  -4
+        C   0   9  -3  -4  -2  -3  -3  -1  -3  -1  -1  -3  -3  -3  -3  -1  -1  -1  -2  -2  -3  -3  -2  -4
+        D  -2  -3   6   2  -3  -1  -1  -3  -1  -4  -3   1  -1   0  -2   0  -1  -3  -4  -3   4   1  -1  -4
+        E  -1  -4   2   5  -3  -2   0  -3   1  -3  -2   0  -1   2   0   0  -1  -2  -3  -2   1   4  -1  -4
+        F  -2  -2  -3  -3   6  -3  -1   0  -3   0   0  -3  -4  -3  -3  -2  -2  -1   1   3  -3  -3  -1  -4
+        G   0  -3  -1  -2  -3   6  -2  -4  -2  -4  -3   0  -2  -2  -2   0  -2  -3  -2  -3  -1  -2  -1  -4
+        H  -2  -3  -1   0  -1  -2   8  -3  -1  -3  -2   1  -2   0   0  -1  -2  -3  -2   2   0   0  -1  -4
+        I  -1  -1  -3  -3   0  -4  -3   4  -3   2   1  -3  -3  -3  -3  -2  -1   3  -3  -1  -3  -3  -1  -4
+        K  -1  -3  -1   1  -3  -2  -1  -3   5  -2  -1   0  -1   1   2   0  -1  -2  -3  -2   0   1  -1  -4
+        L  -1  -1  -4  -3   0  -4  -3   2  -2   4   2  -3  -3  -2  -2  -2  -1   1  -2  -1  -4  -3  -1  -4
+        M  -1  -1  -3  -2   0  -3  -2   1  -1   2   5  -2  -2   0  -1  -1  -1   1  -1  -1  -3  -1  -1  -4
+        N  -2  -3   1   0  -3   0   1  -3   0  -3  -2   6  -2   0   0   1   0  -3  -4  -2   3   0  -1  -4
+        P  -1  -3  -1  -1  -4  -2  -2  -3  -1  -3  -2  -2   7  -1  -2  -1  -1  -2  -4  -3  -2  -1  -2  -4
+        Q  -1  -3   0   2  -3  -2   0  -3   1  -2   0   0  -1   5   1   0  -1  -2  -2  -1   0   3  -1  -4
+        R  -1  -3  -2   0  -3  -2   0  -3   2  -2  -1   0  -2   1   5  -1  -1  -3  -3  -2  -1   0  -1  -4
+        S   1  -1   0   0  -2   0  -1  -2   0  -2  -1   1  -1   0  -1   4   1  -2  -3  -2   0   0   0  -4
+        T   0  -1  -1  -1  -2  -2  -2  -1  -1  -1  -1   0  -1  -1  -1   1   5   0  -2  -2  -1  -1   0  -4
+        V   0  -1  -3  -2  -1  -3  -3   3  -2   1   1  -3  -2  -2  -3  -2   0   4  -3  -1  -3  -2  -1  -4
+        W  -3  -2  -4  -3   1  -2  -2  -3  -3  -2  -1  -4  -4  -2  -3  -3  -2  -3  11   2  -4  -3  -2  -4
+        Y  -2  -2  -3  -2   3  -3   2  -1  -2  -1  -1  -2  -3  -1  -2  -2  -2  -1   2   7  -3  -2  -1  -4
+        B  -2  -3   4   1  -3  -1   0  -3   0  -4  -3   3  -2   0  -1   0  -1  -3  -4  -3   4   1  -1  -4
+        Z  -1  -3   1   4  -3  -2   0  -3   1  -3  -1   0  -1   3   0   0  -1  -2  -3  -2   1   4  -1  -4
+        X   0  -2  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -2  -1  -1   0   0  -1  -2  -1  -1  -1  -1  -4
+        *  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4  -4   1
+        >>> alignment = align_optimal(seq1, seq2, matrix, gap_penalty=-10)[0]
+        >>> print(alignment)
+        BIQTITE
+        -IQLITE
+
+        Running the alignment with positional equivalents gives the same result:
+
+        >>> pos_matrix, pos_seq1, pos_seq2 = matrix.as_positional(seq1, seq2)
+        >>> print(pos_matrix)
+            I   Q   L   I   T   E
+        B  -3   0  -4  -3  -1   1
+        I   4  -3   2   4  -1  -3
+        Q  -3   5  -2  -3  -1   2
+        T  -1  -1  -1  -1   5  -1
+        I   4  -3   2   4  -1  -3
+        T  -1  -1  -1  -1   5  -1
+        E  -3   2  -3  -3  -1   5
+        >>> pos_alignment = align_optimal(pos_seq1, pos_seq2, pos_matrix, gap_penalty=-10)[0]
+        >>> print(pos_alignment)
+        BIQTITE
+        -IQLITE
+
+        Increase the substitution score for the first symbols in both sequences to align
+        to each other:
+
+        >>> score_matrix = pos_matrix.score_matrix().copy()
+        >>> score_matrix[0, 0] = 100
+        >>> biased_matrix = SubstitutionMatrix(
+        ...     pos_matrix.get_alphabet1(), pos_matrix.get_alphabet2(), score_matrix
+        ... )
+        >>> print(biased_matrix)
+            I   Q   L   I   T   E
+        B 100   0  -4  -3  -1   1
+        I   4  -3   2   4  -1  -3
+        Q  -3   5  -2  -3  -1   2
+        T  -1  -1  -1  -1   5  -1
+        I   4  -3   2   4  -1  -3
+        T  -1  -1  -1  -1   5  -1
+        E  -3   2  -3  -3  -1   5
+        >>> biased_alignment = align_optimal(pos_seq1, pos_seq2, biased_matrix, gap_penalty=-10)[0]
+        >>> print(biased_alignment)
+        BIQTITE
+        I-QLITE
         """
-        return (len(self._alph1), len(self._alph2))
+        pos_sequence1 = PositionalSequence(sequence1)
+        pos_sequence2 = PositionalSequence(sequence2)
+
+        pos_score_matrix = self._matrix[
+            tuple(_cartesian_product(sequence1.code, sequence2.code).T)
+        ].reshape(len(sequence1), len(sequence2))
+        pos_matrix = SubstitutionMatrix(
+            pos_sequence1.get_alphabet(),
+            pos_sequence2.get_alphabet(),
+            pos_score_matrix,
+        )
+
+        return pos_matrix, pos_sequence1, pos_sequence2
+
+    def __repr__(self):
+        """Represent SubstitutionMatrix as a string for debugging."""
+        return (
+            f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
+            f"np.{np.array_repr(self._matrix)})"
+        )
+
+    def __eq__(self, item):
+        if not isinstance(item, SubstitutionMatrix):
+            return False
+        if self._alph1 != item.get_alphabet1():
+            return False
+        if self._alph2 != item.get_alphabet2():
+            return False
+        if not np.array_equal(self.score_matrix(), item.score_matrix()):
+            return False
+        return True
+
+    def __ne__(self, item):
+        return not self == item
 
     def __str__(self):
         # Create matrix in NCBI format
         string = " "
         for symbol in self._alph2:
-            string += f" {symbol:>3}"
+            string += f" {str(symbol):>3}"
         string += "\n"
         for i, symbol in enumerate(self._alph1):
-            string += f"{symbol:>1}"
+            string += f"{str(symbol):>1}"
             for j in range(len(self._alph2)):
                 string += f" {int(self._matrix[i,j]):>3d}"
             string += "\n"
@@ -394,6 +516,14 @@ def std_nucleotide_matrix():
         """
         return _matrix_nuc
 
+    def _fill_with_matrix_dict(self, matrix_dict):
+        self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
+        for i in range(len(self._alph1)):
+            for j in range(len(self._alph2)):
+                sym1 = self._alph1.decode(i)
+                sym2 = self._alph2.decode(j)
+                self._matrix[i, j] = int(matrix_dict[sym1, sym2])
+
 
 # Preformatted BLOSUM62 and NUC substitution matrix from NCBI
 _matrix_blosum62 = SubstitutionMatrix(
@@ -402,3 +532,15 @@ def std_nucleotide_matrix():
 _matrix_nuc = SubstitutionMatrix(
     NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
 )
+
+
+def _cartesian_product(array1, array2):
+    """
+    Create all combinations of elements from two arrays.
+    """
+    return np.transpose(
+        [
+            np.repeat(array1, len(array2)),
+            np.tile(array2, len(array1)),
+        ]
+    )
diff --git a/src/biotite/sequence/seqtypes.py b/src/biotite/sequence/seqtypes.py
index e09527c35..0234776a1 100644
--- a/src/biotite/sequence/seqtypes.py
+++ b/src/biotite/sequence/seqtypes.py
@@ -4,10 +4,22 @@
 
 __name__ = "biotite.sequence"
 __author__ = "Patrick Kunzmann", "Thomas Nevolianis"
-__all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"]
-
+__all__ = [
+    "GeneralSequence",
+    "NucleotideSequence",
+    "ProteinSequence",
+    "PositionalSequence",
+    "PurePositionalSequence",
+]
+
+from dataclasses import dataclass, field
 import numpy as np
-from biotite.sequence.alphabet import AlphabetError, AlphabetMapper, LetterAlphabet
+from biotite.sequence.alphabet import (
+    Alphabet,
+    AlphabetError,
+    AlphabetMapper,
+    LetterAlphabet,
+)
 from biotite.sequence.sequence import Sequence
 
 
@@ -590,3 +602,112 @@ def get_molecular_weight(self, monoisotopic=False):
                 "Sequence contains ambiguous amino acids, " "cannot calculate weight"
             )
         return weight
+
+
+class PositionalSequence(Sequence):
+    """
+    A sequence where each symbol is associated with a position.
+
+    For each individual position the sequence contains a separate
+    :class:`PositionalSequence.Symbol`, encoded by a custom alphabet for this sequence.
+    In consequence the symbol code is the position in the sequence itself.
+    This is useful for aligning sequences based on a position-specific
+    substitution matrix.
+
+    Parameters
+    ----------
+    original_sequence : seq.Sequence
+        The original sequence to create the positional sequence from.
+    """
+
+    @dataclass(frozen=True)
+    class Symbol:
+        """
+        Combination of a symbol and its position in a sequence.
+
+        Attributes
+        ----------
+        original_alphabet : Alphabet
+            The original alphabet, where the symbol stems from.
+        original_code : int
+            The code of the original symbol in the original alphabet.
+        position : int
+            The 0-based position of the symbol in the sequence.
+        symbol : object
+            The symbol from the original alphabet.
+
+        See Also
+        --------
+        PositionalSequence
+            The sequence type containing :class:`PositionalSymbol` objects.
+        """
+
+        original_alphabet: ...
+        original_code: ...
+        position: ...
+        symbol: ... = field(init=False)
+
+        def __post_init__(self):
+            sym = self.original_alphabet.decode(self.original_code)
+            super().__setattr__("symbol", sym)
+
+        def __str__(self):
+            return str(self.symbol)
+
+    def __init__(self, original_sequence):
+        self._orig_alphabet = original_sequence.get_alphabet()
+        self._alphabet = Alphabet(
+            [
+                PositionalSequence.Symbol(self._orig_alphabet, code, pos)
+                for pos, code in enumerate(original_sequence.code)
+            ]
+        )
+        self.code = np.arange(
+            len(original_sequence), dtype=Sequence.dtype(len(self._alphabet))
+        )
+
+    def reconstruct(self):
+        """
+        Reconstruct the original sequence from the positional sequence.
+
+        Returns
+        -------
+        original_sequence : GeneralSequence
+            The original sequence.
+            Although the actual type of the returned sequence is always a
+            :class:`GeneralSequence`, the alphabet and the symbols of the returned
+            sequence are equal to the original sequence.
+        """
+        original_sequence = GeneralSequence(self._orig_alphabet)
+        original_sequence.code = np.array([sym.original_code for sym in self._alphabet])
+        return original_sequence
+
+    def get_alphabet(self):
+        return self._alphabet
+
+    def __str__(self) -> str:
+        return "".join([str(sym) for sym in self.symbols])
+
+    def __repr__(self):
+        return f"PositionalSequence({self.reconstruct()!r})"
+
+
+class PurePositionalSequence(Sequence):
+    """
+    An object of this class is a 'placeholder' sequence, where each symbol is the
+    position in the sequence itself.
+
+    This class is similar to :class:`PositionalSequence`, but the symbols are not
+    derived from an original sequence, but are the pure position.
+    Hence, there is no meaningful string representation of the sequence and its symbols.
+    """
+
+    def __init__(self, length):
+        self._alphabet = Alphabet(range(length))
+        self.code = np.arange(length, dtype=Sequence.dtype(length))
+
+    def get_alphabet(self):
+        return self._alphabet
+
+    def __repr__(self):
+        return f"PurePositionalSequence({len(self)})"
diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py
index 570878945..a9c33d829 100644
--- a/tests/sequence/align/test_matrix.py
+++ b/tests/sequence/align/test_matrix.py
@@ -40,3 +40,32 @@ def test_matrix_str():
          "b   3   4   5",
          "c   6   7   8"]
     )  # fmt: skip
+
+
+@pytest.mark.parametrize("seed", range(10))
+def test_as_positional(seed):
+    """
+    Check if the score from a positional substitution matrix for symbols from
+    positional sequences is equal to the score from the original matrix for the
+    corresponding symbols from the original sequences.
+    """
+    MAX_SEQ_LENGTH = 10
+
+    np.random.seed(seed)
+    matrix = align.SubstitutionMatrix.std_protein_matrix()
+    sequences = []
+    for _ in range(2):
+        seq_length = np.random.randint(1, MAX_SEQ_LENGTH)
+        sequence = seq.ProteinSequence()
+        sequence.code = np.random.randint(0, len(sequence.alphabet), size=seq_length)
+        sequences.append(sequence)
+    pos_matrix, *pos_sequences = matrix.as_positional(*sequences)
+
+    # Sanity check if the positional sequences do correspond to the original sequences
+    for i in range(2):
+        assert str(pos_sequences[i]) == str(sequences[i])
+    for i in range(len(sequences[0])):
+        for j in range(len(sequences[1])):
+            ref_score = matrix.get_score(sequences[0][i], sequences[1][j])
+            test_score = pos_matrix.get_score(pos_sequences[0][i], pos_sequences[1][j])
+            assert test_score == ref_score
diff --git a/tests/sequence/test_seqtypes.py b/tests/sequence/test_seqtypes.py
index 086f972a9..a58e25c3d 100644
--- a/tests/sequence/test_seqtypes.py
+++ b/tests/sequence/test_seqtypes.py
@@ -2,6 +2,7 @@
 # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
 # information.
 
+import numpy as np
 import pytest
 import biotite.sequence as seq
 
@@ -90,3 +91,23 @@ def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein):
     protein = seq.ProteinSequence("ACDEFGHIKLMNPQRSTVW")
     mol_weight_protein = protein.get_molecular_weight(monoisotopic=monoisotopic)
     assert mol_weight_protein == pytest.approx(expected_mol_weight_protein, abs=1e-2)
+
+
+def test_positional_sequence():
+    """
+    Check whether the original sequence symbols can be reconstructed from a
+    :class:`PositionalSequence`.
+    """
+    SEQ_LENGTH = 100
+
+    np.random.seed(0)
+    orig_sequence = seq.ProteinSequence()
+    orig_sequence.code = np.random.randint(
+        0, len(orig_sequence.alphabet), size=SEQ_LENGTH
+    )
+    positional_sequence = seq.PositionalSequence(orig_sequence)
+
+    assert str(positional_sequence) == str(orig_sequence)
+    reconstructed_sequence = positional_sequence.reconstruct()
+    assert np.all(reconstructed_sequence.code == orig_sequence.code)
+    assert reconstructed_sequence.alphabet == orig_sequence.alphabet
diff --git a/tests/test_repr.py b/tests/test_repr.py
index f8bf319c4..df3afaed4 100644
--- a/tests/test_repr.py
+++ b/tests/test_repr.py
@@ -15,7 +15,9 @@
     LetterAlphabet,
     Location,
     NucleotideSequence,
+    PositionalSequence,
     ProteinSequence,
+    PurePositionalSequence,
     SequenceProfile,
 )
 from biotite.sequence.align import Alignment, SubstitutionMatrix
@@ -30,6 +32,8 @@
         NucleotideSequence("AACTGCTA"),
         NucleotideSequence("AACTGCTA", ambiguous=True),
         ProteinSequence("BIQTITE"),
+        PositionalSequence(NucleotideSequence("AACTGCTA")),
+        PurePositionalSequence(123),
         Alphabet(["X", "Y", "Z"]),
         GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]),
         LetterAlphabet(["X", "Y", "Z"]),

From 72ea1480c5d5ee9f4b1aeca99209d179b00a4bcf Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Sun, 8 Sep 2024 14:11:31 +0200
Subject: [PATCH 2/9] Implement more efficient implementations in subclass

---
 src/biotite/sequence/align/kmeralphabet.pyx | 17 +++++++++++++++++
 src/biotite/sequence/alphabet.py            |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/src/biotite/sequence/align/kmeralphabet.pyx b/src/biotite/sequence/align/kmeralphabet.pyx
index 02a1bbcda..910c39360 100644
--- a/src/biotite/sequence/align/kmeralphabet.pyx
+++ b/src/biotite/sequence/align/kmeralphabet.pyx
@@ -568,6 +568,23 @@ class KmerAlphabet(Alphabet):
         return int(len(self._base_alph) ** self._k)
 
 
+    def __iter__(self):
+        # Creating all symbols is expensive
+        # -> Use a generator instead
+        if isinstance(self._base_alph, LetterAlphabet):
+            return ("".join(self.decode(code)) for code in range(len(self)))
+        else:
+            return (list(self.decode(code)) for code in range(len(self)))
+
+
+    def __contains__(self, symbol):
+        try:
+            self.fuse(self._base_alph.encode_multiple(symbol))
+            return True
+        except AlphabetError:
+            return False
+
+
 def _to_array_form(model_string):
     """
     Convert the the common string representation of a *k-mer* spacing
diff --git a/src/biotite/sequence/alphabet.py b/src/biotite/sequence/alphabet.py
index c08022cec..0b1f92a75 100644
--- a/src/biotite/sequence/alphabet.py
+++ b/src/biotite/sequence/alphabet.py
@@ -410,6 +410,9 @@ def decode_multiple(self, code, as_bytes=False):
             symbols = symbols.astype("U1")
         return symbols
 
+    def is_letter_alphabet(self):
+        return True
+
     def __contains__(self, symbol):
         if not isinstance(symbol, (str, bytes)):
             return False

From ed4be761dec3d943928a66abb2e7de433ed6c6c1 Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Sun, 8 Sep 2024 16:13:05 +0200
Subject: [PATCH 3/9] Add string representation for `SequenceProfile`

---
 src/biotite/sequence/profile.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/biotite/sequence/profile.py b/src/biotite/sequence/profile.py
index d208b2b3f..f44c5d6af 100644
--- a/src/biotite/sequence/profile.py
+++ b/src/biotite/sequence/profile.py
@@ -156,8 +156,23 @@ def gaps(self, new_gaps):
             )
         self._gaps = new_gaps
 
+    def __str__(self):
+        # Add an additional row and column for the position and symbol indicators
+        print_matrix = np.full(
+            (self.symbols.shape[0] + 1, self.symbols.shape[1] + 1), "", dtype=object
+        )
+        print_matrix[1:, 1:] = self.symbols.astype(str)
+        print_matrix[0, 1:] = [str(sym) for sym in self.alphabet]
+        print_matrix[1:, 0] = [str(i) for i in range(self.symbols.shape[0])]
+        max_len = len(max(print_matrix.flatten(), key=len))
+        return "\n".join(
+            [
+                " ".join([str(cell).rjust(max_len) for cell in row])
+                for row in print_matrix
+            ]
+        )
+
     def __repr__(self):
-        """Represent SequenceProfile as a string for debugging."""
         return (
             f"SequenceProfile(np.{np.array_repr(self.symbols)}, "
             f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"

From 7ab6728c62df3a168e8292b00057d46ff5adf21d Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Sun, 8 Sep 2024 16:13:15 +0200
Subject: [PATCH 4/9] Add sequence profile tutorial

---
 doc/tutorial/sequence/index.rst    |   4 +-
 doc/tutorial/sequence/profiles.rst | 157 +++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+), 2 deletions(-)
 create mode 100644 doc/tutorial/sequence/profiles.rst

diff --git a/doc/tutorial/sequence/index.rst b/doc/tutorial/sequence/index.rst
index ab1a96b83..c879c3e04 100644
--- a/doc/tutorial/sequence/index.rst
+++ b/doc/tutorial/sequence/index.rst
@@ -43,5 +43,5 @@ For an unambiguous DNA sequence, the alphabet comprises the four nucleobases.
     align_optimal
     align_heuristic
     align_multiple
-    annotations
-    profiles
\ No newline at end of file
+    profiles
+    annotations
\ No newline at end of file
diff --git a/doc/tutorial/sequence/profiles.rst b/doc/tutorial/sequence/profiles.rst
new file mode 100644
index 000000000..d82dc4a2f
--- /dev/null
+++ b/doc/tutorial/sequence/profiles.rst
@@ -0,0 +1,157 @@
+.. include:: /tutorial/preamble.rst
+
+Profiles and position-specific matrices
+=======================================
+Often sequences are not viewed in isolation:
+For example, if you investigate a protein family, you do not handle a single sequence,
+but an arbitrarily large collection of highly similar sequences.
+At some point handling those sequences as a mere collection of individual sequences
+becomes impractical for answering common questions, like which are the prevalent
+amino acids at a certain positions of the sequences.
+
+Sequence profiles
+-----------------
+
+.. currentmodule:: biotite.sequence
+
+This is where *sequence profiles* come into play:
+The condense the a collection of aligned sequences into a matrix that tracks the
+frequency of each symbol at each position of the alignment.
+Hence, asking the questions such as '*How frequent is an alanine at the n-th position?*'
+becomes a trivial indexing operation.
+
+As example for profiles, we will reuse the cyclotide sequence family from the
+:doc:`previous chapter <align_multiple>`.
+
+.. jupyter-execute::
+
+    from tempfile import NamedTemporaryFile
+    import biotite.sequence.io.fasta as fasta
+    import biotite.database.entrez as entrez
+
+    query = (
+        entrez.SimpleQuery("Cyclotide") &
+        entrez.SimpleQuery("cter") &
+        entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^
+        entrez.SimpleQuery("Precursor")
+    )
+    uids = entrez.search(query, "protein")
+    temp_file = NamedTemporaryFile(suffix=".fasta", delete=False)
+    fasta_file = fasta.FastaFile.read(
+        entrez.fetch_single_file(uids, temp_file.name, "protein", "fasta")
+    )
+    sequences = {
+        # The cyclotide variant is the last character in the header
+        header[-1]: seq for header, seq in fasta.get_sequences(fasta_file).items()
+    }
+    # Extract cyclotide N as query sequence for later
+    query = sequences.pop("N")
+
+To create a profile, we first need to align the sequences, so corresponding symbols
+appear the same position.
+Then we can create a :class:`SequenceProfile` object from the :class:`.Alignment`, which
+simply counts for each alignment column (i.e. the sequence position) the number of
+occurrences for each symbol.
+
+.. jupyter-execute::
+
+    import biotite.sequence as seq
+    import biotite.sequence.align as align
+
+    alignment, _, _, _ = seq.align.align_multiple(
+        list(sequences.values()),
+        align.SubstitutionMatrix.std_protein_matrix(),
+        gap_penalty=-5,
+    )
+    profile = seq.SequenceProfile.from_alignment(alignment)
+    print(profile)
+
+Each row in the displayed count matrix
+(accessible via :attr:`SequenceProfile.symbols`) refers to a single position, i.e. a
+column in the input MSA, and each column refers to a symbol in the underlying alphabet
+(accessible via :attr:`SequenceProfile.alphabet`).
+For completeness it should be noted that :attr:`SequenceProfile.gaps` also tracks the
+gaps for each position in the alignment, but we will not further use them in this
+tutorial.
+
+Note that the information about the individual sequences is lost in the condensation
+process: There is no way to reconstruct the original sequences from the profile.
+However, we can still extract a consensus sequence from the profile, which is a
+sequence that represents the most frequent symbol at each position.
+
+.. jupyter-execute::
+
+    print(profile.to_consensus())
+
+Profile visualization as sequence logo
+--------------------------------------
+
+.. currentmodule:: biotite.sequence.align
+
+A common way to visualize a sequence profile is a sequence logo.
+It depicts each profile position as a stack of letters:
+The degree of conversation (more precisely the
+`Shannon entropy <https://en.wikipedia.org/wiki/Entropy_(information_theory)>`_)
+is the height of a stack and each letter's height in the stack is proportional to its
+frequency at the respective position.
+
+.. jupyter-execute::
+
+    import matplotlib.pyplot as plt
+    from biotite.sequence.graphics import plot_sequence_logo
+
+    fig, ax = plt.subplots(figsize=(8.0, 2.0), constrained_layout=True)
+    plot_sequence_logo(ax, profile)
+    ax.set_xlabel("Residue position")
+    ax.set_ylabel("Bits")
+
+Position-specific scoring matrices
+----------------------------------
+
+.. currentmodule:: biotite.sequence.align
+
+Sequence profiles can achieve even more:
+The substitution matrices we have seen so far assign a score to a pair of two symbols,
+regardless of their position in a sequence.
+However, as discussed above, the position is crucial information to determine how
+conserved a certain symbol is in a family of sequences.
+
+Hence, we extend the concept of substitution matrices to *position-specific* matrices,
+which assign a score to a symbol and a position (instead of another symbols).
+
+.. jupyter-execute::
+
+    # For a real analysis each amino acid background frequencies should be provided
+    log_odds = profile.log_odds_matrix(pseudocount=1)
+
+Now, we encounter a problem:
+To create a :class:`SubstitutionMatrix` object from the log-odds matrix, we require two
+:class:`.Alphabet` objects:
+One is taken from the query sequence to be aligned to the profile, but which alphabet
+do we use for the positional axis of the matrix?
+Likewise, the alignment functions (e.g. :func:`align_optimal()`) require a two sequences
+to be aligned, but we only have one query sequence.
+
+To solve this problem :mod:`biotite.sequence` provides the :class:`.PositionalSequence`
+which acts as a placeholder for the second sequence in the alignment.
+Its alphabet contains a unique symbol for each position, i.e. the alphabet has the
+sought length.
+
+.. jupyter-execute::
+
+    positional_seq = seq.PositionalSequence(profile.to_consensus())
+    matrix = align.SubstitutionMatrix(
+        positional_seq.alphabet,
+        profile.alphabet,
+        log_odds
+    )
+    alignment = align.align_optimal(
+        positional_seq, query, matrix, gap_penalty=-5, max_number=1
+    )[0]
+    print(alignment)
+
+Only the length of the input sequence passed to :class:`.PositionalSequence` matters for
+the alignment to work.
+The consensus sequence of the profile was merely chosen for cosmetic reasons, i.e.
+to have a meaningful string representation of the positional sequence and thus the
+alignment.

From 7ddb3e4e2eafea71ca585bb519cfd619c55770b9 Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Sun, 3 Nov 2024 18:40:19 +0100
Subject: [PATCH 5/9] Check for empty response

---
 src/biotite/database/uniprot/check.py    | 35 ++++++++++++++----------
 src/biotite/database/uniprot/download.py |  2 +-
 src/biotite/database/uniprot/query.py    |  2 +-
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/biotite/database/uniprot/check.py b/src/biotite/database/uniprot/check.py
index a1782e1ba..3bbfabc6d 100644
--- a/src/biotite/database/uniprot/check.py
+++ b/src/biotite/database/uniprot/check.py
@@ -10,7 +10,7 @@
 
 
 # Taken from https://www.uniprot.org/help/api_retrieve_entries
-def assert_valid_response(response_status_code):
+def assert_valid_response(response):
     """
     Checks whether the response is valid.
 
@@ -19,17 +19,22 @@ def assert_valid_response(response_status_code):
     response_status_code: int
         Status code of request.get.
     """
-    if response_status_code == 400:
-        raise RequestError("Bad request. There is a problem with your input.")
-    elif response_status_code == 404:
-        raise RequestError("Not found. The resource you requested doesn't exist.")
-    elif response_status_code == 410:
-        raise RequestError("Gone. The resource you requested was removed.")
-    elif response_status_code == 500:
-        raise RequestError(
-            "Internal server error. Most likely a temporary problem, but if the problem persists please contact UniProt team."
-        )
-    elif response_status_code == 503:
-        raise RequestError(
-            "Service not available. The server is being updated, try again later."
-        )
+    if len(response.content) == 0:
+        raise RequestError("No content returned")
+    match response.status_code:
+        case 400:
+            raise RequestError("Bad request. There is a problem with your input.")
+        case 404:
+            raise RequestError("Not found. The resource you requested doesn't exist.")
+        case 410:
+            raise RequestError("Gone. The resource you requested was removed.")
+        case 500:
+            raise RequestError(
+                "Internal server error. "
+                "Most likely a temporary problem, "
+                "but if the problem persists please contact UniProt team."
+            )
+        case 503:
+            raise RequestError(
+                "Service not available. The server is being updated, try again later."
+            )
diff --git a/src/biotite/database/uniprot/download.py b/src/biotite/database/uniprot/download.py
index bacb40e96..8f27c916b 100644
--- a/src/biotite/database/uniprot/download.py
+++ b/src/biotite/database/uniprot/download.py
@@ -111,7 +111,7 @@ def fetch(ids, format, target_path=None, overwrite=False, verbose=False):
             if format in ["fasta", "gff", "txt", "xml", "rdf", "tab"]:
                 r = requests.get(_fetch_url + db_name + "/" + id + "." + format)
                 content = r.text
-                assert_valid_response(r.status_code)
+                assert_valid_response(r)
             else:
                 raise ValueError(f"Format '{format}' is not supported")
             if file is None:
diff --git a/src/biotite/database/uniprot/query.py b/src/biotite/database/uniprot/query.py
index 687c61f5f..1394dc4c1 100644
--- a/src/biotite/database/uniprot/query.py
+++ b/src/biotite/database/uniprot/query.py
@@ -289,5 +289,5 @@ def search(query, number=500):
     params = {"query": str(query), "format": "list", "size": str(number)}
     r = requests.get(_base_url, params=params)
     content = r.text
-    assert_valid_response(r.status_code)
+    assert_valid_response(r)
     return content.split("\n")[:-1]

From 32fadcbcf0249e2bbc047c2bae7b76e455d9d54e Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Sun, 3 Nov 2024 18:40:37 +0100
Subject: [PATCH 6/9] fixup! Add sequence profile tutorial

---
 doc/tutorial/sequence/profiles.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/tutorial/sequence/profiles.rst b/doc/tutorial/sequence/profiles.rst
index d82dc4a2f..e5b43d5a5 100644
--- a/doc/tutorial/sequence/profiles.rst
+++ b/doc/tutorial/sequence/profiles.rst
@@ -143,7 +143,9 @@ sought length.
     matrix = align.SubstitutionMatrix(
         positional_seq.alphabet,
         profile.alphabet,
-        log_odds
+        # Substitution matrices require integer scores
+        # Multiply by 10 to increase value range and convert to integer
+        (log_odds * 10).astype(int)
     )
     alignment = align.align_optimal(
         positional_seq, query, matrix, gap_penalty=-5, max_number=1

From 782219fa15c0fcb00b23b8bef5d2b43258ba052f Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Sun, 3 Nov 2024 18:41:13 +0100
Subject: [PATCH 7/9] Check for invalid input score matrix

---
 src/biotite/sequence/align/matrix.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py
index 06308115d..cab851672 100644
--- a/src/biotite/sequence/align/matrix.py
+++ b/src/biotite/sequence/align/matrix.py
@@ -148,7 +148,21 @@ def __init__(self, alphabet1, alphabet2, score_matrix):
                     f"Matrix has shape {score_matrix.shape}, "
                     f"but {alph_shape} is required"
                 )
+            if not np.issubdtype(score_matrix.dtype, np.integer):
+                raise TypeError("Score matrix must be an integer ndarray")
             self._matrix = score_matrix.astype(np.int32)
+            # If the score matrix was converted from a a float matrix,
+            # inf values would be converted to 2**31,
+            # which is probably undesired and gives overflow issues in the alignment
+            # functions
+            if (
+                np.any(self._matrix == np.iinfo(np.int32).max) or
+                np.any(self._matrix == np.iinfo(np.int32).min)
+            ):  # fmt: skip
+                raise ValueError(
+                    "Score values are too large. "
+                    "Maybe it was converted from a float matrix containing inf values?"
+                )
         elif isinstance(score_matrix, str):
             matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix)
             self._fill_with_matrix_dict(matrix_dict)

From 5c47b54584479e0567ef020618c481d0dcdfafc4 Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Mon, 4 Nov 2024 11:23:14 +0100
Subject: [PATCH 8/9] Add example to gallery

---
 .../scripts/sequence/profile/scop_profile.py  | 171 ++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 doc/examples/scripts/sequence/profile/scop_profile.py

diff --git a/doc/examples/scripts/sequence/profile/scop_profile.py b/doc/examples/scripts/sequence/profile/scop_profile.py
new file mode 100644
index 000000000..bf761732e
--- /dev/null
+++ b/doc/examples/scripts/sequence/profile/scop_profile.py
@@ -0,0 +1,171 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+import biotite.application.clustalo as clustalo
+import biotite.database.uniprot as uniprot
+import biotite.sequence as seq
+import biotite.sequence.align as align
+import biotite.sequence.graphics as graphics
+import biotite.sequence.io.fasta as fasta
+from biotite.database import RequestError
+
+TARGET_UNIPROT_ID = "P10145"
+TARGET_SCOP_FAMILY_ID = "4000912"
+QUERY_UNIPROT_ID = "P03212"
+
+#TARGET_UNIPROT_ID = "P09467"
+#TARGET_SCOP_FAMILY_ID = "4002766"
+#QUERY_UNIPROT_ID = "P73922"
+
+
+target_sequence = fasta.get_sequence(
+    fasta.FastaFile.read(uniprot.fetch(TARGET_UNIPROT_ID, "fasta", "."))
+)
+query_sequence = fasta.get_sequence(
+    fasta.FastaFile.read(uniprot.fetch(QUERY_UNIPROT_ID, "fasta", "."))
+)
+
+aa_matrix = align.SubstitutionMatrix.std_protein_matrix()
+alignment = align.align_optimal(
+    target_sequence,
+    query_sequence,
+    aa_matrix,
+    gap_penalty=(-10, -1),
+    local=True,
+    max_number=1,
+)[0]
+
+fig, ax = plt.subplots(figsize=(8.0, 1.0))
+graphics.plot_alignment_similarity_based(
+    ax, alignment, matrix=aa_matrix, labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID]
+)
+
+
+########################################################################################
+# https://www.ebi.ac.uk/pdbe/scop/download
+
+
+def get_sequence_family(scop_id):
+    scop_domains = requests.get(
+        f"https://www.ebi.ac.uk/pdbe/scop/api/domains/{scop_id}"
+    ).json()
+    sequences = {}
+    for scop_domain in scop_domains["domains"]:
+        uniprot_id = scop_domain["uniprot_id"]
+        try:
+            sequence = fasta.get_sequence(
+                fasta.FastaFile.read(uniprot.fetch(uniprot_id, "fasta", "."))
+            )
+        except RequestError:
+            # The UniProt ID is not in the database -> skip this domain
+            continue
+        for start, end in scop_domain["protein_regions"]:
+            region = sequence[start : end + 1]
+            sequences[uniprot_id] = region
+            # Most domains have only one region within the sequence
+            # For simplicity we only consider the first region
+            break
+    return sequences
+
+
+sequences = get_sequence_family(TARGET_SCOP_FAMILY_ID)
+app = clustalo.ClustalOmegaApp(list(sequences.values()))
+app.start()
+app.join()
+msa = app.get_alignment()
+order = app.get_alignment_order()
+labels = np.array(list(sequences.keys()))
+
+fig, ax = plt.subplots(figsize=(8.0, 24.0))
+graphics.plot_alignment_type_based(
+    ax, msa[:, order], labels=labels[order], show_numbers=True
+)
+
+########################################################################################
+# :footcite:`Robinson1991`
+
+# Must have the same order as `ProteinSequence.alphabet`
+AA_FREQUENCY = {
+    "A": 35155,
+    "C": 8669,
+    "D": 24161,
+    "E": 28354,
+    "F": 17367,
+    "G": 33229,
+    "H": 9906,
+    "I": 23161,
+    "K": 25872,
+    "L": 40625,
+    "M": 10101,
+    "N": 20212,
+    "P": 23435,
+    "Q": 19208,
+    "R": 23105,
+    "S": 32070,
+    "T": 26311,
+    "V": 29012,
+    "W": 5990,
+    "Y": 14488,
+    # Set ambiguous amino acid count to 1 to avoid division by zero in log odds matrix
+    "B": 1,
+    "Z": 1,
+    "X": 1,
+    "*": 1,
+}
+
+
+profile = seq.SequenceProfile.from_alignment(msa)
+background_frequencies = np.array(list(AA_FREQUENCY.values()))
+# Normalize background frequencies
+background_frequencies = background_frequencies / background_frequencies.sum()
+
+score_matrix = profile.log_odds_matrix(background_frequencies, pseudocount=1)
+score_matrix *= 10
+score_matrix = score_matrix.astype(np.int32)
+
+profile_seq = seq.PositionalSequence(profile.to_consensus())
+positional_matrix = align.SubstitutionMatrix(
+    profile_seq.alphabet, seq.ProteinSequence.alphabet, score_matrix=score_matrix
+)
+profile_alignment = align.align_optimal(
+    profile_seq,
+    query_sequence,
+    positional_matrix,
+    gap_penalty=(-10, -1),
+    local=False,
+    terminal_penalty=False,
+    max_number=1,
+)[0]
+print(profile_alignment)
+
+
+########################################################################################
+# Map the profile alignment to the original target sequence
+# Chimeric alignment from target in the MSA and the query in the profile alignment
+sequence_pos_in_msa = np.where(labels == TARGET_UNIPROT_ID)[0][0]
+target_trace = msa.trace[:, sequence_pos_in_msa]
+# Remove parts of query aligned to gaps in in the profile
+gap_mask = profile_alignment.trace[:, 0] != -1
+query_trace = profile_alignment.trace[gap_mask, 1]
+print(msa.trace.shape)
+print(profile_alignment.trace.shape)
+print(profile.symbols.shape)
+print()
+print(query_trace.shape)
+print(target_trace.shape)
+target_alignment = align.Alignment(
+    [target_sequence, query_sequence],
+    np.stack([target_trace, query_trace], axis=-1),
+)
+
+fig, ax = plt.subplots(figsize=(8.0, 4.0))
+graphics.plot_alignment_similarity_based(
+    ax, target_alignment, matrix=aa_matrix, labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID]
+)
+
+
+plt.show()
+
+########################################################################################
+#
+# .. footbibliography::

From b306942447976b39217c3d10ebcad95dd835d998 Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Wed, 13 Nov 2024 10:43:23 +0100
Subject: [PATCH 9/9] fixup! Add example to gallery

---
 .../scripts/sequence/profile/scop_profile.py  | 93 +++++++++++--------
 1 file changed, 54 insertions(+), 39 deletions(-)

diff --git a/doc/examples/scripts/sequence/profile/scop_profile.py b/doc/examples/scripts/sequence/profile/scop_profile.py
index bf761732e..938e22bd9 100644
--- a/doc/examples/scripts/sequence/profile/scop_profile.py
+++ b/doc/examples/scripts/sequence/profile/scop_profile.py
@@ -1,7 +1,11 @@
+"""
+Note that in order to get a representative sequence profile, it is crucial to have a
+high number of sequences the target sequence can be aligned to.
+"""
+
 import matplotlib.pyplot as plt
 import numpy as np
 import requests
-import biotite.application.clustalo as clustalo
 import biotite.database.uniprot as uniprot
 import biotite.sequence as seq
 import biotite.sequence.align as align
@@ -9,13 +13,13 @@
 import biotite.sequence.io.fasta as fasta
 from biotite.database import RequestError
 
-TARGET_UNIPROT_ID = "P10145"
-TARGET_SCOP_FAMILY_ID = "4000912"
-QUERY_UNIPROT_ID = "P03212"
+TARGET_UNIPROT_ID = "P04746"
+TARGET_SCOP_FAMILY_ID = "4003138"
+QUERY_UNIPROT_ID = "P00722"
 
-#TARGET_UNIPROT_ID = "P09467"
-#TARGET_SCOP_FAMILY_ID = "4002766"
-#QUERY_UNIPROT_ID = "P73922"
+# TARGET_UNIPROT_ID = "P09467"
+# TARGET_SCOP_FAMILY_ID = "4002766"
+# QUERY_UNIPROT_ID = "P73922"
 
 
 target_sequence = fasta.get_sequence(
@@ -27,11 +31,11 @@
 
 aa_matrix = align.SubstitutionMatrix.std_protein_matrix()
 alignment = align.align_optimal(
-    target_sequence,
     query_sequence,
+    target_sequence,
     aa_matrix,
     gap_penalty=(-10, -1),
-    local=True,
+    local=False,
     max_number=1,
 )[0]
 
@@ -68,18 +72,38 @@ def get_sequence_family(scop_id):
     return sequences
 
 
+def merge_pairwise_alignments(alignments):
+    traces = []
+    sequences = []
+    for alignment in alignments:
+        trace = alignment.trace
+        # Remove gaps in first sequence
+        trace = trace[trace[:, 0] != -1]
+        traces.append(trace[:, 1])
+        sequences.append(alignment.sequences[1])
+    return align.Alignment(sequences, np.stack(traces, axis=-1))
+
+
 sequences = get_sequence_family(TARGET_SCOP_FAMILY_ID)
-app = clustalo.ClustalOmegaApp(list(sequences.values()))
-app.start()
-app.join()
-msa = app.get_alignment()
-order = app.get_alignment_order()
+# This is not a 'true' MSA, in the sense that it only merges the pairwise alignments
+# with respect to the target sequence
+pseudo_msa = merge_pairwise_alignments(
+    [
+        align.align_optimal(
+            target_sequence,
+            sequence,
+            aa_matrix,
+            gap_penalty=(-10, -1),
+            max_number=1,
+        )[0]
+        for sequence in sequences.values()
+    ]
+)
+
 labels = np.array(list(sequences.keys()))
 
 fig, ax = plt.subplots(figsize=(8.0, 24.0))
-graphics.plot_alignment_type_based(
-    ax, msa[:, order], labels=labels[order], show_numbers=True
-)
+graphics.plot_alignment_type_based(ax, pseudo_msa, labels=labels, show_numbers=True)
 
 ########################################################################################
 # :footcite:`Robinson1991`
@@ -114,53 +138,44 @@ def get_sequence_family(scop_id):
 }
 
 
-profile = seq.SequenceProfile.from_alignment(msa)
+profile = seq.SequenceProfile.from_alignment(pseudo_msa)
 background_frequencies = np.array(list(AA_FREQUENCY.values()))
 # Normalize background frequencies
 background_frequencies = background_frequencies / background_frequencies.sum()
 
-score_matrix = profile.log_odds_matrix(background_frequencies, pseudocount=1)
+score_matrix = profile.log_odds_matrix(background_frequencies, pseudocount=1).T
 score_matrix *= 10
 score_matrix = score_matrix.astype(np.int32)
 
 profile_seq = seq.PositionalSequence(profile.to_consensus())
 positional_matrix = align.SubstitutionMatrix(
-    profile_seq.alphabet, seq.ProteinSequence.alphabet, score_matrix=score_matrix
+    seq.ProteinSequence.alphabet, profile_seq.alphabet, score_matrix=score_matrix
 )
 profile_alignment = align.align_optimal(
-    profile_seq,
     query_sequence,
+    profile_seq,
     positional_matrix,
     gap_penalty=(-10, -1),
     local=False,
     terminal_penalty=False,
     max_number=1,
 )[0]
-print(profile_alignment)
 
 
 ########################################################################################
-# Map the profile alignment to the original target sequence
-# Chimeric alignment from target in the MSA and the query in the profile alignment
-sequence_pos_in_msa = np.where(labels == TARGET_UNIPROT_ID)[0][0]
-target_trace = msa.trace[:, sequence_pos_in_msa]
-# Remove parts of query aligned to gaps in in the profile
-gap_mask = profile_alignment.trace[:, 0] != -1
-query_trace = profile_alignment.trace[gap_mask, 1]
-print(msa.trace.shape)
-print(profile_alignment.trace.shape)
-print(profile.symbols.shape)
-print()
-print(query_trace.shape)
-print(target_trace.shape)
-target_alignment = align.Alignment(
-    [target_sequence, query_sequence],
-    np.stack([target_trace, query_trace], axis=-1),
+# Map the profile alignment to the original target sequence.
+# As the pseudo-MSA was designed to have the same length as the target sequence,
+# the sequence needs to be exchanged only, the trace remains the same
+refined_alignment = align.Alignment(
+    [query_sequence, target_sequence], profile_alignment.trace
 )
 
 fig, ax = plt.subplots(figsize=(8.0, 4.0))
 graphics.plot_alignment_similarity_based(
-    ax, target_alignment, matrix=aa_matrix, labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID]
+    ax,
+    refined_alignment,
+    matrix=aa_matrix,
+    labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID],
 )