From 09bf8157ad5d09f9fa255c678ed6c6adb4ccdf33 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Mon, 2 Sep 2024 14:52:00 +0200 Subject: [PATCH 1/9] Add positional substitution matrices --- src/biotite/sequence/align/alignment.py | 19 ++- src/biotite/sequence/align/matrix.py | 214 ++++++++++++++++++++---- src/biotite/sequence/seqtypes.py | 127 +++++++++++++- tests/sequence/align/test_matrix.py | 29 ++++ tests/sequence/test_seqtypes.py | 21 +++ tests/test_repr.py | 4 + 6 files changed, 372 insertions(+), 42 deletions(-) diff --git a/src/biotite/sequence/align/alignment.py b/src/biotite/sequence/align/alignment.py index d33e3d051..cbbbe1cdd 100644 --- a/src/biotite/sequence/align/alignment.py +++ b/src/biotite/sequence/align/alignment.py @@ -9,7 +9,6 @@ import textwrap from collections.abc import Sequence import numpy as np -from biotite.sequence.alphabet import LetterAlphabet __all__ = [ "Alignment", @@ -111,7 +110,7 @@ def _gapped_str(self, seq_index): for i in range(len(self.trace)): j = self.trace[i][seq_index] if j != -1: - seq_str += self.sequences[seq_index][j] + seq_str += str(self.sequences[seq_index][j]) else: seq_str += "-" return seq_str @@ -133,7 +132,7 @@ def __str__(self): # has an non-single letter alphabet all_single_letter = True for seq in self.sequences: - if not isinstance(seq.get_alphabet(), LetterAlphabet): + if not _is_single_letter(seq.alphabet): all_single_letter = False if all_single_letter: # First dimension: sequence number, @@ -665,3 +664,17 @@ def remove_terminal_gaps(alignment): "no overlap and the resulting alignment would be empty" ) return alignment[start:stop] + + +def _is_single_letter(alphabet): + """ + More relaxed version of :func:`biotite.sequence.alphabet.is_letter_alphabet()`: + It is sufficient that only only the string representation of each symbol is only + a single character. + """ + if alphabet.is_letter_alphabet(): + return True + for symbol in alphabet: + if len(str(symbol)) != 1: + return False + return True diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py index 2a7d23437..06308115d 100644 --- a/src/biotite/sequence/align/matrix.py +++ b/src/biotite/sequence/align/matrix.py @@ -7,7 +7,11 @@ import os import numpy as np -from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence +from biotite.sequence.seqtypes import ( + NucleotideSequence, + PositionalSequence, + ProteinSequence, +) __all__ = ["SubstitutionMatrix"] @@ -78,6 +82,11 @@ class SubstitutionMatrix(object): or a dictionary mapping the symbol pairing to scores, or a string referencing a matrix in the internal database. + Attributes + ---------- + shape : tuple + The shape of the substitution matrix. + Raises ------ KeyError @@ -151,34 +160,18 @@ def __init__(self, alphabet1, alphabet2, score_matrix): # score matrix -> make the score matrix read-only self._matrix.setflags(write=False) - def __repr__(self): - """Represent SubstitutionMatrix as a string for debugging.""" - return ( - f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, " - f"np.{np.array_repr(self._matrix)})" - ) - - def __eq__(self, item): - if not isinstance(item, SubstitutionMatrix): - return False - if self._alph1 != item.get_alphabet1(): - return False - if self._alph2 != item.get_alphabet2(): - return False - if not np.array_equal(self.score_matrix(), item.score_matrix()): - return False - return True - - def __ne__(self, item): - return not self == item + @property + def shape(self): + """ + Get the shape (i.e. the length of both alphabets) + of the substitution matrix. - def _fill_with_matrix_dict(self, matrix_dict): - self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32) - for i in range(len(self._alph1)): - for j in range(len(self._alph2)): - sym1 = self._alph1.decode(i) - sym2 = self._alph2.decode(j) - self._matrix[i, j] = int(matrix_dict[sym1, sym2]) + Returns + ------- + shape : tuple + Matrix shape. + """ + return (len(self._alph1), len(self._alph2)) def get_alphabet1(self): """ @@ -280,26 +273,155 @@ def get_score(self, symbol1, symbol2): code2 = self._alph2.encode(symbol2) return self._matrix[code1, code2] - def shape(self): + def as_positional(self, sequence1, sequence2): """ - Get the shape (i.e. the length of both alphabets) - of the subsitution matrix. + Transform this substitution matrix and two sequences into positional + equivalents. + + This means the new substitution matrix is position-specific: It has the lengths + of the sequences instead of the lengths of their alphabets. + Its scores represent the same scores as the original matrix, but now mapped + onto the positions of the sequences. + + Parameters + ---------- + sequence1, sequence2 : seq.Sequence, length=n + The sequences to create the positional equivalents from. Returns ------- - shape : tuple - Matrix shape. + pos_matrix : align.SubstitutionMatrix, shape=(n, n) + The position-specific substitution matrix. + pos_sequence1, pos_sequence2 : PositionalSequence, length=n + The positional sequences. + + Notes + ----- + After the transformation the substitution scores remain the same, i.e. + `substitution_matrix.get_score(sequence1[i], sequence2[j])` is equal to + `pos_matrix.get_score(pos_sequence1[i], pos_sequence2[j])`. + + Examples + -------- + + Run an alignment with the usual substitution matrix: + + >>> seq1 = ProteinSequence("BIQTITE") + >>> seq2 = ProteinSequence("IQLITE") + >>> matrix = SubstitutionMatrix.std_protein_matrix() + >>> print(matrix) + A C D E F G H I K L M N P Q R S T V W Y B Z X * + A 4 0 -2 -1 -2 0 -2 -1 -1 -1 -1 -2 -1 -1 -1 1 0 0 -3 -2 -2 -1 0 -4 + C 0 9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2 -3 -3 -2 -4 + D -2 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -3 4 1 -1 -4 + E -1 -4 2 5 -3 -2 0 -3 1 -3 -2 0 -1 2 0 0 -1 -2 -3 -2 1 4 -1 -4 + F -2 -2 -3 -3 6 -3 -1 0 -3 0 0 -3 -4 -3 -3 -2 -2 -1 1 3 -3 -3 -1 -4 + G 0 -3 -1 -2 -3 6 -2 -4 -2 -4 -3 0 -2 -2 -2 0 -2 -3 -2 -3 -1 -2 -1 -4 + H -2 -3 -1 0 -1 -2 8 -3 -1 -3 -2 1 -2 0 0 -1 -2 -3 -2 2 0 0 -1 -4 + I -1 -1 -3 -3 0 -4 -3 4 -3 2 1 -3 -3 -3 -3 -2 -1 3 -3 -1 -3 -3 -1 -4 + K -1 -3 -1 1 -3 -2 -1 -3 5 -2 -1 0 -1 1 2 0 -1 -2 -3 -2 0 1 -1 -4 + L -1 -1 -4 -3 0 -4 -3 2 -2 4 2 -3 -3 -2 -2 -2 -1 1 -2 -1 -4 -3 -1 -4 + M -1 -1 -3 -2 0 -3 -2 1 -1 2 5 -2 -2 0 -1 -1 -1 1 -1 -1 -3 -1 -1 -4 + N -2 -3 1 0 -3 0 1 -3 0 -3 -2 6 -2 0 0 1 0 -3 -4 -2 3 0 -1 -4 + P -1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2 7 -1 -2 -1 -1 -2 -4 -3 -2 -1 -2 -4 + Q -1 -3 0 2 -3 -2 0 -3 1 -2 0 0 -1 5 1 0 -1 -2 -2 -1 0 3 -1 -4 + R -1 -3 -2 0 -3 -2 0 -3 2 -2 -1 0 -2 1 5 -1 -1 -3 -3 -2 -1 0 -1 -4 + S 1 -1 0 0 -2 0 -1 -2 0 -2 -1 1 -1 0 -1 4 1 -2 -3 -2 0 0 0 -4 + T 0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1 0 -1 -1 -1 1 5 0 -2 -2 -1 -1 0 -4 + V 0 -1 -3 -2 -1 -3 -3 3 -2 1 1 -3 -2 -2 -3 -2 0 4 -3 -1 -3 -2 -1 -4 + W -3 -2 -4 -3 1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11 2 -4 -3 -2 -4 + Y -2 -2 -3 -2 3 -3 2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1 2 7 -3 -2 -1 -4 + B -2 -3 4 1 -3 -1 0 -3 0 -4 -3 3 -2 0 -1 0 -1 -3 -4 -3 4 1 -1 -4 + Z -1 -3 1 4 -3 -2 0 -3 1 -3 -1 0 -1 3 0 0 -1 -2 -3 -2 1 4 -1 -4 + X 0 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 -1 0 0 -1 -2 -1 -1 -1 -1 -4 + * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 + >>> alignment = align_optimal(seq1, seq2, matrix, gap_penalty=-10)[0] + >>> print(alignment) + BIQTITE + -IQLITE + + Running the alignment with positional equivalents gives the same result: + + >>> pos_matrix, pos_seq1, pos_seq2 = matrix.as_positional(seq1, seq2) + >>> print(pos_matrix) + I Q L I T E + B -3 0 -4 -3 -1 1 + I 4 -3 2 4 -1 -3 + Q -3 5 -2 -3 -1 2 + T -1 -1 -1 -1 5 -1 + I 4 -3 2 4 -1 -3 + T -1 -1 -1 -1 5 -1 + E -3 2 -3 -3 -1 5 + >>> pos_alignment = align_optimal(pos_seq1, pos_seq2, pos_matrix, gap_penalty=-10)[0] + >>> print(pos_alignment) + BIQTITE + -IQLITE + + Increase the substitution score for the first symbols in both sequences to align + to each other: + + >>> score_matrix = pos_matrix.score_matrix().copy() + >>> score_matrix[0, 0] = 100 + >>> biased_matrix = SubstitutionMatrix( + ... pos_matrix.get_alphabet1(), pos_matrix.get_alphabet2(), score_matrix + ... ) + >>> print(biased_matrix) + I Q L I T E + B 100 0 -4 -3 -1 1 + I 4 -3 2 4 -1 -3 + Q -3 5 -2 -3 -1 2 + T -1 -1 -1 -1 5 -1 + I 4 -3 2 4 -1 -3 + T -1 -1 -1 -1 5 -1 + E -3 2 -3 -3 -1 5 + >>> biased_alignment = align_optimal(pos_seq1, pos_seq2, biased_matrix, gap_penalty=-10)[0] + >>> print(biased_alignment) + BIQTITE + I-QLITE """ - return (len(self._alph1), len(self._alph2)) + pos_sequence1 = PositionalSequence(sequence1) + pos_sequence2 = PositionalSequence(sequence2) + + pos_score_matrix = self._matrix[ + tuple(_cartesian_product(sequence1.code, sequence2.code).T) + ].reshape(len(sequence1), len(sequence2)) + pos_matrix = SubstitutionMatrix( + pos_sequence1.get_alphabet(), + pos_sequence2.get_alphabet(), + pos_score_matrix, + ) + + return pos_matrix, pos_sequence1, pos_sequence2 + + def __repr__(self): + """Represent SubstitutionMatrix as a string for debugging.""" + return ( + f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, " + f"np.{np.array_repr(self._matrix)})" + ) + + def __eq__(self, item): + if not isinstance(item, SubstitutionMatrix): + return False + if self._alph1 != item.get_alphabet1(): + return False + if self._alph2 != item.get_alphabet2(): + return False + if not np.array_equal(self.score_matrix(), item.score_matrix()): + return False + return True + + def __ne__(self, item): + return not self == item def __str__(self): # Create matrix in NCBI format string = " " for symbol in self._alph2: - string += f" {symbol:>3}" + string += f" {str(symbol):>3}" string += "\n" for i, symbol in enumerate(self._alph1): - string += f"{symbol:>1}" + string += f"{str(symbol):>1}" for j in range(len(self._alph2)): string += f" {int(self._matrix[i,j]):>3d}" string += "\n" @@ -394,6 +516,14 @@ def std_nucleotide_matrix(): """ return _matrix_nuc + def _fill_with_matrix_dict(self, matrix_dict): + self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32) + for i in range(len(self._alph1)): + for j in range(len(self._alph2)): + sym1 = self._alph1.decode(i) + sym2 = self._alph2.decode(j) + self._matrix[i, j] = int(matrix_dict[sym1, sym2]) + # Preformatted BLOSUM62 and NUC substitution matrix from NCBI _matrix_blosum62 = SubstitutionMatrix( @@ -402,3 +532,15 @@ def std_nucleotide_matrix(): _matrix_nuc = SubstitutionMatrix( NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC" ) + + +def _cartesian_product(array1, array2): + """ + Create all combinations of elements from two arrays. + """ + return np.transpose( + [ + np.repeat(array1, len(array2)), + np.tile(array2, len(array1)), + ] + ) diff --git a/src/biotite/sequence/seqtypes.py b/src/biotite/sequence/seqtypes.py index e09527c35..0234776a1 100644 --- a/src/biotite/sequence/seqtypes.py +++ b/src/biotite/sequence/seqtypes.py @@ -4,10 +4,22 @@ __name__ = "biotite.sequence" __author__ = "Patrick Kunzmann", "Thomas Nevolianis" -__all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"] - +__all__ = [ + "GeneralSequence", + "NucleotideSequence", + "ProteinSequence", + "PositionalSequence", + "PurePositionalSequence", +] + +from dataclasses import dataclass, field import numpy as np -from biotite.sequence.alphabet import AlphabetError, AlphabetMapper, LetterAlphabet +from biotite.sequence.alphabet import ( + Alphabet, + AlphabetError, + AlphabetMapper, + LetterAlphabet, +) from biotite.sequence.sequence import Sequence @@ -590,3 +602,112 @@ def get_molecular_weight(self, monoisotopic=False): "Sequence contains ambiguous amino acids, " "cannot calculate weight" ) return weight + + +class PositionalSequence(Sequence): + """ + A sequence where each symbol is associated with a position. + + For each individual position the sequence contains a separate + :class:`PositionalSequence.Symbol`, encoded by a custom alphabet for this sequence. + In consequence the symbol code is the position in the sequence itself. + This is useful for aligning sequences based on a position-specific + substitution matrix. + + Parameters + ---------- + original_sequence : seq.Sequence + The original sequence to create the positional sequence from. + """ + + @dataclass(frozen=True) + class Symbol: + """ + Combination of a symbol and its position in a sequence. + + Attributes + ---------- + original_alphabet : Alphabet + The original alphabet, where the symbol stems from. + original_code : int + The code of the original symbol in the original alphabet. + position : int + The 0-based position of the symbol in the sequence. + symbol : object + The symbol from the original alphabet. + + See Also + -------- + PositionalSequence + The sequence type containing :class:`PositionalSymbol` objects. + """ + + original_alphabet: ... + original_code: ... + position: ... + symbol: ... = field(init=False) + + def __post_init__(self): + sym = self.original_alphabet.decode(self.original_code) + super().__setattr__("symbol", sym) + + def __str__(self): + return str(self.symbol) + + def __init__(self, original_sequence): + self._orig_alphabet = original_sequence.get_alphabet() + self._alphabet = Alphabet( + [ + PositionalSequence.Symbol(self._orig_alphabet, code, pos) + for pos, code in enumerate(original_sequence.code) + ] + ) + self.code = np.arange( + len(original_sequence), dtype=Sequence.dtype(len(self._alphabet)) + ) + + def reconstruct(self): + """ + Reconstruct the original sequence from the positional sequence. + + Returns + ------- + original_sequence : GeneralSequence + The original sequence. + Although the actual type of the returned sequence is always a + :class:`GeneralSequence`, the alphabet and the symbols of the returned + sequence are equal to the original sequence. + """ + original_sequence = GeneralSequence(self._orig_alphabet) + original_sequence.code = np.array([sym.original_code for sym in self._alphabet]) + return original_sequence + + def get_alphabet(self): + return self._alphabet + + def __str__(self) -> str: + return "".join([str(sym) for sym in self.symbols]) + + def __repr__(self): + return f"PositionalSequence({self.reconstruct()!r})" + + +class PurePositionalSequence(Sequence): + """ + An object of this class is a 'placeholder' sequence, where each symbol is the + position in the sequence itself. + + This class is similar to :class:`PositionalSequence`, but the symbols are not + derived from an original sequence, but are the pure position. + Hence, there is no meaningful string representation of the sequence and its symbols. + """ + + def __init__(self, length): + self._alphabet = Alphabet(range(length)) + self.code = np.arange(length, dtype=Sequence.dtype(length)) + + def get_alphabet(self): + return self._alphabet + + def __repr__(self): + return f"PurePositionalSequence({len(self)})" diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py index 570878945..a9c33d829 100644 --- a/tests/sequence/align/test_matrix.py +++ b/tests/sequence/align/test_matrix.py @@ -40,3 +40,32 @@ def test_matrix_str(): "b 3 4 5", "c 6 7 8"] ) # fmt: skip + + +@pytest.mark.parametrize("seed", range(10)) +def test_as_positional(seed): + """ + Check if the score from a positional substitution matrix for symbols from + positional sequences is equal to the score from the original matrix for the + corresponding symbols from the original sequences. + """ + MAX_SEQ_LENGTH = 10 + + np.random.seed(seed) + matrix = align.SubstitutionMatrix.std_protein_matrix() + sequences = [] + for _ in range(2): + seq_length = np.random.randint(1, MAX_SEQ_LENGTH) + sequence = seq.ProteinSequence() + sequence.code = np.random.randint(0, len(sequence.alphabet), size=seq_length) + sequences.append(sequence) + pos_matrix, *pos_sequences = matrix.as_positional(*sequences) + + # Sanity check if the positional sequences do correspond to the original sequences + for i in range(2): + assert str(pos_sequences[i]) == str(sequences[i]) + for i in range(len(sequences[0])): + for j in range(len(sequences[1])): + ref_score = matrix.get_score(sequences[0][i], sequences[1][j]) + test_score = pos_matrix.get_score(pos_sequences[0][i], pos_sequences[1][j]) + assert test_score == ref_score diff --git a/tests/sequence/test_seqtypes.py b/tests/sequence/test_seqtypes.py index 086f972a9..a58e25c3d 100644 --- a/tests/sequence/test_seqtypes.py +++ b/tests/sequence/test_seqtypes.py @@ -2,6 +2,7 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +import numpy as np import pytest import biotite.sequence as seq @@ -90,3 +91,23 @@ def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein): protein = seq.ProteinSequence("ACDEFGHIKLMNPQRSTVW") mol_weight_protein = protein.get_molecular_weight(monoisotopic=monoisotopic) assert mol_weight_protein == pytest.approx(expected_mol_weight_protein, abs=1e-2) + + +def test_positional_sequence(): + """ + Check whether the original sequence symbols can be reconstructed from a + :class:`PositionalSequence`. + """ + SEQ_LENGTH = 100 + + np.random.seed(0) + orig_sequence = seq.ProteinSequence() + orig_sequence.code = np.random.randint( + 0, len(orig_sequence.alphabet), size=SEQ_LENGTH + ) + positional_sequence = seq.PositionalSequence(orig_sequence) + + assert str(positional_sequence) == str(orig_sequence) + reconstructed_sequence = positional_sequence.reconstruct() + assert np.all(reconstructed_sequence.code == orig_sequence.code) + assert reconstructed_sequence.alphabet == orig_sequence.alphabet diff --git a/tests/test_repr.py b/tests/test_repr.py index f8bf319c4..df3afaed4 100644 --- a/tests/test_repr.py +++ b/tests/test_repr.py @@ -15,7 +15,9 @@ LetterAlphabet, Location, NucleotideSequence, + PositionalSequence, ProteinSequence, + PurePositionalSequence, SequenceProfile, ) from biotite.sequence.align import Alignment, SubstitutionMatrix @@ -30,6 +32,8 @@ NucleotideSequence("AACTGCTA"), NucleotideSequence("AACTGCTA", ambiguous=True), ProteinSequence("BIQTITE"), + PositionalSequence(NucleotideSequence("AACTGCTA")), + PurePositionalSequence(123), Alphabet(["X", "Y", "Z"]), GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]), LetterAlphabet(["X", "Y", "Z"]), From 72ea1480c5d5ee9f4b1aeca99209d179b00a4bcf Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 8 Sep 2024 14:11:31 +0200 Subject: [PATCH 2/9] Implement more efficient implementations in subclass --- src/biotite/sequence/align/kmeralphabet.pyx | 17 +++++++++++++++++ src/biotite/sequence/alphabet.py | 3 +++ 2 files changed, 20 insertions(+) diff --git a/src/biotite/sequence/align/kmeralphabet.pyx b/src/biotite/sequence/align/kmeralphabet.pyx index 02a1bbcda..910c39360 100644 --- a/src/biotite/sequence/align/kmeralphabet.pyx +++ b/src/biotite/sequence/align/kmeralphabet.pyx @@ -568,6 +568,23 @@ class KmerAlphabet(Alphabet): return int(len(self._base_alph) ** self._k) + def __iter__(self): + # Creating all symbols is expensive + # -> Use a generator instead + if isinstance(self._base_alph, LetterAlphabet): + return ("".join(self.decode(code)) for code in range(len(self))) + else: + return (list(self.decode(code)) for code in range(len(self))) + + + def __contains__(self, symbol): + try: + self.fuse(self._base_alph.encode_multiple(symbol)) + return True + except AlphabetError: + return False + + def _to_array_form(model_string): """ Convert the the common string representation of a *k-mer* spacing diff --git a/src/biotite/sequence/alphabet.py b/src/biotite/sequence/alphabet.py index c08022cec..0b1f92a75 100644 --- a/src/biotite/sequence/alphabet.py +++ b/src/biotite/sequence/alphabet.py @@ -410,6 +410,9 @@ def decode_multiple(self, code, as_bytes=False): symbols = symbols.astype("U1") return symbols + def is_letter_alphabet(self): + return True + def __contains__(self, symbol): if not isinstance(symbol, (str, bytes)): return False From ed4be761dec3d943928a66abb2e7de433ed6c6c1 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 8 Sep 2024 16:13:05 +0200 Subject: [PATCH 3/9] Add string representation for `SequenceProfile` --- src/biotite/sequence/profile.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/biotite/sequence/profile.py b/src/biotite/sequence/profile.py index d208b2b3f..f44c5d6af 100644 --- a/src/biotite/sequence/profile.py +++ b/src/biotite/sequence/profile.py @@ -156,8 +156,23 @@ def gaps(self, new_gaps): ) self._gaps = new_gaps + def __str__(self): + # Add an additional row and column for the position and symbol indicators + print_matrix = np.full( + (self.symbols.shape[0] + 1, self.symbols.shape[1] + 1), "", dtype=object + ) + print_matrix[1:, 1:] = self.symbols.astype(str) + print_matrix[0, 1:] = [str(sym) for sym in self.alphabet] + print_matrix[1:, 0] = [str(i) for i in range(self.symbols.shape[0])] + max_len = len(max(print_matrix.flatten(), key=len)) + return "\n".join( + [ + " ".join([str(cell).rjust(max_len) for cell in row]) + for row in print_matrix + ] + ) + def __repr__(self): - """Represent SequenceProfile as a string for debugging.""" return ( f"SequenceProfile(np.{np.array_repr(self.symbols)}, " f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))" From 7ab6728c62df3a168e8292b00057d46ff5adf21d Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 8 Sep 2024 16:13:15 +0200 Subject: [PATCH 4/9] Add sequence profile tutorial --- doc/tutorial/sequence/index.rst | 4 +- doc/tutorial/sequence/profiles.rst | 157 +++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 2 deletions(-) create mode 100644 doc/tutorial/sequence/profiles.rst diff --git a/doc/tutorial/sequence/index.rst b/doc/tutorial/sequence/index.rst index ab1a96b83..c879c3e04 100644 --- a/doc/tutorial/sequence/index.rst +++ b/doc/tutorial/sequence/index.rst @@ -43,5 +43,5 @@ For an unambiguous DNA sequence, the alphabet comprises the four nucleobases. align_optimal align_heuristic align_multiple - annotations - profiles \ No newline at end of file + profiles + annotations \ No newline at end of file diff --git a/doc/tutorial/sequence/profiles.rst b/doc/tutorial/sequence/profiles.rst new file mode 100644 index 000000000..d82dc4a2f --- /dev/null +++ b/doc/tutorial/sequence/profiles.rst @@ -0,0 +1,157 @@ +.. include:: /tutorial/preamble.rst + +Profiles and position-specific matrices +======================================= +Often sequences are not viewed in isolation: +For example, if you investigate a protein family, you do not handle a single sequence, +but an arbitrarily large collection of highly similar sequences. +At some point handling those sequences as a mere collection of individual sequences +becomes impractical for answering common questions, like which are the prevalent +amino acids at a certain positions of the sequences. + +Sequence profiles +----------------- + +.. currentmodule:: biotite.sequence + +This is where *sequence profiles* come into play: +The condense the a collection of aligned sequences into a matrix that tracks the +frequency of each symbol at each position of the alignment. +Hence, asking the questions such as '*How frequent is an alanine at the n-th position?*' +becomes a trivial indexing operation. + +As example for profiles, we will reuse the cyclotide sequence family from the +:doc:`previous chapter `. + +.. jupyter-execute:: + + from tempfile import NamedTemporaryFile + import biotite.sequence.io.fasta as fasta + import biotite.database.entrez as entrez + + query = ( + entrez.SimpleQuery("Cyclotide") & + entrez.SimpleQuery("cter") & + entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ + entrez.SimpleQuery("Precursor") + ) + uids = entrez.search(query, "protein") + temp_file = NamedTemporaryFile(suffix=".fasta", delete=False) + fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, temp_file.name, "protein", "fasta") + ) + sequences = { + # The cyclotide variant is the last character in the header + header[-1]: seq for header, seq in fasta.get_sequences(fasta_file).items() + } + # Extract cyclotide N as query sequence for later + query = sequences.pop("N") + +To create a profile, we first need to align the sequences, so corresponding symbols +appear the same position. +Then we can create a :class:`SequenceProfile` object from the :class:`.Alignment`, which +simply counts for each alignment column (i.e. the sequence position) the number of +occurrences for each symbol. + +.. jupyter-execute:: + + import biotite.sequence as seq + import biotite.sequence.align as align + + alignment, _, _, _ = seq.align.align_multiple( + list(sequences.values()), + align.SubstitutionMatrix.std_protein_matrix(), + gap_penalty=-5, + ) + profile = seq.SequenceProfile.from_alignment(alignment) + print(profile) + +Each row in the displayed count matrix +(accessible via :attr:`SequenceProfile.symbols`) refers to a single position, i.e. a +column in the input MSA, and each column refers to a symbol in the underlying alphabet +(accessible via :attr:`SequenceProfile.alphabet`). +For completeness it should be noted that :attr:`SequenceProfile.gaps` also tracks the +gaps for each position in the alignment, but we will not further use them in this +tutorial. + +Note that the information about the individual sequences is lost in the condensation +process: There is no way to reconstruct the original sequences from the profile. +However, we can still extract a consensus sequence from the profile, which is a +sequence that represents the most frequent symbol at each position. + +.. jupyter-execute:: + + print(profile.to_consensus()) + +Profile visualization as sequence logo +-------------------------------------- + +.. currentmodule:: biotite.sequence.align + +A common way to visualize a sequence profile is a sequence logo. +It depicts each profile position as a stack of letters: +The degree of conversation (more precisely the +`Shannon entropy `_) +is the height of a stack and each letter's height in the stack is proportional to its +frequency at the respective position. + +.. jupyter-execute:: + + import matplotlib.pyplot as plt + from biotite.sequence.graphics import plot_sequence_logo + + fig, ax = plt.subplots(figsize=(8.0, 2.0), constrained_layout=True) + plot_sequence_logo(ax, profile) + ax.set_xlabel("Residue position") + ax.set_ylabel("Bits") + +Position-specific scoring matrices +---------------------------------- + +.. currentmodule:: biotite.sequence.align + +Sequence profiles can achieve even more: +The substitution matrices we have seen so far assign a score to a pair of two symbols, +regardless of their position in a sequence. +However, as discussed above, the position is crucial information to determine how +conserved a certain symbol is in a family of sequences. + +Hence, we extend the concept of substitution matrices to *position-specific* matrices, +which assign a score to a symbol and a position (instead of another symbols). + +.. jupyter-execute:: + + # For a real analysis each amino acid background frequencies should be provided + log_odds = profile.log_odds_matrix(pseudocount=1) + +Now, we encounter a problem: +To create a :class:`SubstitutionMatrix` object from the log-odds matrix, we require two +:class:`.Alphabet` objects: +One is taken from the query sequence to be aligned to the profile, but which alphabet +do we use for the positional axis of the matrix? +Likewise, the alignment functions (e.g. :func:`align_optimal()`) require a two sequences +to be aligned, but we only have one query sequence. + +To solve this problem :mod:`biotite.sequence` provides the :class:`.PositionalSequence` +which acts as a placeholder for the second sequence in the alignment. +Its alphabet contains a unique symbol for each position, i.e. the alphabet has the +sought length. + +.. jupyter-execute:: + + positional_seq = seq.PositionalSequence(profile.to_consensus()) + matrix = align.SubstitutionMatrix( + positional_seq.alphabet, + profile.alphabet, + log_odds + ) + alignment = align.align_optimal( + positional_seq, query, matrix, gap_penalty=-5, max_number=1 + )[0] + print(alignment) + +Only the length of the input sequence passed to :class:`.PositionalSequence` matters for +the alignment to work. +The consensus sequence of the profile was merely chosen for cosmetic reasons, i.e. +to have a meaningful string representation of the positional sequence and thus the +alignment. From 7ddb3e4e2eafea71ca585bb519cfd619c55770b9 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 3 Nov 2024 18:40:19 +0100 Subject: [PATCH 5/9] Check for empty response --- src/biotite/database/uniprot/check.py | 35 ++++++++++++++---------- src/biotite/database/uniprot/download.py | 2 +- src/biotite/database/uniprot/query.py | 2 +- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/biotite/database/uniprot/check.py b/src/biotite/database/uniprot/check.py index a1782e1ba..3bbfabc6d 100644 --- a/src/biotite/database/uniprot/check.py +++ b/src/biotite/database/uniprot/check.py @@ -10,7 +10,7 @@ # Taken from https://www.uniprot.org/help/api_retrieve_entries -def assert_valid_response(response_status_code): +def assert_valid_response(response): """ Checks whether the response is valid. @@ -19,17 +19,22 @@ def assert_valid_response(response_status_code): response_status_code: int Status code of request.get. """ - if response_status_code == 400: - raise RequestError("Bad request. There is a problem with your input.") - elif response_status_code == 404: - raise RequestError("Not found. The resource you requested doesn't exist.") - elif response_status_code == 410: - raise RequestError("Gone. The resource you requested was removed.") - elif response_status_code == 500: - raise RequestError( - "Internal server error. Most likely a temporary problem, but if the problem persists please contact UniProt team." - ) - elif response_status_code == 503: - raise RequestError( - "Service not available. The server is being updated, try again later." - ) + if len(response.content) == 0: + raise RequestError("No content returned") + match response.status_code: + case 400: + raise RequestError("Bad request. There is a problem with your input.") + case 404: + raise RequestError("Not found. The resource you requested doesn't exist.") + case 410: + raise RequestError("Gone. The resource you requested was removed.") + case 500: + raise RequestError( + "Internal server error. " + "Most likely a temporary problem, " + "but if the problem persists please contact UniProt team." + ) + case 503: + raise RequestError( + "Service not available. The server is being updated, try again later." + ) diff --git a/src/biotite/database/uniprot/download.py b/src/biotite/database/uniprot/download.py index bacb40e96..8f27c916b 100644 --- a/src/biotite/database/uniprot/download.py +++ b/src/biotite/database/uniprot/download.py @@ -111,7 +111,7 @@ def fetch(ids, format, target_path=None, overwrite=False, verbose=False): if format in ["fasta", "gff", "txt", "xml", "rdf", "tab"]: r = requests.get(_fetch_url + db_name + "/" + id + "." + format) content = r.text - assert_valid_response(r.status_code) + assert_valid_response(r) else: raise ValueError(f"Format '{format}' is not supported") if file is None: diff --git a/src/biotite/database/uniprot/query.py b/src/biotite/database/uniprot/query.py index 687c61f5f..1394dc4c1 100644 --- a/src/biotite/database/uniprot/query.py +++ b/src/biotite/database/uniprot/query.py @@ -289,5 +289,5 @@ def search(query, number=500): params = {"query": str(query), "format": "list", "size": str(number)} r = requests.get(_base_url, params=params) content = r.text - assert_valid_response(r.status_code) + assert_valid_response(r) return content.split("\n")[:-1] From 32fadcbcf0249e2bbc047c2bae7b76e455d9d54e Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 3 Nov 2024 18:40:37 +0100 Subject: [PATCH 6/9] fixup! Add sequence profile tutorial --- doc/tutorial/sequence/profiles.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/tutorial/sequence/profiles.rst b/doc/tutorial/sequence/profiles.rst index d82dc4a2f..e5b43d5a5 100644 --- a/doc/tutorial/sequence/profiles.rst +++ b/doc/tutorial/sequence/profiles.rst @@ -143,7 +143,9 @@ sought length. matrix = align.SubstitutionMatrix( positional_seq.alphabet, profile.alphabet, - log_odds + # Substitution matrices require integer scores + # Multiply by 10 to increase value range and convert to integer + (log_odds * 10).astype(int) ) alignment = align.align_optimal( positional_seq, query, matrix, gap_penalty=-5, max_number=1 From 782219fa15c0fcb00b23b8bef5d2b43258ba052f Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 3 Nov 2024 18:41:13 +0100 Subject: [PATCH 7/9] Check for invalid input score matrix --- src/biotite/sequence/align/matrix.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py index 06308115d..cab851672 100644 --- a/src/biotite/sequence/align/matrix.py +++ b/src/biotite/sequence/align/matrix.py @@ -148,7 +148,21 @@ def __init__(self, alphabet1, alphabet2, score_matrix): f"Matrix has shape {score_matrix.shape}, " f"but {alph_shape} is required" ) + if not np.issubdtype(score_matrix.dtype, np.integer): + raise TypeError("Score matrix must be an integer ndarray") self._matrix = score_matrix.astype(np.int32) + # If the score matrix was converted from a a float matrix, + # inf values would be converted to 2**31, + # which is probably undesired and gives overflow issues in the alignment + # functions + if ( + np.any(self._matrix == np.iinfo(np.int32).max) or + np.any(self._matrix == np.iinfo(np.int32).min) + ): # fmt: skip + raise ValueError( + "Score values are too large. " + "Maybe it was converted from a float matrix containing inf values?" + ) elif isinstance(score_matrix, str): matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix) self._fill_with_matrix_dict(matrix_dict) From 5c47b54584479e0567ef020618c481d0dcdfafc4 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Mon, 4 Nov 2024 11:23:14 +0100 Subject: [PATCH 8/9] Add example to gallery --- .../scripts/sequence/profile/scop_profile.py | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 doc/examples/scripts/sequence/profile/scop_profile.py diff --git a/doc/examples/scripts/sequence/profile/scop_profile.py b/doc/examples/scripts/sequence/profile/scop_profile.py new file mode 100644 index 000000000..bf761732e --- /dev/null +++ b/doc/examples/scripts/sequence/profile/scop_profile.py @@ -0,0 +1,171 @@ +import matplotlib.pyplot as plt +import numpy as np +import requests +import biotite.application.clustalo as clustalo +import biotite.database.uniprot as uniprot +import biotite.sequence as seq +import biotite.sequence.align as align +import biotite.sequence.graphics as graphics +import biotite.sequence.io.fasta as fasta +from biotite.database import RequestError + +TARGET_UNIPROT_ID = "P10145" +TARGET_SCOP_FAMILY_ID = "4000912" +QUERY_UNIPROT_ID = "P03212" + +#TARGET_UNIPROT_ID = "P09467" +#TARGET_SCOP_FAMILY_ID = "4002766" +#QUERY_UNIPROT_ID = "P73922" + + +target_sequence = fasta.get_sequence( + fasta.FastaFile.read(uniprot.fetch(TARGET_UNIPROT_ID, "fasta", ".")) +) +query_sequence = fasta.get_sequence( + fasta.FastaFile.read(uniprot.fetch(QUERY_UNIPROT_ID, "fasta", ".")) +) + +aa_matrix = align.SubstitutionMatrix.std_protein_matrix() +alignment = align.align_optimal( + target_sequence, + query_sequence, + aa_matrix, + gap_penalty=(-10, -1), + local=True, + max_number=1, +)[0] + +fig, ax = plt.subplots(figsize=(8.0, 1.0)) +graphics.plot_alignment_similarity_based( + ax, alignment, matrix=aa_matrix, labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID] +) + + +######################################################################################## +# https://www.ebi.ac.uk/pdbe/scop/download + + +def get_sequence_family(scop_id): + scop_domains = requests.get( + f"https://www.ebi.ac.uk/pdbe/scop/api/domains/{scop_id}" + ).json() + sequences = {} + for scop_domain in scop_domains["domains"]: + uniprot_id = scop_domain["uniprot_id"] + try: + sequence = fasta.get_sequence( + fasta.FastaFile.read(uniprot.fetch(uniprot_id, "fasta", ".")) + ) + except RequestError: + # The UniProt ID is not in the database -> skip this domain + continue + for start, end in scop_domain["protein_regions"]: + region = sequence[start : end + 1] + sequences[uniprot_id] = region + # Most domains have only one region within the sequence + # For simplicity we only consider the first region + break + return sequences + + +sequences = get_sequence_family(TARGET_SCOP_FAMILY_ID) +app = clustalo.ClustalOmegaApp(list(sequences.values())) +app.start() +app.join() +msa = app.get_alignment() +order = app.get_alignment_order() +labels = np.array(list(sequences.keys())) + +fig, ax = plt.subplots(figsize=(8.0, 24.0)) +graphics.plot_alignment_type_based( + ax, msa[:, order], labels=labels[order], show_numbers=True +) + +######################################################################################## +# :footcite:`Robinson1991` + +# Must have the same order as `ProteinSequence.alphabet` +AA_FREQUENCY = { + "A": 35155, + "C": 8669, + "D": 24161, + "E": 28354, + "F": 17367, + "G": 33229, + "H": 9906, + "I": 23161, + "K": 25872, + "L": 40625, + "M": 10101, + "N": 20212, + "P": 23435, + "Q": 19208, + "R": 23105, + "S": 32070, + "T": 26311, + "V": 29012, + "W": 5990, + "Y": 14488, + # Set ambiguous amino acid count to 1 to avoid division by zero in log odds matrix + "B": 1, + "Z": 1, + "X": 1, + "*": 1, +} + + +profile = seq.SequenceProfile.from_alignment(msa) +background_frequencies = np.array(list(AA_FREQUENCY.values())) +# Normalize background frequencies +background_frequencies = background_frequencies / background_frequencies.sum() + +score_matrix = profile.log_odds_matrix(background_frequencies, pseudocount=1) +score_matrix *= 10 +score_matrix = score_matrix.astype(np.int32) + +profile_seq = seq.PositionalSequence(profile.to_consensus()) +positional_matrix = align.SubstitutionMatrix( + profile_seq.alphabet, seq.ProteinSequence.alphabet, score_matrix=score_matrix +) +profile_alignment = align.align_optimal( + profile_seq, + query_sequence, + positional_matrix, + gap_penalty=(-10, -1), + local=False, + terminal_penalty=False, + max_number=1, +)[0] +print(profile_alignment) + + +######################################################################################## +# Map the profile alignment to the original target sequence +# Chimeric alignment from target in the MSA and the query in the profile alignment +sequence_pos_in_msa = np.where(labels == TARGET_UNIPROT_ID)[0][0] +target_trace = msa.trace[:, sequence_pos_in_msa] +# Remove parts of query aligned to gaps in in the profile +gap_mask = profile_alignment.trace[:, 0] != -1 +query_trace = profile_alignment.trace[gap_mask, 1] +print(msa.trace.shape) +print(profile_alignment.trace.shape) +print(profile.symbols.shape) +print() +print(query_trace.shape) +print(target_trace.shape) +target_alignment = align.Alignment( + [target_sequence, query_sequence], + np.stack([target_trace, query_trace], axis=-1), +) + +fig, ax = plt.subplots(figsize=(8.0, 4.0)) +graphics.plot_alignment_similarity_based( + ax, target_alignment, matrix=aa_matrix, labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID] +) + + +plt.show() + +######################################################################################## +# +# .. footbibliography:: From b306942447976b39217c3d10ebcad95dd835d998 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Wed, 13 Nov 2024 10:43:23 +0100 Subject: [PATCH 9/9] fixup! Add example to gallery --- .../scripts/sequence/profile/scop_profile.py | 93 +++++++++++-------- 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/doc/examples/scripts/sequence/profile/scop_profile.py b/doc/examples/scripts/sequence/profile/scop_profile.py index bf761732e..938e22bd9 100644 --- a/doc/examples/scripts/sequence/profile/scop_profile.py +++ b/doc/examples/scripts/sequence/profile/scop_profile.py @@ -1,7 +1,11 @@ +""" +Note that in order to get a representative sequence profile, it is crucial to have a +high number of sequences the target sequence can be aligned to. +""" + import matplotlib.pyplot as plt import numpy as np import requests -import biotite.application.clustalo as clustalo import biotite.database.uniprot as uniprot import biotite.sequence as seq import biotite.sequence.align as align @@ -9,13 +13,13 @@ import biotite.sequence.io.fasta as fasta from biotite.database import RequestError -TARGET_UNIPROT_ID = "P10145" -TARGET_SCOP_FAMILY_ID = "4000912" -QUERY_UNIPROT_ID = "P03212" +TARGET_UNIPROT_ID = "P04746" +TARGET_SCOP_FAMILY_ID = "4003138" +QUERY_UNIPROT_ID = "P00722" -#TARGET_UNIPROT_ID = "P09467" -#TARGET_SCOP_FAMILY_ID = "4002766" -#QUERY_UNIPROT_ID = "P73922" +# TARGET_UNIPROT_ID = "P09467" +# TARGET_SCOP_FAMILY_ID = "4002766" +# QUERY_UNIPROT_ID = "P73922" target_sequence = fasta.get_sequence( @@ -27,11 +31,11 @@ aa_matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal( - target_sequence, query_sequence, + target_sequence, aa_matrix, gap_penalty=(-10, -1), - local=True, + local=False, max_number=1, )[0] @@ -68,18 +72,38 @@ def get_sequence_family(scop_id): return sequences +def merge_pairwise_alignments(alignments): + traces = [] + sequences = [] + for alignment in alignments: + trace = alignment.trace + # Remove gaps in first sequence + trace = trace[trace[:, 0] != -1] + traces.append(trace[:, 1]) + sequences.append(alignment.sequences[1]) + return align.Alignment(sequences, np.stack(traces, axis=-1)) + + sequences = get_sequence_family(TARGET_SCOP_FAMILY_ID) -app = clustalo.ClustalOmegaApp(list(sequences.values())) -app.start() -app.join() -msa = app.get_alignment() -order = app.get_alignment_order() +# This is not a 'true' MSA, in the sense that it only merges the pairwise alignments +# with respect to the target sequence +pseudo_msa = merge_pairwise_alignments( + [ + align.align_optimal( + target_sequence, + sequence, + aa_matrix, + gap_penalty=(-10, -1), + max_number=1, + )[0] + for sequence in sequences.values() + ] +) + labels = np.array(list(sequences.keys())) fig, ax = plt.subplots(figsize=(8.0, 24.0)) -graphics.plot_alignment_type_based( - ax, msa[:, order], labels=labels[order], show_numbers=True -) +graphics.plot_alignment_type_based(ax, pseudo_msa, labels=labels, show_numbers=True) ######################################################################################## # :footcite:`Robinson1991` @@ -114,53 +138,44 @@ def get_sequence_family(scop_id): } -profile = seq.SequenceProfile.from_alignment(msa) +profile = seq.SequenceProfile.from_alignment(pseudo_msa) background_frequencies = np.array(list(AA_FREQUENCY.values())) # Normalize background frequencies background_frequencies = background_frequencies / background_frequencies.sum() -score_matrix = profile.log_odds_matrix(background_frequencies, pseudocount=1) +score_matrix = profile.log_odds_matrix(background_frequencies, pseudocount=1).T score_matrix *= 10 score_matrix = score_matrix.astype(np.int32) profile_seq = seq.PositionalSequence(profile.to_consensus()) positional_matrix = align.SubstitutionMatrix( - profile_seq.alphabet, seq.ProteinSequence.alphabet, score_matrix=score_matrix + seq.ProteinSequence.alphabet, profile_seq.alphabet, score_matrix=score_matrix ) profile_alignment = align.align_optimal( - profile_seq, query_sequence, + profile_seq, positional_matrix, gap_penalty=(-10, -1), local=False, terminal_penalty=False, max_number=1, )[0] -print(profile_alignment) ######################################################################################## -# Map the profile alignment to the original target sequence -# Chimeric alignment from target in the MSA and the query in the profile alignment -sequence_pos_in_msa = np.where(labels == TARGET_UNIPROT_ID)[0][0] -target_trace = msa.trace[:, sequence_pos_in_msa] -# Remove parts of query aligned to gaps in in the profile -gap_mask = profile_alignment.trace[:, 0] != -1 -query_trace = profile_alignment.trace[gap_mask, 1] -print(msa.trace.shape) -print(profile_alignment.trace.shape) -print(profile.symbols.shape) -print() -print(query_trace.shape) -print(target_trace.shape) -target_alignment = align.Alignment( - [target_sequence, query_sequence], - np.stack([target_trace, query_trace], axis=-1), +# Map the profile alignment to the original target sequence. +# As the pseudo-MSA was designed to have the same length as the target sequence, +# the sequence needs to be exchanged only, the trace remains the same +refined_alignment = align.Alignment( + [query_sequence, target_sequence], profile_alignment.trace ) fig, ax = plt.subplots(figsize=(8.0, 4.0)) graphics.plot_alignment_similarity_based( - ax, target_alignment, matrix=aa_matrix, labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID] + ax, + refined_alignment, + matrix=aa_matrix, + labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID], )