From b485d5374cc1e9418b27b88a283e8c65bed5133b Mon Sep 17 00:00:00 2001
From: Nikita Vaulin <vaulin@ro.ru>
Date: Mon, 26 Feb 2024 20:50:11 +0300
Subject: [PATCH 1/2] Add RAG2.py

---
 RAG2.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 RAG2.py

diff --git a/RAG2.py b/RAG2.py
new file mode 100644
index 0000000..e756a11
--- /dev/null
+++ b/RAG2.py
@@ -0,0 +1,75 @@
+from abc import ABC, abstractmethod
+
+class BiologicalSequence(ABC):
+
+    def __init__(self, sequence: str):
+        self.sequence = sequence
+ 
+    def __len__(self) -> int:
+        return len(self._sequence)
+
+    @abstractmethod
+    def is_valid(self) -> bool:
+        """
+        Checks if the sequence matches the specified alphabet.
+        Returns:
+            bool: True if the sequence is correct, otherwise False.
+        """
+        pass
+
+    def __getitem__(self, key):
+        return self.sequence[key]
+    
+    def __str__(self) -> str:
+        return self.sequence
+    
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}('{self.sequence}')"
+
+
+
+class NucleicAcidSequence(BiologicalSequence):
+    complement_map = {}
+
+    def is_valid(self) -> bool:
+        return all(nucleotide in self.complement_map for nucleotide in self.sequence)
+
+    def complement(self):
+        return ''.join(self.complement_map[nucleotide] for nucleotide in self.sequence)
+
+    def gc_content(self):
+        gc_content = (self.sequence.count('G') + self.sequence.count('C')) / len(self.sequence) if self.sequence else 0
+        return gc_content
+
+class DNASequence(NucleicAcidSequence):
+    complement_map = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
+    def transcribe(self):
+        return RNASequence(self.sequence.replace('T', 'U'))
+
+class RNASequence(NucleicAcidSequence):
+    complement_map = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G'}
+
+
+class AminoAcidSequence(BiologicalSequence):
+
+    def is_valid(self) -> bool:
+        amino_acids = "ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy"
+        return all(aa in amino_acids for aa in self.sequence)
+    
+
+    def one_to_three_letter_code(self) -> str:
+        """
+        This function converts a protein sequence from one-letter amino acid code to three-letter code.
+    
+        Args:
+            sequence (str): The input protein sequence in one-letter code.
+        
+        Returns:
+            str: The converted protein sequence in three-letter code.
+        """
+        AMINO_ACIDS = {'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His', 'I': 'Ile',
+               'K': 'Lys', 'L': 'Leu', 'M': 'Met', 'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 'S': 'Ser',
+               'T': 'Thr', 'V': 'Val',
+               'W': 'Trp', 'Y': 'Tyr'}
+        three_letter_code = [AMINO_ACIDS.get(aa.upper()) for aa in self.sequence]
+        return '-'.join(three_letter_code)
\ No newline at end of file

From fc277ad9a870ea4bd2aaeea80ef5f372e8d918c8 Mon Sep 17 00:00:00 2001
From: Nikita Vaulin <vaulin@ro.ru>
Date: Tue, 27 Feb 2024 10:01:04 +0300
Subject: [PATCH 2/2] Add RAG2.py

---
 RAG2.py | 601 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 600 insertions(+), 1 deletion(-)

diff --git a/RAG2.py b/RAG2.py
index e756a11..2d12918 100644
--- a/RAG2.py
+++ b/RAG2.py
@@ -1,4 +1,7 @@
 from abc import ABC, abstractmethod
+from Bio import SeqIO
+from Bio.SeqUtils import gc_fraction
+from Bio.Seq import Seq
 
 class BiologicalSequence(ABC):
 
@@ -72,4 +75,600 @@ def one_to_three_letter_code(self) -> str:
                'T': 'Thr', 'V': 'Val',
                'W': 'Trp', 'Y': 'Tyr'}
         three_letter_code = [AMINO_ACIDS.get(aa.upper()) for aa in self.sequence]
-        return '-'.join(three_letter_code)
\ No newline at end of file
+        return '-'.join(three_letter_code)
+
+
+def filter_fastq(input_path: str, output_filename: str = None, gc_bounds: tuple = (0, 100), length_bounds: tuple = (0, 2 ** 32),
+                 quality_threshold: int = 0) -> dict:
+    """
+        Filters a dictionary of FASTQ sequences based on specified criteria. Saves the output FASTAQ file.
+
+        Args:
+            input_path (str): Path to the input FASTQ file.
+            output_filename (str, optional): Name of the output FASTQ file. If not provided,
+                the filtered sequences will not be saved to a file (default is None).
+            gc_bounds (tuple or float, optional): Tuple with lower and upper bounds or a single float
+                representing the upper bound for GC content (default is (0, 100)).
+            length_bounds (tuple or int, optional): Tuple with lower and upper bounds or a single integer
+                representing the upper bound for sequence length (default is (0, 2**32)).
+            quality_threshold (int, optional): The threshold for average quality (default is 0).
+
+        Returns:
+            dict: Filtered dictionary of sequences. Keys are sequence names, values are tuples of
+            (0) sequence and (1) quality scores.
+        """
+    seqs = read_fastaq(input_path)
+    filtered_seqs = {}
+
+    for seq_name, (sequence, quality) in seqs.items():
+        if not check_gc_content(sequence, gc_bounds):
+            continue
+
+        if not check_length(sequence, length_bounds):
+            continue
+
+        if not check_quality(quality, quality_threshold):
+            continue
+
+        filtered_seqs[seq_name] = (sequence, quality)
+
+    save_fastq_from_dict(filtered_seqs, output_filename)
+
+    return filtered_seqs
+
+
+def filter_fastq_with_Bio(input_path: str, output_filename: str = None, gc_bounds: tuple = (0, 100), length_bounds: tuple = (0, 2 ** 32), quality_threshold: int = 0) -> dict:
+    """
+    Filters FASTQ sequences from fastq format file based on specified criteria. Saves the output FASTAQ file. Uses Biopython libraries
+    """
+    filtered_seqs = {}
+    
+    for record in SeqIO.parse(input_path, "fastq"):
+        sequence = str(record.seq)
+        quality_scores = record.letter_annotations["phred_quality"]
+        gc_content =  gc_fraction(record.seq)*100
+
+        if not (gc_bounds[0] <= gc_content <= gc_bounds[1]):
+            continue
+        
+        if not (length_bounds[0] <= len(sequence) <= length_bounds[1]):
+            continue
+        
+        if not check_quality(quality_scores, quality_threshold):
+            continue
+        
+        filtered_seqs[record.id] = (sequence, quality_scores)
+    
+    if output_filename:
+        with open(output_filename, "w") as output_handle:
+            SeqIO.write((SeqIO.SeqRecord(Seq(seq), id=seq_id, description="", letter_annotations={"phred_quality": quality}) for seq_id, (seq, quality) in filtered_seqs.items()), output_handle, "fastq")
+    
+    return filtered_seqs
+
+
+def run_dna_rna_tools(*arguments):
+    """
+    Executes DNA/RNA sequence manipulation procedures.
+
+    Args:
+        *arguments (tuple): Variable-length argument list containing sequences and procedure.
+
+    Returns:
+        str or list of str: Result of the selected procedure.
+    """
+    procedure = arguments[-1]
+    sequences = arguments[:-1]
+    if not check_valid_sequence(sequences):
+        raise ValueError("At least one of your sequences does not correspond to either DNA or RNA")
+    if contains_T_and_U_at_the_same_time(sequences):
+        raise ValueError(
+            "One of your sequences contains both thymine and uracil at the same time, which is not possible((((")
+    if procedure == "transcribe":
+        return transcribe(sequences)
+    elif procedure == "reverse":
+        return reverse(sequences)
+    elif procedure == "complement":
+        return complement(sequences)
+    elif procedure == "reverse_complement":
+        return reverse_complement(sequences)
+    else:
+        return "Something went wrong, please, verify the chosen procedure is written correctly"
+
+
+def run_amino_analyzer(sequence: str, procedure: str, *, weight_type: str = 'average', enzyme: str = 'trypsin'):
+    """
+    This is the main function to run the amino-analyzer.py tool.
+    
+    Args:
+        sequence (str): The input protein sequence in one-letter code.
+        procedure (str): amino-analyzer.py tool has 5 functions at all:
+            1. aa_weight - Calculate the amino acids weight in a protein sequence. Return float weight
+            weight_type = 'average': default argument for 'aa_weight' function. weight_type = 'monoisotopic' can be
+            used as a second option.
+            2. count_hydroaffinity - Count the quantity of hydrophobic and hydrophilic amino acids in a protein
+            sequence. Return list in order: hydrophobic, hydrophilic
+            3. peptide_cutter - This function identifies cleavage sites in a given peptide sequence using a specified
+            enzyme. Return list of cleavage sites enzyme = 'trypsin': default argument for 'peptide_cutter' function.
+            enzyme = 'chymotrypsin' can be used as a second option.
+            4. one_to_three_letter_code - This function converts a protein sequence from one-letter amino acid code
+            to three-letter code. Return string of amino acids in three-letter code
+            5. sulphur_containing_aa_counter - This function counts sulphur-containing amino acids in a protein
+            sequence. Return quantity of sulphur-containing amino acids.
+
+    Returns:
+        The result of the specified procedure.
+
+    Raises:
+        ValueError: If the procedure is not recognized or if the input sequence contains non-amino acid characters.
+
+    Note: - Supported amino acid characters: V, I, L, E, Q, D, N, H, W, F, Y, R, K, S, T, M, A, G, P, C, v, i, l, e,
+    q, d, n, h, w, f, y, r, k, s, t, m, a, g, p, c. - Make sure to provide a valid procedure name and sequence for
+    analysis. :param enzyme: :param sequence: :param procedure: :param weight_type:
+    """
+    procedures = ['aa_weight', 'count_hydroaffinity', 'peptide_cutter', 'one_to_three_letter_code',
+                  'sulphur_containing_aa_counter']
+    if procedure not in procedures:
+        raise ValueError(f"Incorrect procedure. Acceptable procedures: {', '.join(procedures)}")
+
+    if not is_aa(sequence):
+        raise ValueError("Incorrect sequence. Only amino acids are allowed (V, I, L, E, Q, D, N, H, W, F, Y, R, K, S, "
+                         "T, M, A, G, P, C, v, i, l, e, q, d, n, h, w, f, y, r, k, s, t, m, a, g, p, c).")
+    result = ''
+    if procedure == 'aa_weight':
+        result = aa_weight(sequence, weight_type)
+    elif procedure == 'count_hydroaffinity':
+        result = count_hydroaffinity(sequence)
+    elif procedure == 'peptide_cutter':
+        result = peptide_cutter(sequence, enzyme)
+    elif procedure == 'one_to_three_letter_code':
+        result = one_to_three_letter_code(sequence)
+    elif procedure == 'sulphur_containing_aa_counter':
+        result = sulphur_containing_aa_counter(sequence)
+    return result
+
+
+
+import os
+
+
+def check_gc_content(sequence: str, gc_bounds: tuple or float) -> bool: # type: ignore
+    """
+    Checks if the GC content of a sequence is within the specified bounds.
+    Args:
+        sequence (str): The input DNA sequence.
+        gc_bounds (tuple or float): Tuple with lower and upper bounds or a single float representing the upper bound.
+    Returns:
+        bool: True if GC content is within bounds, False otherwise.
+    """
+    gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence) * 100
+    if isinstance(gc_bounds, tuple):
+        return gc_bounds[0] <= gc_content <= gc_bounds[1]
+    else:
+        return gc_content <= gc_bounds
+
+
+def check_length(sequence: str, length_bounds: tuple or int) -> bool: # type: ignore
+    """
+    Checks if the length of a sequence is within the specified bounds.
+
+    Args:
+        sequence (str): The input DNA sequence.
+        length_bounds (tuple or int): Tuple with lower and upper bounds or a single integer representing the upper bound
+
+    Returns:
+        bool: True if length is within bounds, False otherwise.
+    """
+    seq_length = len(sequence)
+    if isinstance(length_bounds, tuple):
+        return length_bounds[0] <= seq_length <= length_bounds[1]
+    else:
+        return seq_length <= length_bounds
+
+
+def check_quality(quality_scores, quality_threshold: int) -> bool:
+    """
+    Checks the average quality of a sequence, accepting both preprocessed numerical quality scores
+    and raw ASCII character quality scores.
+
+    This function allows for flexible handling of quality scores, whether they come directly from FASTQ files
+    as ASCII characters or have been preprocessed into numerical scores. It calculates the average quality
+    and compares it to a specified threshold to determine if the sequence meets the quality criteria.
+
+    Args:
+        quality_scores: Numerical list of quality scores or a string of ASCII quality characters.
+        quality_threshold (int): The threshold for average quality.
+
+    Returns:
+        bool: True if the average quality is above the threshold, False otherwise.
+
+    Raises:
+        ValueError: If `quality_scores` is neither a string nor a list/tuple.
+    """
+    # If quality_scores is a string, assume these are ASCII characters, raw data from a FASTQ file
+    if isinstance(quality_scores, str):
+        avg_quality = sum(ord(score) - 33 for score in quality_scores) / len(quality_scores)
+    # If quality_scores is a list or tuple, assume these are numerical quality scores
+    elif isinstance(quality_scores, (list, tuple)):
+        avg_quality = sum(quality_scores) / len(quality_scores)
+    else:
+        raise ValueError("quality_scores must be either a string or a list/tuple")
+
+    return avg_quality >= quality_threshold
+
+
+
+
+def read_fastaq(input_path: str) -> dict:
+    """
+    Reads a FASTQ file and returns a dictionary.
+
+    Args:
+        input_path (str): The path to the FASTQ file.
+
+    Returns:
+        dict: A dictionary where keys are sequence names (starting with "@")
+              and values are tuples of (sequence (line 2), quality (line 4)).
+
+    Example:
+        Given a FASTQ file like this:
+
+        @Sequence1
+        AGCTAGCTAGCTAGCT
+        +
+        !@#$!@#$!@#$!@#$
+        @Sequence2
+        CGATCGATCGATCGAT
+        +
+        !@#$!@#$!@#$!@#$
+
+        The function will return:
+        {'@Sequence1': ('AGCTAGCTAGCTAGCT', '!@#$!@#$!@#$!@#$'),
+         '@Sequence2': ('CGATCGATCGATCGAT', '!@#$!@#$!@#$!@#$')}
+
+    """
+    seqs = {}
+    with open(input_path) as fastaq:
+        lines = fastaq.readlines()
+        number_of_line = 0
+        while number_of_line < len(lines):
+            if lines[number_of_line].startswith("@"):
+                name = lines[number_of_line].strip()
+                sequence = lines[number_of_line + 1].strip()
+                quality = lines[number_of_line + 3].strip()
+                seqs[name] = (sequence, quality)
+                number_of_line += 4
+            else:
+                number_of_line += 1
+    return seqs
+
+
+def save_fastq_from_dict(filtered_seqs: dict, output_filename=None) -> None:
+    """
+    Save sequences from a dictionary to a FASTQ file.
+
+    Args:
+        filtered_seqs (dict): A dictionary where keys are sequence names and values
+                              are tuples of (sequence, quality).
+        output_filename (str, optional): The output filename. If not provided,
+                                         the default is 'fastq_filtrator_results/filtered_data.fastq'.
+
+    Returns:
+        None
+
+    """
+    if not output_filename:
+        output_filename = 'fastq_filtrator_results/filtered_data.fastq'
+    else:
+        output_filename = f'fastq_filtrator_results/{output_filename}.fastq'
+
+    output_folder = os.path.dirname(output_filename)
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    with open(output_filename, 'w') as output_file:
+        for name, (sequence, quality) in filtered_seqs.items():
+            output_file.write(f'{name}\n{sequence}\n+\n{quality}\n')
+
+def check_valid_sequence(sequences: tuple) -> bool:
+    """
+    Checks if the input sequences consist only of valid DNA or RNA characters.
+
+    Args:
+        sequences (iterable of str): List of sequences to be validated.
+    Returns:
+        bool: True if all sequences are valid, False otherwise.
+    """
+    allowed_characters = set('ATGCUatgcu')
+    for sequence in sequences:
+        for nucleotide in sequence:
+            if nucleotide not in allowed_characters:
+                return False
+    return True
+
+
+def contains_T_and_U_at_the_same_time(sequences: tuple) -> bool:
+    """
+    Checks if any sequence in the input list contains both 'T' and 'U' nucleotides simultaneously.
+    Args:
+        sequences (iterable of str): List of sequences to be checked.
+    Returns:
+        bool: True if any sequence contains both 'T' and 'U', False otherwise.
+    """
+    for sequence in sequences:
+        sequence = sequence.upper()
+        if sequence.count("T") and sequences.count("U"):
+            return True
+    return False
+
+
+def get_first_sequence(my_tuple: tuple or list[str]) -> str: # type: ignore
+    """
+    Extracts the first sequence from the input tuple, if applicable.
+
+    Args:
+        my_tuple (str or tuple): Input tuple of sequences.
+
+    Returns:
+        str or None: The first sequence if available, otherwise None.
+
+    """
+    if isinstance(my_tuple, str):
+        return my_tuple
+    elif len(my_tuple) == 1:
+        return str(my_tuple[0])
+
+
+def is_dna(sequences: tuple) -> bool:
+    """
+    Checks if all sequences in the input list consist only of valid DNA characters.
+
+    Args:
+        sequences (iterable of str): List of sequences to be validated.
+
+    Returns:
+        bool: True if all sequences are valid DNA, False otherwise.
+    """
+    allowed_characters = set('ATGCatgc')
+    for sequence in sequences:
+        for nucleotide in sequence:
+            if nucleotide not in allowed_characters:
+                return False
+    return True
+
+
+def transcribe(sequences: tuple) -> str or list[str]: # type: ignore
+    """
+    Transcribes DNA sequences to RNA sequences.
+
+    Args:
+        sequences (iterable of str): List of DNA sequences to be transcribed.
+
+    Returns:
+        str or list of str: Transcribed RNA sequence(s).
+    """
+    for sequence in sequences:
+        if not is_dna(sequence):
+            raise ValueError("At least one of your sequences is RNA instead of DNA, and RNA can not be transcribed")
+    first_sequence = get_first_sequence(sequences)
+    if first_sequence:
+        return first_sequence.replace("T", "U").replace('t', 'u')
+    else:
+        return [sequence.replace("T", "U").replace('t', 'u') for sequence in sequences]
+
+
+def reverse(sequences: tuple) -> str or list[str]: # type: ignore
+    """
+    Reverses the input sequences.
+
+    Args:
+        sequences (iterable of str): List of sequences to be reversed.
+
+    Returns:
+        str or list of str: Reversed sequence(s).
+    """
+    first_sequence = get_first_sequence(sequences)
+    if first_sequence:
+        return first_sequence[::-1]
+    else:
+        return [sequence[::-1] for sequence in sequences]
+
+
+def complement(sequences: tuple or list[str]) -> str or list[str]: # type: ignore
+    """
+    Finds the complement of DNA or RNA sequences.
+
+    Args:
+        sequences (str or iterable of str): Input sequence(s).
+
+    Returns:
+        str or list of str: Complemented sequence(s).
+    """
+    if type(sequences) == str:
+        sequences = () + (sequences,)
+    complement_sequences = []
+    for sequence in sequences:
+        complement_seq = ''
+        if sequence.count("U"):
+            nucl_complement_map = {"A": "U", "C": "G", "U": "A", "G": "C", 'a': 'u', 'c': 'g', 'u': 'a', 'g': 'c'}
+        else:
+            nucl_complement_map = {"A": "T", "C": "G", "T": "A", "G": "C", 'a': 't', 'c': 'g', 't': 'a', 'g': 'c'}
+        for nucleotide in sequence:
+            complement_seq += nucl_complement_map[nucleotide]
+        complement_sequences.append(complement_seq)
+    first_sequence = get_first_sequence(sequences)
+    if first_sequence:
+        return get_first_sequence(complement_sequences)
+    else:
+        return complement_sequences
+
+
+def reverse_complement(sequences: tuple) -> str or list[str]: # type: ignore
+    """
+    Finds the reverse complement of DNA or RNA sequences.
+
+    Args:
+        sequences (str or iterable of str): Input sequence(s).
+
+    Returns:
+        str or list of str: Reverse complemented sequence(s).
+    """
+    return complement(reverse(sequences))
+
+AA_SET = {'V', 'I', 'L', 'E', 'Q', 'D', 'N', 'H', 'W', 'F', 'Y', 'R', 'K', 'S', 'T', 'M', 'A', 'G', 'P', 'C',
+          'v', 'i', 'l', 'e', 'q', 'd', 'n', 'h', 'w', 'f', 'y', 'r', 'k', 's', 't', 'm', 'a', 'g', 'p', 'c'}
+HYDROPHOBIC_AA = ['A', 'V', 'L', 'I', 'P', 'F', 'W', 'M']
+HYDROPHILIC_AA = ['R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'K', 'S', 'T', 'Y']
+AMINO_ACIDS = {'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His', 'I': 'Ile',
+               'K': 'Lys', 'L': 'Leu', 'M': 'Met', 'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 'S': 'Ser',
+               'T': 'Thr', 'V': 'Val',
+               'W': 'Trp', 'Y': 'Tyr'}
+
+
+def is_aa(seq: str) -> bool:
+    """
+    Check if a sequence contains only amino acids.
+
+        Args:
+        seq (str): The input sequence to be checked.
+
+    Returns:
+        bool: True if the sequence contains only amino acids, False otherwise.
+    """
+    unique_chars = set(seq)
+    return unique_chars <= AA_SET
+
+
+def choose_weight(weight: str) -> dict:
+    """
+    Choose the weight type of amino acids - average or monoisotopic.
+
+    Args:
+        weight (str): The type of weight to choose, either 'average' or 'monoisotopic'.
+
+    Returns:
+        dict: A dictionary mapping amino acids to their weights based on the chosen type.
+    """
+    if weight == 'average':
+        weights = {
+            'A': 71.0788, 'R': 156.1875, 'N': 114.1038, 'D': 115.0886, 'C': 103.1388,
+            'E': 129.1155, 'Q': 128.1307, 'G': 57.0519, 'H': 137.1411, 'I': 113.1594,
+            'L': 113.1594, 'K': 128.1741, 'M': 131.1926, 'F': 147.1766, 'P': 97.1167,
+            'S': 87.0782, 'T': 101.1051, 'W': 186.2132, 'Y': 163.1760, 'V': 99.1326
+            }
+    elif weight == 'monoisotopic':
+        weights = {
+            'A': 71.03711, 'R': 156.10111, 'N': 114.04293, 'D': 115.02694, 'C': 103.00919,
+            'E': 129.04259, 'Q': 128.05858, 'G': 57.02146, 'H': 137.05891, 'I': 113.08406,
+            'L': 113.08406, 'K': 128.09496, 'M': 131.04049, 'F': 147.06841, 'P': 97.05276,
+            'S': 87.03203, 'T': 101.04768, 'W': 186.07931, 'Y': 163.06333, 'V': 99.06841
+            }
+    else:
+        raise ValueError(f"I do not know what '{weight}' is :( \n Read help or just do not write anything except your "
+                         f"sequence")
+
+    return weights
+
+
+def aa_weight(seq: str, weight: str = 'average') -> float:
+    """
+    Calculate the amino acids weight in a protein sequence.
+
+    Args:
+        seq (str): The amino acid sequence to calculate the weight for.
+        weight (str, optional): The type of weight to use, either 'average' or 'monoisotopic'. Default is 'average'.
+
+    Returns:
+        float: The calculated weight of the amino acid sequence.
+    """
+    weights_aa = choose_weight(weight)
+    final_weight = 0
+    for aa in seq.upper():
+        final_weight += weights_aa[aa]
+    return round(final_weight, 3)
+
+
+def count_hydroaffinity(seq: str) -> list:
+    """
+    Count the quantity of hydrophobic and hydrophilic amino acids in a protein sequence.
+
+    Args:
+        seq (str): The protein sequence for which to count hydrophobic and hydrophilic amino acids.
+
+    Returns:
+        tuple: A tuple containing the count of hydrophobic and hydrophilic amino acids, respectively.
+    """
+    hydrophobic_count = 0
+    hydrophilic_count = 0
+    seq = seq.upper()
+
+    for aa in seq:
+        if aa in HYDROPHOBIC_AA:
+            hydrophobic_count += 1
+        elif aa in HYDROPHILIC_AA:
+            hydrophilic_count += 1
+
+    return [hydrophobic_count, hydrophilic_count]
+
+
+def peptide_cutter(sequence: str, enzyme: str = "trypsin") -> str:
+    """
+    This function identifies cleavage sites in a given peptide sequence using a specified enzyme.
+    
+    Args: sequence (str): The input peptide sequence. enzyme (str): The enzyme to be used for cleavage. Choose
+    between "trypsin" and "chymotrypsin". Default is "trypsin".
+        
+    Returns: str: A message indicating the number and positions of cleavage sites, or an error message if an invalid
+    enzyme is provided.
+    """
+    cleavage_sites = []
+    if enzyme not in ("trypsin", "chymotrypsin"):
+        return "You have chosen an enzyme that is not provided. Please choose between trypsin and chymotrypsin."
+
+    if enzyme == "trypsin":  # Trypsin cuts peptide chains mainly at the carboxyl side of the amino acids lysine or
+        # arginine.
+        for aa in range(len(sequence) - 1):
+            if sequence[aa] in ['K', 'R', 'k', 'r'] and sequence[aa + 1] not in ['P', 'p']:
+                cleavage_sites.append(aa + 1)
+
+    if enzyme == "chymotrypsin":  # Chymotrypsin preferentially cleaves at Trp, Tyr and Phe in position P1(high
+        # specificity)
+        for aa in range(len(sequence) - 1):
+            if sequence[aa] in ['W', 'Y', 'F', 'w', 'y', 'f'] and sequence[aa + 1] not in ['P', 'p']:
+                cleavage_sites.append(aa + 1)
+
+    if cleavage_sites:
+        return f"Found {len(cleavage_sites)} {enzyme} cleavage sites at positions {', '.join(map(str, cleavage_sites))}"
+    else:
+        return f"No {enzyme} cleavage sites were found."
+
+
+def one_to_three_letter_code(sequence: str) -> str:
+    """
+    This function converts a protein sequence from one-letter amino acid code to three-letter code.
+    
+    Args:
+        sequence (str): The input protein sequence in one-letter code.
+        
+    Returns:
+        str: The converted protein sequence in three-letter code.
+    """
+    three_letter_code = [AMINO_ACIDS.get(aa.upper()) for aa in sequence]
+    return '-'.join(three_letter_code)
+
+
+def sulphur_containing_aa_counter(sequence: str) -> str:
+    """
+    This function counts sulphur-containing amino acids (Cysteine and Methionine) in a protein sequence.
+    
+    Args:
+        sequence (str): The input protein sequence in one-letter code.
+        
+    Returns:
+        str: The number of sulphur-containing amino acids in a protein sequence.
+    """
+    counter = 0
+    for aa in sequence:
+        if aa == 'C' or aa == 'M':
+            counter += 1
+    answer = str(counter)
+    return 'The number of sulphur-containing amino acids in the sequence is equal to ' + answer