From 2c2e51f09761fe212b4bdd6551f0b23ca103b785 Mon Sep 17 00:00:00 2001
From: Nikita Vaulin <vaulin@ro.ru>
Date: Mon, 26 Feb 2024 20:51:00 +0300
Subject: [PATCH] Add UBC.py

---
 UBC.py | 380 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 380 insertions(+)
 create mode 100644 UBC.py

diff --git a/UBC.py b/UBC.py
new file mode 100644
index 0000000..b73da4e
--- /dev/null
+++ b/UBC.py
@@ -0,0 +1,380 @@
+import os
+from Bio import SeqIO
+from Bio.SeqUtils import gc_fraction
+
+SHORT_CODE = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'U', 'O',
+                'a', 'r', 'n', 'd', 'c', 'e', 'q', 'g', 'h', 'i', 'l', 'k', 'm', 'f', 'p', 's', 't', 'w', 'y', 'v', 'u', 'o']
+LONG_CODE = {'A': 'Ala', 'R': 'Arg', 'N': 'Asn', 'D': 'Asp', 'C': 'Cys', 'E': 'Glu', 'Q': 'Gln', 'G': 'Gly', 'H': 'His', 'I': 'Ile', 'L': 'Leu',
+            'K': 'Lys', 'M': 'Met', 'F': 'Phe', 'P': 'Pro', 'S': 'Ser', 'T': 'Thr', 'W': 'Trp', 'Y': 'Tyr', 'V': 'Val', 'U': 'Sec', 'O': 'Pyl',
+            'a': 'Ala', 'r': 'Arg', 'n': 'Asn', 'd': 'Asp', 'c': 'Cys', 'e': 'Glu', 'q': 'Gln', 'g': 'Gly', 'h': 'His', 'i': 'Ile', 'l': 'Leu',
+            'k': 'Lys', 'm': 'Met', 'f': 'Phe', 'p': 'Pro', 's': 'Ser', 't': 'Thr', 'w': 'Trp', 'y': 'Tyr', 'v': 'Val', 'u': 'Sec', 'o': 'Pyl'}
+MASS = {'A': 71.08, 'R': 156.2, 'N': 114.1, 'D': 115.1, 'C': 103.1, 'E': 129.1, 'Q': 128.1, 'G': 57.05, 'H': 137.1, 'I': 113.2, 'L': 113.2,
+        'K': 128.2, 'M': 131.2, 'F': 147.2, 'P': 97.12, 'S': 87.08, 'T': 101.1, 'W': 186.2, 'Y': 163.2, 'V': 99.13, 'U': 168.05, 'O': 255.3,
+        'a': 71.08, 'r': 156.2, 'n': 114.1, 'd': 115.1, 'c': 103.1, 'e': 129.1, 'q': 128.1, 'g': 57.05, 'h': 137.1, 'i': 113.2, 'l': 113.2,
+        'k': 128.2, 'm': 131.2, 'f': 147.2, 'p': 97.12, 's': 87.08, 't': 101.1, 'w': 186.2, 'y': 163.2, 'v': 99.13, 'u': 168.05, 'o': 255.3}
+DNA_COMPLEMENT = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't', 't': 'a', 'c': 'g', 'g': 'c'}
+RNA_COMPLEMENT = {'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C', 'a': 'u', 'u': 'a', 'c': 'g', 'g': 'c'}
+
+
+def molecular_weight(seq: str) -> float:
+    """
+    Function calculates molecular weight of the amino acid chain
+        Parameters:
+            seq (str): each letter refers to one-letter coded proteinogenic amino acids
+    Returns:
+        (float) Molecular weight of tge given amino acid chain in Da
+    """
+    m = 0
+    for acid in seq:
+        m += MASS[acid]
+    return m
+
+
+def three_letter_code(seq: str) -> str:
+    """
+    Function converts single letter translations to three letter translations
+        Parameters:
+            seq (str): each letter refers to one-letter coded proteinogenic amino acids
+        Returns:
+            (str) translated in three-letter code
+    """
+    recording = seq.maketrans(LONG_CODE)
+    return seq.translate(recording)
+
+
+def show_length(seq: str) -> int:
+    """
+    Function counts the number of amino acids in the given sequence
+        Parameters:
+            seq (str): amino acid sequence
+        Returns:
+            (int): integer number of amino acid residues
+    """
+    return len(seq)
+
+
+def folding(seq: str) -> str:
+    """
+    Counts the number of amino acids characteristic separately for alpha helixes and beta sheets,
+    and gives out what will be the structure of the protein more.
+    This function has been tested on proteins such as 2M3X, 6DT4 (PDB ID) and MHC, CRP.
+    The obtained results corresponded to reality.
+        Parameters:
+            seq (str): amino acid sequence
+        Returns:
+            (str): overcoming structure ('alfa_helix', 'beta_sheet', 'equally')
+    """
+    alfa_helix = ['A', 'E', 'L', 'M', 'G', 'Y', 'S', 'a', 'e', 'l', 'm', 'g', 'y', 's']
+    beta_sheet = ['Y', 'F', 'W', 'T', 'V', 'I', 'y', 'f', 'w', 't', 'v', 'i']
+    alfa_helix_counts = 0
+    beta_sheet_counts = 0
+    for amino_acid in seq:
+        if amino_acid in alfa_helix:
+            alfa_helix_counts += 1
+        elif amino_acid in beta_sheet:
+            beta_sheet_counts += 1
+    if alfa_helix_counts > beta_sheet_counts:
+        return 'alfa_helix'
+    elif alfa_helix_counts < beta_sheet_counts:
+        return 'beta_sheet'
+    elif alfa_helix_counts == beta_sheet_counts:
+        return 'equally'
+
+
+def aminoacid_seqs_only(seqs: list) -> list:
+    """
+    Leaves only the amino acid sequences from the fed into the function.
+        Parameters:
+            seqs (list): amino acid sequence list
+        Returns:
+            aminoacid_seqs (list): amino acid sequence list without non amino acid sequence
+    """
+    aminoacid_seqs = []
+    for seq in seqs:
+        unique_chars = set(seq)
+        amino_acid = set(SHORT_CODE)
+        if unique_chars <= amino_acid:
+            aminoacid_seqs.append(seq)
+    return aminoacid_seqs
+
+
+def dna_or_rna_only(seqs):
+    """
+        The function leaves only DNA and RNA sequences
+            Parameters:
+                seqs - sequences fed to the input of the main function
+            Return:
+                list of DNA and RNA sequences only
+    """
+    dna_seqs = []
+    for seq in seqs:
+        unique_chars = set(seq)
+        nuc = set('ATGCatgcUu')
+        if unique_chars <= nuc:
+            dna_seqs.append(seq)
+    return dna_seqs
+
+
+def transcribe(seq):
+    """
+        The function returns the transcribed sequence
+            Parameters:
+                seq (str) - sequence
+            Return:
+                (str) transcript sequence
+    """
+    transcript = seq.replace('T', 'U').replace('t', 'u')
+    return transcript
+
+
+def reverse(seq):
+    """
+        The function produces a reverse sequence
+            Parameters:
+                seq (str) - sequence
+            Return:
+                (str) reverse sequence
+    """
+    reverse_seq = seq[::-1]
+    return reverse_seq
+
+
+def complement(seq):
+    """
+        The function produces a complementary sequence
+            Parameters:
+                seq (str) - sequence
+            Return:
+                (str) complementary sequence
+    """
+    complement_seq = []
+    if 'U' in set(seq):
+        using_dict = RNA_COMPLEMENT
+    else:
+        using_dict = DNA_COMPLEMENT
+    for nucl in seq:
+        complement_seq.append(using_dict[nucl])
+    return ''.join(complement_seq)
+
+
+def reverse_complement(seq):
+    """
+        The function produces a reverse and complementary sequence
+            Parameters:
+                seq (str) - sequence
+            Return:
+                (str) reverse and complementary sequence
+    """
+    reverse_compl_seq = complement(reverse(seq))
+    return reverse_compl_seq
+
+
+def filter_fastq(input_path, gc_bounds=(0, 100), length_bounds=(0, 2**32), quality_threshold=0, output_filename=None):
+    """
+    Performs functions for working with fastq.
+        Parameters:
+            - input_path - takes as input the path to the FASTQ-file.
+                        A DNA or RNA sequences can consist of either uppercase or lowercase letters.
+            - gc_bounds - GC composition interval (in percent) for filtering, by default it is (0, 100).
+                        If you pass one number as an argument, it is considered to be the upper limit.
+                        Examples: gc_bounds = (20, 80) - save only reads with GC content from 20 to 80%,
+                        gc_bounds = 44.4 - save reads with GC content less than 44.4%.
+            - length_bounds - length interval for filtering, everything is similar to gc_bounds,
+                        but by default it is equal to (0, 2**32).
+            - quality_threshold - threshold value of average read quality for filtering, default is 0 (phred33 scale).
+                        Reads with average quality across all nucleotides below the threshold are discarded.
+            - output_filename - name of the file with the filtering result, if not specified, the name of the input file is assigned by default.
+                        Important: specify the file extension.
+        Example input:
+            filter_seqs(seqs = 'example_fastq.fastq', gc_bounds = (20, 80), length_bounds = (0, 89), quality_threshold = 34), output_filename = 'my_out_file'
+        Return:
+            The function returns a file consisting of only those sequences that satisfy all conditions.
+            It puts this file into the "fastq_filtrator_resuls" folder and creates it, if it does not exist.
+            All described intervals include both upper and lower boundaries.
+            Depending on the function being performed, the following returns will occur:
+                If the sequences in the file are RNA, then there will be no filtering by gc composition.
+                If you supply a tuple of more than 2 values for the gc_bounds and length_bounds arguments,
+                you will receive the errors "Incorrect gc_bounds input" and "Incorrect length_bounds input" respectively.
+    """
+    if type(gc_bounds) is tuple and len(gc_bounds) == 2:
+        min_gc_bound = gc_bounds[0]
+        max_gc_bound = gc_bounds[1]
+    elif isinstance(gc_bounds, (int, float)):
+        min_gc_bound = 0
+        max_gc_bound = gc_bounds
+    else:
+        return print("Incorrect gc_bounds input")
+
+    if type(length_bounds) is tuple and len(length_bounds) == 2:
+        min_len_bound = length_bounds[0]
+        max_len_bound = length_bounds[1]
+    elif type(length_bounds) is tuple and len(length_bounds) == 0:
+        min_len_bound = 0
+        max_len_bound = 2**32
+    elif isinstance(length_bounds, (int, float)):
+        min_len_bound = 0
+        max_len_bound = length_bounds
+    else:
+        return print("Incorrect length_bounds input")
+
+    filtered_dict_key = []
+    with open(input_path, 'r') as input_file:
+        for record in SeqIO.parse(input_file, 'fastq'):
+            seq_length = len(record.seq)
+            seq_gc_content = gc_fraction(record.seq)
+            seq_quality = sum(record.letter_annotations["phred_quality"]) / len(record)
+
+            if (min_len_bound <= seq_length <= max_len_bound) and (min_gc_bound <= seq_gc_content <= max_gc_bound) and (seq_quality >= quality_threshold):
+                filtered_dict_key.append(record)
+
+    if output_filename is None:
+        output_filename = input_path.split('.')[0] + '_filtered.fastq'
+    with open(output_filename, 'w') as output_handle:
+        SeqIO.write(filtered_dict_key, output_handle, 'fastq')
+
+    print(f"Filtered FastQ sequences saved to '{output_filename}'")
+
+
+def amino_acid_tools(*args: str):
+    """
+    Performs functions for working with protein sequences.
+        Parameters:
+            The function must accept an unlimited number of protein sequences (str) as input,
+            the last  variable must be the function (str) you want to execute.
+            The amino acid sequence can consist of both uppercase and lowercase letters.
+        Input example:
+            amino_acid_tools('PLPKVEL','VDviRIkLQ','PPDFGKT','folding')
+        Functions:
+            molecular_weight: calculates molecular weight of the amino acid chain
+            three_letter_code: converts single letter translations to three letter translations
+            show_length: counts the number of amino acids in the given sequence
+            folding: counts the number of amino acids characteristic separately for alpha helixes and beta sheets,
+                    and gives out what will be the structure of the protein more
+            seq_charge: evaluates the overall charge of the aminoacid chain in neutral aqueous solution (pH = 7)
+        Returns:
+            If one sequence is supplied, a string with the result is returned.
+            If several are submitted, a list of strings is returned.
+            Depending on the function performed, the following returns will occur:
+                molecular_weight (int) or (list): amino acid sequence molecular weight number or list of numbers
+                three_letter_code (str) or (list): translated sequence from one-letter in three-letter code
+                show_length (int) or (list): integer number of amino acid residues
+                folding (str) or (list): 'alpha_helix', if there are more alpha helices
+                                        'beta_sheet', if there are more beta sheets
+                                        'equally', if the probability of alpha spirals and beta sheets are the same
+                seq_charge(str) or (list): "positive", "negative" or "neutral"
+    """
+    *seqs, function = args
+    d_of_functions = {'molecular_weight': molecular_weight,
+                      'three_letter_code': three_letter_code,
+                      'show_length': show_length,
+                      'folding': folding}
+    answer = []
+    aminoacid_seqs = aminoacid_seqs_only(seqs)
+    for sequence in aminoacid_seqs:
+        if function in d_of_functions.keys():
+            answer.append(d_of_functions[function](sequence))
+        elif function == 'seq_charge':
+            ac_seq = AminoAcidSequence(sequence)
+            ac_seq_charge = ac_seq.seq_charge()
+            answer.append(ac_seq_charge)
+    if len(answer) == 1:
+        return answer[0]
+    return answer
+
+
+def run_dna_rna_tools(*args: str):
+    """
+        Performs functions for processing one or several DNA and/or RNA sequences.
+           Parameters:
+               The function must accept an unlimited number of sequences (str) as input.
+               the last variable should be the function (str) you want to execute.
+               The sequence can consist of both uppercase and lowercase letters.
+           Example input:
+               def run_dna_rna_tools("ATgAAaC", "cUgAuaC", "reverse")
+           Functions:
+               transcribe — print the transcribed sequence
+               reverse — print the reversed sequence
+               complement — print the complementary sequence
+               reverse_complement — print the reverse complementary sequence
+           Return:
+               If a single sequence is specified, a string containing the result is returned.
+               If multiple strings are sent, a list of strings is returned.
+               Depending on the function being performed, the following returns will occur:
+                   - if the sequences you pass are not DNA or RNA, then the result of the function will be a list without them
+                   - if the sequence is RNA, then the 'transcribe' procedure will produce the unchanged sequence
+    """
+    *seqs, function = args
+    d_of_functions = {'transcribe': transcribe, 
+                      'reverse': reverse,
+                      'reverse_complement': reverse_complement,
+                      'complement': complement,
+                     }
+    answer = []
+    dna_rna_seqs = dna_or_rna_only(seqs)
+    for sequence in dna_rna_seqs:
+        answer.append(d_of_functions[function](sequence))
+    if len(answer) == 1:
+        return answer[0]
+    return answer
+
+
+class BiologicalSequence():
+    def __init__(self, seq):
+        self.seq = seq
+    
+    def __len__(self):
+        return len(self.seq)
+    
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            return self.seq[index]
+        elif isinstance(index, (list, tuple)) and len(index) == 2:
+            return self.seq[index[0]:index[1]]
+
+    def __str__(self):
+        return str(self.seq)
+    
+    def check_alphabet(self):
+        return set(str(self.seq)) <= set("ACGTUacgtu")
+
+
+class NucleicAcidSequence(BiologicalSequence):
+    def complement(self):
+        complement_seq = []
+        DNA_COMPLEMENT = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't', 't': 'a', 'c': 'g', 'g': 'c'}
+        RNA_COMPLEMENT = {'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C', 'a': 'u', 'u': 'a', 'c': 'g', 'g': 'c'}
+        if 'U' in set(self.seq):
+            using_dict = RNA_COMPLEMENT
+        else:
+            using_dict = DNA_COMPLEMENT
+        for nucl in self.seq:
+            complement_seq.append(using_dict[nucl])
+        return ''.join(complement_seq)
+    
+    def gc_content(self):
+        return gc_fraction(self.seq)
+
+
+class DNASequence(NucleicAcidSequence):
+    def transcribe(self):
+        return RNASequence(self.complement().replace('T', 'U').replace('t', 'u'))
+
+
+class RNASequence(NucleicAcidSequence):
+    def __init__(self, seq):
+        super().__init__(seq)
+        self.complement_seq = self.complement()
+
+class AminoAcidSequence(BiologicalSequence):
+    def seq_charge(self):
+        aminoacid_charge = {'R': 1, 'D': -1, 'E': -1, 'K': 1, 'O': 1, 'r': 1, 'd': -1, 'e': -1, 'k': 1, 'o': 1}
+        charge = 0
+        for aminoacid in self.seq:
+            if aminoacid in aminoacid_charge:
+                charge += aminoacid_charge[aminoacid]
+        if charge > 0:
+            return 'positive'
+        elif charge < 0:
+            return 'negative'
+        else:
+            return 'neutral'
\ No newline at end of file