From 97a198b9b2ec2da53451882c980e7c7c15617af2 Mon Sep 17 00:00:00 2001
From: Amy <amina.ibragimova@phystech.edu>
Date: Thu, 13 Oct 2022 22:49:02 +0300
Subject: [PATCH 1/6] Added fastq_filtrator.py and empty README

---
 hw2/README.md          |  0
 hw2/fastq_filtrator.py | 95 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 hw2/README.md
 create mode 100644 hw2/fastq_filtrator.py

diff --git a/hw2/README.md b/hw2/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py
new file mode 100644
index 0000000..9398d23
--- /dev/null
+++ b/hw2/fastq_filtrator.py
@@ -0,0 +1,95 @@
+# This function returns GC-content of sequence in percents
+def gc_content(read_seq):
+    return sum(map(read_seq.upper().count, ['G','C'])) / len(read_seq.strip()) * 100
+
+# This function checks whether GC-content of sequence is in bounds 
+def gc_filter(read_seq, gc_bounds):
+    if type(gc_bounds) == tuple:
+        min_gc, max_gc = gc_bounds
+    else:    # Only upper bound is given
+        min_gc = 0
+        max_gc = gc_bounds
+    if gc_content(read_seq) >= min_gc and gc_content(read_seq) <= max_gc:    # GC-content is in bounds
+        return 'Pass'
+    else:
+        return 'Fail'
+
+# This function returns length of sequence
+def length(read_seq):
+    return len(read_seq.strip())
+    
+# This function checks whether length of sequence is in bounds 
+def len_filter(read_seq, length_bounds):
+    if type(length_bounds) == tuple:
+        min_len, max_len = length_bounds
+    else:    # Only upper bound is given
+        min_len = 0
+        max_len = length_bounds
+    if length(read_seq) >= min_len and length(read_seq) <= max_len:    # Length is in bounds
+        return 'Pass'
+    else:
+        return 'Fail'
+    
+# This function returns mean q-score of sequence
+def mean_q_score(read_qual):
+    sum_of_q_scores = 0
+    for symbol in read_qual.strip():
+        sum_of_q_scores += ord(symbol) - 33
+    return sum_of_q_scores / len(read_qual.strip())
+
+# This function checks whether mean q-score of sequence is not less than quality_threshold  
+def qual_filter(read_qual, quality_threshold):        
+    if mean_q_score(read_qual) >= quality_threshold:
+        return 'Pass'
+    else:
+        return 'Fail'
+
+# This function adds read to output '_passed.fastq' file 
+def add_read_to_passed(
+        read_header, read_seq,
+        read_plus_str, read_qual,
+        output_file_prefix):
+    with open(output_file_prefix + '_passed.fastq', 'a') as file:
+        file.writelines([read_header, read_seq, read_plus_str, read_qual])
+
+# This function adds read to output '_failed.fastq' file 
+def add_read_to_failed(
+        read_header, read_seq,
+        read_plus_str, read_qual,
+        output_file_prefix):
+    with open(output_file_prefix + '_failed.fastq', 'a') as file:
+        file.writelines([read_header, read_seq, read_plus_str, read_qual])
+    
+
+    
+def main(input_fastq, output_file_prefix, gc_bounds=(0, 100),
+         length_bounds=(0, 2**32), quality_threshold=0, save_filtered=False):
+    
+    # Creating empty output files for passed and filtered reads
+    # If file already exists, it becomes empty           
+    with open(output_file_prefix + '_passed.fastq', 'w') as file:
+        pass
+    with open(output_file_prefix + '_failed.fastq', 'w') as file:
+        pass
+    # Reading input fastq file line by line 
+    with open(input_fastq, 'r') as file:
+        while True:
+            read_header = file.readline()     ## I'm not stripping '\n' here consciously
+            read_seq = file.readline()        ## for convenient appending to output files.
+            read_plus_str = file.readline()   ## I'm going to take '\n' into account 
+            read_qual = file.readline()       ## when I'll be counting length of these strings.
+            if read_header == '':    # Reached the end of input file, stop reading
+                break                            
+            # Checking read
+            if (gc_filter(read_seq, gc_bounds) == 'Pass' and
+                len_filter(read_seq, length_bounds) == 'Pass' and
+                qual_filter(read_qual, quality_threshold) == 'Pass'):
+            # Appending read to one of output files 
+                add_read_to_passed(read_header, read_seq,
+                                   read_plus_str, read_qual,
+                                   output_file_prefix)
+            else: 
+                if save_filtered == True:
+                    add_read_to_failed(read_header, read_seq,
+                                       read_plus_str, read_qual,
+                                       output_file_prefix)
\ No newline at end of file

From 377d71e4a267596839cb6550437a8a796088b1a7 Mon Sep 17 00:00:00 2001
From: Amy <amina.ibragimova@phystech.edu>
Date: Thu, 13 Oct 2022 22:55:44 +0300
Subject: [PATCH 2/6] Minor improvement

---
 hw2/fastq_filtrator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py
index 9398d23..23f27cf 100644
--- a/hw2/fastq_filtrator.py
+++ b/hw2/fastq_filtrator.py
@@ -92,4 +92,5 @@ def main(input_fastq, output_file_prefix, gc_bounds=(0, 100),
                 if save_filtered == True:
                     add_read_to_failed(read_header, read_seq,
                                        read_plus_str, read_qual,
-                                       output_file_prefix)
\ No newline at end of file
+                                       output_file_prefix)
+

From af6dbecd707e1b3a68a5ed26f72173a42cf98c1e Mon Sep 17 00:00:00 2001
From: Amy <amina.ibragimova@phystech.edu>
Date: Fri, 14 Oct 2022 15:30:43 +0300
Subject: [PATCH 3/6] Handling case when gc_bounds or length_bounds parameter
 is a list, not a tuple

---
 hw2/fastq_filtrator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py
index 23f27cf..065e1b9 100644
--- a/hw2/fastq_filtrator.py
+++ b/hw2/fastq_filtrator.py
@@ -4,7 +4,7 @@ def gc_content(read_seq):
 
 # This function checks whether GC-content of sequence is in bounds 
 def gc_filter(read_seq, gc_bounds):
-    if type(gc_bounds) == tuple:
+    if isinstance(gc_bounds, (tuple, list)):
         min_gc, max_gc = gc_bounds
     else:    # Only upper bound is given
         min_gc = 0
@@ -20,7 +20,7 @@ def length(read_seq):
     
 # This function checks whether length of sequence is in bounds 
 def len_filter(read_seq, length_bounds):
-    if type(length_bounds) == tuple:
+    if isinstance(length_bounds, (tuple, list)):
         min_len, max_len = length_bounds
     else:    # Only upper bound is given
         min_len = 0

From 98cd8eafdc8be6346a004e340361e4148e4f2a58 Mon Sep 17 00:00:00 2001
From: Amy <amina.ibragimova@phystech.edu>
Date: Fri, 14 Oct 2022 15:37:23 +0300
Subject: [PATCH 4/6] Minor improvements

---
 hw2/fastq_filtrator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py
index 065e1b9..86dd3c2 100644
--- a/hw2/fastq_filtrator.py
+++ b/hw2/fastq_filtrator.py
@@ -84,7 +84,7 @@ def main(input_fastq, output_file_prefix, gc_bounds=(0, 100),
             if (gc_filter(read_seq, gc_bounds) == 'Pass' and
                 len_filter(read_seq, length_bounds) == 'Pass' and
                 qual_filter(read_qual, quality_threshold) == 'Pass'):
-            # Appending read to one of output files 
+            # Appending read to output files 
                 add_read_to_passed(read_header, read_seq,
                                    read_plus_str, read_qual,
                                    output_file_prefix)

From b5dc285cfc2bb5809438900981f22e3d2b21eff9 Mon Sep 17 00:00:00 2001
From: Amy <amina.ibragimova@phystech.edu>
Date: Fri, 14 Oct 2022 15:43:11 +0300
Subject: [PATCH 5/6] Handling case when there is a dangling newline character
 at the end of input file

---
 hw2/fastq_filtrator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py
index 86dd3c2..b04b021 100644
--- a/hw2/fastq_filtrator.py
+++ b/hw2/fastq_filtrator.py
@@ -78,7 +78,7 @@ def main(input_fastq, output_file_prefix, gc_bounds=(0, 100),
             read_seq = file.readline()        ## for convenient appending to output files.
             read_plus_str = file.readline()   ## I'm going to take '\n' into account 
             read_qual = file.readline()       ## when I'll be counting length of these strings.
-            if read_header == '':    # Reached the end of input file, stop reading
+            if read_header == '' or read_header == '\n':    # Reached the end of input file, stop reading
                 break                            
             # Checking read
             if (gc_filter(read_seq, gc_bounds) == 'Pass' and

From 169808a2ab9042b7820671baa3ed4547a66abaf8 Mon Sep 17 00:00:00 2001
From: ibragimovaamina <104636744+ibragimovaamina@users.noreply.github.com>
Date: Fri, 14 Oct 2022 16:32:45 +0300
Subject: [PATCH 6/6] Added README

---
 hw2/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/hw2/README.md b/hw2/README.md
index e69de29..2813ab9 100644
--- a/hw2/README.md
+++ b/hw2/README.md
@@ -0,0 +1,15 @@
+Скрипт *fastq_filtrator.py* содержит программу для фильтрации ридов из fastq файла по длине, GC-составу и среднему q-score последовательности рида.
+
+Риды, прошедшие фильтрацию, записываются в новый файл.
+Отфильтрованные риды сохраняются в отдельный файл, если значения параметра **save_filtered** равно *True*.
+
+**main** принимает аргументы:
+
+1. **input_fastq** - путь к файлу, который подаётся на вход программе (обычный не сжатый fastq файл).
+2. **output_file_prefix** - префикс пути к файлу, в который будет записан результат. К префиксу прибавляется *"_passed.fastq"* для файла с ридами, прошедшими фильтрацию, и *"_failed.fastq"* для файлов с отфильтрованными ридами.
+3. **gc_bounds** - интервал GC-состава (в процентах) для фильтрации. По умолчанию равен *(0, 100)*, т. е. все риды сохраняются. Если в аргумент передать одно число, то считается, что это верхняя граница.
+4. **length_bounds** - интервал длины для фильтрации, аналогично gc_bounds. По умолчанию равен *(0, 2**32)*.
+5. **quality_threshold** - пороговое значение среднего качества рида для фильтрации. По умолчанию равно *0* (шкала phred33). Риды со средним качеством по всем нуклеотидам ниже порогового отбрасываются.
+6. **save_filtered** - сохранять ли отфильтрованные риды. По умолчанию равен *False*.
+
+Все описанные интервалы включают и верхнюю, и нижнюю границы.