From 97a198b9b2ec2da53451882c980e7c7c15617af2 Mon Sep 17 00:00:00 2001 From: Amy Date: Thu, 13 Oct 2022 22:49:02 +0300 Subject: [PATCH 1/6] Added fastq_filtrator.py and empty README --- hw2/README.md | 0 hw2/fastq_filtrator.py | 95 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 hw2/README.md create mode 100644 hw2/fastq_filtrator.py diff --git a/hw2/README.md b/hw2/README.md new file mode 100644 index 0000000..e69de29 diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py new file mode 100644 index 0000000..9398d23 --- /dev/null +++ b/hw2/fastq_filtrator.py @@ -0,0 +1,95 @@ +# This function returns GC-content of sequence in percents +def gc_content(read_seq): + return sum(map(read_seq.upper().count, ['G','C'])) / len(read_seq.strip()) * 100 + +# This function checks whether GC-content of sequence is in bounds +def gc_filter(read_seq, gc_bounds): + if type(gc_bounds) == tuple: + min_gc, max_gc = gc_bounds + else: # Only upper bound is given + min_gc = 0 + max_gc = gc_bounds + if gc_content(read_seq) >= min_gc and gc_content(read_seq) <= max_gc: # GC-content is in bounds + return 'Pass' + else: + return 'Fail' + +# This function returns length of sequence +def length(read_seq): + return len(read_seq.strip()) + +# This function checks whether length of sequence is in bounds +def len_filter(read_seq, length_bounds): + if type(length_bounds) == tuple: + min_len, max_len = length_bounds + else: # Only upper bound is given + min_len = 0 + max_len = length_bounds + if length(read_seq) >= min_len and length(read_seq) <= max_len: # Length is in bounds + return 'Pass' + else: + return 'Fail' + +# This function returns mean q-score of sequence +def mean_q_score(read_qual): + sum_of_q_scores = 0 + for symbol in read_qual.strip(): + sum_of_q_scores += ord(symbol) - 33 + return sum_of_q_scores / len(read_qual.strip()) + +# This function checks whether mean q-score of sequence is not less than quality_threshold +def qual_filter(read_qual, quality_threshold): + if mean_q_score(read_qual) >= quality_threshold: + return 'Pass' + else: + return 'Fail' + +# This function adds read to output '_passed.fastq' file +def add_read_to_passed( + read_header, read_seq, + read_plus_str, read_qual, + output_file_prefix): + with open(output_file_prefix + '_passed.fastq', 'a') as file: + file.writelines([read_header, read_seq, read_plus_str, read_qual]) + +# This function adds read to output '_failed.fastq' file +def add_read_to_failed( + read_header, read_seq, + read_plus_str, read_qual, + output_file_prefix): + with open(output_file_prefix + '_failed.fastq', 'a') as file: + file.writelines([read_header, read_seq, read_plus_str, read_qual]) + + + +def main(input_fastq, output_file_prefix, gc_bounds=(0, 100), + length_bounds=(0, 2**32), quality_threshold=0, save_filtered=False): + + # Creating empty output files for passed and filtered reads + # If file already exists, it becomes empty + with open(output_file_prefix + '_passed.fastq', 'w') as file: + pass + with open(output_file_prefix + '_failed.fastq', 'w') as file: + pass + # Reading input fastq file line by line + with open(input_fastq, 'r') as file: + while True: + read_header = file.readline() ## I'm not stripping '\n' here consciously + read_seq = file.readline() ## for convenient appending to output files. + read_plus_str = file.readline() ## I'm going to take '\n' into account + read_qual = file.readline() ## when I'll be counting length of these strings. + if read_header == '': # Reached the end of input file, stop reading + break + # Checking read + if (gc_filter(read_seq, gc_bounds) == 'Pass' and + len_filter(read_seq, length_bounds) == 'Pass' and + qual_filter(read_qual, quality_threshold) == 'Pass'): + # Appending read to one of output files + add_read_to_passed(read_header, read_seq, + read_plus_str, read_qual, + output_file_prefix) + else: + if save_filtered == True: + add_read_to_failed(read_header, read_seq, + read_plus_str, read_qual, + output_file_prefix) \ No newline at end of file From 377d71e4a267596839cb6550437a8a796088b1a7 Mon Sep 17 00:00:00 2001 From: Amy Date: Thu, 13 Oct 2022 22:55:44 +0300 Subject: [PATCH 2/6] Minor improvement --- hw2/fastq_filtrator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py index 9398d23..23f27cf 100644 --- a/hw2/fastq_filtrator.py +++ b/hw2/fastq_filtrator.py @@ -92,4 +92,5 @@ def main(input_fastq, output_file_prefix, gc_bounds=(0, 100), if save_filtered == True: add_read_to_failed(read_header, read_seq, read_plus_str, read_qual, - output_file_prefix) \ No newline at end of file + output_file_prefix) + From af6dbecd707e1b3a68a5ed26f72173a42cf98c1e Mon Sep 17 00:00:00 2001 From: Amy Date: Fri, 14 Oct 2022 15:30:43 +0300 Subject: [PATCH 3/6] Handling case when gc_bounds or length_bounds parameter is a list, not a tuple --- hw2/fastq_filtrator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py index 23f27cf..065e1b9 100644 --- a/hw2/fastq_filtrator.py +++ b/hw2/fastq_filtrator.py @@ -4,7 +4,7 @@ def gc_content(read_seq): # This function checks whether GC-content of sequence is in bounds def gc_filter(read_seq, gc_bounds): - if type(gc_bounds) == tuple: + if isinstance(gc_bounds, (tuple, list)): min_gc, max_gc = gc_bounds else: # Only upper bound is given min_gc = 0 @@ -20,7 +20,7 @@ def length(read_seq): # This function checks whether length of sequence is in bounds def len_filter(read_seq, length_bounds): - if type(length_bounds) == tuple: + if isinstance(length_bounds, (tuple, list)): min_len, max_len = length_bounds else: # Only upper bound is given min_len = 0 From 98cd8eafdc8be6346a004e340361e4148e4f2a58 Mon Sep 17 00:00:00 2001 From: Amy Date: Fri, 14 Oct 2022 15:37:23 +0300 Subject: [PATCH 4/6] Minor improvements --- hw2/fastq_filtrator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py index 065e1b9..86dd3c2 100644 --- a/hw2/fastq_filtrator.py +++ b/hw2/fastq_filtrator.py @@ -84,7 +84,7 @@ def main(input_fastq, output_file_prefix, gc_bounds=(0, 100), if (gc_filter(read_seq, gc_bounds) == 'Pass' and len_filter(read_seq, length_bounds) == 'Pass' and qual_filter(read_qual, quality_threshold) == 'Pass'): - # Appending read to one of output files + # Appending read to output files add_read_to_passed(read_header, read_seq, read_plus_str, read_qual, output_file_prefix) From b5dc285cfc2bb5809438900981f22e3d2b21eff9 Mon Sep 17 00:00:00 2001 From: Amy Date: Fri, 14 Oct 2022 15:43:11 +0300 Subject: [PATCH 5/6] Handling case when there is a dangling newline character at the end of input file --- hw2/fastq_filtrator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw2/fastq_filtrator.py b/hw2/fastq_filtrator.py index 86dd3c2..b04b021 100644 --- a/hw2/fastq_filtrator.py +++ b/hw2/fastq_filtrator.py @@ -78,7 +78,7 @@ def main(input_fastq, output_file_prefix, gc_bounds=(0, 100), read_seq = file.readline() ## for convenient appending to output files. read_plus_str = file.readline() ## I'm going to take '\n' into account read_qual = file.readline() ## when I'll be counting length of these strings. - if read_header == '': # Reached the end of input file, stop reading + if read_header == '' or read_header == '\n': # Reached the end of input file, stop reading break # Checking read if (gc_filter(read_seq, gc_bounds) == 'Pass' and From 169808a2ab9042b7820671baa3ed4547a66abaf8 Mon Sep 17 00:00:00 2001 From: ibragimovaamina <104636744+ibragimovaamina@users.noreply.github.com> Date: Fri, 14 Oct 2022 16:32:45 +0300 Subject: [PATCH 6/6] Added README --- hw2/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/hw2/README.md b/hw2/README.md index e69de29..2813ab9 100644 --- a/hw2/README.md +++ b/hw2/README.md @@ -0,0 +1,15 @@ +Скрипт *fastq_filtrator.py* содержит программу для фильтрации ридов из fastq файла по длине, GC-составу и среднему q-score последовательности рида. + +Риды, прошедшие фильтрацию, записываются в новый файл. +Отфильтрованные риды сохраняются в отдельный файл, если значения параметра **save_filtered** равно *True*. + +**main** принимает аргументы: + +1. **input_fastq** - путь к файлу, который подаётся на вход программе (обычный не сжатый fastq файл). +2. **output_file_prefix** - префикс пути к файлу, в который будет записан результат. К префиксу прибавляется *"_passed.fastq"* для файла с ридами, прошедшими фильтрацию, и *"_failed.fastq"* для файлов с отфильтрованными ридами. +3. **gc_bounds** - интервал GC-состава (в процентах) для фильтрации. По умолчанию равен *(0, 100)*, т. е. все риды сохраняются. Если в аргумент передать одно число, то считается, что это верхняя граница. +4. **length_bounds** - интервал длины для фильтрации, аналогично gc_bounds. По умолчанию равен *(0, 2**32)*. +5. **quality_threshold** - пороговое значение среднего качества рида для фильтрации. По умолчанию равно *0* (шкала phred33). Риды со средним качеством по всем нуклеотидам ниже порогового отбрасываются. +6. **save_filtered** - сохранять ли отфильтрованные риды. По умолчанию равен *False*. + +Все описанные интервалы включают и верхнюю, и нижнюю границы.