From 8cb01e6764dbdb7821b31f525a27f886f74c6dbe Mon Sep 17 00:00:00 2001 From: Alexandra Kasianova Date: Mon, 13 Feb 2023 16:51:15 +0300 Subject: [PATCH 01/29] Added diputils.py. Added the ability to create a dictionary separately by haplotypes. Added function to calculate reference length by haplotypes. --- quast_libs/basic_stats.py | 7 +++ quast_libs/diputils.py | 30 +++++++++++++ quast_libs/reporting.py | 4 +- test_data/dip_reference.fasta | 73 ++++++++++++++++++++++++++++++++ test_data/dip_test_contigs.fasta | 37 ++++++++++++++++ 5 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 quast_libs/diputils.py create mode 100644 test_data/dip_reference.fasta create mode 100644 test_data/dip_test_contigs.fasta diff --git a/quast_libs/basic_stats.py b/quast_libs/basic_stats.py index 1c470a6216..e9f0f9a242 100644 --- a/quast_libs/basic_stats.py +++ b/quast_libs/basic_stats.py @@ -14,6 +14,8 @@ from quast_libs import fastaparser, qconfig, qutils, reporting, plotter from quast_libs.circos import set_window_size from quast_libs.log import get_logger +from quast_libs.diputils import DipQuastAnalyzer + logger = get_logger(qconfig.LOGGER_DEFAULT_NAME) MIN_HISTOGRAM_POINTS = 5 MIN_GC_WINDOW_SIZE = qconfig.GC_window_size // 2 @@ -323,6 +325,11 @@ def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) + + dipquast = DipQuastAnalyzer() + _, genome_size_by_haplotypes = dipquast.fill_dip_dict_by_chromosomes(ref_fpath) + report.add_field(reporting.Fields.REFLEN_HAPLOTYPE1, int(genome_size_by_haplotypes['haplotype1'])) + report.add_field(reporting.Fields.REFLEN_HAPLOTYPE2, int(genome_size_by_haplotypes['haplotype2'])) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) diff --git a/quast_libs/diputils.py b/quast_libs/diputils.py new file mode 100644 index 0000000000..1add258f41 --- /dev/null +++ b/quast_libs/diputils.py @@ -0,0 +1,30 @@ +from quast_libs.fastaparser import read_fasta + +class DipQuastAnalyzer: + def __init__(self): + self.dip_genome_by_chr = {} + self.dip_genome_by_chr_len = {} + self.genome_size_by_haplotypes = {} + self.__remember_haplotypes = [] + def fill_dip_dict_by_chromosomes(self, fasta_fpath): + for name, seq in read_fasta(fasta_fpath): + chr_name, haplotype = name.strip('\n').split('_') + chr_len = len(seq) + if haplotype not in self.dip_genome_by_chr_len.keys(): + self.dip_genome_by_chr_len[haplotype] = {} + self.dip_genome_by_chr[haplotype] = {} + self.__remember_haplotypes.append(haplotype) + self.dip_genome_by_chr_len[haplotype][chr_name] = chr_len + self.dip_genome_by_chr[haplotype][chr_name] = seq + + for haplotype_n in self.__remember_haplotypes: + self.genome_size_by_haplotypes[haplotype_n] = sum(self.dip_genome_by_chr_len[haplotype_n].values()) + + return self.dip_genome_by_chr, self.genome_size_by_haplotypes + + + + + + + diff --git a/quast_libs/reporting.py b/quast_libs/reporting.py index d1f764f8c5..c7262d3d24 100644 --- a/quast_libs/reporting.py +++ b/quast_libs/reporting.py @@ -172,6 +172,8 @@ class Fields: # Reference statistics REFLEN = 'Reference length' + REFLEN_HAPLOTYPE1 = 'Reference length of haplotype 1' + REFLEN_HAPLOTYPE2 = 'Reference length of haplotype 2' ESTREFLEN = 'Estimated reference length' REF_FRAGMENTS = 'Reference fragments' REFGC = 'Reference GC (%)' @@ -200,7 +202,7 @@ class Fields: SIMILAR_MIS_BLOCKS = '# similar misassembled blocks' ### content and order of metrics in MAIN REPORT (/report.txt, .tex, .tsv): - order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, ESTREFLEN, GC, REFGC, + order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPE1, REFLEN_HAPLOTYPE2, ESTREFLEN, GC, REFGC, N50, NG50, Nx, NGx, auN, auNG, L50, LG50, Lx, LGx, TOTAL_READS, LEFT_READS, RIGHT_READS, MAPPED_READS_PCNT, REF_MAPPED_READS_PCNT, diff --git a/test_data/dip_reference.fasta b/test_data/dip_reference.fasta new file mode 100644 index 0000000000..76c7639a67 --- /dev/null +++ b/test_data/dip_reference.fasta @@ -0,0 +1,73 @@ +>chr1_haplotype1 +ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG +AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG +GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA +TTTAGTGCCCATTTCGGCCAACTCTCTATCATCTTTCTTTGGCTGAGTGGCATGTATTTCCATGGTGCTC +GTTTTTCCAATTATGAAGCATGGCTGAGTGATCCTACTCACATTGGACCTAGTGCTCAGGTGGTTTGGCC +AATAGTGGGCCAAGAAATCCTGAATGGAGATGTGGGCGGAGGCTTCCGAGGAATACAAATAACCTCAGGC +TTTTTTCAGATTTGGCGAGCATCCGGAATAACTAGTGAATTACAACTTTATTGTACCGCAATTGGCGCAT +TGGTCTTCGCAGCCTTAATGCTTTTTGCTGGTTGGTTCCATTATCACAAAGCAGCTCCAAAATTGGCTTG +GTTCCAAGATGTAGAATCTATGTTGAATCACCATTTAGCAGGGCTACTAGGACTTGGGTCCCTTTCTTGG +GCAGGACATCAAGTACATGTATCTTTACCGATTAACCAATTTCTAAACGCTGGAGTAGATCCTAAAGAAA +TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG +AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT +CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA +TCGCGGGTCATATGTATAGGACC +>chr2_haplotype1 +AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA +TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT +ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT +TCTCATAGTTGGTGCTGCTGCGCATGCAGCCATTTTTATGGTAAGAGACTATGATCCAACTAATCGATAT +AACGATTTATTAGATCGTGTCCTGAGGCATCGCGATGCAATCATATCACATCTCAACTGGGTATGTATAT +TTCTAGGCTTCCACAGTTTTGGTTTGTATATTCATAATGATACCATGAGTGCTTTAGGGCGTCCACAAGA +TATGTTTTCAGATACTGCTATACAATTACAACCAGTCTTTGCTCAATGGATACAAAATACCCATGCTTTA +GCACCTGGTGTAACAGCCCCTGGTGAAACAGCGAGCACCAGTTTGACTTGGGGGGGCGGTGAGTTAGTAG +CAGTGGGTGGCAAAGTAGCTTT +>chr3_haplotype1 +TGCATTTACAATTCATGTGACGGTATTGATACTGTTGAAAGGTGTTCTATTTGCTCGTAGCTCGCGTTTA +ATACCAGATAAAGCAAATCTTGGTTTTCGTTTCCCTTGTGATGGGCCGGGAAGAGGAGGAACATGTCAAG +TATCTGCTTGGGATCATGTCTTCTTAGGACTATTCTGGATGTACAATGCTATTTCCGTAGTAATATTCCA +TTTCAGTTGGAAAATGCAGTCAGATGTTTGGGGTAGTATAAGTGATCAAGGGGTGGTAACTCATATTACT +GGAGGAAACTTTGCACAGAGTTCCATTACTATTAATGGGTGGCTCCGCGATTTCTTATGGGCACAAGCAT +CTCAGGTAATTCAGTCTTATGGTTCTTCGTTATCTGCATATGGTCTTTTTTTCCTAGGTGCTCATTTTGT +ATGGGCTTTCAGTTTAATGTTTCTATTCAGCGGGCGTGGTTATTGGCAAGAACTTATTGAATCCATTGTT +TGGGCTCATAATAAATTAAAAGTTGCTCCTGCTACTCAGCCTAGAGCCTTGAGCATTATACAAGGACGTG +CTGTAGGAGTAACCCATTACCTTCTGGGTGGAATTGCCACAACATGGGCGTTCTTCTTAGCAAGAATTAT +TGCAGTAGGATAAAACTGGGGTATTGGTCATGGTATAAAAGATATTTTAGAGGCTCATAAGTTACCTATT +CCATTAGGAACGGCAGATTTTTTGGTACATCATATTCA +>chr1_haplotype2 +ATGGAATTAAGATTTCCCAGGTTTAGCCAAGGCTTAGCTCAGGACCCCACTACTCGTCGTATTTGGTTTG +GTATTGCTACCGCACATGATTTCGAAAGTCATGATGATATTACTGAGGAACGTCTTTATCAGAACATTTT +TGCTTCTCACTTTGGGCAGTTAGCAATAATCTTTCTATGGACGTCCGGAAATCTGTTTCATGTAGCTTGG +CAAGGAAATTTTGAATCATGGATACAGGATCCTTTACACGTAAGACCTATTGCTCATGCCATTTGGGATC +CTCATTTTGGGCAACCCGCTGTGGAAGCCTTTACTCGAGGAGGTGCTGCCGGTCCAGTGAATATCGCTTA +TTCTGGGGTTTATCAGTGGTGGTATACAATTGGATTGCGCACCAATGAAGATCTTTATACTGGAGCTCTT +TTTCTATTATTTCTTTCTACGCTATCCTTAATAGGGGGTTGGTTACATCTACAACCCAAATGGAAGCCAA +GCCTTTCGTGGTTCAAAAACGCCGAATCTCGTCTGAATCATCATTTGTCAGGACTTTTCGGAGTAAGTTC +TTTGGCTTGGACAGGACATTTAGTTCATGTTGCTATTCCCGGATCCTCTAGGGGGGAGTACGTTCGATGG +AATAATTTCTTAGATGTATTACCCTATCCCCAGGGGTTGGGTCCCCTTCTGACGGGTCAGTGGAATCTTT +ATGCCCAAAATCCTGATTCGAGTAATCATTTATTTGGTACCACTCAAGGAGCGGGAACTGCCATTCTGAC +CCTTCTTGGGGGATTCC +>chr2_haplotype2 +ATTGCATTTATTTTTCTCATTGCCGGTCATATGTATCGAACTAACTTCGGAATTGGGCACAGTATCAAAG +ATCTTTTAGAAGCACATACTCCTCCGGGGGGTCGATTAGGACGTGGGCATAAAGGCCTTTATGATACAAT +CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT +CAACATATGTACTCTTTACCTGCTTATGCATTCATAGCACAAGACTTTACTACTCAAGCTGCTTTATATA +CTCATCACCAATACATTGCAGGGTTCATCATGACAGGGGCTTTTGCTCATGGAGCTATTTTTTTCATTAG +GGATTACAATCCGGAACAGAATGAAGATAATGTATTGGCAAGAATGTTAGACCATAAGGAAGCTATCATA +TCTCATTTAAGTTGGGCTAGCCTCTTCCTAGGATTCCATACCTTGGGCCCTTATGTTCATAACGACGTTA +TGCTTGCTTTTGGTACTCCAGAAAAGCAAATCTTGATTGAACCTATATTTGCCCAATGGATACAATCTGC +TCATGGTAAGACGACATATGGGTTCGATATACTCTTATCTTCAACGAATGGCCCCACTTTCAATGCAGGT +CGAAACATATGGTTGCCCGGATGGTTGAATGCTGTTAATGAGAATAGTAATTCGCTTTTCTTAACAATAG +GACCTGGGGATTTCTTGGTTCATCATGCTATTGCTCTAGGTTTGCATACAACTACATTGATTTTAGTAAA +GGGTGCTTTAGATGCACGCGGTTCCAAATTAATGCCGGATAAAAAGGATTTCGGGTATAGTTTT +>chr3_haplotype2 +GACGGCCCAGGGCGCGGCGGTACTTGTGATATTTCTGCTTGGGACGCGTTTTATTTGGCAGTTTTCTGGA +TGTTAAATACCATTGGATGGGTTACTTTTTATTGGCATTGGAAACACATTACATTATGGCAGGGCAACGT +TTCACAATTTAATGAATCCTCCACTTATTTGATGGGATGGTTAAGAGATTACCTATGGTTAAACTCTTCA +CAACTTATTAATGGATATAATCCTTTTGGGATGAATAGTTTATCAGTATGGGCTTGGATGTTCTTATTTG +GACATCTTGTTTGGGCTACAGGATTTATGTTCTTAATTTCCTGGCGTGGATATTGGCAGGAATTAATTGA +GACTTTAGCATGGGCTCATGAACGGACACCTTTGGCTAATTTAATTCGCTGGAGAGATAAGCCCGTGGCT +CTTTCCGTTGTGCAAGCAAGATTGGTCGGATTAGCCCACTTTTCCGTGGGTTATATATTCACTTATGCAG +CTTTCTTGATTGCCTCAACATCAGGCAAGTTCGGTTAAATCCACAAACACAAAGTTTGTGGCTGACCGAT +ATTGCTCATCATCATTTAGCTCCTTGC \ No newline at end of file diff --git a/test_data/dip_test_contigs.fasta b/test_data/dip_test_contigs.fasta new file mode 100644 index 0000000000..753ec42c48 --- /dev/null +++ b/test_data/dip_test_contigs.fasta @@ -0,0 +1,37 @@ +>cont_1 +ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG +AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG +GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA +TTTAGTGCCCATTTCGGCCAACTCTCTATCATCTTTCTTTGGCTGAGTGGCATGTATTTCCATGGTGCTC +GTTTTTCCAATTATGAAGCATGGCTGAGTGATCCTACTCACATTGGACCTAGTGCTCAGGTGGTTTGGCC +AATAGTGGGCCAAGAAATCCTGAATGGAGATGTGGGCGGAGGCTTCCGAGGAATACAAATAACCTCAGGC +>cont_2 +TTTTTTCAGATTTGGCGAGCATCCGGAATAACTAGTGAATTACAACTTTATTGTACCGCAATTGGCGCAT +TGGTCTTCGCAGCCTTAATGCTTTTTGCTGGTTGGTTCCATTATCACAAAGCAGCTCCAAAATTGGCTTG +GTTCCAAGATGTAGAATCTATGTTGAATCACCATTTAGCAGGGCTACTAGGACTTGGGTCCCTTTCTTGG +GCAGGACATCAAGTACATGTATCTTTACCGATTAACCAATTTCTAAACGCTGGAGTAGATCCTAAAGAAA +TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG +AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT +CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA +TCGCGGGTCATATGTATAGGACC +>cont_3 +AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA +TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT +ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT +TCTCATAGTTGGTGCTGCTGCGCATGCAGCCATTTTTATGGTAAGAGACTATGATCCAACTAATCGATAT +AACGATTTATTAGATCGTGTCCTGAGGCATCGCGATGCAATCATATCACATCTCAACTGGGTATGTATAT +TTC +>cont_4 +ATGGAATTAAGATTTCCCAGGTTTAGCCAAGGCTTAGCTCAGGACCCCACTACTCGTCGTATTTGGTTTG +GTATTGCTACCGCACATGATTTCGAAAGTCATGATGATATTACTGAGGAACGTCTTTATCAGAACATTTT +TGCTTCTCACTTTGGGCAGTTAGCAATAATCTTTCTATGGACGTCCGGAAATCTGTTTCATGTAGCTTGG +CAAGGAAATTTTGAATCATGGATACAGGATCCTTTACACGTAAGACCTATTGCTCATGCCATTTGGGATC +CTCATTCATAAAGGCCTTTATGATACAAT +>cont_5 +CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT +CAACATATGTACTCTTTACCTGCTTATGCATTCATAGCACAAGACTTTACTACTCAAGCTGCTTTATATA +CTCATCACCAATACATTGCAGGGTTCATCATGACAGGGGCTTTTGCTCATGGAGCTATTTTTTTCATTAG +GGATTACAATCCGGAACAGAATGAAGATAATGTATTGGCAAGAATGTTAGACCATAAGGAAGCTATCATA +TCTCATTTAAGTTGGGCTAGCCTCTTCCTAGGATTCCATACCTTGGGCCCTTATGTTCATAACGACGTTA +TGCTTGCTTTTGGTACTCCAGAAAAGCAAATCTTGATTGAACCTATATTTGCCCAATGGATACAATCTGC +TCATGGTAAGACGACAT \ No newline at end of file From 635f50728fdea551e94604e600ddfcd5b4f66512 Mon Sep 17 00:00:00 2001 From: Amy Date: Mon, 13 Feb 2023 23:36:11 +0300 Subject: [PATCH 02/29] Added value ploid for --ambiguity-usage option --- quast_libs/ca_utils/analyze_contigs.py | 21 +++++++++++++++++++++ quast_libs/options_parser.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/quast_libs/ca_utils/analyze_contigs.py b/quast_libs/ca_utils/analyze_contigs.py index ab3651a1c9..dbe0bdf55b 100644 --- a/quast_libs/ca_utils/analyze_contigs.py +++ b/quast_libs/ca_utils/analyze_contigs.py @@ -202,6 +202,18 @@ def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fp top_aligns = top_aligns[1:] for align in top_aligns: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') + # This is a template for future "ploid" ambiguity-usage flag, need to change it later: + elif qconfig.ambiguity_usage == "ploid": + ca_output.stdout_f.write('\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n') + ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0])) + ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n') + ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) + aligned_lengths.append(top_aligns[0].len2) + contigs_aligned_lengths[-1] = top_aligns[0].len2 + ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n') + top_aligns = top_aligns[1:] + for align in top_aligns: + ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') elif qconfig.ambiguity_usage == "all": ca_output.stdout_f.write('\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):\n') # we count only extra bases, so we shouldn't include bases in the first alignment @@ -250,6 +262,15 @@ def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fp for idx in used_indexes: if idx not in the_best_set.indexes: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') + # This is a template for future "ploid" ambiguity-usage flag, need to change it later: + elif qconfig.ambiguity_usage == "ploid": + ambiguous_contigs_extra_bases += 0 + ca_output.stdout_f.write('\t\tUsing only the very best set (option --ambiguity-usage is set to "one").\n') + if len(the_best_set.indexes) < len(used_indexes): + ca_output.stdout_f.write('\t\tSo, skipping alignments from other sets:\n') + for idx in used_indexes: + if idx not in the_best_set.indexes: + ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') elif qconfig.ambiguity_usage == "all": ca_output.stdout_f.write('\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):\n') ca_output.stdout_f.write('\t\t\tThe very best set is shown in details below, the rest are:\n') diff --git a/quast_libs/options_parser.py b/quast_libs/options_parser.py index b40b3d021d..03fd3d173a 100644 --- a/quast_libs/options_parser.py +++ b/quast_libs/options_parser.py @@ -502,7 +502,7 @@ def parse_options(logger, quast_args): action='callback', callback=check_str_arg_value, callback_args=(logger,), - callback_kwargs={'available_values': ['none', 'one', 'all']}) + callback_kwargs={'available_values': ['none', 'one', 'ploid', 'all']}) ), (['--ambiguity-score'], dict( dest='ambiguity_score', From c9f391234decfb98e8b506caef8256e1858cfa6f Mon Sep 17 00:00:00 2001 From: Amy Date: Mon, 20 Feb 2023 01:00:09 +0300 Subject: [PATCH 03/29] Added ploid value to the help menu --- quast_libs/qconfig.py | 166 +++++++++++++++++++++--------------------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/quast_libs/qconfig.py b/quast_libs/qconfig.py index 1e7270cbd1..050c8facdf 100644 --- a/quast_libs/qconfig.py +++ b/quast_libs/qconfig.py @@ -417,102 +417,102 @@ def usage(show_hidden=False, mode=None, short=True, stream=sys.stdout): stream.write("These are basic options. To see the full list, use --help\n") else: stream.write("Advanced options:\n") - stream.write("-s --split-scaffolds Split assemblies by continuous fragments of N's and add such \"contigs\" to the comparison\n") - stream.write("-l --labels \"label, label, ...\" Names of assemblies to use in reports, comma-separated. If contain spaces, use quotes\n") - stream.write("-L Take assembly names from their parent directory names\n") + stream.write("-s --split-scaffolds Split assemblies by continuous fragments of N's and add such \"contigs\" to the comparison\n") + stream.write("-l --labels \"label, label, ...\" Names of assemblies to use in reports, comma-separated. If contain spaces, use quotes\n") + stream.write("-L Take assembly names from their parent directory names\n") if mode != 'large': - stream.write("-e --eukaryote Genome is eukaryotic (primarily affects gene prediction)\n") - stream.write(" --fungus Genome is fungal (primarily affects gene prediction)\n") - stream.write(" --large Use optimal parameters for evaluation of large genomes\n") - stream.write(" In particular, imposes '-e -m %d -i %d -x %d' (can be overridden manually)\n" % + stream.write("-e --eukaryote Genome is eukaryotic (primarily affects gene prediction)\n") + stream.write(" --fungus Genome is fungal (primarily affects gene prediction)\n") + stream.write(" --large Use optimal parameters for evaluation of large genomes\n") + stream.write(" In particular, imposes '-e -m %d -i %d -x %d' (can be overridden manually)\n" % (LARGE_MIN_CONTIG, LARGE_MIN_ALIGNMENT, LARGE_EXTENSIVE_MIS_THRESHOLD)) - stream.write("-k --k-mer-stats Compute k-mer-based quality metrics (recommended for large genomes)\n" - " This may significantly increase memory and time consumption on large genomes\n") - stream.write(" --k-mer-size Size of k used in --k-mer-stats [default: %d]\n" % unique_kmer_len) - stream.write(" --circos Draw Circos plot\n") + stream.write("-k --k-mer-stats Compute k-mer-based quality metrics (recommended for large genomes)\n" + " This may significantly increase memory and time consumption on large genomes\n") + stream.write(" --k-mer-size Size of k used in --k-mer-stats [default: %d]\n" % unique_kmer_len) + stream.write(" --circos Draw Circos plot\n") if mode == 'meta': - stream.write("-f --gene-finding Predict genes using MetaGeneMark\n") + stream.write("-f --gene-finding Predict genes using MetaGeneMark\n") elif mode == 'large': - stream.write("-f --gene-finding Predict genes using GeneMark-ES\n") + stream.write("-f --gene-finding Predict genes using GeneMark-ES\n") else: - stream.write("-f --gene-finding Predict genes using GeneMarkS (prokaryotes, default) or GeneMark-ES (eukaryotes, use --eukaryote)\n") + stream.write("-f --gene-finding Predict genes using GeneMarkS (prokaryotes, default) or GeneMark-ES (eukaryotes, use --eukaryote)\n") if not meta: - stream.write(" --mgm Use MetaGeneMark for gene prediction (instead of the default finder, see above)\n") - stream.write(" --glimmer Use GlimmerHMM for gene prediction (instead of the default finder, see above)\n") - stream.write(" --gene-thresholds Comma-separated list of threshold lengths of genes to search with Gene Finding module\n") - stream.write(" [default: %s]\n" % genes_lengths) - stream.write(" --rna-finding Predict ribosomal RNA genes using Barrnap\n") - stream.write("-b --conserved-genes-finding Count conserved orthologs using BUSCO (only on Linux)\n") - stream.write(" --operons File with operon coordinates in the reference (GFF, BED, NCBI or TXT)\n") + stream.write(" --mgm Use MetaGeneMark for gene prediction (instead of the default finder, see above)\n") + stream.write(" --glimmer Use GlimmerHMM for gene prediction (instead of the default finder, see above)\n") + stream.write(" --gene-thresholds Comma-separated list of threshold lengths of genes to search with Gene Finding module\n") + stream.write(" [default: %s]\n" % genes_lengths) + stream.write(" --rna-finding Predict ribosomal RNA genes using Barrnap\n") + stream.write("-b --conserved-genes-finding Count conserved orthologs using BUSCO (only on Linux)\n") + stream.write(" --operons File with operon coordinates in the reference (GFF, BED, NCBI or TXT)\n") if not meta: - stream.write(" --est-ref-size Estimated reference size (for computing NGx metrics without a reference)\n") + stream.write(" --est-ref-size Estimated reference size (for computing NGx metrics without a reference)\n") else: - stream.write(" --max-ref-number Maximum number of references (per each assembly) to download after looking in SILVA database.\n") - stream.write(" Set 0 for not looking in SILVA at all [default: %s]\n" % max_references) - stream.write(" --blast-db Custom BLAST database (.nsq file). By default, MetaQUAST searches references in SILVA database\n") - stream.write(" --use-input-ref-order Use provided order of references in MetaQUAST summary plots (default order: by the best average value)\n") - stream.write(" --contig-thresholds Comma-separated list of contig length thresholds [default: %s]\n" % contig_thresholds) - stream.write(" --x-for-Nx Value of 'x' for Nx, Lx, etc metrics reported in addition to N50, L50, etc (0, 100) [default: %s]\n" % x_for_additional_Nx) + stream.write(" --max-ref-number Maximum number of references (per each assembly) to download after looking in SILVA database.\n") + stream.write(" Set 0 for not looking in SILVA at all [default: %s]\n" % max_references) + stream.write(" --blast-db Custom BLAST database (.nsq file). By default, MetaQUAST searches references in SILVA database\n") + stream.write(" --use-input-ref-order Use provided order of references in MetaQUAST summary plots (default order: by the best average value)\n") + stream.write(" --contig-thresholds Comma-separated list of contig length thresholds [default: %s]\n" % contig_thresholds) + stream.write(" --x-for-Nx Value of 'x' for Nx, Lx, etc metrics reported in addition to N50, L50, etc (0, 100) [default: %s]\n" % x_for_additional_Nx) if meta: stream.write(" --reuse-combined-alignments Reuse the alignments from the combined_reference stage on runs_per_reference stages.\n") - stream.write("-u --use-all-alignments Compute genome fraction, # genes, # operons in QUAST v1.* style.\n") - stream.write(" By default, QUAST filters Minimap\'s alignments to keep only best ones\n") - stream.write("-i --min-alignment The minimum alignment length [default: %s]\n" % i_default) - stream.write(" --min-identity The minimum alignment identity (80.0, 100.0) [default: %.1f]\n" % min_idy_default) - stream.write("-a --ambiguity-usage Use none, one, or all alignments of a contig when all of them\n") - stream.write(" are almost equally good (see --ambiguity-score) [default: %s]\n" % ambiguity_usage) - stream.write(" --ambiguity-score Score S for defining equally good alignments of a single contig. All alignments are sorted \n") - stream.write(" by decreasing LEN * IDY% value. All alignments with LEN * IDY% < S * best(LEN * IDY%) are \n") - stream.write(" discarded. S should be between 0.8 and 1.0 [default: %.2f]\n" % ambiguity_score) + stream.write("-u --use-all-alignments Compute genome fraction, # genes, # operons in QUAST v1.* style.\n") + stream.write(" By default, QUAST filters Minimap\'s alignments to keep only best ones\n") + stream.write("-i --min-alignment The minimum alignment length [default: %s]\n" % i_default) + stream.write(" --min-identity The minimum alignment identity (80.0, 100.0) [default: %.1f]\n" % min_idy_default) + stream.write("-a --ambiguity-usage Use none, one, ploid number / ploidy number / amount of ploidy or all alignments of a contig when all of them\n") + stream.write(" are almost equally good (see --ambiguity-score) [default: %s]\n" % ambiguity_usage) + stream.write(" --ambiguity-score Score S for defining equally good alignments of a single contig. All alignments are sorted \n") + stream.write(" by decreasing LEN * IDY% value. All alignments with LEN * IDY% < S * best(LEN * IDY%) are \n") + stream.write(" discarded. S should be between 0.8 and 1.0 [default: %.2f]\n" % ambiguity_score) if meta: stream.write(" --unique-mapping Disable --ambiguity-usage=all for the combined reference run,\n") stream.write(" i.e. use user-specified or default ('%s') value of --ambiguity-usage\n" % ambiguity_usage) - stream.write(" --strict-NA Break contigs in any misassembly event when compute NAx and NGAx.\n") - stream.write(" By default, QUAST breaks contigs only by extensive misassemblies (not local ones)\n") - stream.write("-x --extensive-mis-size Lower threshold for extensive misassembly size. All relocations with inconsistency\n") - stream.write(" less than extensive-mis-size are counted as local misassemblies [default: %s]\n" % x_default) - stream.write(" --local-mis-size Lower threshold on local misassembly size. Local misassemblies with inconsistency\n") - stream.write(" less than local-mis-size are counted as (long) indels [default: %s]\n" % local_misassembly_min_length) - stream.write(" --scaffold-gap-max-size Max allowed scaffold gap length difference. All relocations with inconsistency\n") - stream.write(" less than scaffold-gap-size are counted as scaffold gap misassemblies [default: %s]\n" % scaffolds_gap_threshold) - stream.write(" --unaligned-part-size Lower threshold for detecting partially unaligned contigs. Such contig should have\n") - stream.write(" at least one unaligned fragment >= the threshold [default: %s]\n" % unaligned_part_size) - stream.write(" --skip-unaligned-mis-contigs Do not distinguish contigs with >= 50% unaligned bases as a separate group\n") - stream.write(" By default, QUAST does not count misassemblies in them\n") - stream.write(" --fragmented Reference genome may be fragmented into small pieces (e.g. scaffolded reference) \n") - stream.write(" --fragmented-max-indent Mark translocation as fake if both alignments are located no further than N bases \n") - stream.write(" from the ends of the reference fragments [default: %s]\n" % local_misassembly_min_length) - stream.write(" Requires --fragmented option\n") - stream.write(" --upper-bound-assembly Simulate upper bound assembly based on the reference genome and reads\n") - stream.write(" --upper-bound-min-con Minimal number of 'connecting reads' needed for joining upper bound contigs into a scaffold\n") - stream.write(" [default: %d for mate-pairs and %d for long reads]\n" % (MIN_CONNECT_MP, MIN_CONNECT_LR)) - stream.write(" --est-insert-size Use provided insert size in upper bound assembly simulation [default: auto detect from reads or %d]\n" % optimal_assembly_default_IS) - stream.write(" --report-all-metrics Keep all quality metrics in the main report even if their values are '-' for all assemblies or \n" - " if they are not applicable (e.g., reference-based metrics in the no-reference mode)\n") - stream.write(" --plots-format Save plots in specified format [default: %s].\n" % plot_extension) - stream.write(" Supported formats: %s\n" % ', '.join(supported_plot_extensions)) - stream.write(" --memory-efficient Run everything using one thread, separately per each assembly.\n") - stream.write(" This may significantly reduce memory consumption on large genomes\n") - stream.write(" --space-efficient Create only reports and plots files. Aux files including .stdout, .stderr, .coords will not be created.\n") - stream.write(" This may significantly reduce space consumption on large genomes. Icarus viewers also will not be built\n") - stream.write("-1 --pe1 File with forward paired-end reads (in FASTQ format, may be gzipped)\n") - stream.write("-2 --pe2 File with reverse paired-end reads (in FASTQ format, may be gzipped)\n") - stream.write(" --pe12 File with interlaced forward and reverse paired-end reads. (in FASTQ format, may be gzipped)\n") - stream.write(" --mp1 File with forward mate-pair reads (in FASTQ format, may be gzipped)\n") - stream.write(" --mp2 File with reverse mate-pair reads (in FASTQ format, may be gzipped)\n") - stream.write(" --mp12 File with interlaced forward and reverse mate-pair reads (in FASTQ format, may be gzipped)\n") - stream.write(" --single File with unpaired short reads (in FASTQ format, may be gzipped)\n") - stream.write(" --pacbio File with PacBio reads (in FASTQ format, may be gzipped)\n") - stream.write(" --nanopore File with Oxford Nanopore reads (in FASTQ format, may be gzipped)\n") - stream.write(" --ref-sam SAM alignment file obtained by aligning reads to reference genome file\n") - stream.write(" --ref-bam BAM alignment file obtained by aligning reads to reference genome file\n") - stream.write(" --sam Comma-separated list of SAM alignment files obtained by aligning reads to assemblies\n" - " (use the same order as for files with contigs)\n") - stream.write(" --bam Comma-separated list of BAM alignment files obtained by aligning reads to assemblies\n" - " (use the same order as for files with contigs)\n") - stream.write(" Reads (or SAM/BAM file) are used for structural variation detection and\n") - stream.write(" coverage histogram building in Icarus\n") - stream.write(" --sv-bedpe File with structural variations (in BEDPE format)\n") + stream.write(" --strict-NA Break contigs in any misassembly event when compute NAx and NGAx.\n") + stream.write(" By default, QUAST breaks contigs only by extensive misassemblies (not local ones)\n") + stream.write("-x --extensive-mis-size Lower threshold for extensive misassembly size. All relocations with inconsistency\n") + stream.write(" less than extensive-mis-size are counted as local misassemblies [default: %s]\n" % x_default) + stream.write(" --local-mis-size Lower threshold on local misassembly size. Local misassemblies with inconsistency\n") + stream.write(" less than local-mis-size are counted as (long) indels [default: %s]\n" % local_misassembly_min_length) + stream.write(" --scaffold-gap-max-size Max allowed scaffold gap length difference. All relocations with inconsistency\n") + stream.write(" less than scaffold-gap-size are counted as scaffold gap misassemblies [default: %s]\n" % scaffolds_gap_threshold) + stream.write(" --unaligned-part-size Lower threshold for detecting partially unaligned contigs. Such contig should have\n") + stream.write(" at least one unaligned fragment >= the threshold [default: %s]\n" % unaligned_part_size) + stream.write(" --skip-unaligned-mis-contigs Do not distinguish contigs with >= 50% unaligned bases as a separate group\n") + stream.write(" By default, QUAST does not count misassemblies in them\n") + stream.write(" --fragmented Reference genome may be fragmented into small pieces (e.g. scaffolded reference) \n") + stream.write(" --fragmented-max-indent Mark translocation as fake if both alignments are located no further than N bases \n") + stream.write(" from the ends of the reference fragments [default: %s]\n" % local_misassembly_min_length) + stream.write(" Requires --fragmented option\n") + stream.write(" --upper-bound-assembly Simulate upper bound assembly based on the reference genome and reads\n") + stream.write(" --upper-bound-min-con Minimal number of 'connecting reads' needed for joining upper bound contigs into a scaffold\n") + stream.write(" [default: %d for mate-pairs and %d for long reads]\n" % (MIN_CONNECT_MP, MIN_CONNECT_LR)) + stream.write(" --est-insert-size Use provided insert size in upper bound assembly simulation [default: auto detect from reads or %d]\n" % optimal_assembly_default_IS) + stream.write(" --report-all-metrics Keep all quality metrics in the main report even if their values are '-' for all assemblies or \n" + " if they are not applicable (e.g., reference-based metrics in the no-reference mode)\n") + stream.write(" --plots-format Save plots in specified format [default: %s].\n" % plot_extension) + stream.write(" Supported formats: %s\n" % ', '.join(supported_plot_extensions)) + stream.write(" --memory-efficient Run everything using one thread, separately per each assembly.\n") + stream.write(" This may significantly reduce memory consumption on large genomes\n") + stream.write(" --space-efficient Create only reports and plots files. Aux files including .stdout, .stderr, .coords will not be created.\n") + stream.write(" This may significantly reduce space consumption on large genomes. Icarus viewers also will not be built\n") + stream.write("-1 --pe1 File with forward paired-end reads (in FASTQ format, may be gzipped)\n") + stream.write("-2 --pe2 File with reverse paired-end reads (in FASTQ format, may be gzipped)\n") + stream.write(" --pe12 File with interlaced forward and reverse paired-end reads. (in FASTQ format, may be gzipped)\n") + stream.write(" --mp1 File with forward mate-pair reads (in FASTQ format, may be gzipped)\n") + stream.write(" --mp2 File with reverse mate-pair reads (in FASTQ format, may be gzipped)\n") + stream.write(" --mp12 File with interlaced forward and reverse mate-pair reads (in FASTQ format, may be gzipped)\n") + stream.write(" --single File with unpaired short reads (in FASTQ format, may be gzipped)\n") + stream.write(" --pacbio File with PacBio reads (in FASTQ format, may be gzipped)\n") + stream.write(" --nanopore File with Oxford Nanopore reads (in FASTQ format, may be gzipped)\n") + stream.write(" --ref-sam SAM alignment file obtained by aligning reads to reference genome file\n") + stream.write(" --ref-bam BAM alignment file obtained by aligning reads to reference genome file\n") + stream.write(" --sam Comma-separated list of SAM alignment files obtained by aligning reads to assemblies\n" + " (use the same order as for files with contigs)\n") + stream.write(" --bam Comma-separated list of BAM alignment files obtained by aligning reads to assemblies\n" + " (use the same order as for files with contigs)\n") + stream.write(" Reads (or SAM/BAM file) are used for structural variation detection and\n") + stream.write(" coverage histogram building in Icarus\n") + stream.write(" --sv-bedpe File with structural variations (in BEDPE format)\n") stream.write("\n") stream.write("Speedup options:\n") stream.write(" --no-check Do not check and correct input fasta files. Use at your own risk (see manual)\n") From 9e6569ae2e90d754d02c8b5904a31751212e9e0a Mon Sep 17 00:00:00 2001 From: Alexandra Kasianova Date: Mon, 20 Feb 2023 19:50:31 +0300 Subject: [PATCH 04/29] Fixed calculation for reference length by haplotypes and added calculation genome fraction by haplotypes. --- quast_libs/basic_stats.py | 14 ++++++--- quast_libs/ca_utils/save_results.py | 16 ++++++++++ quast_libs/contigs_analyzer.py | 18 +++++++++-- quast_libs/diputils.py | 48 +++++++++++++++++------------ quast_libs/qconfig.py | 1 + quast_libs/qutils.py | 1 - quast_libs/reporting.py | 12 ++++---- test_data/dip_reference.fasta | 10 +++--- 8 files changed, 81 insertions(+), 39 deletions(-) diff --git a/quast_libs/basic_stats.py b/quast_libs/basic_stats.py index e9f0f9a242..544a3945c3 100644 --- a/quast_libs/basic_stats.py +++ b/quast_libs/basic_stats.py @@ -325,12 +325,16 @@ def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) - - dipquast = DipQuastAnalyzer() - _, genome_size_by_haplotypes = dipquast.fill_dip_dict_by_chromosomes(ref_fpath) - report.add_field(reporting.Fields.REFLEN_HAPLOTYPE1, int(genome_size_by_haplotypes['haplotype1'])) - report.add_field(reporting.Fields.REFLEN_HAPLOTYPE2, int(genome_size_by_haplotypes['haplotype2'])) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) + + if qconfig.ambiguity_usage == 'ploid': + dipquast = DipQuastAnalyzer() + _ = dipquast.fill_dip_dict_by_chromosomes(ref_fpath) + length_of_haplotypes = dict(sorted(dipquast.get_haplotypes_len(ref_fpath).items())) + report.add_field(reporting.Fields.REFLEN_HAPLOTYPES, + [int(l) for l in length_of_haplotypes.values()]) + + if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: diff --git a/quast_libs/ca_utils/save_results.py b/quast_libs/ca_utils/save_results.py index 06e77f3af3..f203ac92d8 100644 --- a/quast_libs/ca_utils/save_results.py +++ b/quast_libs/ca_utils/save_results.py @@ -11,6 +11,7 @@ from quast_libs import qconfig, qutils, reporting from quast_libs.ca_utils.analyze_misassemblies import Misassembly from quast_libs.ca_utils.misc import print_file, intergenomic_misassemblies_by_asm, ref_labels_by_chromosomes +from quast_libs.diputils import DipQuastAnalyzer def print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result): @@ -128,6 +129,21 @@ def save_result(result, report, fname, ref_fpath, genome_size): report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS)) * 100000.0 / float(aligned_assembly_bases))) + if qconfig.ambiguity_usage == 'ploid': + dip_dict_haplotypes = DipQuastAnalyzer() + _ = dip_dict_haplotypes.fill_dip_dict_by_chromosomes(ref_fpath) + genome_len_by_haplotypes = dict(sorted(dip_dict_haplotypes.get_haplotypes_len(ref_fpath).items())) + aligned_ref_bases_by_haplotypes = dict(sorted(DipQuastAnalyzer.ploid_aligned.items())) + + genome_fraction_by_haplotypes = {} + for haplotype in genome_len_by_haplotypes.keys(): + genome_fraction_by_haplotypes[haplotype] = genome_fraction_by_haplotypes.get(haplotype, 0) + round(aligned_ref_bases_by_haplotypes[haplotype] * + 100 / genome_len_by_haplotypes[haplotype], 2) + + report.add_field(reporting.Fields.MAPPEDGENOME_BY_HAPLOTYPES, + [float(l) for l in genome_fraction_by_haplotypes.values()]) + + # for misassemblies report: report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + diff --git a/quast_libs/contigs_analyzer.py b/quast_libs/contigs_analyzer.py index e15ae0a7ce..4c19557ce0 100644 --- a/quast_libs/contigs_analyzer.py +++ b/quast_libs/contigs_analyzer.py @@ -37,6 +37,7 @@ from quast_libs.log import get_logger from quast_libs.qutils import is_python2, run_parallel +from quast_libs.diputils import DipQuastAnalyzer logger = get_logger(qconfig.LOGGER_DEFAULT_NAME) @@ -100,9 +101,11 @@ def analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_ genome_mapping[align.ref][pos] = 1 for i in ns_by_chromosomes[align.ref]: genome_mapping[align.ref][i] = 0 - - covered_ref_bases = sum([sum(genome_mapping[chrom]) for chrom in genome_mapping]) - return covered_ref_bases, indels_info + if qconfig.ambiguity_usage == 'ploid': + return genome_mapping, indels_info + else: + covered_ref_bases = sum([sum(genome_mapping[chrom]) for chrom in genome_mapping]) + return covered_ref_bases, indels_info # former plantagora and plantakolya @@ -197,6 +200,15 @@ def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') aligned_ref_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath) + + if qconfig.ambiguity_usage == 'ploid': + dip_dict_haplotypes = DipQuastAnalyzer().fill_dip_dict_by_chromosomes(ref_fpath) + for key, val in dip_dict_haplotypes.items(): + for chrom in val: + DipQuastAnalyzer.ploid_aligned[key] = DipQuastAnalyzer.ploid_aligned.get(key, 0) + sum(aligned_ref_bases[chrom]) + + aligned_ref_bases = sum([sum(aligned_ref_bases[chrom]) for chrom in aligned_ref_bases]) + total_indels_info += indels_info cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'aligned_ref_bases': aligned_ref_bases} result.update(cov_stats) diff --git a/quast_libs/diputils.py b/quast_libs/diputils.py index 1add258f41..06daad1243 100644 --- a/quast_libs/diputils.py +++ b/quast_libs/diputils.py @@ -1,26 +1,36 @@ -from quast_libs.fastaparser import read_fasta +from collections import OrderedDict +import re + +from quast_libs.fastaparser import read_fasta, get_chr_lengths_from_fastafile class DipQuastAnalyzer: + ploid_aligned = {} def __init__(self): - self.dip_genome_by_chr = {} - self.dip_genome_by_chr_len = {} - self.genome_size_by_haplotypes = {} - self.__remember_haplotypes = [] + self._dip_genome_by_chr = OrderedDict() + self._length_of_haplotypes = OrderedDict() + def fill_dip_dict_by_chromosomes(self, fasta_fpath): - for name, seq in read_fasta(fasta_fpath): - chr_name, haplotype = name.strip('\n').split('_') - chr_len = len(seq) - if haplotype not in self.dip_genome_by_chr_len.keys(): - self.dip_genome_by_chr_len[haplotype] = {} - self.dip_genome_by_chr[haplotype] = {} - self.__remember_haplotypes.append(haplotype) - self.dip_genome_by_chr_len[haplotype][chr_name] = chr_len - self.dip_genome_by_chr[haplotype][chr_name] = seq - - for haplotype_n in self.__remember_haplotypes: - self.genome_size_by_haplotypes[haplotype_n] = sum(self.dip_genome_by_chr_len[haplotype_n].values()) - - return self.dip_genome_by_chr, self.genome_size_by_haplotypes + for name, _ in read_fasta(fasta_fpath): + haplotype = re.findall(r'(haplotype\d+)', name)[0] + if haplotype not in self._dip_genome_by_chr.keys(): + self._dip_genome_by_chr[haplotype] = [] + self._dip_genome_by_chr[haplotype].append(name) + + return self._dip_genome_by_chr + + def get_haplotypes_len(self, fpath): + chr_len_d = get_chr_lengths_from_fastafile(fpath) + for key, val in self._dip_genome_by_chr.items(): + for chrom in val: + self._length_of_haplotypes[key] = self._length_of_haplotypes.get(key, 0) + chr_len_d[chrom] + + return self._length_of_haplotypes + + + + + + diff --git a/quast_libs/qconfig.py b/quast_libs/qconfig.py index 050c8facdf..a50fe0115a 100644 --- a/quast_libs/qconfig.py +++ b/quast_libs/qconfig.py @@ -85,6 +85,7 @@ large_genome = False use_kmc = False report_all_metrics = False +var_haplotypes = [1,2,3,4,5,6,7,8] # ideal assembly section optimal_assembly = False diff --git a/quast_libs/qutils.py b/quast_libs/qutils.py index 5a43ddec95..72770395df 100644 --- a/quast_libs/qutils.py +++ b/quast_libs/qutils.py @@ -166,7 +166,6 @@ def add_lengths_to_report(lengths, reporting, contigs_fpath): [sum(l for l in lengths if l >= threshold) if threshold >= min_threshold else None for threshold in qconfig.contig_thresholds]) - def correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting): ## removing from contigs' names special characters because: ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc diff --git a/quast_libs/reporting.py b/quast_libs/reporting.py index c7262d3d24..6e3db6fef7 100644 --- a/quast_libs/reporting.py +++ b/quast_libs/reporting.py @@ -134,6 +134,7 @@ class Fields: UNCALLED_PERCENT = "# N's per 100 kbp" # Genome statistics + MAPPEDGENOME_BY_HAPLOTYPES = ('Genome fraction (%%) of haplotype %d', tuple(qconfig.var_haplotypes)) MAPPEDGENOME = 'Genome fraction (%)' DUPLICATION_RATIO = 'Duplication ratio' AVE_READ_SUPPORT = 'Avg contig read support' @@ -172,8 +173,7 @@ class Fields: # Reference statistics REFLEN = 'Reference length' - REFLEN_HAPLOTYPE1 = 'Reference length of haplotype 1' - REFLEN_HAPLOTYPE2 = 'Reference length of haplotype 2' + REFLEN_HAPLOTYPES = ('Reference length of haplotype %d', tuple(qconfig.var_haplotypes)) ESTREFLEN = 'Estimated reference length' REF_FRAGMENTS = 'Reference fragments' REFGC = 'Reference GC (%)' @@ -202,7 +202,7 @@ class Fields: SIMILAR_MIS_BLOCKS = '# similar misassembled blocks' ### content and order of metrics in MAIN REPORT (/report.txt, .tex, .tsv): - order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPE1, REFLEN_HAPLOTYPE2, ESTREFLEN, GC, REFGC, + order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPES, ESTREFLEN, GC, REFGC, N50, NG50, Nx, NGx, auN, auNG, L50, LG50, Lx, LGx, TOTAL_READS, LEFT_READS, RIGHT_READS, MAPPED_READS_PCNT, REF_MAPPED_READS_PCNT, @@ -211,7 +211,7 @@ class Fields: LARGE_MIS_EXTENSIVE, MISASSEMBL, MISCONTIGS, MISCONTIGSBASES, MISLOCAL, MIS_SCAFFOLDS_GAP, MIS_LOCAL_SCAFFOLDS_GAP, STRUCT_VARIATIONS, POTENTIAL_MGE, UNALIGNED_MISASSEMBLED_CTGS, - UNALIGNED, UNALIGNEDBASES, MAPPEDGENOME, DUPLICATION_RATIO, AVE_READ_SUPPORT, + UNALIGNED, UNALIGNEDBASES, MAPPEDGENOME, MAPPEDGENOME_BY_HAPLOTYPES, DUPLICATION_RATIO, AVE_READ_SUPPORT, UNCALLED_PERCENT, SUBSERROR, INDELSERROR, GENES, OPERONS, BUSCO_COMPLETE, BUSCO_PART, PREDICTED_GENES_UNIQUE, PREDICTED_GENES, RNA_GENES, @@ -250,7 +250,7 @@ class Fields: ### Grouping of metrics and set of main metrics for HTML version of main report grouped_order = [ - ('Genome statistics', [MAPPEDGENOME, DUPLICATION_RATIO, AVE_READ_SUPPORT, GENES, OPERONS, + ('Genome statistics', [MAPPEDGENOME, MAPPEDGENOME_BY_HAPLOTYPES, DUPLICATION_RATIO, AVE_READ_SUPPORT, GENES, OPERONS, LARGALIGN, TOTAL_ALIGNED_LEN, NG50, NGx, auNG, NA50, NAx, auNA, NGA50, NGAx, auNGA, LG50, LGx, LA50, LAx, LGA50, LGAx, BUSCO_COMPLETE, BUSCO_PART]), @@ -296,7 +296,7 @@ class Fields: TOTALLENS__FOR_1000_THRESHOLD, TOTALLENS__FOR_10000_THRESHOLD, TOTALLENS__FOR_50000_THRESHOLD, LARGE_MIS_EXTENSIVE, MIS_ALL_EXTENSIVE, MIS_EXTENSIVE_BASES, SUBSERROR, INDELSERROR, UNCALLED_PERCENT, - MAPPEDGENOME, DUPLICATION_RATIO, AVE_READ_SUPPORT, + MAPPEDGENOME, MAPPEDGENOME_BY_HAPLOTYPES, DUPLICATION_RATIO, AVE_READ_SUPPORT, MAPPED_READS_PCNT, PROPERLY_PAIRED_READS_PCNT, SINGLETONS_PCNT, MISJOINT_READS_PCNT, GENES, OPERONS, BUSCO_COMPLETE, BUSCO_PART, diff --git a/test_data/dip_reference.fasta b/test_data/dip_reference.fasta index 76c7639a67..a6f9789feb 100644 --- a/test_data/dip_reference.fasta +++ b/test_data/dip_reference.fasta @@ -1,4 +1,4 @@ ->chr1_haplotype1 +>chr1_haplotype3 ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA @@ -13,7 +13,7 @@ TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA TCGCGGGTCATATGTATAGGACC ->chr2_haplotype1 +>chr2_haplotype3 AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT @@ -23,7 +23,7 @@ TTCTAGGCTTCCACAGTTTTGGTTTGTATATTCATAATGATACCATGAGTGCTTTAGGGCGTCCACAAGA TATGTTTTCAGATACTGCTATACAATTACAACCAGTCTTTGCTCAATGGATACAAAATACCCATGCTTTA GCACCTGGTGTAACAGCCCCTGGTGAAACAGCGAGCACCAGTTTGACTTGGGGGGGCGGTGAGTTAGTAG CAGTGGGTGGCAAAGTAGCTTT ->chr3_haplotype1 +>chr3_haplotype2 TGCATTTACAATTCATGTGACGGTATTGATACTGTTGAAAGGTGTTCTATTTGCTCGTAGCTCGCGTTTA ATACCAGATAAAGCAAATCTTGGTTTTCGTTTCCCTTGTGATGGGCCGGGAAGAGGAGGAACATGTCAAG TATCTGCTTGGGATCATGTCTTCTTAGGACTATTCTGGATGTACAATGCTATTTCCGTAGTAATATTCCA @@ -48,7 +48,7 @@ TTTGGCTTGGACAGGACATTTAGTTCATGTTGCTATTCCCGGATCCTCTAGGGGGGAGTACGTTCGATGG AATAATTTCTTAGATGTATTACCCTATCCCCAGGGGTTGGGTCCCCTTCTGACGGGTCAGTGGAATCTTT ATGCCCAAAATCCTGATTCGAGTAATCATTTATTTGGTACCACTCAAGGAGCGGGAACTGCCATTCTGAC CCTTCTTGGGGGATTCC ->chr2_haplotype2 +>chr2_haplotype1 ATTGCATTTATTTTTCTCATTGCCGGTCATATGTATCGAACTAACTTCGGAATTGGGCACAGTATCAAAG ATCTTTTAGAAGCACATACTCCTCCGGGGGGTCGATTAGGACGTGGGCATAAAGGCCTTTATGATACAAT CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT @@ -61,7 +61,7 @@ TCATGGTAAGACGACATATGGGTTCGATATACTCTTATCTTCAACGAATGGCCCCACTTTCAATGCAGGT CGAAACATATGGTTGCCCGGATGGTTGAATGCTGTTAATGAGAATAGTAATTCGCTTTTCTTAACAATAG GACCTGGGGATTTCTTGGTTCATCATGCTATTGCTCTAGGTTTGCATACAACTACATTGATTTTAGTAAA GGGTGCTTTAGATGCACGCGGTTCCAAATTAATGCCGGATAAAAAGGATTTCGGGTATAGTTTT ->chr3_haplotype2 +>chr3_haplotype1 GACGGCCCAGGGCGCGGCGGTACTTGTGATATTTCTGCTTGGGACGCGTTTTATTTGGCAGTTTTCTGGA TGTTAAATACCATTGGATGGGTTACTTTTTATTGGCATTGGAAACACATTACATTATGGCAGGGCAACGT TTCACAATTTAATGAATCCTCCACTTATTTGATGGGATGGTTAAGAGATTACCTATGGTTAAACTCTTCA From 164a5962e31f5894cfde47246f9fa7a3303b49c9 Mon Sep 17 00:00:00 2001 From: Amy Date: Tue, 21 Feb 2023 00:08:12 +0300 Subject: [PATCH 05/29] Revert "Added ploid value to the help menu" This reverts commit c9f391234decfb98e8b506caef8256e1858cfa6f. --- quast_libs/qconfig.py | 166 +++++++++++++++++++++--------------------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/quast_libs/qconfig.py b/quast_libs/qconfig.py index a50fe0115a..5e57ec87f4 100644 --- a/quast_libs/qconfig.py +++ b/quast_libs/qconfig.py @@ -418,102 +418,102 @@ def usage(show_hidden=False, mode=None, short=True, stream=sys.stdout): stream.write("These are basic options. To see the full list, use --help\n") else: stream.write("Advanced options:\n") - stream.write("-s --split-scaffolds Split assemblies by continuous fragments of N's and add such \"contigs\" to the comparison\n") - stream.write("-l --labels \"label, label, ...\" Names of assemblies to use in reports, comma-separated. If contain spaces, use quotes\n") - stream.write("-L Take assembly names from their parent directory names\n") + stream.write("-s --split-scaffolds Split assemblies by continuous fragments of N's and add such \"contigs\" to the comparison\n") + stream.write("-l --labels \"label, label, ...\" Names of assemblies to use in reports, comma-separated. If contain spaces, use quotes\n") + stream.write("-L Take assembly names from their parent directory names\n") if mode != 'large': - stream.write("-e --eukaryote Genome is eukaryotic (primarily affects gene prediction)\n") - stream.write(" --fungus Genome is fungal (primarily affects gene prediction)\n") - stream.write(" --large Use optimal parameters for evaluation of large genomes\n") - stream.write(" In particular, imposes '-e -m %d -i %d -x %d' (can be overridden manually)\n" % + stream.write("-e --eukaryote Genome is eukaryotic (primarily affects gene prediction)\n") + stream.write(" --fungus Genome is fungal (primarily affects gene prediction)\n") + stream.write(" --large Use optimal parameters for evaluation of large genomes\n") + stream.write(" In particular, imposes '-e -m %d -i %d -x %d' (can be overridden manually)\n" % (LARGE_MIN_CONTIG, LARGE_MIN_ALIGNMENT, LARGE_EXTENSIVE_MIS_THRESHOLD)) - stream.write("-k --k-mer-stats Compute k-mer-based quality metrics (recommended for large genomes)\n" - " This may significantly increase memory and time consumption on large genomes\n") - stream.write(" --k-mer-size Size of k used in --k-mer-stats [default: %d]\n" % unique_kmer_len) - stream.write(" --circos Draw Circos plot\n") + stream.write("-k --k-mer-stats Compute k-mer-based quality metrics (recommended for large genomes)\n" + " This may significantly increase memory and time consumption on large genomes\n") + stream.write(" --k-mer-size Size of k used in --k-mer-stats [default: %d]\n" % unique_kmer_len) + stream.write(" --circos Draw Circos plot\n") if mode == 'meta': - stream.write("-f --gene-finding Predict genes using MetaGeneMark\n") + stream.write("-f --gene-finding Predict genes using MetaGeneMark\n") elif mode == 'large': - stream.write("-f --gene-finding Predict genes using GeneMark-ES\n") + stream.write("-f --gene-finding Predict genes using GeneMark-ES\n") else: - stream.write("-f --gene-finding Predict genes using GeneMarkS (prokaryotes, default) or GeneMark-ES (eukaryotes, use --eukaryote)\n") + stream.write("-f --gene-finding Predict genes using GeneMarkS (prokaryotes, default) or GeneMark-ES (eukaryotes, use --eukaryote)\n") if not meta: - stream.write(" --mgm Use MetaGeneMark for gene prediction (instead of the default finder, see above)\n") - stream.write(" --glimmer Use GlimmerHMM for gene prediction (instead of the default finder, see above)\n") - stream.write(" --gene-thresholds Comma-separated list of threshold lengths of genes to search with Gene Finding module\n") - stream.write(" [default: %s]\n" % genes_lengths) - stream.write(" --rna-finding Predict ribosomal RNA genes using Barrnap\n") - stream.write("-b --conserved-genes-finding Count conserved orthologs using BUSCO (only on Linux)\n") - stream.write(" --operons File with operon coordinates in the reference (GFF, BED, NCBI or TXT)\n") + stream.write(" --mgm Use MetaGeneMark for gene prediction (instead of the default finder, see above)\n") + stream.write(" --glimmer Use GlimmerHMM for gene prediction (instead of the default finder, see above)\n") + stream.write(" --gene-thresholds Comma-separated list of threshold lengths of genes to search with Gene Finding module\n") + stream.write(" [default: %s]\n" % genes_lengths) + stream.write(" --rna-finding Predict ribosomal RNA genes using Barrnap\n") + stream.write("-b --conserved-genes-finding Count conserved orthologs using BUSCO (only on Linux)\n") + stream.write(" --operons File with operon coordinates in the reference (GFF, BED, NCBI or TXT)\n") if not meta: - stream.write(" --est-ref-size Estimated reference size (for computing NGx metrics without a reference)\n") + stream.write(" --est-ref-size Estimated reference size (for computing NGx metrics without a reference)\n") else: - stream.write(" --max-ref-number Maximum number of references (per each assembly) to download after looking in SILVA database.\n") - stream.write(" Set 0 for not looking in SILVA at all [default: %s]\n" % max_references) - stream.write(" --blast-db Custom BLAST database (.nsq file). By default, MetaQUAST searches references in SILVA database\n") - stream.write(" --use-input-ref-order Use provided order of references in MetaQUAST summary plots (default order: by the best average value)\n") - stream.write(" --contig-thresholds Comma-separated list of contig length thresholds [default: %s]\n" % contig_thresholds) - stream.write(" --x-for-Nx Value of 'x' for Nx, Lx, etc metrics reported in addition to N50, L50, etc (0, 100) [default: %s]\n" % x_for_additional_Nx) + stream.write(" --max-ref-number Maximum number of references (per each assembly) to download after looking in SILVA database.\n") + stream.write(" Set 0 for not looking in SILVA at all [default: %s]\n" % max_references) + stream.write(" --blast-db Custom BLAST database (.nsq file). By default, MetaQUAST searches references in SILVA database\n") + stream.write(" --use-input-ref-order Use provided order of references in MetaQUAST summary plots (default order: by the best average value)\n") + stream.write(" --contig-thresholds Comma-separated list of contig length thresholds [default: %s]\n" % contig_thresholds) + stream.write(" --x-for-Nx Value of 'x' for Nx, Lx, etc metrics reported in addition to N50, L50, etc (0, 100) [default: %s]\n" % x_for_additional_Nx) if meta: stream.write(" --reuse-combined-alignments Reuse the alignments from the combined_reference stage on runs_per_reference stages.\n") - stream.write("-u --use-all-alignments Compute genome fraction, # genes, # operons in QUAST v1.* style.\n") - stream.write(" By default, QUAST filters Minimap\'s alignments to keep only best ones\n") - stream.write("-i --min-alignment The minimum alignment length [default: %s]\n" % i_default) - stream.write(" --min-identity The minimum alignment identity (80.0, 100.0) [default: %.1f]\n" % min_idy_default) - stream.write("-a --ambiguity-usage Use none, one, ploid number / ploidy number / amount of ploidy or all alignments of a contig when all of them\n") - stream.write(" are almost equally good (see --ambiguity-score) [default: %s]\n" % ambiguity_usage) - stream.write(" --ambiguity-score Score S for defining equally good alignments of a single contig. All alignments are sorted \n") - stream.write(" by decreasing LEN * IDY% value. All alignments with LEN * IDY% < S * best(LEN * IDY%) are \n") - stream.write(" discarded. S should be between 0.8 and 1.0 [default: %.2f]\n" % ambiguity_score) + stream.write("-u --use-all-alignments Compute genome fraction, # genes, # operons in QUAST v1.* style.\n") + stream.write(" By default, QUAST filters Minimap\'s alignments to keep only best ones\n") + stream.write("-i --min-alignment The minimum alignment length [default: %s]\n" % i_default) + stream.write(" --min-identity The minimum alignment identity (80.0, 100.0) [default: %.1f]\n" % min_idy_default) + stream.write("-a --ambiguity-usage Use none, one, or all alignments of a contig when all of them\n") + stream.write(" are almost equally good (see --ambiguity-score) [default: %s]\n" % ambiguity_usage) + stream.write(" --ambiguity-score Score S for defining equally good alignments of a single contig. All alignments are sorted \n") + stream.write(" by decreasing LEN * IDY% value. All alignments with LEN * IDY% < S * best(LEN * IDY%) are \n") + stream.write(" discarded. S should be between 0.8 and 1.0 [default: %.2f]\n" % ambiguity_score) if meta: stream.write(" --unique-mapping Disable --ambiguity-usage=all for the combined reference run,\n") stream.write(" i.e. use user-specified or default ('%s') value of --ambiguity-usage\n" % ambiguity_usage) - stream.write(" --strict-NA Break contigs in any misassembly event when compute NAx and NGAx.\n") - stream.write(" By default, QUAST breaks contigs only by extensive misassemblies (not local ones)\n") - stream.write("-x --extensive-mis-size Lower threshold for extensive misassembly size. All relocations with inconsistency\n") - stream.write(" less than extensive-mis-size are counted as local misassemblies [default: %s]\n" % x_default) - stream.write(" --local-mis-size Lower threshold on local misassembly size. Local misassemblies with inconsistency\n") - stream.write(" less than local-mis-size are counted as (long) indels [default: %s]\n" % local_misassembly_min_length) - stream.write(" --scaffold-gap-max-size Max allowed scaffold gap length difference. All relocations with inconsistency\n") - stream.write(" less than scaffold-gap-size are counted as scaffold gap misassemblies [default: %s]\n" % scaffolds_gap_threshold) - stream.write(" --unaligned-part-size Lower threshold for detecting partially unaligned contigs. Such contig should have\n") - stream.write(" at least one unaligned fragment >= the threshold [default: %s]\n" % unaligned_part_size) - stream.write(" --skip-unaligned-mis-contigs Do not distinguish contigs with >= 50% unaligned bases as a separate group\n") - stream.write(" By default, QUAST does not count misassemblies in them\n") - stream.write(" --fragmented Reference genome may be fragmented into small pieces (e.g. scaffolded reference) \n") - stream.write(" --fragmented-max-indent Mark translocation as fake if both alignments are located no further than N bases \n") - stream.write(" from the ends of the reference fragments [default: %s]\n" % local_misassembly_min_length) - stream.write(" Requires --fragmented option\n") - stream.write(" --upper-bound-assembly Simulate upper bound assembly based on the reference genome and reads\n") - stream.write(" --upper-bound-min-con Minimal number of 'connecting reads' needed for joining upper bound contigs into a scaffold\n") - stream.write(" [default: %d for mate-pairs and %d for long reads]\n" % (MIN_CONNECT_MP, MIN_CONNECT_LR)) - stream.write(" --est-insert-size Use provided insert size in upper bound assembly simulation [default: auto detect from reads or %d]\n" % optimal_assembly_default_IS) - stream.write(" --report-all-metrics Keep all quality metrics in the main report even if their values are '-' for all assemblies or \n" - " if they are not applicable (e.g., reference-based metrics in the no-reference mode)\n") - stream.write(" --plots-format Save plots in specified format [default: %s].\n" % plot_extension) - stream.write(" Supported formats: %s\n" % ', '.join(supported_plot_extensions)) - stream.write(" --memory-efficient Run everything using one thread, separately per each assembly.\n") - stream.write(" This may significantly reduce memory consumption on large genomes\n") - stream.write(" --space-efficient Create only reports and plots files. Aux files including .stdout, .stderr, .coords will not be created.\n") - stream.write(" This may significantly reduce space consumption on large genomes. Icarus viewers also will not be built\n") - stream.write("-1 --pe1 File with forward paired-end reads (in FASTQ format, may be gzipped)\n") - stream.write("-2 --pe2 File with reverse paired-end reads (in FASTQ format, may be gzipped)\n") - stream.write(" --pe12 File with interlaced forward and reverse paired-end reads. (in FASTQ format, may be gzipped)\n") - stream.write(" --mp1 File with forward mate-pair reads (in FASTQ format, may be gzipped)\n") - stream.write(" --mp2 File with reverse mate-pair reads (in FASTQ format, may be gzipped)\n") - stream.write(" --mp12 File with interlaced forward and reverse mate-pair reads (in FASTQ format, may be gzipped)\n") - stream.write(" --single File with unpaired short reads (in FASTQ format, may be gzipped)\n") - stream.write(" --pacbio File with PacBio reads (in FASTQ format, may be gzipped)\n") - stream.write(" --nanopore File with Oxford Nanopore reads (in FASTQ format, may be gzipped)\n") - stream.write(" --ref-sam SAM alignment file obtained by aligning reads to reference genome file\n") - stream.write(" --ref-bam BAM alignment file obtained by aligning reads to reference genome file\n") - stream.write(" --sam Comma-separated list of SAM alignment files obtained by aligning reads to assemblies\n" - " (use the same order as for files with contigs)\n") - stream.write(" --bam Comma-separated list of BAM alignment files obtained by aligning reads to assemblies\n" - " (use the same order as for files with contigs)\n") - stream.write(" Reads (or SAM/BAM file) are used for structural variation detection and\n") - stream.write(" coverage histogram building in Icarus\n") - stream.write(" --sv-bedpe File with structural variations (in BEDPE format)\n") + stream.write(" --strict-NA Break contigs in any misassembly event when compute NAx and NGAx.\n") + stream.write(" By default, QUAST breaks contigs only by extensive misassemblies (not local ones)\n") + stream.write("-x --extensive-mis-size Lower threshold for extensive misassembly size. All relocations with inconsistency\n") + stream.write(" less than extensive-mis-size are counted as local misassemblies [default: %s]\n" % x_default) + stream.write(" --local-mis-size Lower threshold on local misassembly size. Local misassemblies with inconsistency\n") + stream.write(" less than local-mis-size are counted as (long) indels [default: %s]\n" % local_misassembly_min_length) + stream.write(" --scaffold-gap-max-size Max allowed scaffold gap length difference. All relocations with inconsistency\n") + stream.write(" less than scaffold-gap-size are counted as scaffold gap misassemblies [default: %s]\n" % scaffolds_gap_threshold) + stream.write(" --unaligned-part-size Lower threshold for detecting partially unaligned contigs. Such contig should have\n") + stream.write(" at least one unaligned fragment >= the threshold [default: %s]\n" % unaligned_part_size) + stream.write(" --skip-unaligned-mis-contigs Do not distinguish contigs with >= 50% unaligned bases as a separate group\n") + stream.write(" By default, QUAST does not count misassemblies in them\n") + stream.write(" --fragmented Reference genome may be fragmented into small pieces (e.g. scaffolded reference) \n") + stream.write(" --fragmented-max-indent Mark translocation as fake if both alignments are located no further than N bases \n") + stream.write(" from the ends of the reference fragments [default: %s]\n" % local_misassembly_min_length) + stream.write(" Requires --fragmented option\n") + stream.write(" --upper-bound-assembly Simulate upper bound assembly based on the reference genome and reads\n") + stream.write(" --upper-bound-min-con Minimal number of 'connecting reads' needed for joining upper bound contigs into a scaffold\n") + stream.write(" [default: %d for mate-pairs and %d for long reads]\n" % (MIN_CONNECT_MP, MIN_CONNECT_LR)) + stream.write(" --est-insert-size Use provided insert size in upper bound assembly simulation [default: auto detect from reads or %d]\n" % optimal_assembly_default_IS) + stream.write(" --report-all-metrics Keep all quality metrics in the main report even if their values are '-' for all assemblies or \n" + " if they are not applicable (e.g., reference-based metrics in the no-reference mode)\n") + stream.write(" --plots-format Save plots in specified format [default: %s].\n" % plot_extension) + stream.write(" Supported formats: %s\n" % ', '.join(supported_plot_extensions)) + stream.write(" --memory-efficient Run everything using one thread, separately per each assembly.\n") + stream.write(" This may significantly reduce memory consumption on large genomes\n") + stream.write(" --space-efficient Create only reports and plots files. Aux files including .stdout, .stderr, .coords will not be created.\n") + stream.write(" This may significantly reduce space consumption on large genomes. Icarus viewers also will not be built\n") + stream.write("-1 --pe1 File with forward paired-end reads (in FASTQ format, may be gzipped)\n") + stream.write("-2 --pe2 File with reverse paired-end reads (in FASTQ format, may be gzipped)\n") + stream.write(" --pe12 File with interlaced forward and reverse paired-end reads. (in FASTQ format, may be gzipped)\n") + stream.write(" --mp1 File with forward mate-pair reads (in FASTQ format, may be gzipped)\n") + stream.write(" --mp2 File with reverse mate-pair reads (in FASTQ format, may be gzipped)\n") + stream.write(" --mp12 File with interlaced forward and reverse mate-pair reads (in FASTQ format, may be gzipped)\n") + stream.write(" --single File with unpaired short reads (in FASTQ format, may be gzipped)\n") + stream.write(" --pacbio File with PacBio reads (in FASTQ format, may be gzipped)\n") + stream.write(" --nanopore File with Oxford Nanopore reads (in FASTQ format, may be gzipped)\n") + stream.write(" --ref-sam SAM alignment file obtained by aligning reads to reference genome file\n") + stream.write(" --ref-bam BAM alignment file obtained by aligning reads to reference genome file\n") + stream.write(" --sam Comma-separated list of SAM alignment files obtained by aligning reads to assemblies\n" + " (use the same order as for files with contigs)\n") + stream.write(" --bam Comma-separated list of BAM alignment files obtained by aligning reads to assemblies\n" + " (use the same order as for files with contigs)\n") + stream.write(" Reads (or SAM/BAM file) are used for structural variation detection and\n") + stream.write(" coverage histogram building in Icarus\n") + stream.write(" --sv-bedpe File with structural variations (in BEDPE format)\n") stream.write("\n") stream.write("Speedup options:\n") stream.write(" --no-check Do not check and correct input fasta files. Use at your own risk (see manual)\n") From bb770d3b90171ad365ae217836a197c769715431 Mon Sep 17 00:00:00 2001 From: Amy Date: Tue, 21 Feb 2023 15:08:17 +0300 Subject: [PATCH 06/29] Added --ambiguity-usage "ploid" option to the help menu. --- quast_libs/qconfig.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quast_libs/qconfig.py b/quast_libs/qconfig.py index 5e57ec87f4..30cd5b3cbc 100644 --- a/quast_libs/qconfig.py +++ b/quast_libs/qconfig.py @@ -460,8 +460,8 @@ def usage(show_hidden=False, mode=None, short=True, stream=sys.stdout): stream.write(" By default, QUAST filters Minimap\'s alignments to keep only best ones\n") stream.write("-i --min-alignment The minimum alignment length [default: %s]\n" % i_default) stream.write(" --min-identity The minimum alignment identity (80.0, 100.0) [default: %.1f]\n" % min_idy_default) - stream.write("-a --ambiguity-usage Use none, one, or all alignments of a contig when all of them\n") - stream.write(" are almost equally good (see --ambiguity-score) [default: %s]\n" % ambiguity_usage) + stream.write("-a --ambiguity-usage