From dd1936425a1f3742ed5b05ed88185a42cde1a060 Mon Sep 17 00:00:00 2001 From: ozemsbg Date: Tue, 20 Nov 2018 15:45:53 +0300 Subject: [PATCH 1/2] Overlappig deletion check for sample-genome and unit test --- mitty/simulation/genome/sampledgenome.py | 36 ++++--- mitty/test/data/gt_free.vcf | 99 ----------------- .../test/data/test-sample-genome-with-af.vcf | 100 ++++++++++++++++++ .../simulation/genome/test_sampledgenome.py | 49 +++++++-- 4 files changed, 161 insertions(+), 123 deletions(-) delete mode 100644 mitty/test/data/gt_free.vcf create mode 100644 mitty/test/data/test-sample-genome-with-af.vcf diff --git a/mitty/simulation/genome/sampledgenome.py b/mitty/simulation/genome/sampledgenome.py index 84ee92f..d83d5a6 100644 --- a/mitty/simulation/genome/sampledgenome.py +++ b/mitty/simulation/genome/sampledgenome.py @@ -10,22 +10,30 @@ def assign_random_gt(input_vcf, output, sample_name="HG", default_af=0.01, seed= new_header.samples.add(sample_name) output.write(str(new_header)) - default_probs = [1 - default_af * (1 + default_af), default_af/2, default_af/2, default_af * default_af] + gt = ["0|0", "0|1", "1|0", "1|1"] + default_probs_all_gt = [1 - default_af * (1 + default_af), default_af/2, default_af/2, default_af * default_af] + default_probs_no_hom_alt = [1 - default_af, default_af] rng = RandomState(seed) - previous_pos = 0 + prev_locus = -1, -1 for rec in vcf_pointer.fetch(): - rec_copy = rec.copy() - if "GT" not in rec_copy.format.keys(): - if rec_copy.pos == previous_pos: - c = "0|0" - else: - if "AF" not in rec_copy.info.keys(): - gt_probs = default_probs + r_copy = rec.copy() + if "GT" not in r_copy.format.keys(): + locus = r_copy.pos + ok_alts = (locus > prev_locus[0]), (locus > prev_locus[1]) + if ok_alts[0] or ok_alts[1]: + hom_ok = ok_alts[0] and ok_alts[1] + if "AF" not in r_copy.info.keys(): + gt_p = default_probs_all_gt if hom_ok else default_probs_no_hom_alt else: - af = rec_copy.info["AF"] - gt_probs = [1 - af * (1 + af), af/2, af/2, af * af] - c = rng.choice(["0|0", "0|1", "1|0", "1|1"], p=gt_probs) - output.write("\t".join([str(rec_copy)[:-1], "GT", c]) + "\n") - previous_pos = rec_copy.pos + af = r_copy.info["AF"] + gt_p = [1 - af * (1 + af), af/2, af/2, af * af] if hom_ok else [1 - af, af] + c = rng.choice(gt if hom_ok else (["0|.", "1|."] if ok_alts[0] else [".|0", ".|1"]), p=gt_p) + prev_locus = (locus + r_copy.rlen - 1) if c[0] == '1' else locus, \ + (locus + r_copy.rlen - 1) if c[2] == '1' else locus + else: + c = ".|." + prev_locus = locus, locus + + output.write("\t".join([str(r_copy)[:-1], "GT", c]) + "\n") vcf_pointer.close() diff --git a/mitty/test/data/gt_free.vcf b/mitty/test/data/gt_free.vcf deleted file mode 100644 index e6b7bb9..0000000 --- a/mitty/test/data/gt_free.vcf +++ /dev/null @@ -1,99 +0,0 @@ -##fileformat=VCFv4.1 -##FILTER= -##contig= -#CHROM POS ID REF ALT QUAL FILTER INFO -1 10146 . AC A 10477.8 . . -1 10177 rs367896724 A AC 100 PASS . -1 10352 rs555500075 T TA 100 PASS . -1 10403 . ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC A 2352.6 . . -1 10409 . ACCCTAACCCTAACCCTAACCCTAACCCTAAC A 4101.66 . . -1 10415 . ACCCTAACCCTAACCCTAACCCTAAC A 7015.42 . . -1 10439 . AC A 42046.2 . . -1 10616 rs376342519 CCGCCGTTGCAAAGGCGCGCCG C 2500.98 PASS . -1 10622 . T G 8160.65 PASS . -1 10623 . T C 8017.39 PASS . -1 11008 rs575272151 C G 100 PASS . -1 11012 rs544419019 C G 100 PASS . -1 12783 . G A 69628 PASS . -1 12882 . C G 2488.76 PASS . -1 13110 rs540538026 G A 100 PASS . -1 13116 rs62635286 T G 100 PASS . -1 13118 rs200579949 A G 100 PASS . -1 13273 rs531730856 G C 100 PASS . -1 13417 . C CGAGA 8428.59 . . -1 13656 . CAG C 59960.3 . . -1 13957 . TC T 2780.52 . . -1 14397 . CTGT C 3266.75 . . -1 14464 rs546169444 A T 100 PASS . -1 14599 rs531646671 T A 100 PASS . -1 14604 rs541940975 A G 100 PASS . -1 14930 rs75454623 A G 100 PASS . -1 14933 rs199856693 G A 100 PASS . -1 15211 rs78601809 T G 100 PASS . -1 15274 rs62636497 A G 232859 PASS . -1 15274 rs62636497 A T 232859 PASS . -1 15774 rs374029747 G A 100 PASS . -1 15777 rs568149713 A G 100 PASS . -1 15903 rs557514207 G GC 14510.5 PASS . -1 16742 . G C 8160.34 . . -1 16742 . G GTGGTGGTGC 8160.34 . . -1 16949 rs199745162 A C 100 PASS . -1 17594 . C T 1349.45 PASS . -1 17614 . G A 4329.84 PASS . -1 17697 . G C 11485.6 PASS . -1 17722 . A G 10754.2 PASS . -1 17728 . G T 1186.09 PASS . -1 17730 . C A 9976.88 PASS . -1 17746 . A G 41569.4 PASS . -1 17765 . G A 5953 PASS . -1 18849 rs533090414 C G 100 PASS . -1 18934 . C T 880.87 PASS . -1 19004 . A G 7681.64 PASS . -1 19190 . GC G 15361.2 . . -1 19432 . A G 9791.2 . . -1 19432 . ATGG A 9791.2 . . -1 19872 . G C 380.03 PASS . -1 20094 . TAA T 7959.68 . . -1 20098 . CAG C 51648.2 . . -1 20316 . GA G 8228.66 . . -1 20321 . A C 105294 PASS . -1 20467 . C T 85296.9 PASS . -1 20512 . G T 41287.9 PASS . -1 20547 . A G 4457.88 PASS . -1 20595 . A G 5331.1 PASS . -1 20628 . A G 1449.47 PASS . -1 23343 . C A 621.72 PASS . -1 28326 . GT G 3140.23 . . -1 28343 . G C 2038.73 PASS . -1 28354 . C T 1812.24 PASS . -1 28367 . G A 499.33 PASS . -1 28376 . G A 1384.9 PASS . -1 28494 . T C 1391.82 PASS . -1 28563 . A G 12646.9 PASS . -1 28590 . T TTGG 1894.86 . . -1 28663 . T A 276.97 PASS . -1 28835 . A G 849.36 PASS . -1 29571 . G C 1812.1 PASS . -1 30548 . T G 7557.21 PASS . -1 30912 . CTCTCTCTCTCGCTATCTCATTTT C 1870.96 . . -1 30923 rs806731 G T 100 PASS . -1 38775 . C A 394.77 PASS . -1 39406 . G C 11561 PASS . -1 39423 . T C 3037.34 PASS . -1 39430 . A C 1650.01 PASS . -1 39487 . A AC 1442.83 . . -1 46633 . T A 8419.66 PASS . -1 46873 . A T 21342.4 PASS . -1 47159 rs540662756 T C 100 PASS . -1 47190 . G GA 1140.99 . . -1 47960 . T A 21972.4 PASS . -1 48448 . C T 14635.2 PASS . -1 48518 . A G 565.8 PASS . -1 49298 rs200943160 T C 100 PASS . -1 49514 . AG A 1766.83 . . -1 49554 rs539322794 A G 100 PASS . -1 50481 . G GGT 4416.44 . . -1 50481 . GGT G 4416.44 . . -1 51479 rs116400033 T A 33993.1 PASS . -1 51762 rs559190862 A G 100 PASS . -1 51765 rs575564077 C G 100 PASS . \ No newline at end of file diff --git a/mitty/test/data/test-sample-genome-with-af.vcf b/mitty/test/data/test-sample-genome-with-af.vcf new file mode 100644 index 0000000..17b0887 --- /dev/null +++ b/mitty/test/data/test-sample-genome-with-af.vcf @@ -0,0 +1,100 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10146 . AC A 10477.8 . AF=0.2 +1 10177 rs367896724 A AC 100 PASS AF=0.3 +1 10352 rs555500075 T TA 100 PASS AF=0.2 +1 10403 . ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC A 2352.6 . AF=0.3 +1 10409 . ACCCTAACCCTAACCCTAACCCTAACCCTAAC A 4101.66 . AF=0.2 +1 10415 . ACCCTAACCCTAACCCTAACCCTAAC A 7015.42 . AF=0.15 +1 10439 . AC A 42046.2 . AF=0.2 +1 10616 rs376342519 CCGCCGTTGCAAAGGCGCGCCG C 2500.98 PASS AF=0.15 +1 10622 . T G 8160.65 PASS AF=0.2 +1 10623 . T C 8017.39 PASS AF=0.3 +1 11008 rs575272151 C G 100 PASS AF=0.2 +1 11012 rs544419019 C G 100 PASS AF=0.3 +1 12783 . G A 69628 PASS AF=0.2 +1 12882 . C G 2488.76 PASS AF=0.15 +1 13110 rs540538026 G A 100 PASS AF=0.2 +1 13116 rs62635286 T G 100 PASS AF=0.15 +1 13118 rs200579949 A G 100 PASS AF=0.2 +1 13273 rs531730856 G C 100 PASS AF=0.3 +1 13417 . C CGAGA 8428.59 . . +1 13656 . CAG C 59960.3 . . +1 13957 . TC T 2780.52 . . +1 14397 . CTGT C 3266.75 . . +1 14464 rs546169444 A T 100 PASS AF=0.1 +1 14599 rs531646671 T A 100 PASS AF=0.2 +1 14604 rs541940975 A G 100 PASS AF=0.15 +1 14930 rs75454623 A G 100 PASS AF=0.2 +1 14933 rs199856693 G A 100 PASS AF=0.3 +1 15211 rs78601809 T G 100 PASS AF=0.2 +1 15274 rs62636497 A G 232859 PASS AF=0.25 +1 15274 rs62636497 A T 232859 PASS AF=0.2 +1 15774 rs374029747 G A 100 PASS AF=0.25 +1 15777 rs568149713 A G 100 PASS AF=0.2 +1 15903 rs557514207 G GC 14510.5 PASS AF=0.15 +1 16742 . G C 8160.34 . . +1 16742 . G GTGGTGGTGC 8160.34 . . +1 16949 rs199745162 A C 100 PASS AF=0.2 +1 17594 . C T 1349.45 PASS AF=0.3 +1 17614 . G A 4329.84 PASS AF=0.2 +1 17697 . G C 11485.6 PASS AF=0.3 +1 17722 . A G 10754.2 PASS AF=0.2 +1 17728 . G T 1186.09 PASS AF=0.22 +1 17730 . C A 9976.88 PASS AF=0.2 +1 17746 . A G 41569.4 PASS AF=0.15 +1 17765 . G A 5953 PASS AF=0.2 +1 18849 rs533090414 C G 100 PASS AF=0.12 +1 18934 . C T 880.87 PASS AF=0.2 +1 19004 . A G 7681.64 PASS AF=0.3 +1 19190 . GC G 15361.2 . . +1 19432 . A G 9791.2 . . +1 19432 . ATGG A 9791.2 . . +1 19872 . G C 380.03 PASS AF=0.2 +1 20094 . TAA T 7959.68 . . +1 20098 . CAG C 51648.2 . . +1 20316 . GA G 8228.66 . . +1 20321 . A C 105294 PASS AF=0.2 +1 20467 . C T 85296.9 PASS AF=0.3 +1 20512 . G T 41287.9 PASS AF=0.2 +1 20547 . A G 4457.88 PASS AF=0.25 +1 20595 . A G 5331.1 PASS AF=0.2 +1 20628 . A G 1449.47 PASS AF=0.15 +1 23343 . C A 621.72 PASS AF=0.2 +1 28326 . GT G 3140.23 . . +1 28343 . G C 2038.73 PASS AF=0.12 +1 28354 . C T 1812.24 PASS AF=0.2 +1 28367 . G A 499.33 PASS AF=0.21 +1 28376 . G A 1384.9 PASS AF=0.2 +1 28494 . T C 1391.82 PASS AF=0.2 +1 28563 . A G 12646.9 PASS AF=0.2 +1 28590 . T TTGG 1894.86 . . +1 28663 . T A 276.97 PASS AF=0.3 +1 28835 . A G 849.36 PASS AF=0.2 +1 29571 . G C 1812.1 PASS AF=0.28 +1 30548 . T G 7557.21 PASS AF=0.2 +1 30912 . CTCTCTCTCTCGCTATCTCATTTT C 1870.96 . . +1 30923 rs806731 G T 100 PASS AF=0.2 +1 38775 . C A 394.77 PASS AF=0.15 +1 39406 . G C 11561 PASS AF=0.2 +1 39423 . T C 3037.34 PASS AF=0.2 +1 39430 . A C 1650.01 PASS AF=0.13 +1 39487 . A AC 1442.83 . . +1 46633 . T A 8419.66 PASS AF=0.2 +1 46873 . A T 21342.4 PASS AF=0.2 +1 47159 rs540662756 T C 100 PASS AF=0.22 +1 47190 . G GA 1140.99 . . +1 47960 . T A 21972.4 PASS AF=0.2 +1 48448 . C T 14635.2 PASS AF=0.3 +1 48518 . A G 565.8 PASS AF=0.2 +1 49298 rs200943160 T C 100 PASS AF=0.2 +1 49514 . AG A 1766.83 . . +1 49554 rs539322794 A G 100 PASS AF=0.3 +1 50481 . G GGT 4416.44 . . +1 50481 . GGT G 4416.44 . . +1 51479 rs116400033 T A 33993.1 PASS AF=0.15 +1 51762 rs559190862 A G 100 PASS AF=0.2 +1 51765 rs575564077 C G 100 PASS AF=0.2 \ No newline at end of file diff --git a/mitty/test/simulation/genome/test_sampledgenome.py b/mitty/test/simulation/genome/test_sampledgenome.py index 356a071..6c1703a 100644 --- a/mitty/test/simulation/genome/test_sampledgenome.py +++ b/mitty/test/simulation/genome/test_sampledgenome.py @@ -9,24 +9,53 @@ def read_data(outname): vcf_pointer = pysam.VariantFile(filename=outname) sample = vcf_pointer.header.samples[0] - gts = [] + gts = {} for rec in vcf_pointer.fetch(): - gts.append("|".join([str(ind) for ind in rec.samples["HG"].allele_indices])) + gts[rec.pos] = "|".join([(str(ind) if ind is not None else ".") for ind in rec.samples["HG"].allele_indices]) return sample, gts def test_assign_random_gt(): """Simulation: sampled-genome""" tempfilename = "tempfile.vcf" - sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'gt_free.vcf'), + + #gt for overlapping deletion is 0|1 + sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'test-sample-genome-with-af.vcf'), output=open(tempfilename, "w"), sample_name="HG", default_af=0.1, - seed=5081973) + seed=1) - sample_name, gt_array = read_data(tempfilename) - os.remove(tempfilename) + sample_name, gt_map = read_data(tempfilename) assert sample_name == "HG" - assert "0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|1" \ - "_1|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_1|0" \ - "_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|0_1|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0" \ - "_0|0_1|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0" == "_".join(gt_array) + assert gt_map[30923] == "0|." or gt_map[30923] == "1|." + + # gt for overlapping deletion is 0|0 + sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'test-sample-genome-with-af.vcf'), + output=open(tempfilename, "w"), sample_name="HG", default_af=0.1, + seed=2) + + _, gt_map = read_data(tempfilename) + assert gt_map[30923] != ".|." + + # gt for overlapping deletion is 1|1 + sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'test-sample-genome-with-af.vcf'), + output=open(tempfilename, "w"), sample_name="HG", default_af=0.1, + seed=204) + + _, gt_map = read_data(tempfilename) + assert gt_map[30923] == ".|." + + # gt for overlapping deletion is 1|0 + sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'test-sample-genome-with-af.vcf'), + output=open(tempfilename, "w"), sample_name="HG", default_af=0.1, + seed=124) + + _, gt_map = read_data(tempfilename) + assert gt_map[30923] == ".|0" or gt_map[30923] == ".|1" + + os.remove(tempfilename) + + assert "0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|0_0|0_1|0_0|0_0|0_0|0_1|0_0|0_0|0_0|0_0|1_0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|0_0|0_" \ + "1|0_.|._1|1_0|0_0|0_.|._0|0_0|0_0|1_0|0_0|0_0|0_1|0_0|0_0|0_0|0_0|0_0|0_0|1_.|._0|0_0|0_0|0_0|0_1|0_0|0_0|0_" \ + "0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|1_0|0_0|0_1|0_.|0_0|0_0|1_0|0_1|0_1|0_0|0_0|0_0|0_0|0_" \ + "0|1_0|0_0|0_0|0_0|0_1|1_.|._0|0_1|0_0|0" == "_".join(gt_map.values()) From 0afc706262cec62f612d6517385a22c91152599e Mon Sep 17 00:00:00 2001 From: ozemsbg Date: Tue, 20 Nov 2018 15:49:55 +0300 Subject: [PATCH 2/2] Check if an allele is None before working on it --- mitty/lib/vcfio.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/mitty/lib/vcfio.py b/mitty/lib/vcfio.py index 20e5242..725266b 100644 --- a/mitty/lib/vcfio.py +++ b/mitty/lib/vcfio.py @@ -108,18 +108,21 @@ def parse(v, cpy, v_check): raise ValueError("Unusable variants present in VCF. Please filter or refactor these.") alt = v.samples[0].alleles[cpy] - l_r, l_a = len(v.ref), len(alt) - if l_r == 1: - if l_a == 1: - op, op_len = 'X', 0 - else: - op, op_len = 'I', l_a - l_r - elif l_a == 1: - op, op_len = 'D', l_r - l_a - else: - raise ValueError("Complex variants present in VCF. Please filter or refactor these.") + if alt: + l_r, l_a = len(v.ref), len(alt) + if l_r == 1: + if l_a == 1: + op, op_len = 'X', 0 + else: + op, op_len = 'I', l_a - l_r + elif l_a == 1: + op, op_len = 'D', l_r - l_a + else: + raise ValueError("Complex variants present in VCF. Please filter or refactor these.") - return Variant(v.pos, v.ref, v.samples[0].alleles[cpy], op, op_len) + return Variant(v.pos, v.ref, v.samples[0].alleles[cpy], op, op_len) + else: + return None class UnusableVariantFilter: @@ -161,7 +164,7 @@ def _complex_variant(_v): @staticmethod def _angle_bracketed_id(_v, var): for alt in var.alleles: - if alt[0] == '<': + if alt and alt[0] == '<': logger.debug('Angle bracketed variant entry {}:{} {} -> {}'.format(_v.contig, _v.pos, _v.ref, var.alleles)) return True return False