Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions mitty/lib/vcfio.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,18 +108,21 @@ def parse(v, cpy, v_check):
raise ValueError("Unusable variants present in VCF. Please filter or refactor these.")

alt = v.samples[0].alleles[cpy]
l_r, l_a = len(v.ref), len(alt)
if l_r == 1:
if l_a == 1:
op, op_len = 'X', 0
else:
op, op_len = 'I', l_a - l_r
elif l_a == 1:
op, op_len = 'D', l_r - l_a
else:
raise ValueError("Complex variants present in VCF. Please filter or refactor these.")
if alt:
l_r, l_a = len(v.ref), len(alt)
if l_r == 1:
if l_a == 1:
op, op_len = 'X', 0
else:
op, op_len = 'I', l_a - l_r
elif l_a == 1:
op, op_len = 'D', l_r - l_a
else:
raise ValueError("Complex variants present in VCF. Please filter or refactor these.")

return Variant(v.pos, v.ref, v.samples[0].alleles[cpy], op, op_len)
return Variant(v.pos, v.ref, v.samples[0].alleles[cpy], op, op_len)
else:
return None


class UnusableVariantFilter:
Expand Down Expand Up @@ -161,7 +164,7 @@ def _complex_variant(_v):
@staticmethod
def _angle_bracketed_id(_v, var):
for alt in var.alleles:
if alt[0] == '<':
if alt and alt[0] == '<':
logger.debug('Angle bracketed variant entry {}:{} {} -> {}'.format(_v.contig, _v.pos, _v.ref, var.alleles))
return True
return False
Expand Down
36 changes: 22 additions & 14 deletions mitty/simulation/genome/sampledgenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,30 @@ def assign_random_gt(input_vcf, output, sample_name="HG", default_af=0.01, seed=
new_header.samples.add(sample_name)
output.write(str(new_header))

default_probs = [1 - default_af * (1 + default_af), default_af/2, default_af/2, default_af * default_af]
gt = ["0|0", "0|1", "1|0", "1|1"]
default_probs_all_gt = [1 - default_af * (1 + default_af), default_af/2, default_af/2, default_af * default_af]
default_probs_no_hom_alt = [1 - default_af, default_af]
rng = RandomState(seed)
previous_pos = 0
prev_locus = -1, -1
for rec in vcf_pointer.fetch():
rec_copy = rec.copy()
if "GT" not in rec_copy.format.keys():
if rec_copy.pos == previous_pos:
c = "0|0"
else:
if "AF" not in rec_copy.info.keys():
gt_probs = default_probs
r_copy = rec.copy()
if "GT" not in r_copy.format.keys():
locus = r_copy.pos
ok_alts = (locus > prev_locus[0]), (locus > prev_locus[1])
if ok_alts[0] or ok_alts[1]:
hom_ok = ok_alts[0] and ok_alts[1]
if "AF" not in r_copy.info.keys():
gt_p = default_probs_all_gt if hom_ok else default_probs_no_hom_alt
else:
af = rec_copy.info["AF"]
gt_probs = [1 - af * (1 + af), af/2, af/2, af * af]
c = rng.choice(["0|0", "0|1", "1|0", "1|1"], p=gt_probs)
output.write("\t".join([str(rec_copy)[:-1], "GT", c]) + "\n")
previous_pos = rec_copy.pos
af = r_copy.info["AF"]
gt_p = [1 - af * (1 + af), af/2, af/2, af * af] if hom_ok else [1 - af, af]
c = rng.choice(gt if hom_ok else (["0|.", "1|."] if ok_alts[0] else [".|0", ".|1"]), p=gt_p)
prev_locus = (locus + r_copy.rlen - 1) if c[0] == '1' else locus, \
(locus + r_copy.rlen - 1) if c[2] == '1' else locus
else:
c = ".|."
prev_locus = locus, locus

output.write("\t".join([str(r_copy)[:-1], "GT", c]) + "\n")

vcf_pointer.close()
99 changes: 0 additions & 99 deletions mitty/test/data/gt_free.vcf

This file was deleted.

100 changes: 100 additions & 0 deletions mitty/test/data/test-sample-genome-with-af.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">
##contig=<ID=1>
#CHROM POS ID REF ALT QUAL FILTER INFO
1 10146 . AC A 10477.8 . AF=0.2
1 10177 rs367896724 A AC 100 PASS AF=0.3
1 10352 rs555500075 T TA 100 PASS AF=0.2
1 10403 . ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC A 2352.6 . AF=0.3
1 10409 . ACCCTAACCCTAACCCTAACCCTAACCCTAAC A 4101.66 . AF=0.2
1 10415 . ACCCTAACCCTAACCCTAACCCTAAC A 7015.42 . AF=0.15
1 10439 . AC A 42046.2 . AF=0.2
1 10616 rs376342519 CCGCCGTTGCAAAGGCGCGCCG C 2500.98 PASS AF=0.15
1 10622 . T G 8160.65 PASS AF=0.2
1 10623 . T C 8017.39 PASS AF=0.3
1 11008 rs575272151 C G 100 PASS AF=0.2
1 11012 rs544419019 C G 100 PASS AF=0.3
1 12783 . G A 69628 PASS AF=0.2
1 12882 . C G 2488.76 PASS AF=0.15
1 13110 rs540538026 G A 100 PASS AF=0.2
1 13116 rs62635286 T G 100 PASS AF=0.15
1 13118 rs200579949 A G 100 PASS AF=0.2
1 13273 rs531730856 G C 100 PASS AF=0.3
1 13417 . C CGAGA 8428.59 . .
1 13656 . CAG C 59960.3 . .
1 13957 . TC T 2780.52 . .
1 14397 . CTGT C 3266.75 . .
1 14464 rs546169444 A T 100 PASS AF=0.1
1 14599 rs531646671 T A 100 PASS AF=0.2
1 14604 rs541940975 A G 100 PASS AF=0.15
1 14930 rs75454623 A G 100 PASS AF=0.2
1 14933 rs199856693 G A 100 PASS AF=0.3
1 15211 rs78601809 T G 100 PASS AF=0.2
1 15274 rs62636497 A G 232859 PASS AF=0.25
1 15274 rs62636497 A T 232859 PASS AF=0.2
1 15774 rs374029747 G A 100 PASS AF=0.25
1 15777 rs568149713 A G 100 PASS AF=0.2
1 15903 rs557514207 G GC 14510.5 PASS AF=0.15
1 16742 . G C 8160.34 . .
1 16742 . G GTGGTGGTGC 8160.34 . .
1 16949 rs199745162 A C 100 PASS AF=0.2
1 17594 . C T 1349.45 PASS AF=0.3
1 17614 . G A 4329.84 PASS AF=0.2
1 17697 . G C 11485.6 PASS AF=0.3
1 17722 . A G 10754.2 PASS AF=0.2
1 17728 . G T 1186.09 PASS AF=0.22
1 17730 . C A 9976.88 PASS AF=0.2
1 17746 . A G 41569.4 PASS AF=0.15
1 17765 . G A 5953 PASS AF=0.2
1 18849 rs533090414 C G 100 PASS AF=0.12
1 18934 . C T 880.87 PASS AF=0.2
1 19004 . A G 7681.64 PASS AF=0.3
1 19190 . GC G 15361.2 . .
1 19432 . A G 9791.2 . .
1 19432 . ATGG A 9791.2 . .
1 19872 . G C 380.03 PASS AF=0.2
1 20094 . TAA T 7959.68 . .
1 20098 . CAG C 51648.2 . .
1 20316 . GA G 8228.66 . .
1 20321 . A C 105294 PASS AF=0.2
1 20467 . C T 85296.9 PASS AF=0.3
1 20512 . G T 41287.9 PASS AF=0.2
1 20547 . A G 4457.88 PASS AF=0.25
1 20595 . A G 5331.1 PASS AF=0.2
1 20628 . A G 1449.47 PASS AF=0.15
1 23343 . C A 621.72 PASS AF=0.2
1 28326 . GT G 3140.23 . .
1 28343 . G C 2038.73 PASS AF=0.12
1 28354 . C T 1812.24 PASS AF=0.2
1 28367 . G A 499.33 PASS AF=0.21
1 28376 . G A 1384.9 PASS AF=0.2
1 28494 . T C 1391.82 PASS AF=0.2
1 28563 . A G 12646.9 PASS AF=0.2
1 28590 . T TTGG 1894.86 . .
1 28663 . T A 276.97 PASS AF=0.3
1 28835 . A G 849.36 PASS AF=0.2
1 29571 . G C 1812.1 PASS AF=0.28
1 30548 . T G 7557.21 PASS AF=0.2
1 30912 . CTCTCTCTCTCGCTATCTCATTTT C 1870.96 . .
1 30923 rs806731 G T 100 PASS AF=0.2
1 38775 . C A 394.77 PASS AF=0.15
1 39406 . G C 11561 PASS AF=0.2
1 39423 . T C 3037.34 PASS AF=0.2
1 39430 . A C 1650.01 PASS AF=0.13
1 39487 . A AC 1442.83 . .
1 46633 . T A 8419.66 PASS AF=0.2
1 46873 . A T 21342.4 PASS AF=0.2
1 47159 rs540662756 T C 100 PASS AF=0.22
1 47190 . G GA 1140.99 . .
1 47960 . T A 21972.4 PASS AF=0.2
1 48448 . C T 14635.2 PASS AF=0.3
1 48518 . A G 565.8 PASS AF=0.2
1 49298 rs200943160 T C 100 PASS AF=0.2
1 49514 . AG A 1766.83 . .
1 49554 rs539322794 A G 100 PASS AF=0.3
1 50481 . G GGT 4416.44 . .
1 50481 . GGT G 4416.44 . .
1 51479 rs116400033 T A 33993.1 PASS AF=0.15
1 51762 rs559190862 A G 100 PASS AF=0.2
1 51765 rs575564077 C G 100 PASS AF=0.2
49 changes: 39 additions & 10 deletions mitty/test/simulation/genome/test_sampledgenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,53 @@
def read_data(outname):
vcf_pointer = pysam.VariantFile(filename=outname)
sample = vcf_pointer.header.samples[0]
gts = []
gts = {}
for rec in vcf_pointer.fetch():
gts.append("|".join([str(ind) for ind in rec.samples["HG"].allele_indices]))
gts[rec.pos] = "|".join([(str(ind) if ind is not None else ".") for ind in rec.samples["HG"].allele_indices])
return sample, gts


def test_assign_random_gt():
"""Simulation: sampled-genome"""
tempfilename = "tempfile.vcf"
sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'gt_free.vcf'),

#gt for overlapping deletion is 0|1
sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'test-sample-genome-with-af.vcf'),
output=open(tempfilename, "w"), sample_name="HG", default_af=0.1,
seed=5081973)
seed=1)

sample_name, gt_array = read_data(tempfilename)
os.remove(tempfilename)
sample_name, gt_map = read_data(tempfilename)

assert sample_name == "HG"
assert "0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|1" \
"_1|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_1|0" \
"_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|0_1|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0" \
"_0|0_1|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|0" == "_".join(gt_array)
assert gt_map[30923] == "0|." or gt_map[30923] == "1|."

# gt for overlapping deletion is 0|0
sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'test-sample-genome-with-af.vcf'),
output=open(tempfilename, "w"), sample_name="HG", default_af=0.1,
seed=2)

_, gt_map = read_data(tempfilename)
assert gt_map[30923] != ".|."

# gt for overlapping deletion is 1|1
sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'test-sample-genome-with-af.vcf'),
output=open(tempfilename, "w"), sample_name="HG", default_af=0.1,
seed=204)

_, gt_map = read_data(tempfilename)
assert gt_map[30923] == ".|."

# gt for overlapping deletion is 1|0
sg.assign_random_gt(input_vcf=os.path.join(mitty.test.example_data_dir, 'test-sample-genome-with-af.vcf'),
output=open(tempfilename, "w"), sample_name="HG", default_af=0.1,
seed=124)

_, gt_map = read_data(tempfilename)
assert gt_map[30923] == ".|0" or gt_map[30923] == ".|1"

os.remove(tempfilename)

assert "0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|0_0|0_1|0_0|0_0|0_0|0_1|0_0|0_0|0_0|0_0|1_0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|0_0|0_" \
"1|0_.|._1|1_0|0_0|0_.|._0|0_0|0_0|1_0|0_0|0_0|0_1|0_0|0_0|0_0|0_0|0_0|0_0|1_.|._0|0_0|0_0|0_0|0_1|0_0|0_0|0_" \
"0|0_0|0_0|0_0|0_0|0_0|0_0|0_0|1_0|0_0|0_0|0_0|0_0|0_0|1_0|0_0|0_1|0_.|0_0|0_0|1_0|0_1|0_1|0_0|0_0|0_0|0_0|0_" \
"0|1_0|0_0|0_0|0_0|0_1|1_.|._0|0_1|0_0|0" == "_".join(gt_map.values())