From e4336f07395645cc30cd13d7f7bcc0c152425847 Mon Sep 17 00:00:00 2001 From: SexyERIC0723 Date: Fri, 20 Mar 2026 11:44:55 +0000 Subject: [PATCH] fix: correct _is_phased() early return and prevent KeyError on duplicate variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. _is_phased(): The function returned on the first sample without checking all samples. Now iterates through all GT fields — returns False if ANY sample is unphased, True only if ALL are phased. 2. extract_pgx_variants(): .pop() calls on ref_pos_dynamic can raise KeyError when multiple input variants normalize to the same coordinate. Changed to .pop(key, None) to handle collisions gracefully. Fixes #222 --- preprocessor/pcat/utilities.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/preprocessor/pcat/utilities.py b/preprocessor/pcat/utilities.py index bae640da..977927c1 100644 --- a/preprocessor/pcat/utilities.py +++ b/preprocessor/pcat/utilities.py @@ -879,14 +879,15 @@ def normalize_vcf(reference_genome: Path, vcf_file: Path, output_dir: Path, outp def _is_phased(gt_field) -> bool | None: """ Determines the phasing status of a position. - If any GT fields have a '/', this means at least one sample is unphased. + Returns False if any sample is unphased (contains '/'), + True if all samples are phased, None if gt_field is empty. """ + if not gt_field: + return None for x in gt_field: if '/' in x: return False - else: - return True - return None + return True def _is_haploid(gt_field) -> bool: @@ -1094,7 +1095,7 @@ def extract_pgx_variants(pharmcat_positions: Path, reference_fasta: Path, vcf_fi out_f.write('\t'.join(fields) + '\n') # elimination: remove the dictionary item so that the variant won't be matched again if input_chr_pos in ref_pos_dynamic: - ref_pos_dynamic[input_chr_pos].pop(input_ref_alt) + ref_pos_dynamic[input_chr_pos].pop(input_ref_alt, None) # remove a position if all of its alts are present in the input if ref_pos_dynamic[input_chr_pos] == {}: del ref_pos_dynamic[input_chr_pos] @@ -1117,7 +1118,7 @@ def extract_pgx_variants(pharmcat_positions: Path, reference_fasta: Path, vcf_fi # for hom ref SNPs, remove the position from the dict for record if input_chr_pos in ref_pos_dynamic: - ref_pos_dynamic[input_chr_pos].pop((ref_alleles[i], alt_alleles[i])) + ref_pos_dynamic[input_chr_pos].pop((ref_alleles[i], alt_alleles[i]), None) # remove a position if all of its alts are present in the input if ref_pos_dynamic[input_chr_pos] == {}: del ref_pos_dynamic[input_chr_pos]