From 076eb17096165a04576035c8b627bbf72268fdbf Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 17 Mar 2026 01:48:57 -0400 Subject: [PATCH 01/48] feat: add genome_kit.df subpackage --- genome_kit/__init__.py | 1 + genome_kit/df/__init__.py | 3 + genome_kit/df/gk_structs.py | 97 +++++++++ genome_kit/df/registry.py | 351 +++++++++++++++++++++++++++++++++ genome_kit/df/serialization.py | 199 +++++++++++++++++++ 5 files changed, 651 insertions(+) create mode 100644 genome_kit/df/__init__.py create mode 100644 genome_kit/df/gk_structs.py create mode 100644 genome_kit/df/registry.py create mode 100644 genome_kit/df/serialization.py diff --git a/genome_kit/__init__.py b/genome_kit/__init__.py index dc1f1cb3..3aa97eee 100644 --- a/genome_kit/__init__.py +++ b/genome_kit/__init__.py @@ -49,6 +49,7 @@ from .variant_genome import VariantGenome from .vcf_table import VCFTable, VCFVariant from . import serialize +from .df import to_parquet, from_parquet ######################################################################### diff --git a/genome_kit/df/__init__.py b/genome_kit/df/__init__.py new file mode 100644 index 00000000..a7c1137c --- /dev/null +++ b/genome_kit/df/__init__.py @@ -0,0 +1,3 @@ +from .serialization import from_parquet, to_parquet + +__all__ = ["from_parquet", "to_parquet"] \ No newline at end of file diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py new file mode 100644 index 00000000..5534acee --- /dev/null +++ b/genome_kit/df/gk_structs.py @@ -0,0 +1,97 @@ +from enum import StrEnum + +import polars as pl + + +class GkDfType(StrEnum): + GENOME = "genome" + INTERVAL = "interval" + TRANSCRIPT = "transcript" + GENE = "gene" + EXON = "exon" + INTRON = "intron" + CDS = "cds" + UTR = "utr" + + +class GkDfVersion(StrEnum): + V1 = "1.0" + + +CURRENT_VERSION = GkDfVersion.V1 + +GenomeStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("genome_str", pl.Utf8), # reference or annotation genome + ] +) + +IntervalStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("chromosome", pl.Utf8), + pl.Field("strand", pl.Utf8), + pl.Field("start", pl.Int32), + pl.Field("end", pl.Int32), + pl.Field("genome_str", pl.Utf8), # reference or annotation genome + ] +) + +TranscriptStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + # index of transcript within annotation genome transcript table + pl.Field("transcript_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] +) + +GeneStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("gene_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] +) + +ExonStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("exon_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] +) + +IntronStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("intron_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] +) + +CdsStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("cds_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] +) + +UtrStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("utr_type", pl.Utf8), # "5prime" or "3prime" + pl.Field("utr_table_index", pl.Int64), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] +) diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py new file mode 100644 index 00000000..aee84240 --- /dev/null +++ b/genome_kit/df/registry.py @@ -0,0 +1,351 @@ +from dataclasses import dataclass +from functools import cache +from typing import Callable + +import polars as pl + +import genome_kit as gk + +from .gk_structs import ( + CdsStruct, + ExonStruct, + GeneStruct, + GenomeStruct, + GkDfType, + GkDfVersion, + IntervalStruct, + IntronStruct, + TranscriptStruct, + UtrStruct, +) + +# mapping from GenomeKit object types to the gkdf type strings +GK_TO_STRUCT: dict[type[gk.GenomeAnnotation], GkDfType] = { + gk.Genome: GkDfType.GENOME, + gk.Interval: GkDfType.INTERVAL, + gk.Transcript: GkDfType.TRANSCRIPT, + gk.Gene: GkDfType.GENE, + gk.Exon: GkDfType.EXON, + gk.Intron: GkDfType.INTRON, + gk.Cds: GkDfType.CDS, + gk.Utr: GkDfType.UTR, +} + + +# entry for the gkdf registry +@dataclass +class GKTypeEntry: + struct: pl.Struct + serializer: Callable[[pl.Series], pl.Series] + deserializer: Callable[[pl.Series], pl.Series] + + +_GKDF_TYPE_FIELD = "gkdf_type" +_SCHEMA_VERSION_FIELD = "schema_version" + +SUPPORTED_VERSIONS = {v for v in GkDfVersion.__members__.values()} + + +def _serialize_genome(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Genome objects by genome name.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.GENOME.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + # config gives annotation genome name if applicable + "genome_str": genome.config, + } + for genome in s + ], + dtype=GenomeStruct, + ) + + +def _deserialize_genome(s: pl.Series) -> pl.Series: + """Deserialize a Series of GenomeStruct back into GenomeKit Genome objects.""" + return pl.Series( + name=s.name, + values=[gk.Genome(struct["genome_str"]) for struct in s], + dtype=pl.Object, + ) + + +def _serialize_interval(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Interval objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.INTERVAL.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "chromosome": interval.chrom, + "strand": interval.strand, + "start": interval.start, + "end": interval.end, + # intervals related to reference genome only + "genome_str": interval.reference_genome, + } + for interval in s + ], + dtype=IntervalStruct, + ) + + +def _deserialize_interval(s: pl.Series) -> pl.Series: + """Deserialize a Series of IntervalStruct back into GenomeKit Interval objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Interval( + chromosome=struct["chrom"], + strand=struct["strand"], + start=struct["start"], + end=struct["end"], + reference_genome=struct["genome_str"], + ) + for struct in s + ], + dtype=pl.Object, + ) + + +def _serialize_transcript(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Transcript objects.""" + + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.TRANSCRIPT.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "transcript_table_index": transcript.annotation_genome.transcripts.index_of( + transcript + ), + "genome_str": transcript.annotation_genome.config, + } + for transcript in s + ], + dtype=TranscriptStruct, + ) + + +def _deserialize_transcript(s: pl.Series) -> pl.Series: + """Deserialize a Series of TranscriptStruct back into GenomeKit Transcript objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).transcripts[ + struct["transcript_table_index"] + ] + for struct in s + ], + dtype=pl.Object, + ) + + +def _serialize_gene(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Gene objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.GENE.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "gene_table_index": gene.annotation_genome.genes.index_of(gene), + "genome_str": gene.annotation_genome.config, + } + for gene in s + ], + dtype=GeneStruct, + ) + + +def _deserialize_gene(s: pl.Series) -> pl.Series: + """Deserialize a Series of GeneStruct back into GenomeKit Gene objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).genes[struct["gene_table_index"]] + for struct in s + ], + dtype=pl.Object, + ) + + +def _serialize_exon(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Exon objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.EXON.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "exon_table_index": exon.annotation_genome.exons.index_of(exon), + "genome_str": exon.annotation_genome.config, + } + for exon in s + ], + dtype=ExonStruct, + ) + + +def _deserialize_exon(s: pl.Series) -> pl.Series: + """Deserialize a Series of ExonStruct back into GenomeKit Exon objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).exons[struct["exon_table_index"]] + for struct in s + ], + dtype=pl.Object, + ) + + +def _serialize_intron(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Intron objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.INTRON.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "intron_table_index": intron.annotation_genome.introns.index_of(intron), + "genome_str": intron.annotation_genome.config, + } + for intron in s + ], + dtype=IntronStruct, + ) + + +def _deserialize_intron(s: pl.Series) -> pl.Series: + """Deserialize a Series of IntronStruct back into GenomeKit Intron objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).introns[struct["intron_table_index"]] + for struct in s + ], + dtype=pl.Object, + ) + + +def _serialize_cds(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Cds objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.CDS.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "cds_table_index": cds.annotation_genome.cdss.index_of(cds), + "genome_str": cds.annotation_genome.config, + } + for cds in s + ], + dtype=CdsStruct, + ) + + +def _deserialize_cds(s: pl.Series) -> pl.Series: + """Deserialize a Series of CDSStruct back into GenomeKit Cds objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).cdss[struct["cds_table_index"]] + for struct in s + ], + dtype=pl.Object, + ) + + +def _serialize_utr(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Utr objects. + + UTRs serialized with their index within the relevant table. + """ + values = [] + for utr in s: + ser_dict = { + _GKDF_TYPE_FIELD: GkDfType.UTR.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + } + genome = utr.annotation_genome + try: + ser_dict["utr_table_index"] = genome.utr5s.index_of(utr) + ser_dict["utr_type"] = "5prime" + except ValueError: + ser_dict["utr_table_index"] = genome.utr3s.index_of(utr) + ser_dict["utr_type"] = "3prime" + + ser_dict["genome_str"] = genome.config + values.append(ser_dict) + + return pl.Series( + name=s.name, + values=values, + dtype=UtrStruct, + ) + + +def _deserialize_utr(s: pl.Series) -> pl.Series: + """Deserialize a Series of UtrStruct back into GenomeKit Utr objects.""" + return pl.Series( + name=s.name, + values=[ + ( + gk.Genome(struct["genome_str"]).utr5s[struct["utr_table_index"]] + if struct["utr_type"] == "5prime" + else gk.Genome(struct["genome_str"]).utr3s[struct["utr_table_index"]] + ) + for struct in s + ], + ) + + +REGISTRY: dict[GkDfVersion, dict[GkDfType, GKTypeEntry]] = { + GkDfVersion.V1: { + GkDfType.GENOME: GKTypeEntry( + struct=GenomeStruct, + serializer=_serialize_genome, + deserializer=_deserialize_genome, + ), + GkDfType.INTERVAL: GKTypeEntry( + struct=IntervalStruct, + serializer=_serialize_interval, + deserializer=_deserialize_interval, + ), + GkDfType.TRANSCRIPT: GKTypeEntry( + struct=TranscriptStruct, + serializer=_serialize_transcript, + deserializer=_deserialize_transcript, + ), + GkDfType.GENE: GKTypeEntry( + struct=GeneStruct, + serializer=_serialize_gene, + deserializer=_deserialize_gene, + ), + GkDfType.EXON: GKTypeEntry( + struct=ExonStruct, + serializer=_serialize_exon, + deserializer=_deserialize_exon, + ), + GkDfType.INTRON: GKTypeEntry( + struct=IntronStruct, + serializer=_serialize_intron, + deserializer=_deserialize_intron, + ), + GkDfType.CDS: GKTypeEntry( + struct=CdsStruct, + serializer=_serialize_cds, + deserializer=_deserialize_cds, + ), + GkDfType.UTR: GKTypeEntry( + struct=UtrStruct, + serializer=_serialize_utr, + deserializer=_deserialize_utr, + ), + } +} diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py new file mode 100644 index 00000000..e1e922a2 --- /dev/null +++ b/genome_kit/df/serialization.py @@ -0,0 +1,199 @@ +import functools +import json +import time +import warnings +from collections.abc import Callable +from inspect import signature + +import polars as pl + +import genome_kit as gk + +from .gk_structs import CURRENT_VERSION, GkDfType, GkDfVersion +from .registry import GK_TO_STRUCT, REGISTRY + + +def _map_batches_safe(fn: Callable): + """Helper function to wrap a UDF and run safely with polars map_batches. + + Polars has a bug in map_batches that incorrectly forwards the return_dtype argument + to the UDF. See https://github.com/pola-rs/polars/issues/24840. + + Args: + fn: The user defined function to wrap. + """ + sig = signature(fn) + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + accepted = sig.parameters + filtered_kwargs = {k: v for k, v in kwargs.items() if k in accepted} + return fn(*args, **filtered_kwargs) + + return wrapper + + +def detect_gk_cols( + lf: pl.LazyFrame, columns: list[str] | None = None +) -> dict[str, GkDfType]: + """Detect columns in the LazyFrame that contains GenomeKit objects. + + Args: + lf: The LazyFrame to inspect. + columns: Optional list of column names to check. If None, all columns will be checked. + + Returns: + A dictionary mapping column names to their corresponding GenomeKit types. + """ + + lf_cols = lf.collect_schema().names() + + if not columns: + columns = lf_cols + + target_cols = {} + + # polars Struct inferred from first row, same behaviour as Polars + # see https://docs.pola.rs/user-guide/expressions/structs/#inferring-the-data-type-struct-from-dictionaries + # materialize the first row to check data types, need the exact type not pl.Object + first_row = lf.head(1).collect()[0] + + for col in columns: + if col not in lf_cols: + raise ValueError( + f"Column '{col}' not found in the DataFrame, please check the column names and try again." + ) + # item from first row of the column + col_type = GK_TO_STRUCT.get(type(first_row[col][0]), None) + + if col_type is None: + # column is not a genomekit type, so no serialization needed + pass + else: + target_cols[col] = col_type + + return target_cols + + +# TODO: add union of pd.DataFrame +def to_parquet( + df: pl.DataFrame | pl.LazyFrame, + path: str, + columns: list[str] | None = None, +) -> None: + """Serialize a DataFrame with GenomeKit objects to a Parquet file. + + Args: + df: A Polars DataFrame or LazyFrame with columns containing GenomeKit objects. + path: The file path to write the Parquet file to. + columns: Optional list of column names to serialize. If None, all GenomeKit + columns will be serialized. + + """ + if isinstance(df, pl.DataFrame): + df = df.lazy() + + target_cols = detect_gk_cols(df, columns) + + if not target_cols: + warnings.warn( + "No GenomeKit columns detected for serialization, writing DataFrame as is." + ) + df.sink_parquet(path) + return + + df = df.with_columns( + pl.col(col) + .map_batches( + _map_batches_safe(REGISTRY[CURRENT_VERSION][target_cols[col]].serializer), + return_dtype=REGISTRY[CURRENT_VERSION][target_cols[col]].struct, + ) + .alias(col) + for col in target_cols + ) + + metadata = { + "gkdf_version": CURRENT_VERSION.value, + "gk_version": gk.__version__, + "target_cols": json.dumps(target_cols), + } + + df.sink_parquet(path, metadata=metadata) + + +def _init_gk_annotations(lf: pl.LazyFrame, target_cols: dict[str, GkDfType]) -> None: + """Initialize GenomeKit annotations for all unique genomes in the LazyFrame. + + Prevents race conditions when opening dganno files during polars operations. + + Args: + lf: The LazyFrame containing the serialized GenomeKit objects. + target_cols: A dictionary mapping column names to their corresponding GenomeKit types. + """ + genomes_exprs = [pl.col(c).struct.field("genome_str") for c in target_cols.keys()] + genomes = ( + lf.select( + pl.concat_list(genomes_exprs) + .explode() + .drop_nulls() + .unique() + .alias("genome_str") + ) + .collect()["genome_str"] + .to_list() + ) + + for genome_str in genomes: + gk.Genome(genome_str).genes + + +def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: + """Validate the parquet metadata for a gk.""" + + try: + version = GkDfVersion(metadata.get("gkdf_version")) + assert version in GkDfVersion, ( + f"Unrecognized gkdf version in Parquet metadata, expected one of {[v.value for v in GkDfVersion]}" + ) + except ValueError: + raise ValueError( + "Invalid or missing gkdf_version in Parquet metadata, unable to deserialize GenomeKit objects. " + ) + + +def from_parquet( + path: str, columns: list[str] | None = None, lazy: bool = False +) -> pl.DataFrame | pl.LazyFrame: + """Deserialize a Parquet file containing GenomeKit objects into a Polars DataFrame or LazyFrame. + + Args: + path: The file path to read the Parquet file from. + columns: Optional list of columns to deserialize. If None, all detected + GenomeKit columns will be deserialized. + lazy: If True, return a LazyFrame. Otherwise, return a DataFrame. + + Returns: + A Polars DataFrame or LazyFrame with deserialized GenomeKit objects. + """ + + metadata = pl.read_parquet_metadata(path) + _validate_gkdf_metadata(metadata) + target_cols = json.loads(metadata.get("target_cols")) + + lf = pl.scan_parquet(path) + + # collect unique genome strings in the file to initialize, prevents race conditions + # on opening dganno files + _init_gk_annotations(lf, target_cols) + + lf = lf.with_columns( + pl.col(col) + .map_batches( + _map_batches_safe(REGISTRY[CURRENT_VERSION][target_cols[col]].deserializer), + return_dtype=pl.Object, + ) + .alias(col) + for col in target_cols + ) + + return lf if lazy else lf.collect() From 62a806e2cbf0fa04edb7a98600cfb14cc77d1c19 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 17 Mar 2026 01:51:02 -0400 Subject: [PATCH 02/48] chore: remove unused import --- genome_kit/df/serialization.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index e1e922a2..510d8b21 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -1,6 +1,5 @@ import functools import json -import time import warnings from collections.abc import Callable from inspect import signature From 3f16e0f7057fda38ba6a222e1c5278234e4eed11 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 17 Mar 2026 10:36:45 -0400 Subject: [PATCH 03/48] chore: remove unused arg and update docstring --- genome_kit/df/registry.py | 6 +----- genome_kit/df/serialization.py | 4 +--- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index aee84240..786c2e39 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -113,7 +113,6 @@ def _deserialize_interval(s: pl.Series) -> pl.Series: def _serialize_transcript(s: pl.Series) -> pl.Series: """Serialize a Series of GenomeKit Transcript objects.""" - return pl.Series( name=s.name, values=[ @@ -262,10 +261,7 @@ def _deserialize_cds(s: pl.Series) -> pl.Series: def _serialize_utr(s: pl.Series) -> pl.Series: - """Serialize a Series of GenomeKit Utr objects. - - UTRs serialized with their index within the relevant table. - """ + """Serialize a Series of GenomeKit Utr objects.""" values = [] for utr in s: ser_dict = { diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index 510d8b21..67923f38 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -160,9 +160,7 @@ def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: ) -def from_parquet( - path: str, columns: list[str] | None = None, lazy: bool = False -) -> pl.DataFrame | pl.LazyFrame: +def from_parquet(path: str, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: """Deserialize a Parquet file containing GenomeKit objects into a Polars DataFrame or LazyFrame. Args: From 7bef461eb23669c4ae5f3e6a6658379b743e1105 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 18 Mar 2026 09:18:58 -0400 Subject: [PATCH 04/48] chore: remove unused arg and add extract helper --- genome_kit/df/serialization.py | 57 ++++++++++++++++------------------ 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index 67923f38..dd036176 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -32,9 +32,7 @@ def wrapper(*args, **kwargs): return wrapper -def detect_gk_cols( - lf: pl.LazyFrame, columns: list[str] | None = None -) -> dict[str, GkDfType]: +def detect_gk_cols(lf: pl.LazyFrame) -> dict[str, GkDfType]: """Detect columns in the LazyFrame that contains GenomeKit objects. Args: @@ -47,9 +45,6 @@ def detect_gk_cols( lf_cols = lf.collect_schema().names() - if not columns: - columns = lf_cols - target_cols = {} # polars Struct inferred from first row, same behaviour as Polars @@ -57,11 +52,8 @@ def detect_gk_cols( # materialize the first row to check data types, need the exact type not pl.Object first_row = lf.head(1).collect()[0] - for col in columns: - if col not in lf_cols: - raise ValueError( - f"Column '{col}' not found in the DataFrame, please check the column names and try again." - ) + # TODO: support list of GenomeKit objects + for col in lf_cols: # item from first row of the column col_type = GK_TO_STRUCT.get(type(first_row[col][0]), None) @@ -75,24 +67,17 @@ def detect_gk_cols( # TODO: add union of pd.DataFrame -def to_parquet( - df: pl.DataFrame | pl.LazyFrame, - path: str, - columns: list[str] | None = None, -) -> None: +def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str) -> None: """Serialize a DataFrame with GenomeKit objects to a Parquet file. Args: df: A Polars DataFrame or LazyFrame with columns containing GenomeKit objects. path: The file path to write the Parquet file to. - columns: Optional list of column names to serialize. If None, all GenomeKit - columns will be serialized. - """ if isinstance(df, pl.DataFrame): df = df.lazy() - target_cols = detect_gk_cols(df, columns) + target_cols = detect_gk_cols(df) if not target_cols: warnings.warn( @@ -160,13 +145,31 @@ def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: ) +def _deserialize_gk_cols( + lf: pl.LazyFrame, target_cols: dict[str, GkDfType] +) -> pl.LazyFrame: + """Deserialize columns containing GenomeKit objects. + + Args: + lf: The LazyFrame containing the serialized GenomeKit objects. + target_cols: A dictionary mapping column names to their corresponding GkDf types. + """ + return lf.with_columns( + pl.col(col) + .map_batches( + _map_batches_safe(REGISTRY[CURRENT_VERSION][target_cols[col]].deserializer), + return_dtype=pl.Object, + ) + .alias(col) + for col in target_cols + ) + + def from_parquet(path: str, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: """Deserialize a Parquet file containing GenomeKit objects into a Polars DataFrame or LazyFrame. Args: path: The file path to read the Parquet file from. - columns: Optional list of columns to deserialize. If None, all detected - GenomeKit columns will be deserialized. lazy: If True, return a LazyFrame. Otherwise, return a DataFrame. Returns: @@ -183,14 +186,6 @@ def from_parquet(path: str, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: # on opening dganno files _init_gk_annotations(lf, target_cols) - lf = lf.with_columns( - pl.col(col) - .map_batches( - _map_batches_safe(REGISTRY[CURRENT_VERSION][target_cols[col]].deserializer), - return_dtype=pl.Object, - ) - .alias(col) - for col in target_cols - ) + lf = _deserialize_gk_cols(lf, target_cols) return lf if lazy else lf.collect() From 7b5130ebf8f7ee84038baca48bee303a084759cb Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 18 Mar 2026 09:31:30 -0400 Subject: [PATCH 05/48] feat: add subpackage discovery and optional polars dependency --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b514af43..1b4b6890 100644 --- a/setup.py +++ b/setup.py @@ -403,10 +403,13 @@ def _compile_obj(obj): "importlib-metadata", "typing-extensions", ], + extras_require={ + "df": ["polars"] + }, license="Apache License 2.0", license_files=(COPYRIGHT_FILE, LICENSE_FILE,), name="genomekit", - packages=find_packages(include=["genome_kit"]), + packages=find_packages(include=["genome_kit", "genome_kit.*"]), project_urls={ "Documentation": "https://deepgenomics.github.io/GenomeKit" }, From 518e45619c14ed8a16fcb5a0ce2896bcf74f6703 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 18 Mar 2026 09:51:02 -0400 Subject: [PATCH 06/48] docs: add df subpackage docs --- docs-src/df.rst | 65 ++++++++++++++++++++++++++++++++++++++++++++++ docs-src/index.rst | 1 + 2 files changed, 66 insertions(+) create mode 100644 docs-src/df.rst diff --git a/docs-src/df.rst b/docs-src/df.rst new file mode 100644 index 00000000..2759fc3a --- /dev/null +++ b/docs-src/df.rst @@ -0,0 +1,65 @@ +.. _df: + +GenomeKit DataFrame Utilities +============================= + +The :py:mod:`genome_kit.df` subpackage contains utilities for working with Polars DataFrames that contain GenomeKit objects. This includes utilities for serializing DataFrames with GenomeKit objects to Parquet and deserializing them back to GenomeKit objects. This is useful when sharing tabular data sets, or when saving intermediate DataFrames to disk during data processing. + +.. important:: + + ``genome_kit.df`` depends on optional ``polars`` dependencies, which are not installed by default. This can be installed with the ``[df]`` extra: + + .. code-block:: bash + + mamba install "genomekit[df]" + + The ``[df]`` extra is not included in the default installation. + + +Quickstart +----------- +The serialization and deserialization entry points are :py:func:`~genome_kit.df.to_parquet` and :py:func:`~genome_kit.df.from_parquet`: + +.. code-block:: python + + import polars as pl + import genome_kit as gk + + genome = gk.Genome("ncbi_refseq.v110") + df = pl.DataFrame( + { + "gene": [genome.genes[0], genome.genes[1]], + "score": [0.1, 0.8], + } + ) + + gk.to_parquet(df, "genes.parquet") + ... + ... + restored_df = gk.from_parquet("genes.parquet") + + +.. note:: + + The written parquet files can be read by any software that supports the parquet format, but the GenomeKit objects will only be restored when read with :py:func:`~genome_kit.df.from_parquet`. + + +Supported GenomeKit Objects +--------------------------- +The currently supported GenomeKit objects for serialization are: + +- :py:class:`genome_kit.Genome` +- :py:class:`genome_kit.Interval` +- :py:class:`genome_kit.Transcript` +- :py:class:`genome_kit.Gene` +- :py:class:`genome_kit.Exon` +- :py:class:`genome_kit.Intron` +- :py:class:`genome_kit.CDS` +- :py:class:`genome_kit.UTR` + +Public API +---------------- +.. automodule:: genome_kit.df + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs-src/index.rst b/docs-src/index.rst index 46b7b4b6..ec365912 100644 --- a/docs-src/index.rst +++ b/docs-src/index.rst @@ -73,6 +73,7 @@ Contents: anchors api genomes + df develop data_org From 6de6f85b6884487d1aa7729e71a25eafb4f4cd72 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Thu, 19 Mar 2026 21:34:06 -0400 Subject: [PATCH 07/48] feat: add lazy polars import and ensure gk annotation lifetime --- genome_kit/_optional.py | 13 + genome_kit/df/__init__.py | 2 +- genome_kit/df/gk_structs.py | 174 ++++++---- genome_kit/df/registry.py | 614 ++++++++++++++++----------------- genome_kit/df/serialization.py | 46 ++- 5 files changed, 450 insertions(+), 399 deletions(-) create mode 100644 genome_kit/_optional.py diff --git a/genome_kit/_optional.py b/genome_kit/_optional.py new file mode 100644 index 00000000..715a1e68 --- /dev/null +++ b/genome_kit/_optional.py @@ -0,0 +1,13 @@ +from __future__ import annotations + + +def require_polars(): + """Import Polars if available, otherwise fail gracefully.""" + try: + import polars as pl + except ModuleNotFoundError as e: + raise ImportError( + "Optional dependency 'polars' is required for this functionality. Please install with `pip install genomekit[df]`." + ) from e + + return pl diff --git a/genome_kit/df/__init__.py b/genome_kit/df/__init__.py index a7c1137c..0ca20f42 100644 --- a/genome_kit/df/__init__.py +++ b/genome_kit/df/__init__.py @@ -1,3 +1,3 @@ from .serialization import from_parquet, to_parquet -__all__ = ["from_parquet", "to_parquet"] \ No newline at end of file +__all__ = ["from_parquet", "to_parquet"] diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index 5534acee..0e6b0f01 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -1,6 +1,12 @@ +from __future__ import annotations + from enum import StrEnum +from typing import TYPE_CHECKING + +from genome_kit._optional import require_polars -import polars as pl +if TYPE_CHECKING: # import polars for type checking + import polars as pl class GkDfType(StrEnum): @@ -20,78 +26,94 @@ class GkDfVersion(StrEnum): CURRENT_VERSION = GkDfVersion.V1 -GenomeStruct = pl.Struct( - [ - pl.Field("gkdf_type", pl.Utf8), - pl.Field("schema_version", pl.Utf8), - pl.Field("genome_str", pl.Utf8), # reference or annotation genome - ] -) - -IntervalStruct = pl.Struct( - [ - pl.Field("gkdf_type", pl.Utf8), - pl.Field("schema_version", pl.Utf8), - pl.Field("chromosome", pl.Utf8), - pl.Field("strand", pl.Utf8), - pl.Field("start", pl.Int32), - pl.Field("end", pl.Int32), - pl.Field("genome_str", pl.Utf8), # reference or annotation genome - ] -) - -TranscriptStruct = pl.Struct( - [ - pl.Field("gkdf_type", pl.Utf8), - pl.Field("schema_version", pl.Utf8), - # index of transcript within annotation genome transcript table - pl.Field("transcript_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome - ] -) - -GeneStruct = pl.Struct( - [ - pl.Field("gkdf_type", pl.Utf8), - pl.Field("schema_version", pl.Utf8), - pl.Field("gene_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome - ] -) - -ExonStruct = pl.Struct( - [ - pl.Field("gkdf_type", pl.Utf8), - pl.Field("schema_version", pl.Utf8), - pl.Field("exon_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome - ] -) - -IntronStruct = pl.Struct( - [ - pl.Field("gkdf_type", pl.Utf8), - pl.Field("schema_version", pl.Utf8), - pl.Field("intron_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome - ] -) - -CdsStruct = pl.Struct( - [ - pl.Field("gkdf_type", pl.Utf8), - pl.Field("schema_version", pl.Utf8), - pl.Field("cds_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome - ] -) - -UtrStruct = pl.Struct( - [ - pl.Field("gkdf_type", pl.Utf8), - pl.Field("schema_version", pl.Utf8), - pl.Field("utr_type", pl.Utf8), # "5prime" or "3prime" - pl.Field("utr_table_index", pl.Int64), - pl.Field("genome_str", pl.Utf8), # annotation genome - ] -) + +def get_structs() -> dict[GkDfType, pl.Struct]: + """Return a mapping of GkDfType to their corresponding Polars Struct definitions.""" + pl = require_polars() + + GenomeStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("genome_str", pl.Utf8), # reference or annotation genome + ] + ) + + IntervalStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("chromosome", pl.Utf8), + pl.Field("strand", pl.Utf8), + pl.Field("start", pl.Int32), + pl.Field("end", pl.Int32), + pl.Field("genome_str", pl.Utf8), # reference or annotation genome + ] + ) + + TranscriptStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + # index of transcript within annotation genome transcript table + pl.Field("transcript_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] + ) + + GeneStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("gene_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] + ) + + ExonStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("exon_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] + ) + + IntronStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("intron_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] + ) + + CdsStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("cds_table_index", pl.Int32), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] + ) + + UtrStruct = pl.Struct( + [ + pl.Field("gkdf_type", pl.Utf8), + pl.Field("schema_version", pl.Utf8), + pl.Field("utr_type", pl.Utf8), # "5prime" or "3prime" + pl.Field("utr_table_index", pl.Int64), + pl.Field("genome_str", pl.Utf8), # annotation genome + ] + ) + + return { + GkDfType.GENOME: GenomeStruct, + GkDfType.INTERVAL: IntervalStruct, + GkDfType.TRANSCRIPT: TranscriptStruct, + GkDfType.GENE: GeneStruct, + GkDfType.EXON: ExonStruct, + GkDfType.INTRON: IntronStruct, + GkDfType.CDS: CdsStruct, + GkDfType.UTR: UtrStruct, + } diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index 786c2e39..fb82ef5e 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -1,23 +1,16 @@ +from __future__ import annotations + from dataclasses import dataclass -from functools import cache -from typing import Callable +from functools import lru_cache +from typing import TYPE_CHECKING, Callable -import polars as pl +if TYPE_CHECKING: + import polars as pl import genome_kit as gk +from genome_kit._optional import require_polars -from .gk_structs import ( - CdsStruct, - ExonStruct, - GeneStruct, - GenomeStruct, - GkDfType, - GkDfVersion, - IntervalStruct, - IntronStruct, - TranscriptStruct, - UtrStruct, -) +from .gk_structs import GkDfType, GkDfVersion, get_structs # mapping from GenomeKit object types to the gkdf type strings GK_TO_STRUCT: dict[type[gk.GenomeAnnotation], GkDfType] = { @@ -46,302 +39,303 @@ class GKTypeEntry: SUPPORTED_VERSIONS = {v for v in GkDfVersion.__members__.values()} -def _serialize_genome(s: pl.Series) -> pl.Series: - """Serialize a Series of GenomeKit Genome objects by genome name.""" - return pl.Series( - name=s.name, - values=[ - { - _GKDF_TYPE_FIELD: GkDfType.GENOME.value, - _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, - # config gives annotation genome name if applicable - "genome_str": genome.config, - } - for genome in s - ], - dtype=GenomeStruct, - ) - - -def _deserialize_genome(s: pl.Series) -> pl.Series: - """Deserialize a Series of GenomeStruct back into GenomeKit Genome objects.""" - return pl.Series( - name=s.name, - values=[gk.Genome(struct["genome_str"]) for struct in s], - dtype=pl.Object, - ) - - -def _serialize_interval(s: pl.Series) -> pl.Series: - """Serialize a Series of GenomeKit Interval objects.""" - return pl.Series( - name=s.name, - values=[ - { - _GKDF_TYPE_FIELD: GkDfType.INTERVAL.value, - _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, - "chromosome": interval.chrom, - "strand": interval.strand, - "start": interval.start, - "end": interval.end, - # intervals related to reference genome only - "genome_str": interval.reference_genome, - } - for interval in s - ], - dtype=IntervalStruct, - ) - - -def _deserialize_interval(s: pl.Series) -> pl.Series: - """Deserialize a Series of IntervalStruct back into GenomeKit Interval objects.""" - return pl.Series( - name=s.name, - values=[ - gk.Interval( - chromosome=struct["chrom"], - strand=struct["strand"], - start=struct["start"], - end=struct["end"], - reference_genome=struct["genome_str"], - ) - for struct in s - ], - dtype=pl.Object, - ) - - -def _serialize_transcript(s: pl.Series) -> pl.Series: - """Serialize a Series of GenomeKit Transcript objects.""" - return pl.Series( - name=s.name, - values=[ - { - _GKDF_TYPE_FIELD: GkDfType.TRANSCRIPT.value, - _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, - "transcript_table_index": transcript.annotation_genome.transcripts.index_of( - transcript - ), - "genome_str": transcript.annotation_genome.config, - } - for transcript in s - ], - dtype=TranscriptStruct, - ) - - -def _deserialize_transcript(s: pl.Series) -> pl.Series: - """Deserialize a Series of TranscriptStruct back into GenomeKit Transcript objects.""" - return pl.Series( - name=s.name, - values=[ - gk.Genome(struct["genome_str"]).transcripts[ - struct["transcript_table_index"] - ] - for struct in s - ], - dtype=pl.Object, - ) - - -def _serialize_gene(s: pl.Series) -> pl.Series: - """Serialize a Series of GenomeKit Gene objects.""" - return pl.Series( - name=s.name, - values=[ - { - _GKDF_TYPE_FIELD: GkDfType.GENE.value, +@lru_cache(maxsize=1) # cache to avoid recreating registry in same session +def get_registry() -> dict[GkDfVersion, dict[GkDfType, GKTypeEntry]]: + """Fetch the registry containing serialization and deserilization functions. + + Returns: + Dictionary mapping GkDfType to their corresponding serializer and deserializer + functions, for each supported GkDfVersion. + """ + pl = require_polars() + gkdf_structs = get_structs() + + def _serialize_genome(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Genome objects by genome name.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.GENOME.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + # config gives annotation genome name if applicable + "genome_str": genome.config, + } + for genome in s + ], + dtype=gkdf_structs[GkDfType.GENOME], + ) + + def _deserialize_genome(s: pl.Series) -> pl.Series: + """Deserialize a Series of GenomeStruct back into GenomeKit Genome objects.""" + return pl.Series( + name=s.name, + values=[gk.Genome(struct["genome_str"]) for struct in s], + dtype=pl.Object, + ) + + def _serialize_interval(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Interval objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.INTERVAL.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "chromosome": interval.chrom, + "strand": interval.strand, + "start": interval.start, + "end": interval.end, + # intervals related to reference genome only + "genome_str": interval.reference_genome, + } + for interval in s + ], + dtype=gkdf_structs[GkDfType.INTERVAL], + ) + + def _deserialize_interval(s: pl.Series) -> pl.Series: + """Deserialize a Series of IntervalStruct back into GenomeKit Interval objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Interval( + chromosome=struct["chrom"], + strand=struct["strand"], + start=struct["start"], + end=struct["end"], + reference_genome=struct["genome_str"], + ) + for struct in s + ], + dtype=pl.Object, + ) + + def _serialize_transcript(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Transcript objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.TRANSCRIPT.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "transcript_table_index": transcript.annotation_genome.transcripts.index_of( + transcript + ), + "genome_str": transcript.annotation_genome.config, + } + for transcript in s + ], + dtype=gkdf_structs[GkDfType.TRANSCRIPT], + ) + + def _deserialize_transcript(s: pl.Series) -> pl.Series: + """Deserialize a Series of TranscriptStruct back into GenomeKit Transcript objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).transcripts[ + struct["transcript_table_index"] + ] + for struct in s + ], + dtype=pl.Object, + ) + + def _serialize_gene(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Gene objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.GENE.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "gene_table_index": gene.annotation_genome.genes.index_of(gene), + "genome_str": gene.annotation_genome.config, + } + for gene in s + ], + dtype=gkdf_structs[GkDfType.GENE], + ) + + def _deserialize_gene(s: pl.Series) -> pl.Series: + """Deserialize a Series of GeneStruct back into GenomeKit Gene objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).genes[struct["gene_table_index"]] + for struct in s + ], + dtype=pl.Object, + ) + + def _serialize_exon(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Exon objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.EXON.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "exon_table_index": exon.annotation_genome.exons.index_of(exon), + "genome_str": exon.annotation_genome.config, + } + for exon in s + ], + dtype=gkdf_structs[GkDfType.EXON], + ) + + def _deserialize_exon(s: pl.Series) -> pl.Series: + """Deserialize a Series of ExonStruct back into GenomeKit Exon objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).exons[struct["exon_table_index"]] + for struct in s + ], + dtype=pl.Object, + ) + + def _serialize_intron(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Intron objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.INTRON.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "intron_table_index": intron.annotation_genome.introns.index_of( + intron + ), + "genome_str": intron.annotation_genome.config, + } + for intron in s + ], + dtype=gkdf_structs[GkDfType.INTRON], + ) + + def _deserialize_intron(s: pl.Series) -> pl.Series: + """Deserialize a Series of IntronStruct back into GenomeKit Intron objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).introns[struct["intron_table_index"]] + for struct in s + ], + dtype=pl.Object, + ) + + def _serialize_cds(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Cds objects.""" + return pl.Series( + name=s.name, + values=[ + { + _GKDF_TYPE_FIELD: GkDfType.CDS.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + "cds_table_index": cds.annotation_genome.cdss.index_of(cds), + "genome_str": cds.annotation_genome.config, + } + for cds in s + ], + dtype=gkdf_structs[GkDfType.CDS], + ) + + def _deserialize_cds(s: pl.Series) -> pl.Series: + """Deserialize a Series of CDSStruct back into GenomeKit Cds objects.""" + return pl.Series( + name=s.name, + values=[ + gk.Genome(struct["genome_str"]).cdss[struct["cds_table_index"]] + for struct in s + ], + dtype=pl.Object, + ) + + def _serialize_utr(s: pl.Series) -> pl.Series: + """Serialize a Series of GenomeKit Utr objects.""" + values = [] + for utr in s: + ser_dict = { + _GKDF_TYPE_FIELD: GkDfType.UTR.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, - "gene_table_index": gene.annotation_genome.genes.index_of(gene), - "genome_str": gene.annotation_genome.config, } - for gene in s - ], - dtype=GeneStruct, - ) - - -def _deserialize_gene(s: pl.Series) -> pl.Series: - """Deserialize a Series of GeneStruct back into GenomeKit Gene objects.""" - return pl.Series( - name=s.name, - values=[ - gk.Genome(struct["genome_str"]).genes[struct["gene_table_index"]] - for struct in s - ], - dtype=pl.Object, - ) - - -def _serialize_exon(s: pl.Series) -> pl.Series: - """Serialize a Series of GenomeKit Exon objects.""" - return pl.Series( - name=s.name, - values=[ - { - _GKDF_TYPE_FIELD: GkDfType.EXON.value, - _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, - "exon_table_index": exon.annotation_genome.exons.index_of(exon), - "genome_str": exon.annotation_genome.config, - } - for exon in s - ], - dtype=ExonStruct, - ) - - -def _deserialize_exon(s: pl.Series) -> pl.Series: - """Deserialize a Series of ExonStruct back into GenomeKit Exon objects.""" - return pl.Series( - name=s.name, - values=[ - gk.Genome(struct["genome_str"]).exons[struct["exon_table_index"]] - for struct in s - ], - dtype=pl.Object, - ) - - -def _serialize_intron(s: pl.Series) -> pl.Series: - """Serialize a Series of GenomeKit Intron objects.""" - return pl.Series( - name=s.name, - values=[ - { - _GKDF_TYPE_FIELD: GkDfType.INTRON.value, - _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, - "intron_table_index": intron.annotation_genome.introns.index_of(intron), - "genome_str": intron.annotation_genome.config, - } - for intron in s - ], - dtype=IntronStruct, - ) - - -def _deserialize_intron(s: pl.Series) -> pl.Series: - """Deserialize a Series of IntronStruct back into GenomeKit Intron objects.""" - return pl.Series( - name=s.name, - values=[ - gk.Genome(struct["genome_str"]).introns[struct["intron_table_index"]] - for struct in s - ], - dtype=pl.Object, - ) - - -def _serialize_cds(s: pl.Series) -> pl.Series: - """Serialize a Series of GenomeKit Cds objects.""" - return pl.Series( - name=s.name, - values=[ - { - _GKDF_TYPE_FIELD: GkDfType.CDS.value, - _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, - "cds_table_index": cds.annotation_genome.cdss.index_of(cds), - "genome_str": cds.annotation_genome.config, - } - for cds in s - ], - dtype=CdsStruct, - ) - - -def _deserialize_cds(s: pl.Series) -> pl.Series: - """Deserialize a Series of CDSStruct back into GenomeKit Cds objects.""" - return pl.Series( - name=s.name, - values=[ - gk.Genome(struct["genome_str"]).cdss[struct["cds_table_index"]] - for struct in s - ], - dtype=pl.Object, - ) - - -def _serialize_utr(s: pl.Series) -> pl.Series: - """Serialize a Series of GenomeKit Utr objects.""" - values = [] - for utr in s: - ser_dict = { - _GKDF_TYPE_FIELD: GkDfType.UTR.value, - _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + genome = utr.annotation_genome + try: + ser_dict["utr_table_index"] = genome.utr5s.index_of(utr) + ser_dict["utr_type"] = "5prime" + except ValueError: + ser_dict["utr_table_index"] = genome.utr3s.index_of(utr) + ser_dict["utr_type"] = "3prime" + + ser_dict["genome_str"] = genome.config + values.append(ser_dict) + + return pl.Series( + name=s.name, + values=values, + dtype=gkdf_structs[GkDfType.UTR], + ) + + def _deserialize_utr(s: pl.Series) -> pl.Series: + """Deserialize a Series of UtrStruct back into GenomeKit Utr objects.""" + return pl.Series( + name=s.name, + values=[ + ( + gk.Genome(struct["genome_str"]).utr5s[struct["utr_table_index"]] + if struct["utr_type"] == "5prime" + else gk.Genome(struct["genome_str"]).utr3s[ + struct["utr_table_index"] + ] + ) + for struct in s + ], + ) + + REGISTRY: dict[GkDfVersion, dict[GkDfType, GKTypeEntry]] = { + GkDfVersion.V1: { + GkDfType.GENOME: GKTypeEntry( + struct=gkdf_structs[GkDfType.GENOME], + serializer=_serialize_genome, + deserializer=_deserialize_genome, + ), + GkDfType.INTERVAL: GKTypeEntry( + struct=gkdf_structs[GkDfType.INTERVAL], + serializer=_serialize_interval, + deserializer=_deserialize_interval, + ), + GkDfType.TRANSCRIPT: GKTypeEntry( + struct=gkdf_structs[GkDfType.TRANSCRIPT], + serializer=_serialize_transcript, + deserializer=_deserialize_transcript, + ), + GkDfType.GENE: GKTypeEntry( + struct=gkdf_structs[GkDfType.GENE], + serializer=_serialize_gene, + deserializer=_deserialize_gene, + ), + GkDfType.EXON: GKTypeEntry( + struct=gkdf_structs[GkDfType.EXON], + serializer=_serialize_exon, + deserializer=_deserialize_exon, + ), + GkDfType.INTRON: GKTypeEntry( + struct=gkdf_structs[GkDfType.INTRON], + serializer=_serialize_intron, + deserializer=_deserialize_intron, + ), + GkDfType.CDS: GKTypeEntry( + struct=gkdf_structs[GkDfType.CDS], + serializer=_serialize_cds, + deserializer=_deserialize_cds, + ), + GkDfType.UTR: GKTypeEntry( + struct=gkdf_structs[GkDfType.UTR], + serializer=_serialize_utr, + deserializer=_deserialize_utr, + ), } - genome = utr.annotation_genome - try: - ser_dict["utr_table_index"] = genome.utr5s.index_of(utr) - ser_dict["utr_type"] = "5prime" - except ValueError: - ser_dict["utr_table_index"] = genome.utr3s.index_of(utr) - ser_dict["utr_type"] = "3prime" - - ser_dict["genome_str"] = genome.config - values.append(ser_dict) - - return pl.Series( - name=s.name, - values=values, - dtype=UtrStruct, - ) - - -def _deserialize_utr(s: pl.Series) -> pl.Series: - """Deserialize a Series of UtrStruct back into GenomeKit Utr objects.""" - return pl.Series( - name=s.name, - values=[ - ( - gk.Genome(struct["genome_str"]).utr5s[struct["utr_table_index"]] - if struct["utr_type"] == "5prime" - else gk.Genome(struct["genome_str"]).utr3s[struct["utr_table_index"]] - ) - for struct in s - ], - ) - - -REGISTRY: dict[GkDfVersion, dict[GkDfType, GKTypeEntry]] = { - GkDfVersion.V1: { - GkDfType.GENOME: GKTypeEntry( - struct=GenomeStruct, - serializer=_serialize_genome, - deserializer=_deserialize_genome, - ), - GkDfType.INTERVAL: GKTypeEntry( - struct=IntervalStruct, - serializer=_serialize_interval, - deserializer=_deserialize_interval, - ), - GkDfType.TRANSCRIPT: GKTypeEntry( - struct=TranscriptStruct, - serializer=_serialize_transcript, - deserializer=_deserialize_transcript, - ), - GkDfType.GENE: GKTypeEntry( - struct=GeneStruct, - serializer=_serialize_gene, - deserializer=_deserialize_gene, - ), - GkDfType.EXON: GKTypeEntry( - struct=ExonStruct, - serializer=_serialize_exon, - deserializer=_deserialize_exon, - ), - GkDfType.INTRON: GKTypeEntry( - struct=IntronStruct, - serializer=_serialize_intron, - deserializer=_deserialize_intron, - ), - GkDfType.CDS: GKTypeEntry( - struct=CdsStruct, - serializer=_serialize_cds, - deserializer=_deserialize_cds, - ), - GkDfType.UTR: GKTypeEntry( - struct=UtrStruct, - serializer=_serialize_utr, - deserializer=_deserialize_utr, - ), } -} + + return REGISTRY diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index dd036176..adb209f2 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -1,15 +1,20 @@ +from __future__ import annotations + import functools import json import warnings from collections.abc import Callable from inspect import signature +from typing import TYPE_CHECKING -import polars as pl +if TYPE_CHECKING: + import polars as pl import genome_kit as gk +from genome_kit._optional import require_polars from .gk_structs import CURRENT_VERSION, GkDfType, GkDfVersion -from .registry import GK_TO_STRUCT, REGISTRY +from .registry import GK_TO_STRUCT, get_registry def _map_batches_safe(fn: Callable): @@ -74,6 +79,8 @@ def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str) -> None: df: A Polars DataFrame or LazyFrame with columns containing GenomeKit objects. path: The file path to write the Parquet file to. """ + pl = require_polars() + if isinstance(df, pl.DataFrame): df = df.lazy() @@ -86,11 +93,12 @@ def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str) -> None: df.sink_parquet(path) return - df = df.with_columns( + registry = get_registry() + df = df.with_columns( # TODO check if with_columns_seq has better performance pl.col(col) .map_batches( - _map_batches_safe(REGISTRY[CURRENT_VERSION][target_cols[col]].serializer), - return_dtype=REGISTRY[CURRENT_VERSION][target_cols[col]].struct, + _map_batches_safe(registry[CURRENT_VERSION][target_cols[col]].serializer), + return_dtype=registry[CURRENT_VERSION][target_cols[col]].struct, ) .alias(col) for col in target_cols @@ -105,7 +113,9 @@ def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str) -> None: df.sink_parquet(path, metadata=metadata) -def _init_gk_annotations(lf: pl.LazyFrame, target_cols: dict[str, GkDfType]) -> None: +def _init_gk_annotations( + lf: pl.LazyFrame, target_cols: dict[str, GkDfType] +) -> list[gk.Genome]: """Initialize GenomeKit annotations for all unique genomes in the LazyFrame. Prevents race conditions when opening dganno files during polars operations. @@ -114,6 +124,10 @@ def _init_gk_annotations(lf: pl.LazyFrame, target_cols: dict[str, GkDfType]) -> lf: The LazyFrame containing the serialized GenomeKit objects. target_cols: A dictionary mapping column names to their corresponding GenomeKit types. """ + pl = require_polars() + + annotations = [] + genomes_exprs = [pl.col(c).struct.field("genome_str") for c in target_cols.keys()] genomes = ( lf.select( @@ -127,8 +141,11 @@ def _init_gk_annotations(lf: pl.LazyFrame, target_cols: dict[str, GkDfType]) -> .to_list() ) + # warms annotations for all unique genomes in the file for genome_str in genomes: - gk.Genome(genome_str).genes + annotations.append(gk.Genome(genome_str).genes) + + return annotations def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: @@ -154,10 +171,13 @@ def _deserialize_gk_cols( lf: The LazyFrame containing the serialized GenomeKit objects. target_cols: A dictionary mapping column names to their corresponding GkDf types. """ - return lf.with_columns( + pl = require_polars() + registry = get_registry() + + return lf.with_columns_seq( pl.col(col) .map_batches( - _map_batches_safe(REGISTRY[CURRENT_VERSION][target_cols[col]].deserializer), + _map_batches_safe(registry[CURRENT_VERSION][target_cols[col]].deserializer), return_dtype=pl.Object, ) .alias(col) @@ -175,6 +195,7 @@ def from_parquet(path: str, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: Returns: A Polars DataFrame or LazyFrame with deserialized GenomeKit objects. """ + pl = require_polars() metadata = pl.read_parquet_metadata(path) _validate_gkdf_metadata(metadata) @@ -182,9 +203,10 @@ def from_parquet(path: str, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: lf = pl.scan_parquet(path) - # collect unique genome strings in the file to initialize, prevents race conditions - # on opening dganno files - _init_gk_annotations(lf, target_cols) + # collect unique genome strings in the file and initialize, prevents race conditions + # on opening dganno files. + # genomes returned in dummy variable to keep weak reference alive for deserialization + _ = _init_gk_annotations(lf, target_cols) lf = _deserialize_gk_cols(lf, target_cols) From 1d433345a5444dadaf2f4dac0ea41eec0824d9ee Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Fri, 20 Mar 2026 12:11:32 -0400 Subject: [PATCH 08/48] fix: add fix for strenum import errors --- genome_kit/df/gk_structs.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index 0e6b0f01..cbfd0a26 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -1,6 +1,5 @@ from __future__ import annotations -from enum import StrEnum from typing import TYPE_CHECKING from genome_kit._optional import require_polars @@ -8,6 +7,17 @@ if TYPE_CHECKING: # import polars for type checking import polars as pl +# minimal shim for python <3.11 compatability +try: + from enum import StrEnum +except ImportError: + from enum import Enum + + class StrEnum(str, Enum): + def __str__(self): + return str(self.value) + + class GkDfType(StrEnum): GENOME = "genome" From 0497eec24ab92688e234e56bfe32ce354fcbeb18 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Fri, 20 Mar 2026 17:10:43 -0400 Subject: [PATCH 09/48] docs: update with_columns_seq comments and docstring --- genome_kit/df/serialization.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index adb209f2..c5c839bb 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -94,7 +94,8 @@ def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str) -> None: return registry = get_registry() - df = df.with_columns( # TODO check if with_columns_seq has better performance + + df = df.with_columns( pl.col(col) .map_batches( _map_batches_safe(registry[CURRENT_VERSION][target_cols[col]].serializer), @@ -149,7 +150,7 @@ def _init_gk_annotations( def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: - """Validate the parquet metadata for a gk.""" + """Validate the parquet metadata for a gkdf parquet file.""" try: version = GkDfVersion(metadata.get("gkdf_version")) @@ -174,6 +175,7 @@ def _deserialize_gk_cols( pl = require_polars() registry = get_registry() + # with_columns_seq provides a 2x speedup here over with_columns return lf.with_columns_seq( pl.col(col) .map_batches( From 6f9c71dfaf4d7283406bbd0c5dfc9c5efefc3e9e Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 24 Mar 2026 12:04:10 -0400 Subject: [PATCH 10/48] feat: add optional polars configuration --- .github/workflows/run-tests.yaml | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index 890450d3..abcdd462 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -31,15 +31,17 @@ jobs: needs: build strategy: matrix: + python-version: ['3.9', '3.10', '3.11', '3.12'] + platform: ["linux-64", "osx-64"] + extras: ["none", "df"] include: - - {"pyver-short": "39", "python-version": "3.9", "platform": "linux-64", "runs-on": "ubuntu-latest"} - - {"pyver-short": "39", "python-version": "3.9", "platform": "osx-64", "runs-on": "macos-latest"} - - {"pyver-short": "310", "python-version": "3.10", "platform": "linux-64", "runs-on": "ubuntu-latest"} - - {"pyver-short": "310", "python-version": "3.10", "platform": "osx-64", "runs-on": "macos-latest"} - - {"pyver-short": "311", "python-version": "3.11", "platform": "linux-64", "runs-on": "ubuntu-latest"} - - {"pyver-short": "311", "python-version": "3.11", "platform": "osx-64", "runs-on": "macos-latest"} - - {"pyver-short": "312", "python-version": "3.12", "platform": "linux-64", "runs-on": "ubuntu-latest"} - - {"pyver-short": "312", "python-version": "3.12", "platform": "osx-64", "runs-on": "macos-latest"} + - {"python-version": "3.9", "pyver-short": "39"} + - {"python-version": "3.10", "pyver-short": "310"} + - {"python-version": "3.11", "pyver-short": "311"} + - {"python-version": "3.12", "pyver-short": "312"} + - {"platform": "osx-64", "runs-on": "macos-latest"} + - {"platform": "linux-64", "runs-on": "ubuntu-latest"} + runs-on: ${{ matrix.runs-on }} steps: - uses: actions/checkout@v4 @@ -87,6 +89,8 @@ jobs: - name: run unittests id: run_unittests shell: bash -l -e {0} + env: + GK_EXTRAS: ${{ matrix.extras }} run: | set -x micromamba activate test @@ -95,7 +99,13 @@ jobs: if [ ! -e "${files[0]}" ]; then echo "No files matched for py${{ matrix.pyver-short }}" exit 1 - fi - conda mambabuild --croot /tmp/conda-bld -t $files --extra-deps python=${{ matrix.python-version }} + fi + + build_cmd=(conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --python "${{ matrix.python-version }}") + if [ "$GK_EXTRAS" = "df" ]; then + build_cmd+=("--extra-deps" "polars=1.39.2") + fi + # run command with optional extra deps + "${build_cmd[@]}" conda clean -it set +x From a905a2b940fe2b184f848335cd4be54c594117c0 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 24 Mar 2026 14:06:11 -0400 Subject: [PATCH 11/48] chore: update syntax and chnage polars version: --- .github/workflows/run-tests.yaml | 33 +++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index abcdd462..7d10fdd5 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -17,15 +17,20 @@ jobs: build: strategy: matrix: - platform: [["linux-64", "ubuntu-latest"], ["osx-64", "macos-latest"]] - runs-on: ${{ matrix.platform[1] }} + platform: ["linux-64", "osx-64"] + include: + - platform: "linux-64" + runs-on: "ubuntu-latest" + - platform: "osx-64" + runs-on: "macos-latest" + runs-on: ${{ matrix.runs-on }} if: "!startsWith(github.head_ref, 'release-please')" steps: - uses: actions/checkout@v4 - uses: ./.github/actions/build-gk with: - platform: ${{ matrix.platform[0] }} + platform: ${{ matrix.platform }} test: needs: build @@ -34,13 +39,19 @@ jobs: python-version: ['3.9', '3.10', '3.11', '3.12'] platform: ["linux-64", "osx-64"] extras: ["none", "df"] - include: - - {"python-version": "3.9", "pyver-short": "39"} - - {"python-version": "3.10", "pyver-short": "310"} - - {"python-version": "3.11", "pyver-short": "311"} - - {"python-version": "3.12", "pyver-short": "312"} - - {"platform": "osx-64", "runs-on": "macos-latest"} - - {"platform": "linux-64", "runs-on": "ubuntu-latest"} + include: # specify additional fields for all configs + - python-version: "3.9" + pyver-short: "39" + - python-version: "3.10" + pyver-short: "310" + - python-version: "3.11" + pyver-short: "311" + - python-version: "3.12" + pyver-short: "312" + - platform: "osx-64" + runs-on: "macos-latest" + - platform: "linux-64" + runs-on: "ubuntu-latest" runs-on: ${{ matrix.runs-on }} steps: @@ -103,7 +114,7 @@ jobs: build_cmd=(conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --python "${{ matrix.python-version }}") if [ "$GK_EXTRAS" = "df" ]; then - build_cmd+=("--extra-deps" "polars=1.39.2") + build_cmd+=("--extra-deps" "polars=1.36.1") fi # run command with optional extra deps "${build_cmd[@]}" From 6f87c23b8a387f41c95f9b12b979ce915de37f48 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 24 Mar 2026 15:00:22 -0400 Subject: [PATCH 12/48] fix: correct python env version --- .github/workflows/run-tests.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index 7d10fdd5..5a0b2d7b 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -112,11 +112,11 @@ jobs: exit 1 fi - build_cmd=(conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --python "${{ matrix.python-version }}") + extra_deps=(python = ${{ matrix.python-version }}) + # run command with optional extra deps if [ "$GK_EXTRAS" = "df" ]; then - build_cmd+=("--extra-deps" "polars=1.36.1") + extra_deps+=("polars=1.36.1") fi - # run command with optional extra deps - "${build_cmd[@]}" + conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --extra-deps "${extra_deps[@]}" conda clean -it set +x From fc4d0ddc06b756e0272d9730e3323f2420965492 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 24 Mar 2026 15:26:40 -0400 Subject: [PATCH 13/48] fix: add null check for registry functions --- genome_kit/df/registry.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index fb82ef5e..98d07d94 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -55,12 +55,16 @@ def _serialize_genome(s: pl.Series) -> pl.Series: return pl.Series( name=s.name, values=[ - { - _GKDF_TYPE_FIELD: GkDfType.GENOME.value, - _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, - # config gives annotation genome name if applicable - "genome_str": genome.config, - } + ( + { + _GKDF_TYPE_FIELD: GkDfType.GENOME.value, + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, + # config gives annotation genome name if applicable + "genome_str": genome.config, + } + if genome is not None + else None + ) for genome in s ], dtype=gkdf_structs[GkDfType.GENOME], @@ -70,7 +74,10 @@ def _deserialize_genome(s: pl.Series) -> pl.Series: """Deserialize a Series of GenomeStruct back into GenomeKit Genome objects.""" return pl.Series( name=s.name, - values=[gk.Genome(struct["genome_str"]) for struct in s], + values=[ + gk.Genome(struct["genome_str"]) if struct is not None else None + for struct in s + ], dtype=pl.Object, ) @@ -89,6 +96,7 @@ def _serialize_interval(s: pl.Series) -> pl.Series: # intervals related to reference genome only "genome_str": interval.reference_genome, } + if interval is not None else None for interval in s ], dtype=gkdf_structs[GkDfType.INTERVAL], @@ -106,6 +114,7 @@ def _deserialize_interval(s: pl.Series) -> pl.Series: end=struct["end"], reference_genome=struct["genome_str"], ) + if struct is not None else None for struct in s ], dtype=pl.Object, @@ -124,6 +133,7 @@ def _serialize_transcript(s: pl.Series) -> pl.Series: ), "genome_str": transcript.annotation_genome.config, } + if transcript is not None else None for transcript in s ], dtype=gkdf_structs[GkDfType.TRANSCRIPT], @@ -137,6 +147,7 @@ def _deserialize_transcript(s: pl.Series) -> pl.Series: gk.Genome(struct["genome_str"]).transcripts[ struct["transcript_table_index"] ] + if struct is not None else None for struct in s ], dtype=pl.Object, @@ -153,6 +164,7 @@ def _serialize_gene(s: pl.Series) -> pl.Series: "gene_table_index": gene.annotation_genome.genes.index_of(gene), "genome_str": gene.annotation_genome.config, } + if gene is not None else None for gene in s ], dtype=gkdf_structs[GkDfType.GENE], @@ -164,6 +176,7 @@ def _deserialize_gene(s: pl.Series) -> pl.Series: name=s.name, values=[ gk.Genome(struct["genome_str"]).genes[struct["gene_table_index"]] + if struct is not None else None for struct in s ], dtype=pl.Object, @@ -180,6 +193,7 @@ def _serialize_exon(s: pl.Series) -> pl.Series: "exon_table_index": exon.annotation_genome.exons.index_of(exon), "genome_str": exon.annotation_genome.config, } + if exon is not None else None for exon in s ], dtype=gkdf_structs[GkDfType.EXON], @@ -191,6 +205,7 @@ def _deserialize_exon(s: pl.Series) -> pl.Series: name=s.name, values=[ gk.Genome(struct["genome_str"]).exons[struct["exon_table_index"]] + if struct is not None else None for struct in s ], dtype=pl.Object, @@ -209,6 +224,7 @@ def _serialize_intron(s: pl.Series) -> pl.Series: ), "genome_str": intron.annotation_genome.config, } + if intron is not None else None for intron in s ], dtype=gkdf_structs[GkDfType.INTRON], @@ -220,6 +236,7 @@ def _deserialize_intron(s: pl.Series) -> pl.Series: name=s.name, values=[ gk.Genome(struct["genome_str"]).introns[struct["intron_table_index"]] + if struct is not None else None for struct in s ], dtype=pl.Object, @@ -236,6 +253,7 @@ def _serialize_cds(s: pl.Series) -> pl.Series: "cds_table_index": cds.annotation_genome.cdss.index_of(cds), "genome_str": cds.annotation_genome.config, } + if cds is not None else None for cds in s ], dtype=gkdf_structs[GkDfType.CDS], @@ -247,6 +265,7 @@ def _deserialize_cds(s: pl.Series) -> pl.Series: name=s.name, values=[ gk.Genome(struct["genome_str"]).cdss[struct["cds_table_index"]] + if struct is not None else None for struct in s ], dtype=pl.Object, @@ -256,6 +275,9 @@ def _serialize_utr(s: pl.Series) -> pl.Series: """Serialize a Series of GenomeKit Utr objects.""" values = [] for utr in s: + if utr is None: + values.append(None) + continue ser_dict = { _GKDF_TYPE_FIELD: GkDfType.UTR.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, @@ -289,6 +311,7 @@ def _deserialize_utr(s: pl.Series) -> pl.Series: struct["utr_table_index"] ] ) + if struct is not None else None for struct in s ], ) From 57004856a18ab99b3490a36a38d7808ed921f2e8 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 24 Mar 2026 15:31:13 -0400 Subject: [PATCH 14/48] fix: remove space in array --- .github/workflows/run-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index 5a0b2d7b..43550ef4 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -112,7 +112,7 @@ jobs: exit 1 fi - extra_deps=(python = ${{ matrix.python-version }}) + extra_deps=(python=${{ matrix.python-version }}) # run command with optional extra deps if [ "$GK_EXTRAS" = "df" ]; then extra_deps+=("polars=1.36.1") From 3939f676e99f4d7d481e7300211371f7aa871d18 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 24 Mar 2026 16:12:14 -0400 Subject: [PATCH 15/48] fix: downgrade polars again --- .github/workflows/run-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index 43550ef4..df025dee 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -115,7 +115,7 @@ jobs: extra_deps=(python=${{ matrix.python-version }}) # run command with optional extra deps if [ "$GK_EXTRAS" = "df" ]; then - extra_deps+=("polars=1.36.1") + extra_deps+=("polars=1.34.0") fi conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --extra-deps "${extra_deps[@]}" conda clean -it From 257504f161710d4eefe443dcc1b8a9eb6375a3b2 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 25 Mar 2026 09:35:03 -0400 Subject: [PATCH 16/48] fix: downgrade polars version for conda-forge --- .github/workflows/run-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index df025dee..a01d329f 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -115,7 +115,7 @@ jobs: extra_deps=(python=${{ matrix.python-version }}) # run command with optional extra deps if [ "$GK_EXTRAS" = "df" ]; then - extra_deps+=("polars=1.34.0") + extra_deps+=("polars=1.33.1") fi conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --extra-deps "${extra_deps[@]}" conda clean -it From cf4ff13335b7545d488e739c152db98d3576b37d Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 25 Mar 2026 15:43:31 -0400 Subject: [PATCH 17/48] chore: support path or str --- genome_kit/df/serialization.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index c5c839bb..e5ef7805 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -6,6 +6,7 @@ from collections.abc import Callable from inspect import signature from typing import TYPE_CHECKING +from pathlib import Path if TYPE_CHECKING: import polars as pl @@ -57,7 +58,7 @@ def detect_gk_cols(lf: pl.LazyFrame) -> dict[str, GkDfType]: # materialize the first row to check data types, need the exact type not pl.Object first_row = lf.head(1).collect()[0] - # TODO: support list of GenomeKit objects + # TODO: support list of GenomeKit objects, all of list assumed same type as first for col in lf_cols: # item from first row of the column col_type = GK_TO_STRUCT.get(type(first_row[col][0]), None) @@ -72,7 +73,7 @@ def detect_gk_cols(lf: pl.LazyFrame) -> dict[str, GkDfType]: # TODO: add union of pd.DataFrame -def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str) -> None: +def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str | Path) -> None: """Serialize a DataFrame with GenomeKit objects to a Parquet file. Args: @@ -81,6 +82,7 @@ def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str) -> None: """ pl = require_polars() + path = Path(path) if isinstance(df, pl.DataFrame): df = df.lazy() @@ -187,7 +189,7 @@ def _deserialize_gk_cols( ) -def from_parquet(path: str, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: +def from_parquet(path: str | Path, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: """Deserialize a Parquet file containing GenomeKit objects into a Polars DataFrame or LazyFrame. Args: @@ -199,6 +201,7 @@ def from_parquet(path: str, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: """ pl = require_polars() + path = Path(path) metadata = pl.read_parquet_metadata(path) _validate_gkdf_metadata(metadata) target_cols = json.loads(metadata.get("target_cols")) From 4b89f79ba3b80363c58bdda2fe8c236d3ef0aa80 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 25 Mar 2026 15:54:46 -0400 Subject: [PATCH 18/48] fix: add metadata check --- genome_kit/df/serialization.py | 1 + 1 file changed, 1 insertion(+) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index e5ef7805..844d62f2 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -163,6 +163,7 @@ def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: raise ValueError( "Invalid or missing gkdf_version in Parquet metadata, unable to deserialize GenomeKit objects. " ) + assert metadata.get("target_cols") is not None, "Missing target_cols in Parquet metadata, unable to deserialize GenomeKit objects." def _deserialize_gk_cols( From 57d5b743a9961101eedc6af8c150576c3b2ca765 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 25 Mar 2026 16:50:22 -0400 Subject: [PATCH 19/48] chore: make chrom name more clear --- genome_kit/df/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index 98d07d94..f2c7e783 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -89,7 +89,7 @@ def _serialize_interval(s: pl.Series) -> pl.Series: { _GKDF_TYPE_FIELD: GkDfType.INTERVAL.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, - "chromosome": interval.chrom, + "chromosome": interval.chromosome, "strand": interval.strand, "start": interval.start, "end": interval.end, @@ -108,7 +108,7 @@ def _deserialize_interval(s: pl.Series) -> pl.Series: name=s.name, values=[ gk.Interval( - chromosome=struct["chrom"], + chromosome=struct["chromosome"], strand=struct["strand"], start=struct["start"], end=struct["end"], From 17edfb207809d9c70e00c65006a8f882028d671c Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 25 Mar 2026 16:50:51 -0400 Subject: [PATCH 20/48] fix: don't init reference genomes --- genome_kit/df/serialization.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index 844d62f2..abbe5424 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -146,6 +146,10 @@ def _init_gk_annotations( # warms annotations for all unique genomes in the file for genome_str in genomes: + genome = gk.Genome(genome_str) + if genome.config == genome.reference_genome: + # identifies reference genomes, instead of annotation genomes + continue annotations.append(gk.Genome(genome_str).genes) return annotations From cb6fd93c71315c14a88160a76a2449c69bea6813 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 25 Mar 2026 16:58:41 -0400 Subject: [PATCH 21/48] docs: update comments --- genome_kit/df/gk_structs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index cbfd0a26..9fa64b62 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -57,7 +57,7 @@ def get_structs() -> dict[GkDfType, pl.Struct]: pl.Field("strand", pl.Utf8), pl.Field("start", pl.Int32), pl.Field("end", pl.Int32), - pl.Field("genome_str", pl.Utf8), # reference or annotation genome + pl.Field("genome_str", pl.Utf8), # reference genome ] ) @@ -66,6 +66,7 @@ def get_structs() -> dict[GkDfType, pl.Struct]: pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), # index of transcript within annotation genome transcript table + # Int32 matches index type in C++ backend (see src/table.g:22) pl.Field("transcript_table_index", pl.Int32), pl.Field("genome_str", pl.Utf8), # annotation genome ] From eb125d4f6d48d0277724dbb8539c0f9799ff1658 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 25 Mar 2026 17:12:33 -0400 Subject: [PATCH 22/48] test: add gkdf tests --- tests/test_gkdf.py | 139 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 tests/test_gkdf.py diff --git a/tests/test_gkdf.py b/tests/test_gkdf.py new file mode 100644 index 00000000..0c2f500b --- /dev/null +++ b/tests/test_gkdf.py @@ -0,0 +1,139 @@ +import importlib.util + +from genome_kit import Genome, Interval +from genome_kit.df import from_parquet, to_parquet +from . import MiniGenome +import polars as pl +import tempfile +import unittest +from pathlib import Path + + + +class TestGkdfRoundTrip(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.tmp_dir = tempfile.TemporaryDirectory() + cls.addClassCleanup(cls.tmp_dir.cleanup) + cls.tmp_dir_path = Path(cls.tmp_dir.name) + + @unittest.skip("MiniGenome and Genome type mismatch") + def test_genome(self): + # plain reference genome as well as gencode and refseq annotations + genomes = ["hg38.p12", "gencode.v41", "ucsc_refseq.2017-06-25"] + + for genome_str in genomes: + g = MiniGenome(genome_str) + df = pl.DataFrame({"genome": [g]}) + + path = self.tmp_dir_path / f"{genome_str}.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + + self.assertEqual(re_df.item(), df.item()) + + + def test_interval(self): + interval = Interval("chr5", "+", 2000, 3000, "hg19") + df = pl.DataFrame({"interval": [interval]}) + + path = self.tmp_dir_path / "interval.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + + def test_transcript(self): + genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] + for genome_str in genomes: + g = Genome(genome_str) + transcript = g.genes[0].transcripts[0] + df = pl.DataFrame({"transcript": [transcript]}) + + path = self.tmp_dir_path / f"{genome_str}_transcript.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + + def test_gene(self): + genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] + for genome_str in genomes: + g = Genome(genome_str) + gene = g.genes[0] + df = pl.DataFrame({"gene": [gene]}) + + path = self.tmp_dir_path / f"{genome_str}_gene.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + def test_exon(self): + genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] + for genome_str in genomes: + g = Genome(genome_str) + exon = g.exons[0] + df = pl.DataFrame({"exon": [exon]}) + + path = self.tmp_dir_path / f"{genome_str}_exon.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + + def test_intron(self): + genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] + for genome_str in genomes: + g = Genome(genome_str) + intron = g.introns[0] + df = pl.DataFrame({"intron": [intron]}) + + path = self.tmp_dir_path / f"{genome_str}_intron.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + + def test_cds(self): + genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] + for genome_str in genomes: + g = Genome(genome_str) + cds = g.cdss[0] + df = pl.DataFrame({"cds": [cds]}) + + path = self.tmp_dir_path / f"{genome_str}_cds.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + + def test_utr3(self): + genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] + for genome_str in genomes: + g = Genome(genome_str) + utr3 = g.utr3s[0] + df = pl.DataFrame({"utr3": [utr3]}) + + path = self.tmp_dir_path / f"{genome_str}_utr3.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + + def test_utr5(self): + genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] + for genome_str in genomes: + g = Genome(genome_str) + utr5 = g.utr5s[0] + df = pl.DataFrame({"utr5": [utr5]}) + + path = self.tmp_dir_path / f"{genome_str}_utr5.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From 0437b70772eee868abbb4620b625b2d35e9f44fe Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 25 Mar 2026 17:43:27 -0400 Subject: [PATCH 23/48] test: skip tests when no polars --- tests/test_gkdf.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_gkdf.py b/tests/test_gkdf.py index 0c2f500b..2d88ae05 100644 --- a/tests/test_gkdf.py +++ b/tests/test_gkdf.py @@ -3,12 +3,14 @@ from genome_kit import Genome, Interval from genome_kit.df import from_parquet, to_parquet from . import MiniGenome -import polars as pl + import tempfile import unittest from pathlib import Path - +HAS_POLARS = importlib.util.find_spec("polars") is not None +if HAS_POLARS: + import polars as pl class TestGkdfRoundTrip(unittest.TestCase): @classmethod @@ -32,7 +34,7 @@ def test_genome(self): self.assertEqual(re_df.item(), df.item()) - + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_interval(self): interval = Interval("chr5", "+", 2000, 3000, "hg19") df = pl.DataFrame({"interval": [interval]}) @@ -42,7 +44,7 @@ def test_interval(self): re_df = from_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) - + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_transcript(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] for genome_str in genomes: @@ -56,6 +58,7 @@ def test_transcript(self): self.assertEqual(re_df.item(), df.item()) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_gene(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] for genome_str in genomes: @@ -68,6 +71,7 @@ def test_gene(self): re_df = from_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_exon(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] for genome_str in genomes: @@ -81,6 +85,7 @@ def test_exon(self): self.assertEqual(re_df.item(), df.item()) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_intron(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] for genome_str in genomes: @@ -94,6 +99,7 @@ def test_intron(self): self.assertEqual(re_df.item(), df.item()) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_cds(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] for genome_str in genomes: @@ -107,6 +113,7 @@ def test_cds(self): self.assertEqual(re_df.item(), df.item()) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_utr3(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] for genome_str in genomes: @@ -120,6 +127,7 @@ def test_utr3(self): self.assertEqual(re_df.item(), df.item()) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_utr5(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] for genome_str in genomes: From 7565646ad88ab666c99bc2d2ef09b90c4e5e1d2a Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 25 Mar 2026 17:47:31 -0400 Subject: [PATCH 24/48] chore: ruff formatting --- genome_kit/df/gk_structs.py | 1 - genome_kit/df/registry.py | 39 ++++++++++++++++++++++------------ genome_kit/df/serialization.py | 6 ++++-- tests/test_gkdf.py | 18 ++++++---------- 4 files changed, 36 insertions(+), 28 deletions(-) diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index 9fa64b62..78ce54a0 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -18,7 +18,6 @@ def __str__(self): return str(self.value) - class GkDfType(StrEnum): GENOME = "genome" INTERVAL = "interval" diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index f2c7e783..dac96391 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -96,7 +96,8 @@ def _serialize_interval(s: pl.Series) -> pl.Series: # intervals related to reference genome only "genome_str": interval.reference_genome, } - if interval is not None else None + if interval is not None + else None for interval in s ], dtype=gkdf_structs[GkDfType.INTERVAL], @@ -114,7 +115,8 @@ def _deserialize_interval(s: pl.Series) -> pl.Series: end=struct["end"], reference_genome=struct["genome_str"], ) - if struct is not None else None + if struct is not None + else None for struct in s ], dtype=pl.Object, @@ -133,7 +135,8 @@ def _serialize_transcript(s: pl.Series) -> pl.Series: ), "genome_str": transcript.annotation_genome.config, } - if transcript is not None else None + if transcript is not None + else None for transcript in s ], dtype=gkdf_structs[GkDfType.TRANSCRIPT], @@ -147,7 +150,8 @@ def _deserialize_transcript(s: pl.Series) -> pl.Series: gk.Genome(struct["genome_str"]).transcripts[ struct["transcript_table_index"] ] - if struct is not None else None + if struct is not None + else None for struct in s ], dtype=pl.Object, @@ -164,7 +168,8 @@ def _serialize_gene(s: pl.Series) -> pl.Series: "gene_table_index": gene.annotation_genome.genes.index_of(gene), "genome_str": gene.annotation_genome.config, } - if gene is not None else None + if gene is not None + else None for gene in s ], dtype=gkdf_structs[GkDfType.GENE], @@ -176,7 +181,8 @@ def _deserialize_gene(s: pl.Series) -> pl.Series: name=s.name, values=[ gk.Genome(struct["genome_str"]).genes[struct["gene_table_index"]] - if struct is not None else None + if struct is not None + else None for struct in s ], dtype=pl.Object, @@ -193,7 +199,8 @@ def _serialize_exon(s: pl.Series) -> pl.Series: "exon_table_index": exon.annotation_genome.exons.index_of(exon), "genome_str": exon.annotation_genome.config, } - if exon is not None else None + if exon is not None + else None for exon in s ], dtype=gkdf_structs[GkDfType.EXON], @@ -205,7 +212,8 @@ def _deserialize_exon(s: pl.Series) -> pl.Series: name=s.name, values=[ gk.Genome(struct["genome_str"]).exons[struct["exon_table_index"]] - if struct is not None else None + if struct is not None + else None for struct in s ], dtype=pl.Object, @@ -224,7 +232,8 @@ def _serialize_intron(s: pl.Series) -> pl.Series: ), "genome_str": intron.annotation_genome.config, } - if intron is not None else None + if intron is not None + else None for intron in s ], dtype=gkdf_structs[GkDfType.INTRON], @@ -236,7 +245,8 @@ def _deserialize_intron(s: pl.Series) -> pl.Series: name=s.name, values=[ gk.Genome(struct["genome_str"]).introns[struct["intron_table_index"]] - if struct is not None else None + if struct is not None + else None for struct in s ], dtype=pl.Object, @@ -253,7 +263,8 @@ def _serialize_cds(s: pl.Series) -> pl.Series: "cds_table_index": cds.annotation_genome.cdss.index_of(cds), "genome_str": cds.annotation_genome.config, } - if cds is not None else None + if cds is not None + else None for cds in s ], dtype=gkdf_structs[GkDfType.CDS], @@ -265,7 +276,8 @@ def _deserialize_cds(s: pl.Series) -> pl.Series: name=s.name, values=[ gk.Genome(struct["genome_str"]).cdss[struct["cds_table_index"]] - if struct is not None else None + if struct is not None + else None for struct in s ], dtype=pl.Object, @@ -311,7 +323,8 @@ def _deserialize_utr(s: pl.Series) -> pl.Series: struct["utr_table_index"] ] ) - if struct is not None else None + if struct is not None + else None for struct in s ], ) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index abbe5424..f3f0c162 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -97,7 +97,7 @@ def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str | Path) -> None: registry = get_registry() - df = df.with_columns( + df = df.with_columns( pl.col(col) .map_batches( _map_batches_safe(registry[CURRENT_VERSION][target_cols[col]].serializer), @@ -167,7 +167,9 @@ def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: raise ValueError( "Invalid or missing gkdf_version in Parquet metadata, unable to deserialize GenomeKit objects. " ) - assert metadata.get("target_cols") is not None, "Missing target_cols in Parquet metadata, unable to deserialize GenomeKit objects." + assert metadata.get("target_cols") is not None, ( + "Missing target_cols in Parquet metadata, unable to deserialize GenomeKit objects." + ) def _deserialize_gk_cols( diff --git a/tests/test_gkdf.py b/tests/test_gkdf.py index 2d88ae05..eb750eb8 100644 --- a/tests/test_gkdf.py +++ b/tests/test_gkdf.py @@ -1,17 +1,18 @@ import importlib.util +import tempfile +import unittest +from pathlib import Path from genome_kit import Genome, Interval from genome_kit.df import from_parquet, to_parquet -from . import MiniGenome -import tempfile -import unittest -from pathlib import Path +from . import MiniGenome HAS_POLARS = importlib.util.find_spec("polars") is not None if HAS_POLARS: import polars as pl + class TestGkdfRoundTrip(unittest.TestCase): @classmethod def setUpClass(cls): @@ -57,7 +58,6 @@ def test_transcript(self): re_df = from_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) - @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_gene(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] @@ -84,7 +84,6 @@ def test_exon(self): re_df = from_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) - @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_intron(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] @@ -98,7 +97,6 @@ def test_intron(self): re_df = from_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) - @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_cds(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] @@ -112,7 +110,6 @@ def test_cds(self): re_df = from_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) - @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_utr3(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] @@ -126,7 +123,6 @@ def test_utr3(self): re_df = from_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) - @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_utr5(self): genomes = ["gencode.v41", "ucsc_refseq.2017-06-25"] @@ -141,7 +137,5 @@ def test_utr5(self): self.assertEqual(re_df.item(), df.item()) - - if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From 42338a029a618643fdb3be8b616e5b92e69c764f Mon Sep 17 00:00:00 2001 From: declanyewlim Date: Thu, 26 Mar 2026 14:25:15 -0400 Subject: [PATCH 25/48] fix: make build and runner info consistent --- .github/workflows/run-tests.yaml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index a01d329f..024028f4 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -17,12 +17,14 @@ jobs: build: strategy: matrix: - platform: ["linux-64", "osx-64"] + platform: ["linux-64", "osx-arm64", "osx-64"] include: - platform: "linux-64" runs-on: "ubuntu-latest" - - platform: "osx-64" + - platform: "osx-arm64" runs-on: "macos-latest" + - platform: "osx-64" + runs-on: "macos-26-intel" runs-on: ${{ matrix.runs-on }} if: "!startsWith(github.head_ref, 'release-please')" steps: @@ -37,7 +39,7 @@ jobs: strategy: matrix: python-version: ['3.9', '3.10', '3.11', '3.12'] - platform: ["linux-64", "osx-64"] + platform: ["linux-64", "osx-arm64", "osx-64"] extras: ["none", "df"] include: # specify additional fields for all configs - python-version: "3.9" @@ -48,8 +50,10 @@ jobs: pyver-short: "311" - python-version: "3.12" pyver-short: "312" - - platform: "osx-64" + - platform: "osx-arm64" runs-on: "macos-latest" + - platform: "osx-64" + runs-on: "macos-26-intel" - platform: "linux-64" runs-on: "ubuntu-latest" From 77f6f3cec19d8b4d6d4f32d9e2f1314869c8fd84 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Thu, 26 Mar 2026 15:50:06 -0400 Subject: [PATCH 26/48] revert: revert github workflow --- .github/workflows/run-tests.yaml | 53 +++++++++----------------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index 024028f4..890450d3 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -17,46 +17,29 @@ jobs: build: strategy: matrix: - platform: ["linux-64", "osx-arm64", "osx-64"] - include: - - platform: "linux-64" - runs-on: "ubuntu-latest" - - platform: "osx-arm64" - runs-on: "macos-latest" - - platform: "osx-64" - runs-on: "macos-26-intel" - runs-on: ${{ matrix.runs-on }} + platform: [["linux-64", "ubuntu-latest"], ["osx-64", "macos-latest"]] + runs-on: ${{ matrix.platform[1] }} if: "!startsWith(github.head_ref, 'release-please')" steps: - uses: actions/checkout@v4 - uses: ./.github/actions/build-gk with: - platform: ${{ matrix.platform }} + platform: ${{ matrix.platform[0] }} test: needs: build strategy: matrix: - python-version: ['3.9', '3.10', '3.11', '3.12'] - platform: ["linux-64", "osx-arm64", "osx-64"] - extras: ["none", "df"] - include: # specify additional fields for all configs - - python-version: "3.9" - pyver-short: "39" - - python-version: "3.10" - pyver-short: "310" - - python-version: "3.11" - pyver-short: "311" - - python-version: "3.12" - pyver-short: "312" - - platform: "osx-arm64" - runs-on: "macos-latest" - - platform: "osx-64" - runs-on: "macos-26-intel" - - platform: "linux-64" - runs-on: "ubuntu-latest" - + include: + - {"pyver-short": "39", "python-version": "3.9", "platform": "linux-64", "runs-on": "ubuntu-latest"} + - {"pyver-short": "39", "python-version": "3.9", "platform": "osx-64", "runs-on": "macos-latest"} + - {"pyver-short": "310", "python-version": "3.10", "platform": "linux-64", "runs-on": "ubuntu-latest"} + - {"pyver-short": "310", "python-version": "3.10", "platform": "osx-64", "runs-on": "macos-latest"} + - {"pyver-short": "311", "python-version": "3.11", "platform": "linux-64", "runs-on": "ubuntu-latest"} + - {"pyver-short": "311", "python-version": "3.11", "platform": "osx-64", "runs-on": "macos-latest"} + - {"pyver-short": "312", "python-version": "3.12", "platform": "linux-64", "runs-on": "ubuntu-latest"} + - {"pyver-short": "312", "python-version": "3.12", "platform": "osx-64", "runs-on": "macos-latest"} runs-on: ${{ matrix.runs-on }} steps: - uses: actions/checkout@v4 @@ -104,8 +87,6 @@ jobs: - name: run unittests id: run_unittests shell: bash -l -e {0} - env: - GK_EXTRAS: ${{ matrix.extras }} run: | set -x micromamba activate test @@ -114,13 +95,7 @@ jobs: if [ ! -e "${files[0]}" ]; then echo "No files matched for py${{ matrix.pyver-short }}" exit 1 - fi - - extra_deps=(python=${{ matrix.python-version }}) - # run command with optional extra deps - if [ "$GK_EXTRAS" = "df" ]; then - extra_deps+=("polars=1.33.1") - fi - conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --extra-deps "${extra_deps[@]}" + fi + conda mambabuild --croot /tmp/conda-bld -t $files --extra-deps python=${{ matrix.python-version }} conda clean -it set +x From 575b7c5acdacec5cee522c2c7baf6c1a0cac85d5 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Mon, 30 Mar 2026 20:53:16 -0400 Subject: [PATCH 27/48] feat: add list of gk support --- genome_kit/df/gk_structs.py | 24 ++++ genome_kit/df/serialization.py | 207 ++++++++++++++++++++++++++------- tests/test_gkdf.py | 77 ++++++++++++ 3 files changed, 264 insertions(+), 44 deletions(-) diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index 78ce54a0..fbd79551 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -1,5 +1,6 @@ from __future__ import annotations +from dataclasses import dataclass from typing import TYPE_CHECKING from genome_kit._optional import require_polars @@ -29,6 +30,29 @@ class GkDfType(StrEnum): UTR = "utr" +class CellType(StrEnum): + SCALAR = "scalar" + LIST = "list" + + +@dataclass(frozen=True) +class ColumnInfo: + """Dataclass to store metadata about a single column in a dataframe. + + Assumes that all cells in a column have the saame type. If the cell contains a list, + assumes all items in the list are of the same type. + """ + + cell_type: CellType + gkdf_type: GkDfType + + def to_dict(self) -> dict: + return { + "cell_type": self.cell_type.value, + "gkdf_type": self.gkdf_type.value, + } + + class GkDfVersion(StrEnum): V1 = "1.0" diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index f3f0c162..e8d931a4 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -5,8 +5,8 @@ import warnings from collections.abc import Callable from inspect import signature -from typing import TYPE_CHECKING from pathlib import Path +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: import polars as pl @@ -14,7 +14,7 @@ import genome_kit as gk from genome_kit._optional import require_polars -from .gk_structs import CURRENT_VERSION, GkDfType, GkDfVersion +from .gk_structs import CURRENT_VERSION, CellType, ColumnInfo, GkDfVersion from .registry import GK_TO_STRUCT, get_registry @@ -38,7 +38,7 @@ def wrapper(*args, **kwargs): return wrapper -def detect_gk_cols(lf: pl.LazyFrame) -> dict[str, GkDfType]: +def _detect_gk_cols(lf: pl.LazyFrame) -> dict[str, ColumnInfo]: """Detect columns in the LazyFrame that contains GenomeKit objects. Args: @@ -46,7 +46,8 @@ def detect_gk_cols(lf: pl.LazyFrame) -> dict[str, GkDfType]: columns: Optional list of column names to check. If None, all columns will be checked. Returns: - A dictionary mapping column names to their corresponding GenomeKit types. + A dictionary mapping column names to the ColumnInfo dataclass containing the + GkDfType and CellType for the column. """ lf_cols = lf.collect_schema().names() @@ -58,20 +59,58 @@ def detect_gk_cols(lf: pl.LazyFrame) -> dict[str, GkDfType]: # materialize the first row to check data types, need the exact type not pl.Object first_row = lf.head(1).collect()[0] - # TODO: support list of GenomeKit objects, all of list assumed same type as first for col in lf_cols: + item = first_row[col][0] + if type(item) == list: + cell_type = CellType.LIST + if item: + # infer type based on the first item in the list + item = item[0] + else: + # empty list, assume not a list of GenomeKit objects + continue + else: + cell_type = CellType.SCALAR + # item from first row of the column - col_type = GK_TO_STRUCT.get(type(first_row[col][0]), None) + col_type = GK_TO_STRUCT.get(type(item), None) if col_type is None: # column is not a genomekit type, so no serialization needed - pass - else: - target_cols[col] = col_type + continue + + target_cols[col] = ColumnInfo(cell_type=cell_type, gkdf_type=col_type) return target_cols +def _list_serializer( + serializer: Callable[[pl.Series], pl.Series], return_dtype: Any +) -> Callable[[pl.Series], pl.Series]: + """Helper function to convert a serializer to accept lists of GenomeKit objects. + + Args: + serializer: A serializer function for a series of GenomeKit objects + return_dtype: The return data type for the serialized series + + Returns: + A serializer function for a series of lists of GenomeKit objects. + """ + pl = require_polars() + + def _serialize_list(s: pl.Series) -> pl.Series: + return pl.Series( + name=s.name, + values=[ + serializer(pl.Series(values=l)).to_list() if l is not None else None + for l in s + ], + dtype=return_dtype, + ) + + return _serialize_list + + # TODO: add union of pd.DataFrame def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str | Path) -> None: """Serialize a DataFrame with GenomeKit objects to a Parquet file. @@ -86,7 +125,8 @@ def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str | Path) -> None: if isinstance(df, pl.DataFrame): df = df.lazy() - target_cols = detect_gk_cols(df) + # mapping from column name to ColumnInfo dataclass + target_cols = _detect_gk_cols(df) if not target_cols: warnings.warn( @@ -97,27 +137,44 @@ def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str | Path) -> None: registry = get_registry() - df = df.with_columns( - pl.col(col) - .map_batches( - _map_batches_safe(registry[CURRENT_VERSION][target_cols[col]].serializer), - return_dtype=registry[CURRENT_VERSION][target_cols[col]].struct, + def _build_serialization_expr(col: str) -> pl.Expr: + col_info = target_cols[col] # ColumnInfo dataclass + gkdf_type = col_info.gkdf_type + if col_info.cell_type == CellType.LIST: + return_dtype = pl.List(inner=registry[CURRENT_VERSION][gkdf_type].struct) + serializer = _list_serializer( + registry[CURRENT_VERSION][gkdf_type].serializer, + return_dtype=return_dtype, + ) + else: + return_dtype = registry[CURRENT_VERSION][gkdf_type].struct + serializer = registry[CURRENT_VERSION][gkdf_type].serializer + + return ( + pl.col(col) + .map_batches( + _map_batches_safe(serializer), + return_dtype=return_dtype, + ) + .alias(col) ) - .alias(col) - for col in target_cols - ) + + df = df.with_columns(_build_serialization_expr(col) for col in target_cols) + + # convert ColumnInfo dataclass to a serializable format + target_col_metadata = {col: target_cols[col].to_dict() for col in target_cols} metadata = { "gkdf_version": CURRENT_VERSION.value, "gk_version": gk.__version__, - "target_cols": json.dumps(target_cols), + "target_cols": json.dumps(target_col_metadata), } df.sink_parquet(path, metadata=metadata) def _init_gk_annotations( - lf: pl.LazyFrame, target_cols: dict[str, GkDfType] + lf: pl.LazyFrame, target_cols: dict[str, dict] ) -> list[gk.Genome]: """Initialize GenomeKit annotations for all unique genomes in the LazyFrame. @@ -125,26 +182,51 @@ def _init_gk_annotations( Args: lf: The LazyFrame containing the serialized GenomeKit objects. - target_cols: A dictionary mapping column names to their corresponding GenomeKit types. + target_cols: A dictionary mapping column names to their corresponding ColumnInfo. """ pl = require_polars() annotations = [] - genomes_exprs = [pl.col(c).struct.field("genome_str") for c in target_cols.keys()] - genomes = ( - lf.select( - pl.concat_list(genomes_exprs) - .explode() - .drop_nulls() - .unique() - .alias("genome_str") + # extract genome_str field from every column + genomes_exprs = [] + genomes_list_exprs = [] + for c in target_cols.keys(): + if target_cols[c]["cell_type"] == CellType.SCALAR: + genomes_exprs.append(pl.col(c).struct.field("genome_str")) + else: + genomes_list_exprs.append(pl.col(c).explode().struct.field("genome_str")) + + # expressions to extract genome_str must be run separately since exploded lists + # may have more rows than the original dataframe + plans = [] + + if genomes_exprs: + plans.append( + lf.select( + pl.concat_list(genomes_exprs) + .explode() + .drop_nulls() + .unique() + .alias("genome_str") + ) + ) + + if genomes_list_exprs: + plans.append( + lf.select( + pl.concat_list(genomes_list_exprs) + .explode() + .drop_nulls() + .unique() + .alias("genome_str") + ) ) - .collect()["genome_str"] - .to_list() - ) - # warms annotations for all unique genomes in the file + genomes = pl.concat(plans).unique().collect()["genome_str"].to_list() + + # warms annotations for all unique annotation genomes in the file. + # all annotations available for serialization are contained in dganno file for genome_str in genomes: genome = gk.Genome(genome_str) if genome.config == genome.reference_genome: @@ -172,28 +254,65 @@ def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: ) +def _list_deserializer( + deserializer: Callable[[pl.Series], pl.Series], +) -> Callable[[pl.Series], pl.Series]: + """Helper function to convert a deserializer to accept lists of serialized GenomeKit objects. + + Args: + deserializer: A deserializer function for a series of serialized GenomeKit objects + + Returns: + A deserializer function for a series of lists of serialized GenomeKit objects. + """ + pl = require_polars() + + def _deserialize_list(s: pl.Series) -> pl.Series: + return pl.Series( + name=s.name, + values=[ + deserializer(pl.Series(values=l)).to_list() if l is not None else None + for l in s + ], + dtype=pl.Object, + ) + + return _deserialize_list + + def _deserialize_gk_cols( - lf: pl.LazyFrame, target_cols: dict[str, GkDfType] + lf: pl.LazyFrame, target_cols: dict[str, dict] ) -> pl.LazyFrame: """Deserialize columns containing GenomeKit objects. Args: lf: The LazyFrame containing the serialized GenomeKit objects. - target_cols: A dictionary mapping column names to their corresponding GkDf types. + target_cols: A dictionary mapping column names to their corresponding ColumnInfo. """ pl = require_polars() registry = get_registry() - # with_columns_seq provides a 2x speedup here over with_columns - return lf.with_columns_seq( - pl.col(col) - .map_batches( - _map_batches_safe(registry[CURRENT_VERSION][target_cols[col]].deserializer), - return_dtype=pl.Object, + def _build_deserialization_expr(col: str) -> pl.Expr: + col_info = target_cols[col] # dict representation of ColumnInfo + gkdf_type = col_info["gkdf_type"] + if col_info["cell_type"] == CellType.LIST: + deserializer = _list_deserializer( + registry[CURRENT_VERSION][gkdf_type].deserializer + ) + else: + deserializer = registry[CURRENT_VERSION][gkdf_type].deserializer + + return ( + pl.col(col) + .map_batches( + _map_batches_safe(deserializer), + return_dtype=pl.Object, + ) + .alias(col) ) - .alias(col) - for col in target_cols - ) + + # with_columns_seq provides a 2x speedup here over with_columns + return lf.with_columns_seq(_build_deserialization_expr(col) for col in target_cols) def from_parquet(path: str | Path, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: diff --git a/tests/test_gkdf.py b/tests/test_gkdf.py index eb750eb8..6490e494 100644 --- a/tests/test_gkdf.py +++ b/tests/test_gkdf.py @@ -136,6 +136,83 @@ def test_utr5(self): re_df = from_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_list_of_intervals(self): + intervals = [ + Interval("chr1", "+", 2000, 3000, "hg19"), + Interval("chr4", "-", 5000, 6000, "hg19"), + ] + df = pl.DataFrame({"intervals": [intervals]}, schema={"intervals": pl.Object}) + + path = self.tmp_dir_path / "list_of_intervals.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_list_of_genomes(self): + genomes = [Genome("hg38.p12"), Genome("gencode.v41")] + df = pl.DataFrame({"genomes": [genomes]}, schema={"genomes": pl.Object}) + + path = self.tmp_dir_path / "list_of_genomes.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_list_of_transcripts(self): + g = Genome("gencode.v41") + transcripts = list(g.transcripts)[:10] + df = pl.DataFrame( + {"transcripts": [transcripts]}, schema={"transcripts": pl.Object} + ) + + path = self.tmp_dir_path / "list_of_transcripts.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_multiple_types(self): + g = Genome("gencode.v41") + + interval = Interval("chr5", "+", 2000, 3000, "hg19") + transcript = g.genes[0].transcripts[0] + gene = g.genes[0] + exon = g.exons[0] + + df = pl.DataFrame( + { + "interval": [interval], + "transcript": [transcript], + "gene": [gene], + "exon": [exon], + } + ) + + path = self.tmp_dir_path / "multiple_types.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df["interval"].item(), df["interval"].item()) + self.assertEqual(re_df["transcript"].item(), df["transcript"].item()) + self.assertEqual(re_df["gene"].item(), df["gene"].item()) + self.assertEqual(re_df["exon"].item(), df["exon"].item()) + + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_multiple_genomes(self): + # test dataframe with multiple reference genomes in a single column + g1 = Genome("gencode.v41") + g2 = Genome("ucsc_refseq.2017-06-25") + + genes = [g1.genes[0], g2.genes[0]] + df = pl.DataFrame({"genes": genes}, schema={"genes": pl.Object}) + + path = self.tmp_dir_path / "multiple_genomes.parquet" + to_parquet(df, path) + re_df = from_parquet(path, lazy=False) + self.assertEqual(re_df["genes"][0], df["genes"][0]) + self.assertEqual(re_df["genes"][1], df["genes"][1]) + if __name__ == "__main__": unittest.main() From 8e07c2a941f79368aaec76f947b93ebe05e7e3a7 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 31 Mar 2026 09:15:03 -0400 Subject: [PATCH 28/48] ci: run gkdf tests --- .github/workflows/run-tests.yaml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index 08efdb08..eb781ca7 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -40,6 +40,7 @@ jobs: matrix: python-version: ['3.10', '3.11', '3.12'] platform: ["linux-64", "osx-arm64", "osx-64"] + extras: ["none", "df"] include: # specify additional fields for all configs - python-version: "3.10" pyver-short: "310" @@ -100,6 +101,8 @@ jobs: - name: run unittests id: run_unittests shell: bash -l -e {0} + env: + GK_EXTRAS: ${{ matrix.extras }} run: | set -x micromamba activate test @@ -108,7 +111,12 @@ jobs: if [ ! -e "${files[0]}" ]; then echo "No files matched for py${{ matrix.pyver-short }}" exit 1 - fi - conda mambabuild --croot /tmp/conda-bld -t $files --extra-deps python=${{ matrix.python-version }} + fi + extra_deps=(python=${{ matrix.python-version }}) + # run command with optional extra deps + if [ "$GK_EXTRAS" = "df" ]; then + extra_deps+=("polars=1.39.3") + fi + conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --extra-deps "${extra_deps[@]}" conda clean -it - set +x + set +x \ No newline at end of file From 59cfb998915326f07b7b50fff2dac23e2397c364 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 31 Mar 2026 13:30:20 -0400 Subject: [PATCH 29/48] chore: change serde functions names --- genome_kit/__init__.py | 4 +- genome_kit/df/__init__.py | 4 +- genome_kit/df/serialization.py | 126 ++++++++++++++++----------------- tests/test_gkdf.py | 58 +++++++-------- 4 files changed, 97 insertions(+), 95 deletions(-) diff --git a/genome_kit/__init__.py b/genome_kit/__init__.py index 3aa97eee..b888147a 100644 --- a/genome_kit/__init__.py +++ b/genome_kit/__init__.py @@ -49,7 +49,7 @@ from .variant_genome import VariantGenome from .vcf_table import VCFTable, VCFVariant from . import serialize -from .df import to_parquet, from_parquet +from .df import write_parquet, read_parquet ######################################################################### @@ -94,6 +94,7 @@ "JunctionTable", "ReadAlignments", "ReadDistributions", + "read_parquet", "Transcript", "TranscriptTable", "Utr", @@ -103,6 +104,7 @@ "VariantTable", "VCFTable", "VCFVariant", + "write_parquet", ] ######################################################################### diff --git a/genome_kit/df/__init__.py b/genome_kit/df/__init__.py index 0ca20f42..317335f9 100644 --- a/genome_kit/df/__init__.py +++ b/genome_kit/df/__init__.py @@ -1,3 +1,3 @@ -from .serialization import from_parquet, to_parquet +from .serialization import read_parquet, write_parquet -__all__ = ["from_parquet", "to_parquet"] +__all__ = ["read_parquet", "write_parquet"] diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index e8d931a4..9c69c9a7 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -111,68 +111,6 @@ def _serialize_list(s: pl.Series) -> pl.Series: return _serialize_list -# TODO: add union of pd.DataFrame -def to_parquet(df: pl.DataFrame | pl.LazyFrame, path: str | Path) -> None: - """Serialize a DataFrame with GenomeKit objects to a Parquet file. - - Args: - df: A Polars DataFrame or LazyFrame with columns containing GenomeKit objects. - path: The file path to write the Parquet file to. - """ - pl = require_polars() - - path = Path(path) - if isinstance(df, pl.DataFrame): - df = df.lazy() - - # mapping from column name to ColumnInfo dataclass - target_cols = _detect_gk_cols(df) - - if not target_cols: - warnings.warn( - "No GenomeKit columns detected for serialization, writing DataFrame as is." - ) - df.sink_parquet(path) - return - - registry = get_registry() - - def _build_serialization_expr(col: str) -> pl.Expr: - col_info = target_cols[col] # ColumnInfo dataclass - gkdf_type = col_info.gkdf_type - if col_info.cell_type == CellType.LIST: - return_dtype = pl.List(inner=registry[CURRENT_VERSION][gkdf_type].struct) - serializer = _list_serializer( - registry[CURRENT_VERSION][gkdf_type].serializer, - return_dtype=return_dtype, - ) - else: - return_dtype = registry[CURRENT_VERSION][gkdf_type].struct - serializer = registry[CURRENT_VERSION][gkdf_type].serializer - - return ( - pl.col(col) - .map_batches( - _map_batches_safe(serializer), - return_dtype=return_dtype, - ) - .alias(col) - ) - - df = df.with_columns(_build_serialization_expr(col) for col in target_cols) - - # convert ColumnInfo dataclass to a serializable format - target_col_metadata = {col: target_cols[col].to_dict() for col in target_cols} - - metadata = { - "gkdf_version": CURRENT_VERSION.value, - "gk_version": gk.__version__, - "target_cols": json.dumps(target_col_metadata), - } - - df.sink_parquet(path, metadata=metadata) - - def _init_gk_annotations( lf: pl.LazyFrame, target_cols: dict[str, dict] ) -> list[gk.Genome]: @@ -315,7 +253,69 @@ def _build_deserialization_expr(col: str) -> pl.Expr: return lf.with_columns_seq(_build_deserialization_expr(col) for col in target_cols) -def from_parquet(path: str | Path, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: +# TODO: add union of pd.DataFrame +def write_parquet(df: pl.DataFrame | pl.LazyFrame, path: str | Path) -> None: + """Serialize a DataFrame with GenomeKit objects to a Parquet file. + + Args: + df: A Polars DataFrame or LazyFrame with columns containing GenomeKit objects. + path: The file path to write the Parquet file to. + """ + pl = require_polars() + + path = Path(path) + if isinstance(df, pl.DataFrame): + df = df.lazy() + + # mapping from column name to ColumnInfo dataclass + target_cols = _detect_gk_cols(df) + + if not target_cols: + warnings.warn( + "No GenomeKit columns detected for serialization, writing DataFrame as is." + ) + df.sink_parquet(path) + return + + registry = get_registry() + + def _build_serialization_expr(col: str) -> pl.Expr: + col_info = target_cols[col] # ColumnInfo dataclass + gkdf_type = col_info.gkdf_type + if col_info.cell_type == CellType.LIST: + return_dtype = pl.List(inner=registry[CURRENT_VERSION][gkdf_type].struct) + serializer = _list_serializer( + registry[CURRENT_VERSION][gkdf_type].serializer, + return_dtype=return_dtype, + ) + else: + return_dtype = registry[CURRENT_VERSION][gkdf_type].struct + serializer = registry[CURRENT_VERSION][gkdf_type].serializer + + return ( + pl.col(col) + .map_batches( + _map_batches_safe(serializer), + return_dtype=return_dtype, + ) + .alias(col) + ) + + df = df.with_columns(_build_serialization_expr(col) for col in target_cols) + + # convert ColumnInfo dataclass to a serializable format + target_col_metadata = {col: target_cols[col].to_dict() for col in target_cols} + + metadata = { + "gkdf_version": CURRENT_VERSION.value, + "gk_version": gk.__version__, + "target_cols": json.dumps(target_col_metadata), + } + + df.sink_parquet(path, metadata=metadata) + + +def read_parquet(path: str | Path, lazy: bool = False) -> pl.DataFrame | pl.LazyFrame: """Deserialize a Parquet file containing GenomeKit objects into a Polars DataFrame or LazyFrame. Args: diff --git a/tests/test_gkdf.py b/tests/test_gkdf.py index 6490e494..a363e58e 100644 --- a/tests/test_gkdf.py +++ b/tests/test_gkdf.py @@ -4,7 +4,7 @@ from pathlib import Path from genome_kit import Genome, Interval -from genome_kit.df import from_parquet, to_parquet +from genome_kit.df import read_parquet, write_parquet from . import MiniGenome @@ -30,8 +30,8 @@ def test_genome(self): df = pl.DataFrame({"genome": [g]}) path = self.tmp_dir_path / f"{genome_str}.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @@ -41,8 +41,8 @@ def test_interval(self): df = pl.DataFrame({"interval": [interval]}) path = self.tmp_dir_path / "interval.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -54,8 +54,8 @@ def test_transcript(self): df = pl.DataFrame({"transcript": [transcript]}) path = self.tmp_dir_path / f"{genome_str}_transcript.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -67,8 +67,8 @@ def test_gene(self): df = pl.DataFrame({"gene": [gene]}) path = self.tmp_dir_path / f"{genome_str}_gene.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -80,8 +80,8 @@ def test_exon(self): df = pl.DataFrame({"exon": [exon]}) path = self.tmp_dir_path / f"{genome_str}_exon.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -93,8 +93,8 @@ def test_intron(self): df = pl.DataFrame({"intron": [intron]}) path = self.tmp_dir_path / f"{genome_str}_intron.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -106,8 +106,8 @@ def test_cds(self): df = pl.DataFrame({"cds": [cds]}) path = self.tmp_dir_path / f"{genome_str}_cds.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -119,8 +119,8 @@ def test_utr3(self): df = pl.DataFrame({"utr3": [utr3]}) path = self.tmp_dir_path / f"{genome_str}_utr3.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -132,8 +132,8 @@ def test_utr5(self): df = pl.DataFrame({"utr5": [utr5]}) path = self.tmp_dir_path / f"{genome_str}_utr5.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -145,8 +145,8 @@ def test_list_of_intervals(self): df = pl.DataFrame({"intervals": [intervals]}, schema={"intervals": pl.Object}) path = self.tmp_dir_path / "list_of_intervals.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -155,8 +155,8 @@ def test_list_of_genomes(self): df = pl.DataFrame({"genomes": [genomes]}, schema={"genomes": pl.Object}) path = self.tmp_dir_path / "list_of_genomes.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -168,8 +168,8 @@ def test_list_of_transcripts(self): ) path = self.tmp_dir_path / "list_of_transcripts.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") @@ -191,8 +191,8 @@ def test_multiple_types(self): ) path = self.tmp_dir_path / "multiple_types.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df["interval"].item(), df["interval"].item()) self.assertEqual(re_df["transcript"].item(), df["transcript"].item()) self.assertEqual(re_df["gene"].item(), df["gene"].item()) @@ -208,8 +208,8 @@ def test_multiple_genomes(self): df = pl.DataFrame({"genes": genes}, schema={"genes": pl.Object}) path = self.tmp_dir_path / "multiple_genomes.parquet" - to_parquet(df, path) - re_df = from_parquet(path, lazy=False) + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) self.assertEqual(re_df["genes"][0], df["genes"][0]) self.assertEqual(re_df["genes"][1], df["genes"][1]) From 5eaf0c9cf57bcf49e64f3cfa3e4809201a7342c5 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 31 Mar 2026 15:41:33 -0400 Subject: [PATCH 30/48] feat: add schema inference on multiple rows --- genome_kit/df/serialization.py | 51 ++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index 9c69c9a7..ed98ace4 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -38,7 +38,9 @@ def wrapper(*args, **kwargs): return wrapper -def _detect_gk_cols(lf: pl.LazyFrame) -> dict[str, ColumnInfo]: +def _detect_gk_cols( + lf: pl.LazyFrame, infer_schema_length: int = 100 +) -> dict[str, ColumnInfo]: """Detect columns in the LazyFrame that contains GenomeKit objects. Args: @@ -49,31 +51,43 @@ def _detect_gk_cols(lf: pl.LazyFrame) -> dict[str, ColumnInfo]: A dictionary mapping column names to the ColumnInfo dataclass containing the GkDfType and CellType for the column. """ + pl = require_polars() lf_cols = lf.collect_schema().names() target_cols = {} - # polars Struct inferred from first row, same behaviour as Polars - # see https://docs.pola.rs/user-guide/expressions/structs/#inferring-the-data-type-struct-from-dictionaries - # materialize the first row to check data types, need the exact type not pl.Object - first_row = lf.head(1).collect()[0] + # datatype inference done on first n=infer_schema_length rows. Follows inference + # logic from Polars DataFrames when rows are provided. + # see https://github.com/pola-rs/polars/blob/1cd236c60c01572c5ec6fdd252d8b20218d7b440/py-polars/src/polars/dataframe/frame.py#L248-L251 + head = lf.head(infer_schema_length).collect() for col in lf_cols: - item = first_row[col][0] - if type(item) == list: + # remove nulls for type inference, list/scalar cols depend on first non-null value + vals = head.get_column(col).drop_nulls() + first = vals[0] + + if type(first) == list: cell_type = CellType.LIST - if item: - # infer type based on the first item in the list - item = item[0] - else: - # empty list, assume not a list of GenomeKit objects - continue + # ensure all values are lists within a col + assert all(type(v) == list for v in vals), ( + f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." + ) + # cannot use Polars list expressions since lists of GenomeKit objects are stored as pl.Object + col_types = {type(item) for v in vals for item in v} + else: cell_type = CellType.SCALAR + # ensure all values are not lists within a col + assert all(type(v) != list for v in vals), ( + f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." + ) + col_types = set(vals.map_elements(type, return_dtype=pl.Object)) - # item from first row of the column - col_type = GK_TO_STRUCT.get(type(item), None) + assert len(col_types) == 1, ( + f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." + ) + col_type = GK_TO_STRUCT.get(col_types.pop(), None) if col_type is None: # column is not a genomekit type, so no serialization needed @@ -254,12 +268,15 @@ def _build_deserialization_expr(col: str) -> pl.Expr: # TODO: add union of pd.DataFrame -def write_parquet(df: pl.DataFrame | pl.LazyFrame, path: str | Path) -> None: +def write_parquet( + df: pl.DataFrame | pl.LazyFrame, path: str | Path, infer_schema_length: int = 100 +) -> None: """Serialize a DataFrame with GenomeKit objects to a Parquet file. Args: df: A Polars DataFrame or LazyFrame with columns containing GenomeKit objects. path: The file path to write the Parquet file to. + infer_schema_length: The number of rows to use for schema inference when writing the Parquet file. """ pl = require_polars() @@ -268,7 +285,7 @@ def write_parquet(df: pl.DataFrame | pl.LazyFrame, path: str | Path) -> None: df = df.lazy() # mapping from column name to ColumnInfo dataclass - target_cols = _detect_gk_cols(df) + target_cols = _detect_gk_cols(df, infer_schema_length=infer_schema_length) if not target_cols: warnings.warn( From 37867e3c4ff6b5c05ed4b24760a0b42b43a82568 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 31 Mar 2026 16:17:29 -0400 Subject: [PATCH 31/48] fix: fix type inference and add test --- genome_kit/df/serialization.py | 4 ++-- tests/test_gkdf.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index ed98ace4..f8e972b1 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -64,7 +64,7 @@ def _detect_gk_cols( for col in lf_cols: # remove nulls for type inference, list/scalar cols depend on first non-null value - vals = head.get_column(col).drop_nulls() + vals = head.get_column(col).drop_nulls() # removes scalar nulls first = vals[0] if type(first) == list: @@ -74,7 +74,7 @@ def _detect_gk_cols( f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." ) # cannot use Polars list expressions since lists of GenomeKit objects are stored as pl.Object - col_types = {type(item) for v in vals for item in v} + col_types = {type(item) for v in vals for item in v if item is not None} else: cell_type = CellType.SCALAR diff --git a/tests/test_gkdf.py b/tests/test_gkdf.py index a363e58e..d079799d 100644 --- a/tests/test_gkdf.py +++ b/tests/test_gkdf.py @@ -172,6 +172,20 @@ def test_list_of_transcripts(self): re_df = read_parquet(path, lazy=False) self.assertEqual(re_df.item(), df.item()) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_list_of_gk_with_null(self): + g = Genome("gencode.v41") + transcripts = list(g.transcripts)[:10] + transcripts[:3] = [None] * 3 + df = pl.DataFrame( + {"transcripts": [transcripts]}, schema={"transcripts": pl.Object} + ) + + path = self.tmp_dir_path / "list_of_transcripts_with_null.parquet" + write_parquet(df, path) + re_df = read_parquet(path, lazy=False) + self.assertEqual(re_df.item(), df.item()) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") def test_multiple_types(self): g = Genome("gencode.v41") From ef2cdabfdb2795f63ddf863992817e911b791c16 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 1 Apr 2026 11:08:39 -0400 Subject: [PATCH 32/48] feat: additional macos rosetta checks --- genome_kit/_optional.py | 61 +++++++++++++++++++++++++++++++++++++++-- setup.py | 4 ++- 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/genome_kit/_optional.py b/genome_kit/_optional.py index 715a1e68..a88c6454 100644 --- a/genome_kit/_optional.py +++ b/genome_kit/_optional.py @@ -1,13 +1,70 @@ from __future__ import annotations +from importlib.metadata import PackageNotFoundError, version + def require_polars(): - """Import Polars if available, otherwise fail gracefully.""" + """Import Polars if available, otherwise provide helpful error messages. + + Also checks for compability on MacOS with Apple Silicon, which may require + an additional package if running Python under Rosetta translation. + """ try: import polars as pl + + if check_under_rosetta(): + if not check_rtcompat(): + raise ImportError( + "Polars is not compatible with Apple Silicon.\n" + "Please install with `pip install genomekit[df-mac]` to include " + "the polars-runtime-compat package required for Rosetta " + "translation." + ) except ModuleNotFoundError as e: raise ImportError( - "Optional dependency 'polars' is required for this functionality. Please install with `pip install genomekit[df]`." + "Optional dependency 'polars' is required for this functionality. Please " + "install with `pip install genomekit[df]`.\n" + "If you are running this on MacOS with Apple Silicon, please install with " + "`pip install genomekit[df-mac]` to include the polars-runtime-compat " + "package required for Rosetta translation." ) from e return pl + + +def check_under_rosetta(): + """Check if program is running under Rosetta translation on Apple Silicon. + + The default version of Polars is incompatible with Rosetta, and requires + polars-runtime-compat to be installed. + + Can be checked with the sysctl.proc_translated flag in sysctl. + See https://developer.apple.com/documentation/apple-silicon/about-the-rosetta-translation-environment#Determine-Whether-Your-App-Is-Running-as-a-Translated-Binary + """ + import subprocess + + try: + result = subprocess.run( + ["sysctl", "-n", "sysctl.proc_translated"], + capture_output=True, + text=True, + check=True, + ) + # output will be 0 if running commnad directly from the terminal, and 1 if + # running through Python under Rosetta translation + return result.stdout.strip() == "1" + except subprocess.CalledProcessError: + # sysctl.proc_translated won't exist on non-Apple Silicon machines + return False + + +def check_rtcompat(): + """Check if polars-runtime-compat is installed. + + Required for Polars to run on MacOS machines under Rosetta translation. + """ + try: + version("polars-runtime-compat") + return True + except PackageNotFoundError: + return False diff --git a/setup.py b/setup.py index 1c079bf2..9bbaddaf 100644 --- a/setup.py +++ b/setup.py @@ -404,7 +404,9 @@ def _compile_obj(obj): "typing-extensions", ], extras_require={ - "df": ["polars"] + "df": ["polars"], + "df-mac": ["polars", "polars-runtime-compat"], + }, license="Apache License 2.0", license_files=(COPYRIGHT_FILE, LICENSE_FILE,), From 96a81b347a3354985a0eb8a06f6bbf9e70acaf27 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 1 Apr 2026 11:09:06 -0400 Subject: [PATCH 33/48] ci: add additional test config with rosetta --- .github/workflows/run-tests.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index eb781ca7..22d75f34 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -54,6 +54,13 @@ jobs: runs-on: "macos-26-intel" - platform: "linux-64" runs-on: "ubuntu-latest" + # single test for running intel build on arm64 macos runner with rosetta + - python-version: "3.10" + pyver-short: "310" + platform: "osx-64" # intel build + runs-on: "macos-latest" # defaults to arm64 runner with M1 chip + extras: "df" + rosetta: true runs-on: ${{ matrix.runs-on }} steps: - uses: actions/checkout@v4 @@ -103,6 +110,7 @@ jobs: shell: bash -l -e {0} env: GK_EXTRAS: ${{ matrix.extras }} + ROSETTA: ${{ matrix.rosetta }} run: | set -x micromamba activate test @@ -117,6 +125,9 @@ jobs: if [ "$GK_EXTRAS" = "df" ]; then extra_deps+=("polars=1.39.3") fi + if [ "$ROSETTA" = "true" ]; then + extra_deps+=("polars-runtime-compat=1.39.3") + fi conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --extra-deps "${extra_deps[@]}" conda clean -it set +x \ No newline at end of file From 9cd8f083b59a6f1f7102f01ce4f8c78563e97e8c Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 1 Apr 2026 11:47:51 -0400 Subject: [PATCH 34/48] docs: update docs --- docs-src/df.rst | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/docs-src/df.rst b/docs-src/df.rst index 2759fc3a..68bc1e44 100644 --- a/docs-src/df.rst +++ b/docs-src/df.rst @@ -13,12 +13,19 @@ The :py:mod:`genome_kit.df` subpackage contains utilities for working with Polar mamba install "genomekit[df]" - The ``[df]`` extra is not included in the default installation. + The ``[df]`` extra is cd included in the default installation. + + If you are running an x86 version of Python on an Apple Silicon Mac (e.g. M1 chip), the ``polars-runtime-compat`` package is also required. Install this with the ``[df-mac]`` extra: + + .. code-block:: bash + + mamba install "genomekit[df-mac]" + Quickstart ----------- -The serialization and deserialization entry points are :py:func:`~genome_kit.df.to_parquet` and :py:func:`~genome_kit.df.from_parquet`: +The serialization and deserialization entry points are :py:func:`~genome_kit.df.read_parquet` and :py:func:`~genome_kit.df.write_parquet`: .. code-block:: python @@ -33,15 +40,15 @@ The serialization and deserialization entry points are :py:func:`~genome_kit.df. } ) - gk.to_parquet(df, "genes.parquet") + gk.read_parquet(df, "genes.parquet") ... ... - restored_df = gk.from_parquet("genes.parquet") + restored_df = gk.write_parquet("genes.parquet") .. note:: - The written parquet files can be read by any software that supports the parquet format, but the GenomeKit objects will only be restored when read with :py:func:`~genome_kit.df.from_parquet`. + The written parquet files can be read by any software that supports the parquet format, but the GenomeKit objects will only be restored when read with :py:func:`genome_kit.df.read_parquet`. Supported GenomeKit Objects From 8e7b2402b9161694d3fd2cc4354af5db5de06a54 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 1 Apr 2026 11:54:35 -0400 Subject: [PATCH 35/48] docs: format and update docstrings --- genome_kit/df/serialization.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index f8e972b1..cca31093 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -18,7 +18,7 @@ from .registry import GK_TO_STRUCT, get_registry -def _map_batches_safe(fn: Callable): +def _map_batches_safe(fn: Callable) -> Callable: """Helper function to wrap a UDF and run safely with polars map_batches. Polars has a bug in map_batches that incorrectly forwards the return_dtype argument @@ -26,6 +26,9 @@ def _map_batches_safe(fn: Callable): Args: fn: The user defined function to wrap. + + Returns: + A wrapped version of the UDF that can be safely used with map_batches. """ sig = signature(fn) @@ -104,11 +107,11 @@ def _list_serializer( """Helper function to convert a serializer to accept lists of GenomeKit objects. Args: - serializer: A serializer function for a series of GenomeKit objects - return_dtype: The return data type for the serialized series + serializer: A serializer function for a series of GenomeKit objects + return_dtype: The return data type for the serialized series Returns: - A serializer function for a series of lists of GenomeKit objects. + A serializer function for a series of lists of GenomeKit objects. """ pl = require_polars() @@ -130,11 +133,15 @@ def _init_gk_annotations( ) -> list[gk.Genome]: """Initialize GenomeKit annotations for all unique genomes in the LazyFrame. - Prevents race conditions when opening dganno files during polars operations. + Prevents race conditions when opening dganno files during polars operations. + Objects are returned in a list to keep weak references alive. Args: lf: The LazyFrame containing the serialized GenomeKit objects. target_cols: A dictionary mapping column names to their corresponding ColumnInfo. + + Returns: + A list of initialized Genome objects for the unique genomes in the LazyFrame. """ pl = require_polars() @@ -190,7 +197,11 @@ def _init_gk_annotations( def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: - """Validate the parquet metadata for a gkdf parquet file.""" + """Validate the parquet metadata for a gkdf parquet file. + + Args: + metadata: The parquet metadata to validate. + """ try: version = GkDfVersion(metadata.get("gkdf_version")) @@ -212,10 +223,10 @@ def _list_deserializer( """Helper function to convert a deserializer to accept lists of serialized GenomeKit objects. Args: - deserializer: A deserializer function for a series of serialized GenomeKit objects + deserializer: A deserializer function for a series of serialized GenomeKit objects Returns: - A deserializer function for a series of lists of serialized GenomeKit objects. + A deserializer function for a series of lists of serialized GenomeKit objects. """ pl = require_polars() @@ -240,6 +251,9 @@ def _deserialize_gk_cols( Args: lf: The LazyFrame containing the serialized GenomeKit objects. target_cols: A dictionary mapping column names to their corresponding ColumnInfo. + + Returns: + A LazyFrame with deserialized GenomeKit objects in the target columns. """ pl = require_polars() registry = get_registry() From 0516acf0dc8753307d9aad29c4cb9855f555d919 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 1 Apr 2026 16:34:26 -0400 Subject: [PATCH 36/48] respond to PR comments --- docs-src/df.rst | 12 +++---- genome_kit/_optional.py | 4 +-- genome_kit/df/gk_structs.py | 4 +-- genome_kit/df/registry.py | 1 + genome_kit/df/serialization.py | 60 +++++++++++++++++++++------------- 5 files changed, 49 insertions(+), 32 deletions(-) diff --git a/docs-src/df.rst b/docs-src/df.rst index 68bc1e44..07b3e1e4 100644 --- a/docs-src/df.rst +++ b/docs-src/df.rst @@ -7,19 +7,19 @@ The :py:mod:`genome_kit.df` subpackage contains utilities for working with Polar .. important:: - ``genome_kit.df`` depends on optional ``polars`` dependencies, which are not installed by default. This can be installed with the ``[df]`` extra: + ``genome_kit.df`` depends on optional ``polars`` dependencies, which are not installed by default. These can be installed with the ``[df]`` extra: .. code-block:: bash - mamba install "genomekit[df]" + pip install "genomekit[df]" - The ``[df]`` extra is cd included in the default installation. + The ``[df]`` extra is not included in the default ``genomekit`` installation. If you are running an x86 version of Python on an Apple Silicon Mac (e.g. M1 chip), the ``polars-runtime-compat`` package is also required. Install this with the ``[df-mac]`` extra: .. code-block:: bash - mamba install "genomekit[df-mac]" + pip install "genomekit[df-mac]" @@ -40,10 +40,10 @@ The serialization and deserialization entry points are :py:func:`~genome_kit.df. } ) - gk.read_parquet(df, "genes.parquet") + gk.write_parquet(df, "genes.parquet") ... ... - restored_df = gk.write_parquet("genes.parquet") + restored_df = gk.read_parquet("genes.parquet") .. note:: diff --git a/genome_kit/_optional.py b/genome_kit/_optional.py index a88c6454..1392c5c5 100644 --- a/genome_kit/_optional.py +++ b/genome_kit/_optional.py @@ -6,7 +6,7 @@ def require_polars(): """Import Polars if available, otherwise provide helpful error messages. - Also checks for compability on MacOS with Apple Silicon, which may require + Also checks for compatibility on MacOS with Apple Silicon, which may require an additional package if running Python under Rosetta translation. """ try: @@ -50,7 +50,7 @@ def check_under_rosetta(): text=True, check=True, ) - # output will be 0 if running commnad directly from the terminal, and 1 if + # output will be 0 if running command directly from the terminal, and 1 if # running through Python under Rosetta translation return result.stdout.strip() == "1" except subprocess.CalledProcessError: diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index fbd79551..a602e04d 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: # import polars for type checking import polars as pl -# minimal shim for python <3.11 compatability +# minimal shim for python <3.11 compatibility try: from enum import StrEnum except ImportError: @@ -39,7 +39,7 @@ class CellType(StrEnum): class ColumnInfo: """Dataclass to store metadata about a single column in a dataframe. - Assumes that all cells in a column have the saame type. If the cell contains a list, + Assumes that all cells in a column have the same type. If the cell contains a list, assumes all items in the list are of the same type. """ diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index dac96391..b4bd3736 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -327,6 +327,7 @@ def _deserialize_utr(s: pl.Series) -> pl.Series: else None for struct in s ], + dtype=pl.Object, ) REGISTRY: dict[GkDfVersion, dict[GkDfType, GKTypeEntry]] = { diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index cca31093..3d350a6e 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -48,7 +48,8 @@ def _detect_gk_cols( Args: lf: The LazyFrame to inspect. - columns: Optional list of column names to check. If None, all columns will be checked. + infer_schema_length: The number of rows to use for schema inference when + detecting GenomeKit columns. Returns: A dictionary mapping column names to the ColumnInfo dataclass containing the @@ -68,28 +69,40 @@ def _detect_gk_cols( for col in lf_cols: # remove nulls for type inference, list/scalar cols depend on first non-null value vals = head.get_column(col).drop_nulls() # removes scalar nulls - first = vals[0] + # column only contains null values in the first infer_schema_length rows + if len(vals) == 0: + warnings.warn( + f"Column {col} contains only null values in the first {infer_schema_length} rows, " + "unable to infer type for serialization. Please ensure this column " + "contains non-null values for accurate serialization." + ) + continue + + first = vals[0] if type(first) == list: cell_type = CellType.LIST # ensure all values are lists within a col - assert all(type(v) == list for v in vals), ( - f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." - ) + if not all(type(v) == list for v in vals): + raise ValueError( + f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." + ) # cannot use Polars list expressions since lists of GenomeKit objects are stored as pl.Object col_types = {type(item) for v in vals for item in v if item is not None} else: cell_type = CellType.SCALAR # ensure all values are not lists within a col - assert all(type(v) != list for v in vals), ( - f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." - ) + if not all(type(v) != list for v in vals): + raise ValueError( + f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." + ) col_types = set(vals.map_elements(type, return_dtype=pl.Object)) - assert len(col_types) == 1, ( - f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." - ) + if len(col_types) != 1: + raise ValueError( + f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." + ) col_type = GK_TO_STRUCT.get(col_types.pop(), None) if col_type is None: @@ -133,15 +146,16 @@ def _init_gk_annotations( ) -> list[gk.Genome]: """Initialize GenomeKit annotations for all unique genomes in the LazyFrame. - Prevents race conditions when opening dganno files during polars operations. - Objects are returned in a list to keep weak references alive. + Prevents race conditions when opening dganno files during polars operations. + Objects are returned in a list to keep weak references alive. Args: lf: The LazyFrame containing the serialized GenomeKit objects. - target_cols: A dictionary mapping column names to their corresponding ColumnInfo. + target_cols: A dictionary mapping column names to their column information. + target_cols is a dict from the ColumnInfo dataclass. Returns: - A list of initialized Genome objects for the unique genomes in the LazyFrame. + A list of initialized gene tables for the unique genomes in the LazyFrame. """ pl = require_polars() @@ -198,23 +212,25 @@ def _init_gk_annotations( def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: """Validate the parquet metadata for a gkdf parquet file. - + Args: metadata: The parquet metadata to validate. """ try: version = GkDfVersion(metadata.get("gkdf_version")) - assert version in GkDfVersion, ( - f"Unrecognized gkdf version in Parquet metadata, expected one of {[v.value for v in GkDfVersion]}" - ) + if version not in GkDfVersion: + raise ValueError( + f"Unrecognized gkdf version in Parquet metadata, expected one of {[v.value for v in GkDfVersion]}" + ) except ValueError: raise ValueError( "Invalid or missing gkdf_version in Parquet metadata, unable to deserialize GenomeKit objects. " ) - assert metadata.get("target_cols") is not None, ( - "Missing target_cols in Parquet metadata, unable to deserialize GenomeKit objects." - ) + if metadata.get("target_cols") is None: + raise ValueError( + "Missing target_cols in Parquet metadata, unable to deserialize GenomeKit objects." + ) def _list_deserializer( From e730a08822f56c9ff004e47a4e2518fab119826c Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Thu, 2 Apr 2026 13:41:56 -0400 Subject: [PATCH 37/48] ci: add missing var for config --- .github/workflows/run-tests.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index 22d75f34..69ab8dad 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -41,6 +41,7 @@ jobs: python-version: ['3.10', '3.11', '3.12'] platform: ["linux-64", "osx-arm64", "osx-64"] extras: ["none", "df"] + rosetta: [false] include: # specify additional fields for all configs - python-version: "3.10" pyver-short: "310" From 240606c6fa96d32a55d30a47f42f1f8a659e59e0 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 7 Apr 2026 10:18:00 -0400 Subject: [PATCH 38/48] chore: simplify extras installation [skip ci] --- docs-src/df.rst | 7 +------ setup.py | 9 ++++++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/docs-src/df.rst b/docs-src/df.rst index 07b3e1e4..782ba517 100644 --- a/docs-src/df.rst +++ b/docs-src/df.rst @@ -15,12 +15,7 @@ The :py:mod:`genome_kit.df` subpackage contains utilities for working with Polar The ``[df]`` extra is not included in the default ``genomekit`` installation. - If you are running an x86 version of Python on an Apple Silicon Mac (e.g. M1 chip), the ``polars-runtime-compat`` package is also required. Install this with the ``[df-mac]`` extra: - - .. code-block:: bash - - pip install "genomekit[df-mac]" - + If you are running an x86 version of Python on an Apple Silicon Mac (e.g. M1 chip), this will also install the ``polars-runtime-compat`` package, which is required to run Polars on Apple Silicon due to AVX features compatibility issues. Quickstart diff --git a/setup.py b/setup.py index 9bbaddaf..d8a2019f 100644 --- a/setup.py +++ b/setup.py @@ -404,9 +404,12 @@ def _compile_obj(obj): "typing-extensions", ], extras_require={ - "df": ["polars"], - "df-mac": ["polars", "polars-runtime-compat"], - + # install polars-runtime-compat if running on x86_64 Python on macOS + # required to run polars due to AVX features compatibility issues + "df": [ + "polars", + "polars-runtime-compat; sys_platform == 'darwin' and platform_machine == 'x86_64'", + ] }, license="Apache License 2.0", license_files=(COPYRIGHT_FILE, LICENSE_FILE,), From bbf8962c0695f0644fc2c70144c35b6f5c76f0a6 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 7 Apr 2026 14:37:07 -0400 Subject: [PATCH 39/48] misc: change struct from string to enum --- genome_kit/df/gk_structs.py | 4 +++- genome_kit/df/registry.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index a602e04d..2193150e 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -131,11 +131,13 @@ def get_structs() -> dict[GkDfType, pl.Struct]: ] ) + UtrType = pl.Enum(["5prime", "3prime"]) + UtrStruct = pl.Struct( [ pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), - pl.Field("utr_type", pl.Utf8), # "5prime" or "3prime" + pl.Field("utr_type", UtrType), pl.Field("utr_table_index", pl.Int64), pl.Field("genome_str", pl.Utf8), # annotation genome ] diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index b4bd3736..59061d76 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -305,6 +305,9 @@ def _serialize_utr(s: pl.Series) -> pl.Series: ser_dict["genome_str"] = genome.config values.append(ser_dict) + # pl.Series constructor doesn't strictly enforce nested struct types errors with + # "utr_type" field may silently become nulls instead of enums. + # TODO: check polars implementation, related issue #18841 return pl.Series( name=s.name, values=values, From 8d03a0ceacb2fcaeb72c91f81ce2be3e8c131c7d Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Tue, 7 Apr 2026 15:11:06 -0400 Subject: [PATCH 40/48] feat: add gk version check --- genome_kit/df/serialization.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index 3d350a6e..f3d6e12b 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -216,7 +216,7 @@ def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: Args: metadata: The parquet metadata to validate. """ - + # gkdf version try: version = GkDfVersion(metadata.get("gkdf_version")) if version not in GkDfVersion: @@ -227,10 +227,24 @@ def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: raise ValueError( "Invalid or missing gkdf_version in Parquet metadata, unable to deserialize GenomeKit objects. " ) + + # target cols if metadata.get("target_cols") is None: raise ValueError( "Missing target_cols in Parquet metadata, unable to deserialize GenomeKit objects." ) + + # gk version + gk_version = metadata.get("gk_version") + if gk_version is None: + raise ValueError( + "Missing gk_version in Parquet metadata." + ) + elif gk_version != gk.__version__: + warnings.warn( + f"Parquet file was written with GenomeKit version {gk_version}, but current version is {gk.__version__}. " + "Deserializing GenomeKit objects may not be consistent across versions." + ) def _list_deserializer( From 13168b6be525ebdac980f71023cf3212244acd04 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Wed, 15 Apr 2026 17:26:40 -0400 Subject: [PATCH 41/48] fix: address PR comments --- .github/workflows/run-tests.yaml | 4 +- docs-src/df.rst | 4 +- genome_kit/_optional.py | 4 +- genome_kit/df/gk_structs.py | 32 +++++++------- genome_kit/df/registry.py | 51 +++++++++-------------- genome_kit/df/serialization.py | 71 ++++++++++++++++++-------------- tests/test_gkdf.py | 23 +++++++++++ 7 files changed, 102 insertions(+), 87 deletions(-) diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index 69ab8dad..588a1ae8 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -41,7 +41,7 @@ jobs: python-version: ['3.10', '3.11', '3.12'] platform: ["linux-64", "osx-arm64", "osx-64"] extras: ["none", "df"] - rosetta: [false] + rosetta: ["false"] include: # specify additional fields for all configs - python-version: "3.10" pyver-short: "310" @@ -61,7 +61,7 @@ jobs: platform: "osx-64" # intel build runs-on: "macos-latest" # defaults to arm64 runner with M1 chip extras: "df" - rosetta: true + rosetta: "true" runs-on: ${{ matrix.runs-on }} steps: - uses: actions/checkout@v4 diff --git a/docs-src/df.rst b/docs-src/df.rst index 782ba517..d7fb90dd 100644 --- a/docs-src/df.rst +++ b/docs-src/df.rst @@ -1,7 +1,7 @@ .. _df: -GenomeKit DataFrame Utilities -============================= +DataFrame Utilities +=================== The :py:mod:`genome_kit.df` subpackage contains utilities for working with Polars DataFrames that contain GenomeKit objects. This includes utilities for serializing DataFrames with GenomeKit objects to Parquet and deserializing them back to GenomeKit objects. This is useful when sharing tabular data sets, or when saving intermediate DataFrames to disk during data processing. diff --git a/genome_kit/_optional.py b/genome_kit/_optional.py index 1392c5c5..d76a0c34 100644 --- a/genome_kit/_optional.py +++ b/genome_kit/_optional.py @@ -50,8 +50,8 @@ def check_under_rosetta(): text=True, check=True, ) - # output will be 0 if running command directly from the terminal, and 1 if - # running through Python under Rosetta translation + # output will be 0 if running natively on Apple Silicon, and 1 if running under + # Rosetta translation return result.stdout.strip() == "1" except subprocess.CalledProcessError: # sysctl.proc_translated won't exist on non-Apple Silicon machines diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index 2193150e..413cbd0b 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -12,12 +12,16 @@ try: from enum import StrEnum except ImportError: - from enum import Enum + from enum import Enum, auto class StrEnum(str, Enum): def __str__(self): return str(self.value) + @staticmethod + def _generate_next_value_(name, start, count, last_values): + return name.lower() + class GkDfType(StrEnum): GENOME = "genome" @@ -66,68 +70,61 @@ def get_structs() -> dict[GkDfType, pl.Struct]: GenomeStruct = pl.Struct( [ - pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), - pl.Field("genome_str", pl.Utf8), # reference or annotation genome + pl.Field("genome_name", pl.Utf8), # reference or annotation genome ] ) IntervalStruct = pl.Struct( [ - pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), pl.Field("chromosome", pl.Utf8), pl.Field("strand", pl.Utf8), pl.Field("start", pl.Int32), pl.Field("end", pl.Int32), - pl.Field("genome_str", pl.Utf8), # reference genome + pl.Field("refg", pl.Utf8), # reference genome ] ) TranscriptStruct = pl.Struct( [ - pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), # index of transcript within annotation genome transcript table - # Int32 matches index type in C++ backend (see src/table.g:22) + # Int32 matches index type in C++ backend (see src/table.h:22) pl.Field("transcript_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome + pl.Field("anno", pl.Utf8), # annotation genome ] ) GeneStruct = pl.Struct( [ - pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), pl.Field("gene_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome + pl.Field("anno", pl.Utf8), # annotation genome ] ) ExonStruct = pl.Struct( [ - pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), pl.Field("exon_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome + pl.Field("anno", pl.Utf8), # annotation genome ] ) IntronStruct = pl.Struct( [ - pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), pl.Field("intron_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome + pl.Field("anno", pl.Utf8), # annotation genome ] ) CdsStruct = pl.Struct( [ - pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), pl.Field("cds_table_index", pl.Int32), - pl.Field("genome_str", pl.Utf8), # annotation genome + pl.Field("anno", pl.Utf8), # annotation genome ] ) @@ -135,11 +132,10 @@ def get_structs() -> dict[GkDfType, pl.Struct]: UtrStruct = pl.Struct( [ - pl.Field("gkdf_type", pl.Utf8), pl.Field("schema_version", pl.Utf8), pl.Field("utr_type", UtrType), pl.Field("utr_table_index", pl.Int64), - pl.Field("genome_str", pl.Utf8), # annotation genome + pl.Field("anno", pl.Utf8), # annotation genome ] ) diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index 59061d76..d3a5bd0c 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -57,10 +57,9 @@ def _serialize_genome(s: pl.Series) -> pl.Series: values=[ ( { - _GKDF_TYPE_FIELD: GkDfType.GENOME.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, # config gives annotation genome name if applicable - "genome_str": genome.config, + "genome_name": genome.config, } if genome is not None else None @@ -75,7 +74,7 @@ def _deserialize_genome(s: pl.Series) -> pl.Series: return pl.Series( name=s.name, values=[ - gk.Genome(struct["genome_str"]) if struct is not None else None + gk.Genome(struct["genome_name"]) if struct is not None else None for struct in s ], dtype=pl.Object, @@ -87,14 +86,12 @@ def _serialize_interval(s: pl.Series) -> pl.Series: name=s.name, values=[ { - _GKDF_TYPE_FIELD: GkDfType.INTERVAL.value, - _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, "chromosome": interval.chromosome, "strand": interval.strand, "start": interval.start, "end": interval.end, # intervals related to reference genome only - "genome_str": interval.reference_genome, + "refg": interval.reference_genome, } if interval is not None else None @@ -113,7 +110,7 @@ def _deserialize_interval(s: pl.Series) -> pl.Series: strand=struct["strand"], start=struct["start"], end=struct["end"], - reference_genome=struct["genome_str"], + reference_genome=struct["refg"], ) if struct is not None else None @@ -128,12 +125,11 @@ def _serialize_transcript(s: pl.Series) -> pl.Series: name=s.name, values=[ { - _GKDF_TYPE_FIELD: GkDfType.TRANSCRIPT.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, "transcript_table_index": transcript.annotation_genome.transcripts.index_of( transcript ), - "genome_str": transcript.annotation_genome.config, + "anno": transcript.annotation_genome.config, } if transcript is not None else None @@ -147,9 +143,7 @@ def _deserialize_transcript(s: pl.Series) -> pl.Series: return pl.Series( name=s.name, values=[ - gk.Genome(struct["genome_str"]).transcripts[ - struct["transcript_table_index"] - ] + gk.Genome(struct["anno"]).transcripts[struct["transcript_table_index"]] if struct is not None else None for struct in s @@ -163,10 +157,9 @@ def _serialize_gene(s: pl.Series) -> pl.Series: name=s.name, values=[ { - _GKDF_TYPE_FIELD: GkDfType.GENE.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, "gene_table_index": gene.annotation_genome.genes.index_of(gene), - "genome_str": gene.annotation_genome.config, + "anno": gene.annotation_genome.config, } if gene is not None else None @@ -180,7 +173,7 @@ def _deserialize_gene(s: pl.Series) -> pl.Series: return pl.Series( name=s.name, values=[ - gk.Genome(struct["genome_str"]).genes[struct["gene_table_index"]] + gk.Genome(struct["anno"]).genes[struct["gene_table_index"]] if struct is not None else None for struct in s @@ -194,10 +187,9 @@ def _serialize_exon(s: pl.Series) -> pl.Series: name=s.name, values=[ { - _GKDF_TYPE_FIELD: GkDfType.EXON.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, "exon_table_index": exon.annotation_genome.exons.index_of(exon), - "genome_str": exon.annotation_genome.config, + "anno": exon.annotation_genome.config, } if exon is not None else None @@ -211,7 +203,7 @@ def _deserialize_exon(s: pl.Series) -> pl.Series: return pl.Series( name=s.name, values=[ - gk.Genome(struct["genome_str"]).exons[struct["exon_table_index"]] + gk.Genome(struct["anno"]).exons[struct["exon_table_index"]] if struct is not None else None for struct in s @@ -225,12 +217,11 @@ def _serialize_intron(s: pl.Series) -> pl.Series: name=s.name, values=[ { - _GKDF_TYPE_FIELD: GkDfType.INTRON.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, "intron_table_index": intron.annotation_genome.introns.index_of( intron ), - "genome_str": intron.annotation_genome.config, + "anno": intron.annotation_genome.config, } if intron is not None else None @@ -244,7 +235,7 @@ def _deserialize_intron(s: pl.Series) -> pl.Series: return pl.Series( name=s.name, values=[ - gk.Genome(struct["genome_str"]).introns[struct["intron_table_index"]] + gk.Genome(struct["anno"]).introns[struct["intron_table_index"]] if struct is not None else None for struct in s @@ -258,10 +249,9 @@ def _serialize_cds(s: pl.Series) -> pl.Series: name=s.name, values=[ { - _GKDF_TYPE_FIELD: GkDfType.CDS.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, "cds_table_index": cds.annotation_genome.cdss.index_of(cds), - "genome_str": cds.annotation_genome.config, + "anno": cds.annotation_genome.config, } if cds is not None else None @@ -275,7 +265,7 @@ def _deserialize_cds(s: pl.Series) -> pl.Series: return pl.Series( name=s.name, values=[ - gk.Genome(struct["genome_str"]).cdss[struct["cds_table_index"]] + gk.Genome(struct["anno"]).cdss[struct["cds_table_index"]] if struct is not None else None for struct in s @@ -291,7 +281,6 @@ def _serialize_utr(s: pl.Series) -> pl.Series: values.append(None) continue ser_dict = { - _GKDF_TYPE_FIELD: GkDfType.UTR.value, _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, } genome = utr.annotation_genome @@ -302,11 +291,11 @@ def _serialize_utr(s: pl.Series) -> pl.Series: ser_dict["utr_table_index"] = genome.utr3s.index_of(utr) ser_dict["utr_type"] = "3prime" - ser_dict["genome_str"] = genome.config + ser_dict["anno"] = genome.config values.append(ser_dict) - # pl.Series constructor doesn't strictly enforce nested struct types errors with - # "utr_type" field may silently become nulls instead of enums. + # pl.Series constructor doesn't strictly enforce nested struct types. Errors + # with "utr_type" field may silently manifest as null values. # TODO: check polars implementation, related issue #18841 return pl.Series( name=s.name, @@ -320,11 +309,9 @@ def _deserialize_utr(s: pl.Series) -> pl.Series: name=s.name, values=[ ( - gk.Genome(struct["genome_str"]).utr5s[struct["utr_table_index"]] + gk.Genome(struct["anno"]).utr5s[struct["utr_table_index"]] if struct["utr_type"] == "5prime" - else gk.Genome(struct["genome_str"]).utr3s[ - struct["utr_table_index"] - ] + else gk.Genome(struct["anno"]).utr3s[struct["utr_table_index"]] ) if struct is not None else None diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index f3d6e12b..ee343057 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -1,6 +1,7 @@ from __future__ import annotations import functools +import itertools import json import warnings from collections.abc import Callable @@ -14,7 +15,7 @@ import genome_kit as gk from genome_kit._optional import require_polars -from .gk_structs import CURRENT_VERSION, CellType, ColumnInfo, GkDfVersion +from .gk_structs import CURRENT_VERSION, CellType, ColumnInfo, GkDfType, GkDfVersion from .registry import GK_TO_STRUCT, get_registry @@ -80,29 +81,26 @@ def _detect_gk_cols( continue first = vals[0] - if type(first) == list: - cell_type = CellType.LIST - # ensure all values are lists within a col - if not all(type(v) == list for v in vals): + head_types = {type(v) for v in vals} + + if isinstance(first, list): + if head_types != {list}: raise ValueError( - f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." + f"Column {col} contains mixed data types: {list(itertools.islice(head_types, 3))}.\n" + "Please ensure all cells are the same type before serialization." ) - # cannot use Polars list expressions since lists of GenomeKit objects are stored as pl.Object + cell_type = CellType.LIST col_types = {type(item) for v in vals for item in v if item is not None} - else: cell_type = CellType.SCALAR - # ensure all values are not lists within a col - if not all(type(v) != list for v in vals): - raise ValueError( - f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." - ) col_types = set(vals.map_elements(type, return_dtype=pl.Object)) if len(col_types) != 1: raise ValueError( - f"Column {col} contains mixed data types. Please ensure all cells are the same type before serialization." + f"Column {col} contains mixed data types: {list(itertools.islice(col_types, 3))}.\n" + "Please ensure all cells are the same type before serialization." ) + col_type = GK_TO_STRUCT.get(col_types.pop(), None) if col_type is None: @@ -152,23 +150,34 @@ def _init_gk_annotations( Args: lf: The LazyFrame containing the serialized GenomeKit objects. target_cols: A dictionary mapping column names to their column information. - target_cols is a dict from the ColumnInfo dataclass. + Each value is a dictionary representation of the ColumnInfo dataclass. Returns: A list of initialized gene tables for the unique genomes in the LazyFrame. """ pl = require_polars() - annotations = [] + def genome_str_field(col_info: dict) -> str: + gkdf_type = col_info["gkdf_type"] + if gkdf_type == GkDfType.GENOME: + return "genome_name" + elif gkdf_type == GkDfType.INTERVAL: + return "refg" + else: + return "anno" + + anno_strong_refs = [] # extract genome_str field from every column genomes_exprs = [] genomes_list_exprs = [] + for c in target_cols.keys(): + genome_field = genome_str_field(target_cols[c]) if target_cols[c]["cell_type"] == CellType.SCALAR: - genomes_exprs.append(pl.col(c).struct.field("genome_str")) + genomes_exprs.append(pl.col(c).struct.field(genome_field)) else: - genomes_list_exprs.append(pl.col(c).explode().struct.field("genome_str")) + genomes_list_exprs.append(pl.col(c).explode().struct.field(genome_field)) # expressions to extract genome_str must be run separately since exploded lists # may have more rows than the original dataframe @@ -202,12 +211,13 @@ def _init_gk_annotations( # all annotations available for serialization are contained in dganno file for genome_str in genomes: genome = gk.Genome(genome_str) - if genome.config == genome.reference_genome: - # identifies reference genomes, instead of annotation genomes + try: + anno_strong_refs.append(genome.genes) + except ValueError: + # reference genomes don't have annotations continue - annotations.append(gk.Genome(genome_str).genes) - return annotations + return anno_strong_refs def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: @@ -219,27 +229,25 @@ def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: # gkdf version try: version = GkDfVersion(metadata.get("gkdf_version")) - if version not in GkDfVersion: - raise ValueError( - f"Unrecognized gkdf version in Parquet metadata, expected one of {[v.value for v in GkDfVersion]}" + if version != CURRENT_VERSION: + raise IOError( + f"Expected GkDfVersion {CURRENT_VERSION}, but found {version}." ) except ValueError: raise ValueError( "Invalid or missing gkdf_version in Parquet metadata, unable to deserialize GenomeKit objects. " ) - + # target cols if metadata.get("target_cols") is None: raise ValueError( "Missing target_cols in Parquet metadata, unable to deserialize GenomeKit objects." ) - + # gk version gk_version = metadata.get("gk_version") if gk_version is None: - raise ValueError( - "Missing gk_version in Parquet metadata." - ) + raise ValueError("Missing gk_version in Parquet metadata.") elif gk_version != gk.__version__: warnings.warn( f"Parquet file was written with GenomeKit version {gk_version}, but current version is {gk.__version__}. " @@ -280,7 +288,8 @@ def _deserialize_gk_cols( Args: lf: The LazyFrame containing the serialized GenomeKit objects. - target_cols: A dictionary mapping column names to their corresponding ColumnInfo. + target_cols: A dictionary mapping column names to their column information. + Each value is a dictionary representation of the ColumnInfo dataclass. Returns: A LazyFrame with deserialized GenomeKit objects in the target columns. diff --git a/tests/test_gkdf.py b/tests/test_gkdf.py index d079799d..140e3daa 100644 --- a/tests/test_gkdf.py +++ b/tests/test_gkdf.py @@ -227,6 +227,29 @@ def test_multiple_genomes(self): self.assertEqual(re_df["genes"][0], df["genes"][0]) self.assertEqual(re_df["genes"][1], df["genes"][1]) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_mismatch_types(self): + # test that error is raised when cols have different types + g = Genome("gencode.v41") + gene = g.genes[0] + interval = Interval("chr5", "+", 2000, 3000, "hg19") + + df = pl.DataFrame({"mixed": [gene, interval]}, schema={"mixed": pl.Object}) + path = self.tmp_dir_path / "mismatch_types.parquet" + with self.assertRaises(ValueError): + write_parquet(df, path) + + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_mismatch_list_types(self): + # test that error is raised when cols have different types + g = Genome("gencode.v41") + gene = g.genes[0] + + df = pl.DataFrame({"mixed": [gene, [gene]]}, schema={"mixed": pl.Object}) + path = self.tmp_dir_path / "mismatch_list_types.parquet" + with self.assertRaises(ValueError): + write_parquet(df, path) + if __name__ == "__main__": unittest.main() From 9f1d32c7a97e8bd9d748ec56950718dcbedd0565 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Thu, 16 Apr 2026 16:47:43 -0400 Subject: [PATCH 42/48] chore: remove unused variable --- genome_kit/df/registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index d3a5bd0c..9b249cd5 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -33,7 +33,6 @@ class GKTypeEntry: deserializer: Callable[[pl.Series], pl.Series] -_GKDF_TYPE_FIELD = "gkdf_type" _SCHEMA_VERSION_FIELD = "schema_version" SUPPORTED_VERSIONS = {v for v in GkDfVersion.__members__.values()} From 6102e290d97a248bcc9722559a47c02e81d1b835 Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Thu, 16 Apr 2026 19:52:17 -0400 Subject: [PATCH 43/48] fix: add missing struct value --- genome_kit/df/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index 9b249cd5..3e0cc309 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -85,6 +85,7 @@ def _serialize_interval(s: pl.Series) -> pl.Series: name=s.name, values=[ { + _SCHEMA_VERSION_FIELD: GkDfVersion.V1.value, "chromosome": interval.chromosome, "strand": interval.strand, "start": interval.start, From 92b1113b1a275090ade82406f67750707d9b9c6e Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Thu, 16 Apr 2026 19:52:45 -0400 Subject: [PATCH 44/48] fix: change expr for lists --- genome_kit/df/serialization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index ee343057..be5452bc 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -197,7 +197,7 @@ def genome_str_field(col_info: dict) -> str: if genomes_list_exprs: plans.append( lf.select( - pl.concat_list(genomes_list_exprs) + pl.concat(genomes_list_exprs) .explode() .drop_nulls() .unique() From da5447588682df1da678dea0d8bc90ccab035bcc Mon Sep 17 00:00:00 2001 From: declanyewlim Date: Tue, 21 Apr 2026 15:10:28 -0400 Subject: [PATCH 45/48] chore: use auto for enums --- genome_kit/df/gk_structs.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index 413cbd0b..97d6ae60 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -10,7 +10,7 @@ # minimal shim for python <3.11 compatibility try: - from enum import StrEnum + from enum import StrEnum, auto except ImportError: from enum import Enum, auto @@ -24,19 +24,19 @@ def _generate_next_value_(name, start, count, last_values): class GkDfType(StrEnum): - GENOME = "genome" - INTERVAL = "interval" - TRANSCRIPT = "transcript" - GENE = "gene" - EXON = "exon" - INTRON = "intron" - CDS = "cds" - UTR = "utr" + GENOME = auto() + INTERVAL = auto() + TRANSCRIPT = auto() + GENE = auto() + EXON = auto() + INTRON = auto() + CDS = auto() + UTR = auto() class CellType(StrEnum): - SCALAR = "scalar" - LIST = "list" + SCALAR = auto() + LIST = auto() @dataclass(frozen=True) From 6ced9e2c3da302bd56b03d51fefb93ed11698bd6 Mon Sep 17 00:00:00 2001 From: declanyewlim Date: Tue, 21 Apr 2026 15:33:31 -0400 Subject: [PATCH 46/48] chore: update const name and add comment --- genome_kit/df/gk_structs.py | 4 +++- genome_kit/df/registry.py | 2 +- genome_kit/df/serialization.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py index 97d6ae60..b2d0c8b7 100644 --- a/genome_kit/df/gk_structs.py +++ b/genome_kit/df/gk_structs.py @@ -22,7 +22,9 @@ def __str__(self): def _generate_next_value_(name, start, count, last_values): return name.lower() - +# serializable representations of the supported GKDF types, with a one-to-one mapping +# between GkDfType and GenomeKit object types. Serves as the key for struct and function +# definitions in registry.py, keeping serialization and deserialization paths symmetric. class GkDfType(StrEnum): GENOME = auto() INTERVAL = auto() diff --git a/genome_kit/df/registry.py b/genome_kit/df/registry.py index 3e0cc309..6ff7dcef 100644 --- a/genome_kit/df/registry.py +++ b/genome_kit/df/registry.py @@ -13,7 +13,7 @@ from .gk_structs import GkDfType, GkDfVersion, get_structs # mapping from GenomeKit object types to the gkdf type strings -GK_TO_STRUCT: dict[type[gk.GenomeAnnotation], GkDfType] = { +GK_TO_GKDF_TYPE: dict[type[gk.GenomeAnnotation], GkDfType] = { gk.Genome: GkDfType.GENOME, gk.Interval: GkDfType.INTERVAL, gk.Transcript: GkDfType.TRANSCRIPT, diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index be5452bc..e2621d8b 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -16,7 +16,7 @@ from genome_kit._optional import require_polars from .gk_structs import CURRENT_VERSION, CellType, ColumnInfo, GkDfType, GkDfVersion -from .registry import GK_TO_STRUCT, get_registry +from .registry import GK_TO_GKDF_TYPE, get_registry def _map_batches_safe(fn: Callable) -> Callable: @@ -101,7 +101,7 @@ def _detect_gk_cols( "Please ensure all cells are the same type before serialization." ) - col_type = GK_TO_STRUCT.get(col_types.pop(), None) + col_type = GK_TO_GKDF_TYPE.get(col_types.pop(), None) if col_type is None: # column is not a genomekit type, so no serialization needed From 07b2102ccc74213bc3a1c858e3119665abdbf29e Mon Sep 17 00:00:00 2001 From: declanyewlim Date: Tue, 21 Apr 2026 15:58:56 -0400 Subject: [PATCH 47/48] tests: simplify logic and add tests --- genome_kit/df/serialization.py | 13 +++++------- tests/test_gkdf.py | 39 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/genome_kit/df/serialization.py b/genome_kit/df/serialization.py index e2621d8b..228bc0ef 100644 --- a/genome_kit/df/serialization.py +++ b/genome_kit/df/serialization.py @@ -227,15 +227,12 @@ def _validate_gkdf_metadata(metadata: dict[str, str]) -> None: metadata: The parquet metadata to validate. """ # gkdf version - try: - version = GkDfVersion(metadata.get("gkdf_version")) - if version != CURRENT_VERSION: - raise IOError( - f"Expected GkDfVersion {CURRENT_VERSION}, but found {version}." - ) - except ValueError: + metadata_version = metadata.get("gkdf_version") + version = GkDfVersion(metadata_version) if metadata_version is not None else None + if version != CURRENT_VERSION: raise ValueError( - "Invalid or missing gkdf_version in Parquet metadata, unable to deserialize GenomeKit objects. " + f"Invalid or missing gkdf_version in Parquet metadata, unable to deserialize GenomeKit objects. " + f"Expected GkDfVersion {CURRENT_VERSION}, but found {version}." ) # target cols diff --git a/tests/test_gkdf.py b/tests/test_gkdf.py index 140e3daa..be20904e 100644 --- a/tests/test_gkdf.py +++ b/tests/test_gkdf.py @@ -1,10 +1,12 @@ import importlib.util +import json import tempfile import unittest from pathlib import Path from genome_kit import Genome, Interval from genome_kit.df import read_parquet, write_parquet +from genome_kit.df.gk_structs import CURRENT_VERSION from . import MiniGenome @@ -250,6 +252,43 @@ def test_mismatch_list_types(self): with self.assertRaises(ValueError): write_parquet(df, path) + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_no_gkdf_version(self): + # test that error raised when no gkdf version is found in metadata + df = pl.DataFrame({"genome": ["hg38.p12"]}) + + path = self.tmp_dir_path / "no_gkdf_version.parquet" + df.write_parquet(path, metadata={"some_other_key": "value"}) + with self.assertRaises(ValueError): + read_parquet(path, lazy=False) + + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_no_target_cols(self): + # test that error raised when no target_cols is found in metadata + df = pl.DataFrame({"genome": ["hg38.p12"]}) + + path = self.tmp_dir_path / "no_target_cols.parquet" + df.write_parquet(path, metadata={"gkdf_version": CURRENT_VERSION}) + with self.assertRaises(ValueError): + read_parquet(path, lazy=False) + + @unittest.skipUnless(HAS_POLARS, "Polars is required for this genome_kit.df tests") + def test_no_gk_version(self): + # test that error raised when no gk version is found in metadata + df = pl.DataFrame({"genome": ["hg38.p12"]}) + + path = self.tmp_dir_path / "no_gk_version.parquet" + target_cols = {"genome": {"cell_type": "scalar", "gkdf_type": "genome"}} + df.write_parquet( + path, + metadata={ + "gkdf_version": CURRENT_VERSION, + "target_cols": json.dumps(target_cols), + }, + ) + with self.assertRaises(ValueError): + read_parquet(path, lazy=False) + if __name__ == "__main__": unittest.main() From 5eaa2808ef50feccf188b49d38efba0bbbd325ba Mon Sep 17 00:00:00 2001 From: Declan Lim Date: Fri, 24 Apr 2026 19:56:52 +0000 Subject: [PATCH 48/48] fix: catch relevant errors --- genome_kit/_optional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genome_kit/_optional.py b/genome_kit/_optional.py index d76a0c34..f7eefab8 100644 --- a/genome_kit/_optional.py +++ b/genome_kit/_optional.py @@ -53,7 +53,7 @@ def check_under_rosetta(): # output will be 0 if running natively on Apple Silicon, and 1 if running under # Rosetta translation return result.stdout.strip() == "1" - except subprocess.CalledProcessError: + except (subprocess.CalledProcessError, OSError): # sysctl.proc_translated won't exist on non-Apple Silicon machines return False