deepgenomics · ovesh · Apr 28, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml
@@ -40,6 +40,8 @@ jobs:
       matrix:
         python-version: ['3.10', '3.11', '3.12']
         platform: ["linux-64", "osx-arm64", "osx-64"]
+        extras: ["none", "df"]
+        rosetta: ["false"]
         include: # specify additional fields for all configs
           - python-version: "3.10"
             pyver-short: "310"
@@ -53,6 +55,13 @@ jobs:
             runs-on: "macos-26-intel"
           - platform: "linux-64"
             runs-on: "ubuntu-latest"
+          # single test for running intel build on arm64 macos runner with rosetta
+          - python-version: "3.10"
+            pyver-short: "310"
+            platform: "osx-64" # intel build
+            runs-on: "macos-latest" # defaults to arm64 runner with M1 chip
+            extras: "df"
+            rosetta: "true"
     runs-on: ${{ matrix.runs-on }}
     steps:
       - uses: actions/checkout@v4
@@ -100,6 +109,9 @@ jobs:
       - name: run unittests
         id: run_unittests
         shell: bash -l -e {0}
+        env:
+          GK_EXTRAS: ${{ matrix.extras }}
+          ROSETTA: ${{ matrix.rosetta }}
         run: |
           set -x
           micromamba activate test
@@ -108,7 +120,15 @@ jobs:
           if [ ! -e "${files[0]}" ]; then
               echo "No files matched for py${{ matrix.pyver-short }}"
               exit 1 
-          fi          
-          conda mambabuild --croot /tmp/conda-bld -t $files --extra-deps python=${{ matrix.python-version }}
+          fi
+          extra_deps=(python=${{ matrix.python-version }})   
+          # run command with optional extra deps
+          if [ "$GK_EXTRAS" = "df" ]; then
+            extra_deps+=("polars=1.39.3")
+          fi
+          if [ "$ROSETTA" = "true" ]; then
+            extra_deps+=("polars-runtime-compat=1.39.3")
+          fi
+          conda mambabuild --croot /tmp/conda-bld -t "${files[@]}" --extra-deps "${extra_deps[@]}"
           conda clean -it
-          set +x
+          set +x
diff --git a/docs-src/df.rst b/docs-src/df.rst
@@ -0,0 +1,67 @@
+.. _df:
+
+DataFrame Utilities
+===================
+
+The :py:mod:`genome_kit.df` subpackage contains utilities for working with Polars DataFrames that contain GenomeKit objects. This includes utilities for serializing DataFrames with GenomeKit objects to Parquet and deserializing them back to GenomeKit objects. This is useful when sharing tabular data sets, or when saving intermediate DataFrames to disk during data processing.
+
+.. important::
+
+    ``genome_kit.df`` depends on optional ``polars`` dependencies, which are not installed by default. These can be installed with the ``[df]`` extra:
+
+    .. code-block:: bash
+
+        pip install "genomekit[df]"
+
+    The ``[df]`` extra is not included in the default ``genomekit`` installation.
+
+    If you are running an x86 version of Python on an Apple Silicon Mac (e.g. M1 chip), this will also install the  ``polars-runtime-compat`` package, which is required to run Polars on Apple Silicon due to AVX features compatibility issues.
+
+
+Quickstart
+-----------
+The serialization and deserialization entry points are :py:func:`~genome_kit.df.read_parquet` and :py:func:`~genome_kit.df.write_parquet`:
+
+.. code-block:: python
+
+    import polars as pl
+    import genome_kit as gk
+
+    genome = gk.Genome("ncbi_refseq.v110")
+    df = pl.DataFrame(
+        {
+            "gene": [genome.genes[0], genome.genes[1]],
+            "score": [0.1, 0.8],
+        }
+    )
+
+    gk.write_parquet(df, "genes.parquet")
+    ...
+    ...
+    restored_df = gk.read_parquet("genes.parquet")
+
+
+.. note::
+
+    The written parquet files can be read by any software that supports the parquet format, but the GenomeKit objects will only be restored when read with :py:func:`genome_kit.df.read_parquet`.
+
+
+Supported GenomeKit Objects
+---------------------------
+The currently supported GenomeKit objects for serialization are:
+
+- :py:class:`genome_kit.Genome`
+- :py:class:`genome_kit.Interval`
+- :py:class:`genome_kit.Transcript`
+- :py:class:`genome_kit.Gene`
+- :py:class:`genome_kit.Exon`
+- :py:class:`genome_kit.Intron`
+- :py:class:`genome_kit.CDS`
+- :py:class:`genome_kit.UTR`
+
+Public API
+----------------
+.. automodule:: genome_kit.df
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs-src/index.rst b/docs-src/index.rst
@@ -73,6 +73,7 @@ Contents:
     anchors
     api
     genomes
+    df
     develop
     data_org
 

diff --git a/genome_kit/__init__.py b/genome_kit/__init__.py
@@ -49,6 +49,7 @@
 from .variant_genome import VariantGenome
 from .vcf_table import VCFTable, VCFVariant
 from . import serialize
+from .df import write_parquet, read_parquet
 
 #########################################################################
 
@@ -93,6 +94,7 @@
     "JunctionTable",
     "ReadAlignments",
     "ReadDistributions",
+    "read_parquet",
     "Transcript",
     "TranscriptTable",
     "Utr",
@@ -102,6 +104,7 @@
     "VariantTable",
     "VCFTable",
     "VCFVariant",
+    "write_parquet",
 ]
 
 #########################################################################
diff --git a/genome_kit/_optional.py b/genome_kit/_optional.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+from importlib.metadata import PackageNotFoundError, version
+
+
+def require_polars():
+    """Import Polars if available, otherwise provide helpful error messages.
+
+    Also checks for compatibility on MacOS with Apple Silicon, which may require
+    an additional package if running Python under Rosetta translation.
+    """
+    try:
+        import polars as pl
+
+        if check_under_rosetta():
+            if not check_rtcompat():
+                raise ImportError(
+                    "Polars is not compatible with Apple Silicon.\n"
+                    "Please install with `pip install genomekit[df-mac]` to include "
+                    "the polars-runtime-compat package required for Rosetta "
+                    "translation."
+                )
+    except ModuleNotFoundError as e:
+        raise ImportError(
+            "Optional dependency 'polars' is required for this functionality. Please "
+            "install with `pip install genomekit[df]`.\n"
+            "If you are running this on MacOS with Apple Silicon, please install with "
+            "`pip install genomekit[df-mac]` to include the polars-runtime-compat "
+            "package required for Rosetta translation."
+        ) from e
+
+    return pl
+
+
+def check_under_rosetta():
+    """Check if program is running under Rosetta translation on Apple Silicon.
+
+    The default version of Polars is incompatible with Rosetta, and requires
+    polars-runtime-compat to be installed.
+
+    Can be checked with the sysctl.proc_translated flag in sysctl.
+    See https://developer.apple.com/documentation/apple-silicon/about-the-rosetta-translation-environment#Determine-Whether-Your-App-Is-Running-as-a-Translated-Binary
+    """
+    import subprocess
+
+    try:
+        result = subprocess.run(
+            ["sysctl", "-n", "sysctl.proc_translated"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        # output will be 0 if running natively on Apple Silicon, and 1 if running under 
+        # Rosetta translation
+        return result.stdout.strip() == "1"
+    except (subprocess.CalledProcessError, OSError):
+        # sysctl.proc_translated won't exist on non-Apple Silicon machines
+        return False
+
+
+def check_rtcompat():
+    """Check if polars-runtime-compat is installed.
+
+    Required for Polars to run on MacOS machines under Rosetta translation.
+    """
+    try:
+        version("polars-runtime-compat")
+        return True
+    except PackageNotFoundError:
+        return False
diff --git a/genome_kit/df/__init__.py b/genome_kit/df/__init__.py
@@ -0,0 +1,3 @@
+from .serialization import read_parquet, write_parquet
+
+__all__ = ["read_parquet", "write_parquet"]
diff --git a/genome_kit/df/gk_structs.py b/genome_kit/df/gk_structs.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from genome_kit._optional import require_polars
+
+if TYPE_CHECKING:  # import polars for type checking
+    import polars as pl
+
+# minimal shim for python <3.11 compatibility
+try:
+    from enum import StrEnum, auto
+except ImportError:
+    from enum import Enum, auto
+
+    class StrEnum(str, Enum):
+        def __str__(self):
+            return str(self.value)
+
+        @staticmethod
+        def _generate_next_value_(name, start, count, last_values):
+            return name.lower()
+
+# serializable representations of the supported GKDF types, with a one-to-one mapping
+# between GkDfType and GenomeKit object types. Serves as the key for struct and function
+# definitions in registry.py, keeping serialization and deserialization paths symmetric.
+class GkDfType(StrEnum):
-class GkDfType(StrEnum):
+# an intermediate mapping (GK types -> GkDfType -> PL types) is added as a defensive
+# coding measure to avoid an accidental import of pl from the main package.
+class GkDfType(StrEnum):
-class GkDfType(StrEnum):
+# an intermediate mapping (GK types -> GkDfType -> PL types) is added as a defensive
+# coding measure to avoid an accidental import of pl from the main package.
+class GkDfType(StrEnum):
+    GENOME = auto()
+    INTERVAL = auto()
+    TRANSCRIPT = auto()
+    GENE = auto()
+    EXON = auto()
+    INTRON = auto()
+    CDS = auto()
+    UTR = auto()
+
+
+class CellType(StrEnum):
+    SCALAR = auto()
+    LIST = auto()
+
+
+@dataclass(frozen=True)
+class ColumnInfo:
+    """Dataclass to store metadata about a single column in a dataframe.
+
+    Assumes that all cells in a column have the same type. If the cell contains a list,
+    assumes all items in the list are of the same type.
+    """
+
+    cell_type: CellType
+    gkdf_type: GkDfType
+
+    def to_dict(self) -> dict:
+        return {
+            "cell_type": self.cell_type.value,
+            "gkdf_type": self.gkdf_type.value,
+        }
+
+
+class GkDfVersion(StrEnum):
+    V1 = "1.0"
+
+
+CURRENT_VERSION = GkDfVersion.V1
+
+
+def get_structs() -> dict[GkDfType, pl.Struct]:
+    """Return a mapping of GkDfType to their corresponding Polars Struct definitions."""
+    pl = require_polars()
+
+    GenomeStruct = pl.Struct(
+        [
+            pl.Field("schema_version", pl.Utf8),
+            pl.Field("genome_name", pl.Utf8),  # reference or annotation genome
+        ]
+    )
+
+    IntervalStruct = pl.Struct(
+        [
+            pl.Field("schema_version", pl.Utf8),
+            pl.Field("chromosome", pl.Utf8),
+            pl.Field("strand", pl.Utf8),
+            pl.Field("start", pl.Int32),
+            pl.Field("end", pl.Int32),
+            pl.Field("refg", pl.Utf8),  # reference genome
+        ]
+    )
+
+    TranscriptStruct = pl.Struct(
+        [
+            pl.Field("schema_version", pl.Utf8),
+            # index of transcript within annotation genome transcript table
+            # Int32 matches index type in C++ backend (see src/table.h:22)
+            pl.Field("transcript_table_index", pl.Int32),
+            pl.Field("anno", pl.Utf8),  # annotation genome
+        ]
+    )
+
+    GeneStruct = pl.Struct(
+        [
+            pl.Field("schema_version", pl.Utf8),
+            pl.Field("gene_table_index", pl.Int32),
+            pl.Field("anno", pl.Utf8),  # annotation genome
+        ]
+    )
+
+    ExonStruct = pl.Struct(
+        [
+            pl.Field("schema_version", pl.Utf8),
+            pl.Field("exon_table_index", pl.Int32),
+            pl.Field("anno", pl.Utf8),  # annotation genome
+        ]
+    )
+
+    IntronStruct = pl.Struct(
+        [
+            pl.Field("schema_version", pl.Utf8),
+            pl.Field("intron_table_index", pl.Int32),
+            pl.Field("anno", pl.Utf8),  # annotation genome
+        ]
+    )
+
+    CdsStruct = pl.Struct(
+        [
+            pl.Field("schema_version", pl.Utf8),
+            pl.Field("cds_table_index", pl.Int32),
+            pl.Field("anno", pl.Utf8),  # annotation genome
+        ]
+    )
+
+    UtrType = pl.Enum(["5prime", "3prime"])
+
+    UtrStruct = pl.Struct(
+        [
+            pl.Field("schema_version", pl.Utf8),
+            pl.Field("utr_type", UtrType),
+            pl.Field("utr_table_index", pl.Int64),
+            pl.Field("anno", pl.Utf8),  # annotation genome
+        ]
+    )
+
+    return {
+        GkDfType.GENOME: GenomeStruct,
+        GkDfType.INTERVAL: IntervalStruct,
+        GkDfType.TRANSCRIPT: TranscriptStruct,
+        GkDfType.GENE: GeneStruct,
+        GkDfType.EXON: ExonStruct,
+        GkDfType.INTRON: IntronStruct,
+        GkDfType.CDS: CdsStruct,
+        GkDfType.UTR: UtrStruct,
+    }
-Original file line number
+Diff line change
@@ Expand Up / @@ -73,6 +73,7 @@ Contents: @@
         anchors
         api
         genomes
+        df
         develop
         data_org
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .serialization import read_parquet, write_parquet

		__all__ = ["read_parquet", "write_parquet"]