-
Notifications
You must be signed in to change notification settings - Fork 7
feat: genomekit dataframe serialization #204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
076eb17
62a806e
3f16e0f
7bef461
7b5130e
518e456
6de6f85
1d43334
0497eec
6f9c71d
a905a2b
6f87c23
fc4d0dd
5700485
3939f67
257504f
cf4ff13
4b89f79
57d5b74
17edfb2
cb6fd93
eb125d4
0437b70
7565646
42338a0
77f6f3c
575b7c5
325f530
8e07c2a
59cfb99
5eaf0c9
37867e3
ef2cdab
96a81b3
9cd8f08
8e7b240
0516acf
e730a08
240606c
bbf8962
8d03a0c
13168b6
9f1d32c
6102e29
92b1113
da54475
6ced9e2
07b2102
5eaa280
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,67 @@ | ||
| .. _df: | ||
|
|
||
| DataFrame Utilities | ||
| =================== | ||
|
|
||
| The :py:mod:`genome_kit.df` subpackage contains utilities for working with Polars DataFrames that contain GenomeKit objects. This includes utilities for serializing DataFrames with GenomeKit objects to Parquet and deserializing them back to GenomeKit objects. This is useful when sharing tabular data sets, or when saving intermediate DataFrames to disk during data processing. | ||
|
|
||
| .. important:: | ||
|
|
||
| ``genome_kit.df`` depends on optional ``polars`` dependencies, which are not installed by default. These can be installed with the ``[df]`` extra: | ||
|
|
||
| .. code-block:: bash | ||
|
|
||
| pip install "genomekit[df]" | ||
|
|
||
| The ``[df]`` extra is not included in the default ``genomekit`` installation. | ||
|
|
||
| If you are running an x86 version of Python on an Apple Silicon Mac (e.g. M1 chip), this will also install the ``polars-runtime-compat`` package, which is required to run Polars on Apple Silicon due to AVX features compatibility issues. | ||
|
|
||
|
|
||
| Quickstart | ||
| ----------- | ||
| The serialization and deserialization entry points are :py:func:`~genome_kit.df.read_parquet` and :py:func:`~genome_kit.df.write_parquet`: | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| import polars as pl | ||
| import genome_kit as gk | ||
|
|
||
| genome = gk.Genome("ncbi_refseq.v110") | ||
| df = pl.DataFrame( | ||
| { | ||
| "gene": [genome.genes[0], genome.genes[1]], | ||
| "score": [0.1, 0.8], | ||
| } | ||
| ) | ||
|
|
||
| gk.write_parquet(df, "genes.parquet") | ||
| ... | ||
| ... | ||
| restored_df = gk.read_parquet("genes.parquet") | ||
|
|
||
|
|
||
| .. note:: | ||
|
|
||
| The written parquet files can be read by any software that supports the parquet format, but the GenomeKit objects will only be restored when read with :py:func:`genome_kit.df.read_parquet`. | ||
|
|
||
|
|
||
| Supported GenomeKit Objects | ||
| --------------------------- | ||
| The currently supported GenomeKit objects for serialization are: | ||
|
|
||
| - :py:class:`genome_kit.Genome` | ||
| - :py:class:`genome_kit.Interval` | ||
| - :py:class:`genome_kit.Transcript` | ||
| - :py:class:`genome_kit.Gene` | ||
| - :py:class:`genome_kit.Exon` | ||
| - :py:class:`genome_kit.Intron` | ||
| - :py:class:`genome_kit.CDS` | ||
| - :py:class:`genome_kit.UTR` | ||
|
|
||
| Public API | ||
| ---------------- | ||
| .. automodule:: genome_kit.df | ||
| :members: | ||
| :undoc-members: | ||
| :show-inheritance: |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,6 +73,7 @@ Contents: | |
| anchors | ||
| api | ||
| genomes | ||
| df | ||
| develop | ||
| data_org | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from importlib.metadata import PackageNotFoundError, version | ||
|
|
||
|
|
||
| def require_polars(): | ||
| """Import Polars if available, otherwise provide helpful error messages. | ||
|
|
||
| Also checks for compatibility on MacOS with Apple Silicon, which may require | ||
| an additional package if running Python under Rosetta translation. | ||
| """ | ||
| try: | ||
| import polars as pl | ||
|
|
||
| if check_under_rosetta(): | ||
| if not check_rtcompat(): | ||
| raise ImportError( | ||
| "Polars is not compatible with Apple Silicon.\n" | ||
| "Please install with `pip install genomekit[df-mac]` to include " | ||
| "the polars-runtime-compat package required for Rosetta " | ||
| "translation." | ||
| ) | ||
| except ModuleNotFoundError as e: | ||
| raise ImportError( | ||
| "Optional dependency 'polars' is required for this functionality. Please " | ||
| "install with `pip install genomekit[df]`.\n" | ||
| "If you are running this on MacOS with Apple Silicon, please install with " | ||
| "`pip install genomekit[df-mac]` to include the polars-runtime-compat " | ||
| "package required for Rosetta translation." | ||
| ) from e | ||
|
|
||
| return pl | ||
|
|
||
|
|
||
| def check_under_rosetta(): | ||
| """Check if program is running under Rosetta translation on Apple Silicon. | ||
|
|
||
| The default version of Polars is incompatible with Rosetta, and requires | ||
| polars-runtime-compat to be installed. | ||
|
|
||
| Can be checked with the sysctl.proc_translated flag in sysctl. | ||
| See https://developer.apple.com/documentation/apple-silicon/about-the-rosetta-translation-environment#Determine-Whether-Your-App-Is-Running-as-a-Translated-Binary | ||
| """ | ||
| import subprocess | ||
|
|
||
| try: | ||
| result = subprocess.run( | ||
| ["sysctl", "-n", "sysctl.proc_translated"], | ||
| capture_output=True, | ||
| text=True, | ||
| check=True, | ||
| ) | ||
| # output will be 0 if running natively on Apple Silicon, and 1 if running under | ||
| # Rosetta translation | ||
| return result.stdout.strip() == "1" | ||
| except (subprocess.CalledProcessError, OSError): | ||
| # sysctl.proc_translated won't exist on non-Apple Silicon machines | ||
| return False | ||
|
|
||
|
|
||
| def check_rtcompat(): | ||
| """Check if polars-runtime-compat is installed. | ||
|
|
||
| Required for Polars to run on MacOS machines under Rosetta translation. | ||
| """ | ||
| try: | ||
| version("polars-runtime-compat") | ||
| return True | ||
| except PackageNotFoundError: | ||
| return False |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| from .serialization import read_parquet, write_parquet | ||
|
|
||
| __all__ = ["read_parquet", "write_parquet"] |
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,153 @@ | ||||||||||
| from __future__ import annotations | ||||||||||
|
|
||||||||||
| from dataclasses import dataclass | ||||||||||
| from typing import TYPE_CHECKING | ||||||||||
|
|
||||||||||
| from genome_kit._optional import require_polars | ||||||||||
|
|
||||||||||
| if TYPE_CHECKING: # import polars for type checking | ||||||||||
| import polars as pl | ||||||||||
|
|
||||||||||
| # minimal shim for python <3.11 compatibility | ||||||||||
| try: | ||||||||||
| from enum import StrEnum, auto | ||||||||||
| except ImportError: | ||||||||||
| from enum import Enum, auto | ||||||||||
|
|
||||||||||
| class StrEnum(str, Enum): | ||||||||||
| def __str__(self): | ||||||||||
| return str(self.value) | ||||||||||
|
|
||||||||||
| @staticmethod | ||||||||||
| def _generate_next_value_(name, start, count, last_values): | ||||||||||
| return name.lower() | ||||||||||
|
|
||||||||||
| # serializable representations of the supported GKDF types, with a one-to-one mapping | ||||||||||
| # between GkDfType and GenomeKit object types. Serves as the key for struct and function | ||||||||||
| # definitions in registry.py, keeping serialization and deserialization paths symmetric. | ||||||||||
| class GkDfType(StrEnum): | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did I get the motivation right?
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn’t classify this as defensive programming, the part that is defensive with regards to polars imports is the The main idea is to use the |
||||||||||
| GENOME = auto() | ||||||||||
| INTERVAL = auto() | ||||||||||
| TRANSCRIPT = auto() | ||||||||||
| GENE = auto() | ||||||||||
| EXON = auto() | ||||||||||
| INTRON = auto() | ||||||||||
| CDS = auto() | ||||||||||
| UTR = auto() | ||||||||||
|
|
||||||||||
|
|
||||||||||
| class CellType(StrEnum): | ||||||||||
| SCALAR = auto() | ||||||||||
| LIST = auto() | ||||||||||
|
|
||||||||||
|
|
||||||||||
| @dataclass(frozen=True) | ||||||||||
| class ColumnInfo: | ||||||||||
| """Dataclass to store metadata about a single column in a dataframe. | ||||||||||
|
|
||||||||||
| Assumes that all cells in a column have the same type. If the cell contains a list, | ||||||||||
| assumes all items in the list are of the same type. | ||||||||||
| """ | ||||||||||
|
|
||||||||||
| cell_type: CellType | ||||||||||
| gkdf_type: GkDfType | ||||||||||
|
|
||||||||||
| def to_dict(self) -> dict: | ||||||||||
| return { | ||||||||||
| "cell_type": self.cell_type.value, | ||||||||||
| "gkdf_type": self.gkdf_type.value, | ||||||||||
| } | ||||||||||
|
|
||||||||||
|
|
||||||||||
| class GkDfVersion(StrEnum): | ||||||||||
| V1 = "1.0" | ||||||||||
|
|
||||||||||
|
|
||||||||||
| CURRENT_VERSION = GkDfVersion.V1 | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def get_structs() -> dict[GkDfType, pl.Struct]: | ||||||||||
| """Return a mapping of GkDfType to their corresponding Polars Struct definitions.""" | ||||||||||
| pl = require_polars() | ||||||||||
|
|
||||||||||
| GenomeStruct = pl.Struct( | ||||||||||
| [ | ||||||||||
| pl.Field("schema_version", pl.Utf8), | ||||||||||
| pl.Field("genome_name", pl.Utf8), # reference or annotation genome | ||||||||||
| ] | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| IntervalStruct = pl.Struct( | ||||||||||
| [ | ||||||||||
| pl.Field("schema_version", pl.Utf8), | ||||||||||
| pl.Field("chromosome", pl.Utf8), | ||||||||||
| pl.Field("strand", pl.Utf8), | ||||||||||
| pl.Field("start", pl.Int32), | ||||||||||
| pl.Field("end", pl.Int32), | ||||||||||
| pl.Field("refg", pl.Utf8), # reference genome | ||||||||||
| ] | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| TranscriptStruct = pl.Struct( | ||||||||||
| [ | ||||||||||
| pl.Field("schema_version", pl.Utf8), | ||||||||||
| # index of transcript within annotation genome transcript table | ||||||||||
| # Int32 matches index type in C++ backend (see src/table.h:22) | ||||||||||
| pl.Field("transcript_table_index", pl.Int32), | ||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these can be gk version specific (depending if gk processes the anno file with more/less/rearranged features)... kind of a weakpoint of gk, but not sure if you want to be explicit about the dganno version
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Edit: I'll add a check for mismatching gk versions when reading the parquet file. The metadata already contains the gk version used to write the file. On second thought adding the specific dganno version(s) to the metadata isn't going to work here. The reference/annotation genomes are not collated when writing the file to disk since this is done lazily in Polars. Collecting the unique genomes before writing to file is possible, but either resulted in duplicated work, or materializing the dataframe into memory. I would assume that a change to the dganno version here would also result in a version bump for gk so I think the check for gk version is suffficient here. |
||||||||||
| pl.Field("anno", pl.Utf8), # annotation genome | ||||||||||
| ] | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| GeneStruct = pl.Struct( | ||||||||||
| [ | ||||||||||
| pl.Field("schema_version", pl.Utf8), | ||||||||||
| pl.Field("gene_table_index", pl.Int32), | ||||||||||
| pl.Field("anno", pl.Utf8), # annotation genome | ||||||||||
| ] | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| ExonStruct = pl.Struct( | ||||||||||
| [ | ||||||||||
| pl.Field("schema_version", pl.Utf8), | ||||||||||
| pl.Field("exon_table_index", pl.Int32), | ||||||||||
| pl.Field("anno", pl.Utf8), # annotation genome | ||||||||||
| ] | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| IntronStruct = pl.Struct( | ||||||||||
| [ | ||||||||||
| pl.Field("schema_version", pl.Utf8), | ||||||||||
| pl.Field("intron_table_index", pl.Int32), | ||||||||||
| pl.Field("anno", pl.Utf8), # annotation genome | ||||||||||
| ] | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| CdsStruct = pl.Struct( | ||||||||||
| [ | ||||||||||
| pl.Field("schema_version", pl.Utf8), | ||||||||||
| pl.Field("cds_table_index", pl.Int32), | ||||||||||
| pl.Field("anno", pl.Utf8), # annotation genome | ||||||||||
| ] | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| UtrType = pl.Enum(["5prime", "3prime"]) | ||||||||||
|
|
||||||||||
| UtrStruct = pl.Struct( | ||||||||||
| [ | ||||||||||
| pl.Field("schema_version", pl.Utf8), | ||||||||||
| pl.Field("utr_type", UtrType), | ||||||||||
| pl.Field("utr_table_index", pl.Int64), | ||||||||||
| pl.Field("anno", pl.Utf8), # annotation genome | ||||||||||
| ] | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| return { | ||||||||||
| GkDfType.GENOME: GenomeStruct, | ||||||||||
| GkDfType.INTERVAL: IntervalStruct, | ||||||||||
| GkDfType.TRANSCRIPT: TranscriptStruct, | ||||||||||
| GkDfType.GENE: GeneStruct, | ||||||||||
| GkDfType.EXON: ExonStruct, | ||||||||||
| GkDfType.INTRON: IntronStruct, | ||||||||||
| GkDfType.CDS: CdsStruct, | ||||||||||
| GkDfType.UTR: UtrStruct, | ||||||||||
| } | ||||||||||
Uh oh!
There was an error while loading. Please reload this page.