Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
784adc2
Transform labels instead of images
joshua-gould Jun 9, 2026
5a00ffe
Merge branch 'main' of https://github.com/Genentech/scallops into wor…
joshua-gould Jun 9, 2026
0fb20ab
Transform labels instead of images
joshua-gould Jun 9, 2026
92a4b35
Transform labels instead of images
joshua-gould Jun 9, 2026
fd1fa64
Transform labels instead of images
joshua-gould Jun 9, 2026
e6f66ab
Transform labels instead of images
joshua-gould Jun 9, 2026
ce6634f
registration transform across times
joshua-gould Jun 11, 2026
cc23675
Registration transform across times
joshua-gould Jun 12, 2026
e524243
Registration transform across times
joshua-gould Jun 12, 2026
6fc0e55
storage options
joshua-gould Jun 12, 2026
baa48d3
pattern
joshua-gould Jun 12, 2026
a6cdfb7
handle timepoint in segmentation
joshua-gould Jun 15, 2026
c9db0a2
handle timepoint in segmentation
joshua-gould Jun 15, 2026
3815857
handle timepoint in segmentation
joshua-gould Jun 15, 2026
3cea7ec
handle timepoint in segmentation
joshua-gould Jun 16, 2026
6778f6a
Merge branch 'main' of https://github.com/Genentech/scallops into wor…
joshua-gould Jun 16, 2026
13a32ca
Segmentation time
joshua-gould Jun 16, 2026
33ce260
Segmentation time
joshua-gould Jun 16, 2026
5f88d47
Segmentation time
joshua-gould Jun 16, 2026
970a5bd
Segmentation time
joshua-gould Jun 16, 2026
eda1e38
Segmentation time
joshua-gould Jun 17, 2026
6a7cd74
Segmentation time
joshua-gould Jun 17, 2026
c376bcc
Segmentation time
joshua-gould Jun 18, 2026
f70e984
Merge branch 'main' of https://github.com/Genentech/scallops into wor…
joshua-gould Jun 22, 2026
0141e89
Merge branch 'main' of https://github.com/Genentech/scallops into wor…
joshua-gould Jun 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
249 changes: 140 additions & 109 deletions scallops/cli/extract_crops.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@
import pyarrow.parquet as pq
from array_api_compat import get_namespace
from skimage.util import img_as_ubyte
from zarr import Group

from scallops.cli.features import _read_merged_or_objects, get_labels
from scallops.cli.features import (
_find_labels,
_image_key_without_time_and_selected_time,
_read_merged_or_objects,
)
from scallops.cli.find_objects import get_path
from scallops.cli.util import (
_get_cli_logger,
cli_metadata,
Expand All @@ -27,23 +31,25 @@
logger = _get_cli_logger()


def _norm_block(image: np.ndarray, percentiles) -> np.ndarray:
xp = get_namespace(image)
percentiles = xp.percentile(image, percentiles, axis=(1, 2), keepdims=True)
image = (image - percentiles[0]) / (percentiles[1] - percentiles[0])
image = xp.clip(image, 0, 1)
return image
def _norm_block(intensity_image: np.ndarray, percentiles) -> np.ndarray:
xp = get_namespace(intensity_image)
percentiles = xp.percentile(
intensity_image, percentiles, axis=(1, 2), keepdims=True
)
intensity_image = (intensity_image - percentiles[0]) / (
percentiles[1] - percentiles[0]
)
intensity_image = xp.clip(intensity_image, 0, 1)
return intensity_image


def single_crop(
group: str, # NOT USED
file_list: list[str],
metadata: dict,
labels_group: Group,
label_paths: list[str],
output_dir: str,
output_sep: str,
merge_dir: str,
merge_dir_sep: str,
merge_dirs: list[str],
crop_size: tuple[int, int],
output_format: Literal["tiff", "npy"],
label_name: str,
Expand All @@ -57,111 +63,136 @@ def single_crop(
force: bool,
):
image_key = metadata["id"]

output_dir = f"{output_dir}{output_sep}{label_name}{output_sep}{image_key}"

output_parquet_path = f"{output_dir}.parquet"
if not force and is_parquet_file(output_parquet_path):
logger.info(f"Skipping features for {image_key} {label_name}")
return

output_fs, _ = fsspec.core.url_to_fs(output_dir)
output_fs.makedirs(output_dir, exist_ok=True)
image = _images2fov(file_list, metadata, dask=True).squeeze().data
logger.info(f"{image_key} image shape {image.shape}")
zarr_labels = get_labels(
labels_group=labels_group,
name=image_key,
suffix=label_name, # e.g. nuclei

image_key_without_t, selected_timepoint = _image_key_without_time_and_selected_time(
metadata
)

if zarr_labels is None:
raise ValueError(f"Unable to read {label_name} labels for {image_key}.")
merged_df = _read_merged_or_objects(
merge_dir=merge_dir,
merge_dir_sep=merge_dir_sep,
label_name=label_name,
g, timepoints = _find_labels(
label_paths=label_paths,
image_key=image_key,
label_filter=label_filter,
)
if merged_df is None:
raise ValueError(f"Unable to read merged data for {image_key}.")
n_labels_before_filtering = len(merged_df)
if label_filter is not None:
merged_df = merged_df.query(label_filter)
label_prefix = _label_name_to_prefix[label_name]
area_column = f"{label_prefix}_AreaShape_Area"
merged_df = merged_df.query(f"{area_column}>=2")
n_labels_filtered = n_labels_before_filtering - len(merged_df)
logger.info(
f"Removed {n_labels_filtered:,} out of {n_labels_before_filtering:,} labels for {image_key}."
label_name=label_name,
image_key_without_t=image_key_without_t,
selected_timepoint=selected_timepoint,
)
if len(merged_df) == 0:
raise ValueError(f"No labels found for {image_key}.")
# e.g. CHAMMI-75
if g is None:
logger.info(f"No labels found for {image_key}")
return
labels_array = da.from_array(g[list(g.keys())[0]])
output_fs, _ = fsspec.core.url_to_fs(output_dir)
output_sep = output_fs.sep
for timepoint in timepoints:
features_path = get_path(
output_dir, output_sep, label_name, image_key, timepoint, ".parquet"
)
output_dir = get_path(
output_dir, output_sep, label_name, image_key, timepoint, ""
)

if percentile_normalize is not None:
if local_percentile_normalize:
chunksize = list(image.chunksize)
for i in range(len(chunksize) - 2):
chunksize[i] = -1
if chunks is not None:
chunksize[-2] = chunks
chunksize[-1] = chunks
else:
logger.info(
f"{image_key} chunk size: {chunksize[-2]:,} by {chunksize[-1]:,}"
)
image = image.rechunk(tuple(chunksize))
depth = None
if local_normalize_overlap is not None and local_normalize_overlap > 0:
depth = {
image.ndim - 2: local_normalize_overlap,
image.ndim - 1: local_normalize_overlap,
}
image = da.map_overlap(
_norm_block,
image,
percentiles=percentile_normalize,
depth=depth,
dtype=float,
if not force and is_parquet_file(features_path):
logger.info(
f"Skipping crops for {image_key} {label_name}{' at t=' + timepoint if timepoint is not None else ''}."
)
continue
if timepoint is not None and labels_array.ndim == 3:
timepoint_index = timepoints.index(timepoint)
label_image = labels_array[timepoint_index]
else:
percentiles = da.percentile(
image, percentile_normalize, axis=(1, 2), keepdims=True
)
image = (image - percentiles[0]) / (percentiles[1] - percentiles[0])
image = da.clip(image, 0, 1)
image = da.map_blocks(img_as_ubyte, image)
label_col = "label" if "label" in merged_df.columns else None
merged_df = to_label_crops(
label_image=da.from_zarr(zarr_labels),
intensity_image=image,
df=merged_df,
label_col=label_col,
output_dir=output_dir,
crop_size=crop_size,
output_format=output_format,
centroid_cols=[
f"{label_prefix}_AreaShape_Center_Y",
f"{label_prefix}_AreaShape_Center_X",
],
gaussian_sigma=gaussian_sigma,
)
label_image = labels_array
intensity_image = (
image.sel(t=timepoint)
if timepoint is not None and image.sizes.get("t", 0) > 1
else image
)
merged_df = _read_merged_or_objects(
paths=merge_dirs,
label_name=label_name,
timepoint=timepoint,
image_key=image_key,
image_key_without_t=image_key_without_t,
label_filter=label_filter,
)
if merged_df is None:
raise ValueError(f"Unable to read metadata for {image_key}.")
n_labels_before_filtering = len(merged_df)
if label_filter is not None:
merged_df = merged_df.query(label_filter)
label_prefix = _label_name_to_prefix[label_name]
area_column = f"{label_prefix}_AreaShape_Area"
merged_df = merged_df.query(f"{area_column}>=2")
n_labels_filtered = n_labels_before_filtering - len(merged_df)
logger.info(
f"Removed {n_labels_filtered:,} out of {n_labels_before_filtering:,} labels for {image_key}."
)
if len(merged_df) == 0:
raise ValueError(f"No labels found for {image_key}.")
# e.g. CHAMMI-75

if percentile_normalize is not None:
if local_percentile_normalize:
chunksize = list(intensity_image.chunksize)
for i in range(len(chunksize) - 2):
chunksize[i] = -1
if chunks is not None:
chunksize[-2] = chunks
chunksize[-1] = chunks
else:
logger.info(
f"{image_key} chunk size: {chunksize[-2]:,} by {chunksize[-1]:,}"
)
intensity_image = intensity_image.rechunk(tuple(chunksize))
depth = None
if local_normalize_overlap is not None and local_normalize_overlap > 0:
depth = {
intensity_image.ndim - 2: local_normalize_overlap,
intensity_image.ndim - 1: local_normalize_overlap,
}
intensity_image = da.map_overlap(
_norm_block,
intensity_image,
percentiles=percentile_normalize,
depth=depth,
dtype=float,
)
else:
percentiles = da.percentile(
intensity_image, percentile_normalize, axis=(1, 2), keepdims=True
)
intensity_image = (intensity_image - percentiles[0]) / (
percentiles[1] - percentiles[0]
)
intensity_image = da.clip(intensity_image, 0, 1)
intensity_image = da.map_blocks(img_as_ubyte, intensity_image)
label_col = "label" if "label" in merged_df.columns else None
merged_df = to_label_crops(
label_image=label_image,
intensity_image=intensity_image,
df=merged_df,
label_col=label_col,
output_dir=output_dir,
crop_size=crop_size,
output_format=output_format,
centroid_cols=[
f"{label_prefix}_AreaShape_Center_Y",
f"{label_prefix}_AreaShape_Center_X",
],
gaussian_sigma=gaussian_sigma,
)

output_metadata = cli_metadata() if not no_version else dict()
output_metadata = cli_metadata() if not no_version else dict()

table = pa.Table.from_pandas(merged_df, preserve_index=True)
table = table.replace_schema_metadata(
{
"scallops".encode(): json.dumps(output_metadata).encode(),
**table.schema.metadata,
}
)
table = pa.Table.from_pandas(merged_df, preserve_index=True)
table = table.replace_schema_metadata(
{
"scallops".encode(): json.dumps(output_metadata).encode(),
**table.schema.metadata,
}
)

fs, output_parquet_path = fsspec.url_to_fs(output_parquet_path)
pq.write_table(
table,
output_parquet_path,
filesystem=fs,
)
fs, features_path = fsspec.url_to_fs(features_path)
pq.write_table(
table,
features_path,
filesystem=fs,
)
26 changes: 5 additions & 21 deletions scallops/cli/extract_crops_main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import argparse

import fsspec
import zarr
from dask.bag import from_sequence

from scallops.cli.arg_parser import _sort_groups
Expand Down Expand Up @@ -39,12 +37,13 @@ def run_pipeline_extract_crops(arguments: argparse.Namespace):

image_patterns = arguments.image_pattern
output_dir = arguments.output
merge_dir = arguments.merge
merge_dirs = arguments.merge
subset = arguments.subset
force = arguments.force
groupby = arguments.groupby
crop_size = arguments.crop_size
crop_size = (crop_size, crop_size)
label_paths = arguments.labels
label_filter = arguments.label_filter
percentile_min = arguments.percentile_min

Expand All @@ -67,22 +66,9 @@ def run_pipeline_extract_crops(arguments: argparse.Namespace):
label_name = arguments.label_name # cell, cytosol, nuclei
chunks = arguments.chunks
if dask_server_url is None and arguments.dask_cluster is None:
dask_cluster_parameters = _dask_workers_threads()
dask_cluster_parameters = _dask_workers_threads(threads_per_worker=4)

merge_dir_sep = None
if merge_dir is not None:
merge_dir_sep = fsspec.core.url_to_fs(merge_dir)[0].sep
merge_dir = merge_dir.rstrip(merge_dir_sep)

output_fs, _ = fsspec.core.url_to_fs(output_dir)
output_dir = output_dir.rstrip(output_fs.sep)

labels_path = arguments.labels
no_version = arguments.no_version
assert labels_path is not None, "No labels provided"
label_root = zarr.open(labels_path, mode="r")
labels_group = label_root["labels"]

image_seq = from_sequence(
_set_up_experiment(
images_paths,
Expand All @@ -101,10 +87,8 @@ def run_pipeline_extract_crops(arguments: argparse.Namespace):
image_seq.starmap(
single_crop,
output_dir=output_dir,
output_sep=output_fs.sep,
merge_dir=merge_dir,
merge_dir_sep=merge_dir_sep,
labels_group=labels_group,
merge_dirs=merge_dirs,
label_paths=label_paths,
label_filter=label_filter,
label_name=label_name,
percentile_normalize=percentile_normalize,
Expand Down
Loading
Loading