xenium_processing/xenium_roi_extractor.py at main · scOpenLab/xenium_processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
"""
Module for extracting all data for a given list of rectangular ROIs
from Xenium Onboard Analysis output. The ROIs x,y,w,h are provided as part of
filenames in a foder (intended for images, but fake empty files can be used).
"""

import json
import os
import re
import argparse
from packaging.version import Version
import pandas as pd
import numpy as np
import tifffile
import h5py
import skimage
import scipy

def get_arguments():
    """
    Parses and checks command line arguments, and provides an help text.
    Assumes 3 and returns 3 positional command line arguments:
    """
    parser = argparse.ArgumentParser(
            description = "Split transcripts and images according to the given ROIs")
    parser.add_argument("xenium_folder", help = "path to the xenium folder")
    parser.add_argument("roi_folder", help = "path to the folder with the ROI images")
    parser.add_argument("output_folder", help = "path to the output folder")
    args = parser.parse_args()
    return args.xenium_folder, args.roi_folder, args.output_folder

def read_image_v2(xenium_dir):
    """
    Reads the morphology focus images from
    Xenium Onboard Analysis output v >= 2
    https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-understanding-outputs#images
    """
    mf_tiff_file_path = os.path.join(xenium_dir, "morphology_focus",
                                     "morphology_focus_0000.ome.tif")
    print(f"Reading Morphology Focus Image File: {mf_tiff_file_path}")
    img = tifffile.imread(mf_tiff_file_path)
    return img

def process_image_v2(mf_image, roi, roi_name,  output_dir):
    """
    Crops and saves the morphology focus images from
    Xenium Onboard Analysis output v > 2
    https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-understanding-outputs
    """
    x, y, w, h = roi
    if mf_image.ndim== 3:
        sub_image = mf_image[:, y:y + h, x:x + w]
        for c, channel in enumerate(sub_image):
            outpath = os.path.join(output_dir, f"{roi_name}-morphology_focus_000{c}.ome.tif")
            tifffile.imwrite(outpath, data = channel, ome = True, compression = "lzw")
    else:
        sub_image = mf_image[y:y + h, x:x + w]
        outpath = os.path.join(output_dir, f"{roi_name}-morphology_focus_0000.ome.tif")
        tifffile.imwrite(outpath, data = sub_image, ome = True, compression = "lzw")

def read_image_v1(xenium_dir):
    """
    Reads the morphology mip image from
    Xenium Onboard Analysis output v <= 2
    https://www.10xgenomics.com/support/software/xenium-onboard-analysis/1.9/analysis/xoa-output-understanding-outputs
    """
    mip_tiff_file_path = os.path.join(xenium_dir, "morphology_mip.ome.tif")
    print(f"Reading Maximum Projection Image File: {mip_tiff_file_path}")
    img = tifffile.imread(mip_tiff_file_path)
    return img

def process_image_v1(mip_image, roi, roi_name, output_dir):
    """
    Crops and saves the morphology mip image from
    Xenium Onboard Analysis output v <= 2
    https://www.10xgenomics.com/support/software/xenium-onboard-analysis/1.9/analysis/xoa-output-understanding-outputs
    """
    x, y, w, h = roi
    sub_image = mip_image[:, y:y + h, x:x + w]
    tifffile.imwrite(os.path.join(output_dir, f"{roi_name}-morphology_mip.ome.tif"),
                     data = sub_image, ome = True, compression = "lzw")

if __name__ == "__main__":

    xenium_folder, roi_folder, ouptut_folder = get_arguments()

    # Expects standard xenium filenames:
    # https://www.10xgenomics.com/support/in-situ-gene-expression/documentation/steps/onboard-analysis/at-a-glance-xenium-output-files
    xenium_file_path = os.path.join(xenium_folder, "experiment.xenium")
    # Reads the pixel size in um and the software version
    with open(xenium_file_path, encoding="utf-8") as metadata_file:
        metadata = json.load(metadata_file)
        pixel_size = metadata["pixel_size"]
        version = Version(metadata["instrument_sw_version"])

    transcript_file_path = os.path.join(xenium_folder, "transcripts.parquet")
    cells_file_path = os.path.join(xenium_folder, "cells.parquet")
    cell_boundaries_file_path = os.path.join(xenium_folder, "cell_boundaries.parquet")
    features_file_path = os.path.join(xenium_folder, "cell_feature_matrix.h5")
    print(f"Xenium file: {xenium_file_path}")
    print(f"Transcript file: {transcript_file_path}")
    print(f"Cell file: {cells_file_path}")
    print(f"Cell Boundary file: {cell_boundaries_file_path}")
    print(f"Feature Matrix file: {features_file_path}")

    # Reads the transcripts and add locations in pixels
    print(f"Reading Transcript file: {transcript_file_path}")
    transcripts = pd.read_parquet(transcript_file_path)
    transcripts["x_location_px"] = transcripts["x_location"] / pixel_size
    transcripts["y_location_px"] = transcripts["y_location"] / pixel_size
    transcripts["z_location_px"] = transcripts["z_location"] / pixel_size

    # Reads the cells and add locations in pixels
    print(f"Reading Cell file: {cells_file_path}")
    cells = pd.read_parquet(cells_file_path)
    cells["x_centroid_px"] = cells["x_centroid"] / pixel_size
    cells["y_centroid_px"] = cells["y_centroid"] / pixel_size

    # Reads the cells and add locations in pixels
    print(f"Reading Cell Boundary file: {cell_boundaries_file_path}")
    cell_boundaries = pd.read_parquet(cell_boundaries_file_path)
    cell_boundaries["vertex_x_px"] = cell_boundaries["vertex_x"] / pixel_size
    cell_boundaries["vertex_y_px"] = cell_boundaries["vertex_y"] / pixel_size

    # Reads ROIS, expects _Xnnn_Ynnnn_Wnnn_Hnnnn_ with n = 0-9
    # makes a lists of lists [x, y, w, h]
    print(f"Looking for ROIs in: {roi_folder}")
    ROIs = os.listdir(roi_folder)
    ROIs = filter(lambda filename: re.match(".*_X[0-9]+_Y[0-9]+_W[0-9]+_H[0-9]+.*", filename), ROIs)
    ROIs = [re.match(
        ".*_X([0-9]+)_Y([0-9]+)_W([0-9]+)_H([0-9]+).*", filename).groups() for filename in ROIs]
    ROIs = [list(int(n) for n in i) for i in ROIs]
    print(f"Found ROIs: {ROIs}")

    # Read features
    # Features are equivalent to:
    # transcripts.loc[(transcripts["qv"] > 20) & (transcripts["cell_id"] != "UNASSIGNED")]
    print(f"Processing features: {features_file_path}")
    with h5py.File(features_file_path, "r") as f:
        feature_attrs = dict(f.attrs.items())
        feature_h5 = {}
        feature_h5['barcodes'] = f["matrix"].get("barcodes")[:]
        feature_h5['data'] = f["matrix"].get("data")[:]
        feature_h5['indices'] = f["matrix"].get("indices")[:]
        feature_h5['indptr'] = f["matrix"].get("indptr")[:]
        feature_h5['shape'] = f["matrix"].get("shape")[:]
        feature_h5['features'] = {}
        feature_h5['features']["_all_tag_keys"] = \
                f["matrix"].get("features").get("_all_tag_keys")[:]
        feature_h5['features']["feature_type"] = f["matrix"].get("features").get("feature_type")[:]
        feature_h5['features']["genome"] = f["matrix"].get("features").get("genome")[:]
        feature_h5['features']["id"] = f["matrix"].get("features").get("id")[:]
        feature_h5['features']["name"] = f["matrix"].get("features").get("name")[:]
    features = scipy.sparse.csr_matrix(
            (feature_h5["data"], feature_h5["indices"], feature_h5["indptr"]), dtype=int)
    features = pd.DataFrame(features.toarray(), columns = feature_h5['features']["name"])
    features.columns = features.columns.astype(str)
    features["cell_id"] = cells["cell_id"]
    features = features[list(features.columns)[-1:] + list(features.columns)[:-1]]

    # Read the images
    if version < Version("2"):
        image = read_image_v1(xenium_folder)
    else:
        image = read_image_v2(xenium_folder)

    os.makedirs(ouptut_folder, exist_ok = True)

    for r in ROIs:
        ROI_STRING = "X_" + str(r[0]) +  "_Y_" + str(r[1]) +  "_W_" + str(r[2]) +  "_H_"+ str(r[3])
        print(f"Processing: {ROI_STRING}")

        sub_transcripts = transcripts.loc[(transcripts['x_location_px'] >= r[0]) &
                                          (transcripts['x_location_px'] <= (r[0] + r[2])) &
                                          (transcripts['y_location_px'] >= r[1]) &
                                          (transcripts['y_location_px'] <= (r[1] + r[3]))]
        sub_transcripts.loc[:, "x_location_px_ROI"] = sub_transcripts["x_location_px"] - r[0]
        sub_transcripts.loc[:, "y_location_px_ROI"] = sub_transcripts["y_location_px"] - r[1]

        # We exlude all cells not fully contained in the ROI!
        cell_boundaries["out"] = (cell_boundaries['vertex_x_px'] >= r[0]) & \
            (cell_boundaries['vertex_x_px'] <= (r[0] + r[2])) & \
            (cell_boundaries['vertex_y_px'] >= r[1]) & \
            (cell_boundaries['vertex_y_px'] <= (r[1] + r[3]))
        filter_df = cell_boundaries[["cell_id", "out"]].groupby("cell_id").filter(
                lambda x: all(x["out"])).drop_duplicates()

        sub_cell_boundaries = pd.merge(cell_boundaries, filter_df["cell_id"],
                                       on = "cell_id", how = "inner")
        sub_cell_boundaries = sub_cell_boundaries.drop('out', axis=1)
        sub_cell_boundaries["vertex_x_px_ROI"] = sub_cell_boundaries["vertex_x_px"] - r[0]
        sub_cell_boundaries["vertex_y_px_ROI"] = sub_cell_boundaries["vertex_y_px"] - r[1]

        sub_cells = pd.merge(cells, filter_df["cell_id"], on = "cell_id", how = "inner")
        sub_cells["x_centroid_px_ROI"] = sub_cells["x_centroid_px"] - r[0]
        sub_cells["y_centroid_px_ROI"] = sub_cells["y_centroid_px"] - r[1]

        # Make a cell mask for the ROI by getting the coordinates of every pixel
        # inside each cell and then drawing them on ask of the size of the ROI
        # The cells are numbered in the same order as they appear in
        # the cell_boundaries.parquet file
        cell_coords = sub_cell_boundaries.groupby("cell_id").apply(
                lambda x, roi=r: skimage.draw.polygon(x["vertex_x_px_ROI"],
                                               x["vertex_y_px_ROI"], (roi[2], roi[3])))
        print(cell_coords)
        cell_mask = np.zeros((r[3], r[2]), dtype=np.int32)
        for n, i in enumerate(cell_coords):
            cell_mask[i[1], i[0]] = n

        # Process features
        sub_features = pd.merge(features, filter_df["cell_id"], on = "cell_id", how = "inner")
        fmatrix = sub_features.drop("cell_id", axis=1)
        fmatrix = scipy.sparse.csr_array(fmatrix)

        # Make transcript table for reding with the Polylux viewer (Resolve Biosciences)
        polylux_tx = sub_transcripts[["x_location_px_ROI", "y_location_px_ROI",
                                      "z_location_px", "feature_name", "qv"]]
        polylux_tx["x_location_px_ROI"] = polylux_tx["x_location_px_ROI"].astype(np.int32)
        polylux_tx["y_location_px_ROI"] = polylux_tx["y_location_px_ROI"].astype(np.int32)
        polylux_tx["z_location_px"] = polylux_tx["z_location_px"].astype(np.int32)

        # Output
        print(f"Writing {ROI_STRING}")
        subfolder= os.path.join(ouptut_folder, ROI_STRING)
        os.makedirs(subfolder, exist_ok = True)

        # Make h5 file
        with h5py.File(os.path.join(subfolder, ROI_STRING + "-cell_feature_matrix.h5"), "w") as f:
            for k,v in feature_attrs.items():
                f.attrs.create(k, v)
            m_to_w = f.create_group("matrix")
            m_to_w.create_dataset("barcodes", data=np.array(sub_features["cell_id"]))
            m_to_w.create_dataset("data", data=fmatrix.data)
            m_to_w.create_dataset("indices", data=fmatrix.indices)
            m_to_w.create_dataset("indptr", data=fmatrix.indptr)
            m_to_w.create_dataset("shape", data=fmatrix.shape)
            f_to_write = m_to_w.create_group("features")
            f_to_write.create_dataset("_all_tag_keys", data=feature_h5["features"]["_all_tag_keys"])
            f_to_write.create_dataset("feature_type", data=feature_h5["features"]["feature_type"])
            f_to_write.create_dataset("genome", data=feature_h5["features"]["genome"])
            f_to_write.create_dataset("id", data=feature_h5["features"]["id"])
            f_to_write.create_dataset("name", data=feature_h5["features"]["name"])

        if version < Version("2"):
            process_image_v1(image, r, ROI_STRING, subfolder)
        else:
            process_image_v2(image, r, ROI_STRING, subfolder)

        # Write other output
        sub_transcripts.to_parquet(os.path.join(subfolder, ROI_STRING + "-transcritps.parquet"))
        sub_transcripts.to_csv(os.path.join(subfolder, ROI_STRING + "-transcritps.csv"))
        polylux_tx.to_csv(os.path.join(subfolder, ROI_STRING + "-Polylux_transcritps.txt"),
                          header = None, index = False, sep = "\t")
        sub_cells.to_parquet(os.path.join(subfolder, ROI_STRING + "-cells.parquet"))
        sub_cells.to_csv(os.path.join(subfolder, ROI_STRING + "-cells.csv"))
        sub_cell_boundaries.to_parquet(
                os.path.join(subfolder, ROI_STRING + "-cell_boundaries.parquet"))
        sub_cell_boundaries.to_csv(os.path.join(subfolder, ROI_STRING + "-cell_boundaries.csv"))
        sub_features.to_csv(os.path.join(subfolder, ROI_STRING + "-feature_matrix.csv"))
        tifffile.imwrite(os.path.join(subfolder, ROI_STRING + "-xenium_cell_mask.ome.tif"),
                         data = cell_mask, ome = True, compression = "lzw")