From 7eb0b9ad204b1ece5ecfca624961f1066dbb8ac8 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 22 Jun 2026 08:56:43 -0700 Subject: [PATCH] Propagate friction geo-attrs through least_cost_corridor (#3446) The corridor is cd_a + cd_b, and each cost-distance surface carries its source raster's attrs. Source masks usually have no geo-attrs, so the xarray binary sum dropped res/crs/transform/nodatavals even when the friction surface had them. Re-emit friction's attrs and name on the output for the non-precomputed path; leave the precomputed path on its source-derived behaviour. Covers single, threshold, unreachable, and pairwise outputs across all four backends. --- .claude/sweep-metadata-state.csv | 1 + xrspatial/corridor.py | 36 ++++++++- xrspatial/tests/test_corridor.py | 124 +++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 4 deletions(-) diff --git a/.claude/sweep-metadata-state.csv b/.claude/sweep-metadata-state.csv index 4ad93347f..56696a262 100644 --- a/.claude/sweep-metadata-state.csv +++ b/.claude/sweep-metadata-state.csv @@ -1,6 +1,7 @@ module,last_inspected,issue,severity_max,categories_found,notes aspect,2026-05-29,2682,MEDIUM,4;5,"Audited 2026-05-29 (agent-a3b7c82e34312ffcb worktree, branch deep-sweep-metadata-aspect-2026-05-29). CUDA available; all 4 backends (numpy/cupy/dask+numpy/dask+cupy) run live for aspect/northness/eastness across planar and geodesic methods. Cat 1 attrs, Cat 2 coords, Cat 3 dims, and .name all preserved correctly on every backend: the 3 public functions re-emit coords=agg.coords, dims=agg.dims, attrs=agg.attrs at the xr.DataArray constructor. NEW MEDIUM finding #2682 (Cat 4 + Cat 5): the planar dask backends (_run_dask_numpy, _run_dask_cupy) called map_overlap with a default-dtype meta (np.array(()) / cupy.array(())), so the lazy DataArray advertised float64 while the chunk functions _cpu / _run_cupy cast to and return float32. numpy and cupy backends already reported float32, and the geodesic dask paths already passed dtype=np.float32, so only the two planar dask paths were inconsistent: a backend-inconsistent metadata bug where agg.dtype differs by backend and silently flips float64->float32 on .compute(). Fix in PR #2741: pass dtype=np.float32 / dtype=cupy.float32 to the planar dask meta. northness/eastness derive from aspect so they inherit the corrected dtype. 5 new tests (test_dask_numpy_advertised_dtype_matches_computed parametrized over 4 boundary modes, plus test_dask_cupy_advertised_dtype_matches_computed) assert lazy dtype == computed dtype == float32. Full aspect suite 69 passed. slope.py and curvature.py share the same default-dtype meta pattern on their planar dask paths (out of scope for this aspect-only sweep; likely same inconsistency). No CRITICAL/HIGH/LOW findings." contour,2026-05-29,2700,HIGH,1;5,"Audited 2026-05-29 (agent-ab7fff484a8f57de2 worktree, branch deep-sweep-metadata-contour-2026-05-29). CUDA available; cupy and dask+cupy paths exercised live. contours() returns a list of (level, ndarray) tuples or a GeoDataFrame, not a DataArray, so Cat 2/3 DataArray checks reinterpreted as coordinate-transform + CRS propagation. Coordinate transform (np.interp over input dims, descending y respected) is correct and identical across all 4 backends (tracing is host-side via _contours_numpy). Cat 4 N/A: library convention is NaN-as-nodata; slope/aspect/curvature/focal do not read attrs['nodatavals'] either, so contour not reading it is consistent, not a bug. NEW HIGH finding #2700 (Cat 1/Cat 5): contours(return_type='geopandas') crashed with 'Assigning CRS to a GeoDataFrame without a geometry column is not supported' whenever the input had attrs['crs'] but the result was empty (flat raster, levels outside data range) because _to_geopandas built gpd.GeoDataFrame([], crs=crs) with no geometry column; separately the all-NaN early-return passed crs=None and silently dropped the CRS. Fix (PR #2708): _to_geopandas builds an empty frame with an explicit geometry column so the CRS attaches; all-NaN early-return forwards agg.attrs['crs']. Both empty paths now return a well-formed empty GeoDataFrame carrying the CRS. 4 new tests in TestGeoDataFrame cover populated-CRS, empty-with-CRS, all-NaN-with-CRS, and empty-without-CRS. Full contour suite 28 passed. numpy-return path emits no DataArray attrs by design (list of tuples)." +corridor,2026-06-22,3446,HIGH,1;5,"Audited 2026-06-22 (agent-a8b2674b815bdfa3f worktree, branch deep-sweep-metadata-corridor-2026-06-22). CUDA available; all 4 backends (numpy/cupy/dask+numpy/dask+cupy) run live end-to-end for least_cost_corridor across single/threshold/relative/unreachable/pairwise paths. Cat 2 coords (x/y values + float64 dtype) and Cat 3 dims (y,x) preserved on every backend: they flow through cost_distance (coords=raster.coords, dims=raster.dims) and survive xarray's binary intersection. NEW HIGH finding #3446 (Cat 1 + Cat 5): the corridor is cd_a + cd_b where each cost-distance surface carries its SOURCE raster's attrs (cost_distance copies attrs from the source, not friction). xarray's default keep_attrs on binary + keeps only attrs present-and-equal in both operands, so when the source masks are plain marker rasters with no geo-attrs (the common case) the corridor came back with attrs=={} even though the friction surface that defines the grid had res/crs/transform/nodatavals; a downstream slope/clip on the corridor silently lost cellsize/CRS. Secondary Cat 5: .name was None whenever the two sources had different names (cost_distance renames each surface to its source .name; summing differently-named arrays drops the name). Fix (PR on this branch): non-precomputed path re-emits friction.attrs + friction.name on every output via new _apply_geo_metadata helper (single, threshold, all-NaN-unreachable, and pairwise-Dataset paths); precomputed path left on the existing source-derived behaviour since there is no friction to draw from. Only .attrs/.name set -- data values, coords, dims, dtype untouched, dask stays lazy (no compute). 10 new tests (test_corridor_inherits_friction_geo_attrs x4 backends, test_corridor_threshold_keeps_geo_attrs x4 backends, test_corridor_unreachable_keeps_geo_attrs, test_pairwise_inherits_friction_geo_attrs, test_precomputed_keeps_source_attrs_not_friction). Full corridor suite 43 passed. Cat 4 N/A: NaN-as-nodata is the library convention; corridor never reads attrs['nodatavals'] for masking. No CRITICAL/MEDIUM/LOW findings." cost_distance,2026-06-15,3344,MEDIUM,5,"Audited 2026-06-15 (agent-ad0b84e7f7b212360 worktree, branch deep-sweep-metadata-cost_distance-2026-06-15). CUDA available; all 4 backends (numpy/cupy/dask+numpy/dask+cupy) run live end-to-end with a rich attrs set (res/crs/transform/nodatavals/_FillValue/units). Cat 1 attrs, Cat 2 coords (values + float64 dtype), and Cat 3 dims (y,x) all preserved and identical across the 4 backends -- public cost_distance() wraps with xr.DataArray(coords=raster.coords, dims=raster.dims, attrs=raster.attrs). NEW MEDIUM finding #3344 (Cat 5): the dask+numpy and dask+cupy backends leaked the internal dask graph name (_trim- from map_overlap, asarray- from the dask+cupy convert-back path) into result.name while numpy/cupy returned None; .name was a nondeterministic per-run token that breaks .to_dataset() variable keys and any name-keyed pipeline. Same .name-leak class as proximity #2723 and zonal #2611. Fix (PR #3349 on this branch): return result.rename(raster.name) -- a constructor name= kwarg does not override a named dask array, and name=None is treated as infer-from-data, so .rename() is required. supports_dataset path unaffected (keys by var_name, verified live). New parametrized regression test test_result_name_matches_input over 4 backends x {None, named}; full cost_distance suite 63 passed (post-merge with origin/main). LOW (documented, not fixed): output float32 uses NaN as the unreachable sentinel but input nodatavals/_FillValue (e.g. -9999) are carried through verbatim, so a downstream reader masks a value that never appears -- this is the library-wide attrs=raster.attrs convention shared by proximity/slope/aspect/focal, not a cost_distance-specific bug, so fixing it in isolation would diverge this module from every peer. No CRITICAL/HIGH findings." focal,2026-06-10,3217,MEDIUM,4;5,"Re-audited 2026-06-10 (agent-ad0d55a894c6abc60 worktree, branch deep-sweep-metadata-focal-2026-06-10). CUDA available; all 4 backends (numpy/cupy/dask+numpy/dask+cupy) run live for mean, apply, focal_stats, hotspots. Cats 1-3 clean: attrs (res/crs/nodatavals/_FillValue/unit), coords (values, dtype, coord attrs), dims, .name, 3D per-band path, and hotspots unit=% all preserved and identical across the 4 backends. NEW MEDIUM finding #3217 (Cat 4 + Cat 5): (a) mean() hardcoded float32 on the GPU paths (_mean_cupy cupy.asarray(dtype=float32), _mean_dask_cupy astype(float32)) while numpy/dask+numpy returned float64 (mean() casts astype(float) before dispatch), so float64 input silently lost precision on cupy/dask+cupy; dask+cupy also advertised float64 (untyped meta) but computed float32. (b) apply()/focal_stats() dask paths passed untyped meta (np.array(()) / cupy.array(())) to map_overlap, so for float32/int input the lazy DataArray advertised float64 but computed the promoted float32 (#2805 typed the chunk fns but not the meta). Same class as aspect #2682 and proximity #2723. Fix: the mean() GPU dtype half landed on main first via duplicate issue #3214/PR #3221 (_promote_float contract: float dtypes preserved, ints->float32, GPU bit-exact vs CPU in float64); PR #3226 (branch deep-sweep-metadata-focal-2026-06-10-01) types every map_overlap meta with data.dtype and aligns tests to the _promote_float contract; 25 new parametrized regression tests (4 backends x 3 dtypes mean; dask backends x 3 dtypes apply/focal_stats; exact CPU/GPU parity). Full focal suite 258 passed. No other CRITICAL/HIGH/MEDIUM/LOW findings." geotiff,2026-06-09,3116,HIGH,2;3,"Re-audited 2026-06-09 (agent-ae89ff94a64e3ee8f worktree, branch deep-sweep-metadata-geotiff-2026-06-09). CUDA available; all 4 backends (numpy/cupy/dask+numpy/dask+cupy) run live. Focus: surfaces changed since the 2026-05-18 audit (unpack rename + GPU/dask+GPU support #3075, pack=True #3065/#3079, masked int->float promotion #2994, bbox= reads, rioxarray param alignment #2963, no-georef VRT coord synthesis #2824, GeoTransform omission #2971). Live probes: unpack attrs (scale_factor/add_offset/mask_and_scale_dtype/nodata/masked_nodata), masked=True promotion, default masked=False, bbox window+transform shift, multi-band band=N, dims/name/coords (incl. coord dtype) all identical across the 4 backends; nodata_pixels_present absent on dask paths is the documented lazy contract, not a bug. pack->unpack round trips verified on numpy/dask/gpu-write; pack of a cupy-backed read raises via the known cupy+xarray xp.astype incompat (see memory cupy_where_astype_incompat; dependency-pin fix, raises loudly, not a metadata bug). VRT reads (full/masked/window/bbox) and no-georef TIFF reads agree across the 4 backends. NEW HIGH finding #3116 (Cat 2+3): to_geotiff(non_georef_da, out.vrt, tile_size=N) wrote a corrupt index for arrays spanning >1 tile -- write_vrt derives placement from each source GeoTransform and non-georef tiles all carry the identity transform, so rasterX/YSize collapsed to one tile and every DstRect landed at the origin; reads silently returned a single tile (24x32 in -> 16x16 out). Gap left by #2966/#2971 (tests only covered one non-georef source). Fix: _write_vrt_tiled threads per-tile pixel offsets through _build_vrt -> write_vrt via internal dst_offsets kwarg; write_vrt refuses >1 all-non-georef sources without explicit placement and rejects dst_offsets alongside georeferenced sources. 18 new tests in tests/vrt/test_non_georef_placement_3116.py incl. 4-backend round trip, dask-backed and plain-ndarray writes, XML DstRect assertions, georef placement regression, and the write_vrt error contract. Full vrt suite 520 passed; write+round-trip suites 1292 passed." diff --git a/xrspatial/corridor.py b/xrspatial/corridor.py index 83bc16acf..8d34ef517 100644 --- a/xrspatial/corridor.py +++ b/xrspatial/corridor.py @@ -38,11 +38,33 @@ def _scalar_to_float(da_scalar): return float(data) +def _apply_geo_metadata( + result: xr.DataArray, + geo_source: xr.DataArray | None, +) -> xr.DataArray: + """Re-emit the grid-defining raster's attrs/name on the corridor output. + + The corridor is ``cd_a + cd_b``, and each cost-distance surface carries + the attrs of its *source* raster (see ``cost_distance``). Source rasters + are usually plain marker masks with no geo-attrs, so the xarray binary + sum drops ``res``/``crs``/``transform``/``nodatavals`` even when the + friction surface that defines the grid had them. When a grid-defining + raster is available (the non-precomputed case, where *friction* is it), + copy its attrs and name back onto the result. + """ + if geo_source is None: + return result + result = result.copy() + result.attrs = dict(geo_source.attrs) + return result.rename(geo_source.name) + + def _compute_corridor( cd_a: xr.DataArray, cd_b: xr.DataArray, threshold: float | None, relative: bool, + geo_source: xr.DataArray | None = None, ) -> xr.DataArray: """Sum two cost-distance surfaces, normalize, and optionally threshold.""" corridor = cd_a + cd_b @@ -50,7 +72,7 @@ def _compute_corridor( if not np.isfinite(corridor_min): # Sources are mutually unreachable -- return all-NaN. - return corridor * np.nan + return _apply_geo_metadata(corridor * np.nan, geo_source) normalized = corridor - corridor_min @@ -74,7 +96,7 @@ def _compute_corridor( data = np.where(data <= cutoff, data, np.nan) normalized = normalized.copy(data=data) - return normalized + return _apply_geo_metadata(normalized, geo_source) def least_cost_corridor( @@ -185,6 +207,9 @@ def least_cost_corridor( # ------------------------------------------------------------------ if precomputed: cd_surfaces = list(sources) + # No friction surface to draw grid metadata from -- keep the + # existing source-derived attrs/name behaviour. + geo_source = None else: cd_surfaces = [ cost_distance( @@ -193,13 +218,16 @@ def least_cost_corridor( ) for src in sources ] + # The friction surface defines the grid (res/crs/transform), so its + # attrs and name should ride through to the corridor output. + geo_source = friction # ------------------------------------------------------------------ # Two-source case: return a single DataArray # ------------------------------------------------------------------ if len(cd_surfaces) == 2 and not pairwise: return _compute_corridor( - cd_surfaces[0], cd_surfaces[1], threshold, relative + cd_surfaces[0], cd_surfaces[1], threshold, relative, geo_source ) # ------------------------------------------------------------------ @@ -209,7 +237,7 @@ def least_cost_corridor( for i, j in itertools.combinations(range(len(cd_surfaces)), 2): name = f"corridor_{i}_{j}" corridors[name] = _compute_corridor( - cd_surfaces[i], cd_surfaces[j], threshold, relative + cd_surfaces[i], cd_surfaces[j], threshold, relative, geo_source ) return xr.Dataset(corridors) diff --git a/xrspatial/tests/test_corridor.py b/xrspatial/tests/test_corridor.py index 056479018..23eb2a326 100644 --- a/xrspatial/tests/test_corridor.py +++ b/xrspatial/tests/test_corridor.py @@ -374,3 +374,127 @@ def test_single_source_in_list_raises(): src = _make_raster(np.ones((3, 3))) with pytest.raises(ValueError, match="at least 2"): least_cost_corridor(friction, sources=[src]) + + +# ----------------------------------------------------------------------- +# Metadata propagation (issue #3446) +# ----------------------------------------------------------------------- + + +_GEO_ATTRS = { + "res": (1.0, 1.0), + "crs": "EPSG:4326", + "transform": (1.0, 0.0, 0.0, 0.0, -1.0, 0.0), + "nodatavals": (-9999.0,), +} + + +def _make_geo_raster(data, attrs, name, backend="numpy"): + """Build a raster with explicit attrs/name on top of ``_make_raster``.""" + raster = _make_raster(data, backend=backend, chunks=data.shape) + raster.attrs = dict(attrs) + raster.name = name + return raster + + +@pytest.mark.parametrize( + "backend", ["numpy", "dask+numpy", "cupy", "dask+cupy"] +) +def test_corridor_inherits_friction_geo_attrs(backend): + """Corridor carries friction's geo-attrs/name even when sources have none.""" + n = 7 + friction = _make_geo_raster( + np.ones((n, n)), _GEO_ATTRS, "friction", backend=backend + ) + sa_d = np.zeros((n, n)) + sa_d[3, 0] = 1.0 + sb_d = np.zeros((n, n)) + sb_d[3, 6] = 1.0 + # Source masks deliberately carry no attrs and no name. + sa = _make_geo_raster(sa_d, {}, None, backend=backend) + sb = _make_geo_raster(sb_d, {}, None, backend=backend) + + result = least_cost_corridor(friction, sa, sb) + + assert dict(result.attrs) == _GEO_ATTRS + assert result.name == "friction" + assert result.dims == ("y", "x") + assert list(result.coords) == ["y", "x"] + + +@pytest.mark.parametrize( + "backend", ["numpy", "dask+numpy", "cupy", "dask+cupy"] +) +def test_corridor_threshold_keeps_geo_attrs(backend): + """Thresholded corridor still carries friction's geo-attrs.""" + n = 7 + friction = _make_geo_raster( + np.ones((n, n)), _GEO_ATTRS, "friction", backend=backend + ) + sa_d = np.zeros((n, n)) + sa_d[3, 0] = 1.0 + sb_d = np.zeros((n, n)) + sb_d[3, 6] = 1.0 + sa = _make_geo_raster(sa_d, {}, None, backend=backend) + sb = _make_geo_raster(sb_d, {}, None, backend=backend) + + result = least_cost_corridor(friction, sa, sb, threshold=0.5) + + assert dict(result.attrs) == _GEO_ATTRS + assert result.name == "friction" + + +def test_corridor_unreachable_keeps_geo_attrs(): + """All-NaN corridor (unreachable sources) still carries geo-attrs.""" + n = 5 + fr_d = np.ones((n, n)) + fr_d[:, 2] = np.nan # impenetrable wall + friction = _make_geo_raster(fr_d, _GEO_ATTRS, "friction") + sa_d = np.zeros((n, n)) + sa_d[2, 0] = 1.0 + sb_d = np.zeros((n, n)) + sb_d[2, 4] = 1.0 + sa = _make_geo_raster(sa_d, {}, None) + sb = _make_geo_raster(sb_d, {}, None) + + result = least_cost_corridor(friction, sa, sb) + + assert np.all(np.isnan(_compute(result))) + assert dict(result.attrs) == _GEO_ATTRS + + +def test_pairwise_inherits_friction_geo_attrs(): + """Every variable in a pairwise Dataset carries friction's geo-attrs.""" + n = 7 + friction = _make_geo_raster(np.ones((n, n)), _GEO_ATTRS, "friction") + sources = [] + for r, c in [(0, 0), (0, 6), (6, 3)]: + s = np.zeros((n, n)) + s[r, c] = 1.0 + sources.append(_make_geo_raster(s, {}, None)) + + result = least_cost_corridor(friction, sources=sources, pairwise=True) + + assert isinstance(result, xr.Dataset) + for name in result.data_vars: + assert dict(result[name].attrs) == _GEO_ATTRS + + +def test_precomputed_keeps_source_attrs_not_friction(): + """Precomputed path leaves the source-derived attrs alone (no friction).""" + n = 7 + friction = _make_geo_raster(np.ones((n, n)), _GEO_ATTRS, "friction") + sa_d = np.zeros((n, n)) + sa_d[3, 0] = 1.0 + sb_d = np.zeros((n, n)) + sb_d[3, 6] = 1.0 + src_attrs = {"res": (1.0, 1.0), "crs": "EPSG:3857"} + cd_a = _make_geo_raster(sa_d, src_attrs, "cd") + cd_b = _make_geo_raster(sb_d, src_attrs, "cd") + + result = least_cost_corridor(friction, cd_a, cd_b, precomputed=True) + + # Friction's attrs must NOT leak into a precomputed corridor; the + # matching source attrs survive xarray's binary-op intersection. + assert dict(result.attrs) == src_attrs + assert "EPSG:4326" not in result.attrs.get("crs", "")