diff --git a/.github/workflows/test-compile.yml b/.github/workflows/test-compile.yml deleted file mode 100644 index 9f48bb2..0000000 --- a/.github/workflows/test-compile.yml +++ /dev/null @@ -1,187 +0,0 @@ -name: CI - -on: - push: - branches: - - compile - -permissions: - contents: read - -jobs: - linux: - runs-on: ${{ matrix.platform.runner }} - strategy: - fail-fast: false - matrix: - platform: - - runner: ubuntu-22.04 - target: x86_64 - - runner: ubuntu-22.04 - target: aarch64 - - runner: ubuntu-22.04 - target: armv7 - - runner: ubuntu-22.04 - target: ppc64le - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: 3.x - - name: Store RUSTFLAGS in ENV - id: features - run: | - FEATURES=+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+cmpxchg16b,+avx,+avx2,+fma,+bmi1,+bmi2,+lzcnt,+pclmulqdq,+movbe - echo "features=$FEATURES" >> $GITHUB_OUTPUT - - name: Set RUSTFLAGS for x86_64 - if: matrix.platform.target == 'x86_64' - env: - FEATURES: ${{ steps.features.outputs.features }} - run: | - echo "RUSTFLAGS=-C target-feature=$FEATURES" >> $GITHUB_ENV - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --profile dist-release --out dist --find-interpreter - sccache: "true" - manylinux: manylinux_2_28 - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-linux-${{ matrix.platform.target }} - path: dist - - musllinux: - runs-on: ${{ matrix.platform.runner }} - strategy: - fail-fast: false - matrix: - platform: - - runner: ubuntu-22.04 - target: x86_64 - - runner: ubuntu-22.04 - target: aarch64 - - runner: ubuntu-22.04 - target: armv7 - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: 3.x - - name: Store RUSTFLAGS in ENV - id: features - run: | - FEATURES=+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+cmpxchg16b,+avx,+avx2,+fma,+bmi1,+bmi2,+lzcnt,+pclmulqdq,+movbe - echo "features=$FEATURES" >> $GITHUB_OUTPUT - - name: Set RUSTFLAGS for x86_64 - if: matrix.platform.target == 'x86_64' - env: - FEATURES: ${{ steps.features.outputs.features }} - run: | - echo "RUSTFLAGS=-C target-feature=$FEATURES" >> $GITHUB_ENV - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --profile dist-release --out dist --find-interpreter - sccache: "true" - manylinux: manylinux_2_28 - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-musllinux-${{ matrix.platform.target }} - path: dist - - windows: - runs-on: ${{ matrix.platform.runner }} - strategy: - fail-fast: false - matrix: - platform: - - runner: windows-latest - target: x64 - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: 3.x - architecture: ${{ matrix.platform.target }} - - name: Store RUSTFLAGS in ENV - id: features - shell: bash - run: | - FEATURES=+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+cmpxchg16b,+avx,+avx2,+fma,+bmi1,+bmi2,+lzcnt,+pclmulqdq,+movbe - echo "features=$FEATURES" >> $GITHUB_OUTPUT - - name: Set RUSTFLAGS for x64 - if: matrix.platform.target == 'x64' - env: - FEATURES: ${{ steps.features.outputs.features }} - run: | - echo "RUSTFLAGS=-C target-feature=$FEATURES" >> $GITHUB_ENV - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --profile dist-release --out dist --find-interpreter - sccache: "true" - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-windows-${{ matrix.platform.target }} - path: dist - - - macos: - runs-on: ${{ matrix.platform.runner }} - strategy: - fail-fast: false - matrix: - platform: - - runner: macos-13 - target: x86_64 - - runner: macos-14 - target: aarch64 - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: 3.x - - name: Store RUSTFLAGS in ENV - id: features - run: | - FEATURES=+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+cmpxchg16b,+avx,+avx2,+fma,+bmi1,+bmi2,+lzcnt,+pclmulqdq,+movbe - echo "features=$FEATURES" >> $GITHUB_OUTPUT - - name: Set RUSTFLAGS for x86_64 - if: matrix.platform.target == 'x86_64' - env: - FEATURES: ${{ steps.features.outputs.features }} - run: | - echo "RUSTFLAGS=-C target-feature=$FEATURES" >> $GITHUB_ENV - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --profile dist-release --out dist --find-interpreter - sccache: "true" - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-macos-${{ matrix.platform.target }} - path: dist - - sdist: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Build sdist - uses: PyO3/maturin-action@v1 - with: - command: sdist - args: --out dist - - name: Upload sdist - uses: actions/upload-artifact@v4 - with: - name: wheels-sdist - path: dist - diff --git a/.gitignore b/.gitignore index 5b68623..05d6926 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ target/ # Virtual environment .venv .env +uv.lock # Shared object **/*.so @@ -23,10 +24,17 @@ target/ # Benchmarks .benchmarks -# other stuff +# Other stuff README_files .ruff_cache benchmarks/data .git -uv.lock -**/*.xml +**/*.aux.xml + +# Temporary maturin development +*.pyd + +# Jupyter Notebooks +.ipynb_checkpoints +*/.ipynb_checkpoints/* +dev_notebooks/ diff --git a/Cargo.lock b/Cargo.lock index d362da4..4994753 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,6 +23,12 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -34,9 +40,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.101" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "approx" @@ -62,6 +68,7 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70f13d10a41ac8d2ec79ee34178d61e6f47a29c2edfe7ef1721c7383b0359e65" dependencies = [ + "half", "num-traits", ] @@ -130,9 +137,9 @@ dependencies = [ [[package]] name = "atoi_simd" -version = "0.16.1" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2a49e05797ca52e312a0c658938b7d00693ef037799ef7187678f212d7684cf" +checksum = "8ad17c7c205c2c28b527b9845eeb91cf1b4d008b438f98ce0e628227a822758e" dependencies = [ "debug_unsafe", ] @@ -215,9 +222,9 @@ checksum = "36f64beae40a84da1b4b26ff2761a5b895c12adc41dc25aaee1c4f2bbfe97a6e" [[package]] name = "bumpalo" -version = "3.19.1" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] name = "bytemuck" @@ -289,14 +296,15 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.43" +version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" +checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" dependencies = [ + "android-tzdata", "iana-time-zone", "num-traits", "serde", - "windows-link", + "windows-link 0.1.3", ] [[package]] @@ -450,6 +458,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "crypto-common" version = "0.1.7" @@ -613,6 +627,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -844,6 +864,20 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "bytemuck", + "cfg-if", + "crunchy", + "num-traits", + "serde", + "zerocopy", +] + [[package]] name = "hash32" version = "0.3.1" @@ -861,9 +895,7 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2", "equivalent", - "foldhash", - "rayon", - "serde", + "foldhash 0.1.5", ] [[package]] @@ -871,6 +903,14 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", + "rayon", + "serde", + "serde_core", +] [[package]] name = "heapless" @@ -1259,9 +1299,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.85" +version = "0.3.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6" dependencies = [ "once_cell", "wasm-bindgen", @@ -1297,9 +1337,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "litemap" @@ -1419,9 +1459,9 @@ dependencies = [ [[package]] name = "ndarray" -version = "0.16.1" +version = "0.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +checksum = "520080814a7a6b4a6e9070823bb24b4531daac8c4627e08ba5de8c5ef2f2752d" dependencies = [ "matrixmultiply", "num-complex", @@ -1451,6 +1491,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -1494,9 +1545,9 @@ dependencies = [ [[package]] name = "numpy" -version = "0.25.0" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29f1dee9aa8d3f6f8e8b9af3803006101bb3653866ef056d530d53ae68587191" +checksum = "7aac2e6a6e4468ffa092ad43c39b81c79196c2bb773b8db4085f695efe3bba17" dependencies = [ "libc", "ndarray", @@ -1519,9 +1570,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.5" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +checksum = "c2858065e55c148d294a9f3aae3b0fa9458edadb41a108397094566f4e3c0dfb" dependencies = [ "async-trait", "base64", @@ -1590,7 +1641,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-link", + "windows-link 0.2.1", ] [[package]] @@ -1647,15 +1698,18 @@ dependencies = [ [[package]] name = "polars" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5f7feb5d56b954e691dff22a8b2d78d77433dcc93c35fe21c3777fdc121b697" +checksum = "899852b723e563dc3cbdc7ea833b14ec44e61309f55df29ba86d45cfd6bc141a" dependencies = [ "getrandom 0.2.17", "getrandom 0.3.4", "polars-arrow", + "polars-buffer", + "polars-compute", "polars-core", "polars-error", + "polars-expr", "polars-io", "polars-lazy", "polars-ops", @@ -1668,13 +1722,14 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b4fed2343961b3eea3db2cee165540c3e1ad9d5782350cc55a9e76cf440148" +checksum = "6f672743a042b72ace4f88b29f8205ab200b29c5ac976c0560899680c07d2d09" dependencies = [ "atoi_simd", "bitflags", "bytemuck", + "bytes", "chrono", "chrono-tz", "dyn-clone", @@ -1682,11 +1737,13 @@ dependencies = [ "ethnum", "getrandom 0.2.17", "getrandom 0.3.4", - "hashbrown 0.15.5", + "half", + "hashbrown 0.16.1", "itoa", "lz4", "num-traits", "polars-arrow-format", + "polars-buffer", "polars-error", "polars-schema", "polars-utils", @@ -1708,37 +1765,50 @@ dependencies = [ "serde", ] +[[package]] +name = "polars-buffer" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d7011424c3a79ca9c1272c7b4f5fe98695d3bed45595e37bb23c16a2978c80c" +dependencies = [ + "bytemuck", + "either", + "serde", + "version_check", +] + [[package]] name = "polars-compute" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "138785beda4e4a90a025219f09d0d15a671b2be9091513ede58e05db6ad4413f" +checksum = "42a32eca8e08ac4cc5de2ac3996d2b38567bba72cdb19bbfd94c370193ed51dd" dependencies = [ "atoi_simd", "bytemuck", "chrono", "either", "fast-float2", - "hashbrown 0.15.5", + "half", + "hashbrown 0.16.1", "itoa", "num-traits", "polars-arrow", + "polars-buffer", "polars-error", "polars-utils", "rand", - "ryu", "serde", - "skiplist", "strength_reduce", "strum_macros", "version_check", + "zmij", ] [[package]] name = "polars-core" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e77b1f08ef6dbb032bb1d0d3365464be950df9905f6827a95b24c4ca5518901d" +checksum = "726296966d04268ee9679c2062af2d06c83c7a87379be471defe616b244c5029" dependencies = [ "bitflags", "boxcar", @@ -1747,11 +1817,13 @@ dependencies = [ "chrono-tz", "comfy-table", "either", - "hashbrown 0.15.5", + "getrandom 0.3.4", + "hashbrown 0.16.1", "indexmap", "itoa", "num-traits", "polars-arrow", + "polars-buffer", "polars-compute", "polars-dtype", "polars-error", @@ -1772,12 +1844,12 @@ dependencies = [ [[package]] name = "polars-dtype" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89c43d0ea57168be4546c4d8064479ed8b29a9c79c31a0c7c367ee734b9b7158" +checksum = "51976dc46d42cd1e7ca252a9e3bdc90c63b0bfa7030047ebaf5250c2b7838fa6" dependencies = [ "boxcar", - "hashbrown 0.15.5", + "hashbrown 0.16.1", "polars-arrow", "polars-error", "polars-utils", @@ -1787,9 +1859,9 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9cb5d98f59f8b94673ee391840440ad9f0d2170afced95fc98aa86f895563c0" +checksum = "8c13126f8baebc13dadf26a80dcf69a607977fc8a67b18671ad2cefc713a7bdd" dependencies = [ "object_store", "parking_lot", @@ -1802,14 +1874,15 @@ dependencies = [ [[package]] name = "polars-expr" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343931b818cf136349135ba11dbc18c27683b52c3477b1ba8ca606cf5ab1965c" +checksum = "2151f54b0ae5d6b86c3c47df0898ff90edfe774807823f742f36e44973d51ea1" dependencies = [ "bitflags", - "hashbrown 0.15.5", + "hashbrown 0.16.1", "num-traits", "polars-arrow", + "polars-buffer", "polars-compute", "polars-core", "polars-io", @@ -1821,13 +1894,25 @@ dependencies = [ "rand", "rayon", "recursive", + "regex", + "version_check", +] + +[[package]] +name = "polars-ffi" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9526b18335cfddc556eb5c34cdecba3ecf49bba7734470a82728569d44e72a0" +dependencies = [ + "polars-arrow", + "polars-core", ] [[package]] name = "polars-io" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10388c64b8155122488229a881d1c6f4fdc393bc988e764ab51b182fcb2307e4" +checksum = "059724d7762d7332cbc225e6504d996091b28fa1337716e06e5a81d9e54a34ad" dependencies = [ "async-trait", "atoi_simd", @@ -1838,7 +1923,7 @@ dependencies = [ "fs4", "futures", "glob", - "hashbrown 0.15.5", + "hashbrown 0.16.1", "home", "itoa", "memchr", @@ -1847,6 +1932,8 @@ dependencies = [ "object_store", "percent-encoding", "polars-arrow", + "polars-buffer", + "polars-compute", "polars-core", "polars-error", "polars-parquet", @@ -1856,26 +1943,25 @@ dependencies = [ "rayon", "regex", "reqwest", - "ryu", "serde", "serde_json", "simdutf8", "tokio", - "tokio-util", - "url", + "zmij", ] [[package]] name = "polars-lazy" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fb6e2c6c2fa4ea0c660df1c06cf56960c81e7c2683877995bae3d4e3d408147" +checksum = "02e1e24d4db8c349e9576564cfff47a3f08bb831dba9168f6599be178bc725e8" dependencies = [ "bitflags", "chrono", "either", "memchr", "polars-arrow", + "polars-buffer", "polars-compute", "polars-core", "polars-expr", @@ -1892,9 +1978,9 @@ dependencies = [ [[package]] name = "polars-mem-engine" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20a856e98e253587c28d8132a5e7e5a75cb2c44731ca090f1481d45f1d123771" +checksum = "c394e4cd90186043d4051ce118e90794afbe81ac5eb9a51e358a56728e8ebde3" dependencies = [ "memmap2", "polars-arrow", @@ -1912,9 +1998,9 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf6062173fdc9ba05775548beb66e76643a148d9aeadc9984ed712bc4babd76" +checksum = "7e47b2d9b3627662650da0a8c76ce5101ed1c61b104cb2b3663e0dc711571b12" dependencies = [ "argminmax", "base64", @@ -1922,13 +2008,14 @@ dependencies = [ "chrono", "chrono-tz", "either", - "hashbrown 0.15.5", + "hashbrown 0.16.1", "hex", "indexmap", "libm", "memchr", "num-traits", "polars-arrow", + "polars-buffer", "polars-compute", "polars-core", "polars-error", @@ -1945,22 +2032,24 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc1d769180dec070df0dc4b89299b364bf2cfe32b218ecc4ddd8f1a49ae60669" +checksum = "436bae3e89438cafe69400e7567057d7d9820d21ac9a4f69a33b413f2666f03d" dependencies = [ "async-stream", "base64", "bytemuck", "ethnum", "futures", - "hashbrown 0.15.5", + "hashbrown 0.16.1", "num-traits", "polars-arrow", + "polars-buffer", "polars-compute", "polars-error", "polars-parquet-format", "polars-utils", + "regex", "serde", "simdutf8", "streaming-decompression", @@ -1978,21 +2067,24 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd3a2e33ae4484fe407ab2d2ba5684f0889d1ccf3ad6b844103c03638e6d0a0" +checksum = "f7930d5ae1d006179e65f01af57c859307b5875a4cc078dc75257250b9ae5162" dependencies = [ "bitflags", + "blake3", "bytemuck", "bytes", "chrono", "chrono-tz", "either", - "hashbrown 0.15.5", + "futures", + "hashbrown 0.16.1", "memmap2", "num-traits", "percent-encoding", "polars-arrow", + "polars-buffer", "polars-compute", "polars-core", "polars-error", @@ -2004,19 +2096,22 @@ dependencies = [ "recursive", "regex", "sha2", + "slotmap", "strum_macros", + "tokio", "version_check", ] [[package]] name = "polars-row" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18734f17e0e348724df3ae65f3ee744c681117c04b041cac969dfceb05edabc0" +checksum = "d29ea1a4554fe06442db1d6229235cd358e8eacba96aed8718f612caf3e3a646" dependencies = [ "bitflags", "bytemuck", "polars-arrow", + "polars-buffer", "polars-compute", "polars-dtype", "polars-error", @@ -2025,9 +2120,9 @@ dependencies = [ [[package]] name = "polars-schema" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6c1ab13e04d5167661a9854ed1ea0482b2ed9b8a0f1118dabed7cd994a85e3" +checksum = "d688e73f9156f93cb29350be144c8f1e84c1bc705f00ee7f15eb9706a7971273" dependencies = [ "indexmap", "polars-error", @@ -2038,9 +2133,9 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4e7766da02cc1d464994404d3e88a7a0ccd4933df3627c325480fbd9bbc0a11" +checksum = "100415f86069d7e9fbf54737148fc161a7c7316a6a7d375fb6cfc7fc64f570ae" dependencies = [ "bitflags", "hex", @@ -2051,7 +2146,6 @@ dependencies = [ "polars-plan", "polars-time", "polars-utils", - "rand", "regex", "serde", "sqlparser", @@ -2059,24 +2153,30 @@ dependencies = [ [[package]] name = "polars-stream" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f6c6ca1ea01f9dea424d167e4f33f5ec44cd67fbfac9efd40575ed20521f14" +checksum = "65a0c054bdf16efd16bbc587e8d5418ae28464d61afd735513579cd3c338fa70" dependencies = [ "async-channel", "async-trait", "atomic-waker", "bitflags", + "bytes", + "chrono-tz", "crossbeam-channel", "crossbeam-deque", "crossbeam-queue", "crossbeam-utils", "futures", + "memchr", "memmap2", + "num-traits", "parking_lot", "percent-encoding", "pin-project-lite", "polars-arrow", + "polars-buffer", + "polars-compute", "polars-core", "polars-error", "polars-expr", @@ -2085,21 +2185,21 @@ dependencies = [ "polars-ops", "polars-parquet", "polars-plan", + "polars-time", "polars-utils", "rand", "rayon", "recursive", "slotmap", "tokio", - "tokio-util", "version_check", ] [[package]] name = "polars-time" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6a3a6e279a7a984a0b83715660f9e880590c6129ec2104396bfa710bcd76dee" +checksum = "72e80404e1e418c997230e3b2972c3be331f45df8bdd3150fe3bef562c7a332f" dependencies = [ "atoi_simd", "bytemuck", @@ -2120,21 +2220,24 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57b267021b0e5422d7fbc70fd79e51b9f9a8466c585779373a18b0199e973f29" +checksum = "c97cabf53eb8fbf6050cde3fef8f596c51cc25fd7d55fbde108d815ee6674abf" dependencies = [ + "argminmax", "bincode", "bytemuck", "bytes", "compact_str", "either", "flate2", - "foldhash", - "hashbrown 0.15.5", + "foldhash 0.2.0", + "half", + "hashbrown 0.16.1", "indexmap", "libc", "memmap2", + "num-derive", "num-traits", "polars-error", "rand", @@ -2224,9 +2327,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.25.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" +checksum = "ab53c047fcd1a1d2a8820fe84f05d6be69e9526be40cb03b73f86b6b03e6d87d" dependencies = [ "indoc", "libc", @@ -2241,20 +2344,19 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.25.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" +checksum = "b455933107de8642b4487ed26d912c2d899dec6114884214a0b3bb3be9261ea6" dependencies = [ - "once_cell", "python3-dll-a", "target-lexicon", ] [[package]] name = "pyo3-ffi" -version = "0.25.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" +checksum = "1c85c9cbfaddf651b1221594209aed57e9e5cff63c4d11d1feead529b872a089" dependencies = [ "libc", "pyo3-build-config", @@ -2262,9 +2364,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.25.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" +checksum = "0a5b10c9bf9888125d917fb4d2ca2d25c8df94c7ab5a52e13313a07e050a3b02" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -2274,9 +2376,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.25.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" +checksum = "03b51720d314836e53327f5871d4c0cfb4fb37cc2c4a11cc71907a86342c40f9" dependencies = [ "heck", "proc-macro2", @@ -2287,9 +2389,9 @@ dependencies = [ [[package]] name = "pyo3-polars" -version = "0.24.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64e7954e98cb5af21afca4b68c73d2e00035b07b06dc2cab9355e99d03d09557" +checksum = "f29248c4baefdfa23a7768341d1e431a5dee7348a757fa74315c810f3b8710d4" dependencies = [ "libc", "once_cell", @@ -2297,6 +2399,7 @@ dependencies = [ "polars-arrow", "polars-core", "polars-error", + "polars-ffi", "pyo3", "thiserror 1.0.69", ] @@ -2518,9 +2621,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "reqwest" @@ -2622,7 +2725,7 @@ checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rusterize" -version = "0.7.1" +version = "0.8.0" dependencies = [ "bitflags", "fixedbitset", @@ -2639,13 +2742,14 @@ dependencies = [ "rayon", "tikv-jemallocator", "wkb", + "wkt", ] [[package]] name = "rustix" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ "bitflags", "errno", @@ -2656,9 +2760,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.36" +version = "0.23.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" dependencies = [ "once_cell", "ring", @@ -2739,9 +2843,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "3.6.0" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d17b898a6d6948c3a8ee4372c17cb384f90d2e6e912ef00895b14fd7ab54ec38" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ "bitflags", "core-foundation", @@ -2752,9 +2856,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.16.0" +version = "2.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "321c8673b092a9a42605034a9879d73cb79101ed5fd117bc9a597b89b4e9e61a" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" dependencies = [ "core-foundation-sys", "libc", @@ -2851,9 +2955,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook" -version = "0.3.18" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2" +checksum = "3b57709da74f9ff9f4a27dce9526eec25ca8407c45a7887243b031a58935fb8e" dependencies = [ "libc", "signal-hook-registry", @@ -2887,16 +2991,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" -[[package]] -name = "skiplist" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f354fd282d3177c2951004953e2fdc4cb342fa159bbee8b829852b6a081c8ea1" -dependencies = [ - "rand", - "thiserror 2.0.18", -] - [[package]] name = "slab" version = "0.4.12" @@ -2942,11 +3036,24 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.53.0" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +checksum = "505aa16b045c4c1375bf5f125cce3813d0176325bfe9ffc4a903f423de7774ff" dependencies = [ "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "028e551d5e270b31b9f3ea271778d9d827148d4287a5d96167b6bb9787f5cc38" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -3015,9 +3122,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.116" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -3180,7 +3287,6 @@ dependencies = [ "bytes", "futures-core", "futures-sink", - "futures-util", "pin-project-lite", "tokio", ] @@ -3450,9 +3556,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.108" +version = "0.2.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2" dependencies = [ "cfg-if", "once_cell", @@ -3463,9 +3569,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.58" +version = "0.4.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" +checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a" dependencies = [ "cfg-if", "futures-util", @@ -3477,9 +3583,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.108" +version = "0.2.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3487,9 +3593,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.108" +version = "0.2.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60" dependencies = [ "bumpalo", "proc-macro2", @@ -3500,9 +3606,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.108" +version = "0.2.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5" dependencies = [ "unicode-ident", ] @@ -3556,9 +3662,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.85" +version = "0.3.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" +checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97" dependencies = [ "js-sys", "wasm-bindgen", @@ -3613,7 +3719,7 @@ checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" dependencies = [ "windows-implement", "windows-interface", - "windows-link", + "windows-link 0.2.1", "windows-result", "windows-strings", ] @@ -3640,6 +3746,12 @@ dependencies = [ "syn", ] +[[package]] +name = "windows-link" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" + [[package]] name = "windows-link" version = "0.2.1" @@ -3652,7 +3764,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" dependencies = [ - "windows-link", + "windows-link 0.2.1", ] [[package]] @@ -3661,7 +3773,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" dependencies = [ - "windows-link", + "windows-link 0.2.1", ] [[package]] @@ -3697,7 +3809,7 @@ version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ - "windows-link", + "windows-link 0.2.1", ] [[package]] @@ -3722,7 +3834,7 @@ version = "0.53.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ - "windows-link", + "windows-link 0.2.1", "windows_aarch64_gnullvm 0.53.1", "windows_aarch64_msvc 0.53.1", "windows_i686_gnu 0.53.1", @@ -3938,6 +4050,19 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "wkt" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efb2b923ccc882312e559ffaa832a055ba9d1ac0cc8e86b3e25453247e4b81d7" +dependencies = [ + "geo-traits", + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + [[package]] name = "writeable" version = "0.6.2" diff --git a/Cargo.toml b/Cargo.toml index 6824bf7..bada9b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ cargo-features = ["profile-rustflags"] [package] name = "rusterize" -version = "0.7.1" +version = "0.8.0" edition = "2024" resolver = "3" @@ -16,14 +16,15 @@ fixedbitset = "0.5.7" geo = "0.30.0" geo-traits = "0.3.0" geo-types = "0.7.18" -ndarray = { version = "0.16.1", features = ["rayon"] } +ndarray = { version = "0.17.2", features = ["rayon"] } num-traits = "0.2.19" -numpy = "0.25.0" -polars = { version = "0.51.0", features = ["lazy", "simd", "performant", "nightly"] } -pyo3 = { version = "0.25.1", features = ["extension-module", "abi3-py311", "generate-import-lib"] } -pyo3-polars = "0.24.0" +numpy = "0.27.1" +polars = { version = "0.53.0", features = ["lazy", "simd", "performant", "nightly"] } +pyo3 = { version = "0.27.2", features = ["extension-module", "abi3-py311", "generate-import-lib"] } +pyo3-polars = "0.26.0" rayon = "1.11.0" wkb = "0.9.2" +wkt = "0.14.0" # OS-specific allocators [target.'cfg(not(target_family = "unix"))'.dependencies] diff --git a/LICENSE b/LICENSE index 1dc9ea0..b93fcf9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,6 @@ MIT License -Copyright (c) 2025 Tommaso Trotto -Copyright (c) 2017 EcoHealth Alliance +Copyright (c) 2024-2026 Tommaso Trotto Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -20,4 +19,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 248398c..fca736f 100644 --- a/README.md +++ b/README.md @@ -1,54 +1,68 @@ -# rusterize +

+ +

-High performance rasterization tool for Python built in Rust. This repository is inspired by the [fasterize](https://github.com/ecohealthalliance/fasterize.git) package and ports parts of the logics into Python with a Rust backend, in addition to useful improvements (see [API](#API)). +
-**rusterize** is designed to work on _all_ shapely geometries, even when they are nested inside complex geometry collections. Functionally, it takes an input [geopandas](https://geopandas.org/en/stable/) dataframe and returns a [xarray](https://docs.xarray.dev/en/stable/), a [numpy](https://numpy.org/), or a sparse array in COOrdinate format. +High performance rasterization tool for Python built in Rust, inspired by the [fasterize](https://github.com/ecohealthalliance/fasterize.git) package with lots of useful improvements (see [API](#API)). -# Installation +**rusterize** is designed to work on _all_ shapely geometries, even when they are nested inside complex geometry collections. Functionally, it supports four input types: -`rusterize` is distributed in two flavors. A `core` library that performs the rasterization and returns a bare `numpy` array, or a `xarray` version that returns a georeferenced `xarray`. This latter requires `xarray` and `rioxarray` to be installed. This is the recommended flavor. +- [geopandas](https://geopandas.org/en/stable/) GeoDataFrame +- [polars-st](https://oreilles.github.io/polars-st/) GeoDataFrame +- Python list of geometries in WKB or WKT format +- Numpy array of geometries in WKB or WKT format + +It returns a [xarray](https://docs.xarray.dev/en/stable/), a [numpy](https://numpy.org/), or a sparse array in COOrdinate format. + +### Installation + +`rusterize` comes with numpy as the only required dependency and is distributed in different flavors. A `core` library that performs the rasterization and returns a bare `numpy` array, a `xarray` flavor that returns a georeferenced `xarray` (requires `xarray` and `rioxarray` and is the recommended flavor), or an `all` flavor with dependencies for all supported inputs. Install the current version with pip: ```shell -# Core library +# core library pip install rusterize -# With xarray capabilities +# xarray capabilities pip install 'rusterize[xarray]' + +# support all input types +pip install 'rusterize[all]' ``` -# Contributing +### Contributing -Any contribution is welcome! You can install **rusterize** directly from this repo using [maturin](https://www.maturin.rs/) as an editable package. For this to work, you’ll need to have [Rust](https://www.rust-lang.org/tools/install) and [cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html) -installed. +Any contribution is welcome! You can install **rusterize** directly from this repo using [maturin](https://www.maturin.rs/) as an editable package. For this to work, you’ll need to have [Rust](https://www.rust-lang.org/tools/install) and [cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html) installed. To run the tests you need to have `gdal` installed as well as the `rusterize[all]` flavor. ```shell -# Clone repo +# clone repo git clone https://github.com//rusterize.git cd rusterize -# Install the Rust nightly toolchain +# install Rust nightly toolchain rustup toolchain install nightly-2026-01-09 - # Install maturin +# install maturin pip install maturin -# Install editable version with optmized code +# install editable version with optmized code maturin develop --profile dist-release + +# test the new contribution +pytest ``` -# API +### API **rusterize** has a simple API consisting of a single function `rusterize()`: ```python from rusterize import rusterize -# gdf = - rusterize( - gdf, + data, like=None, res=(30, 30), out_shape=(10, 10), @@ -65,28 +79,57 @@ rusterize( ) ``` -- `gdf`: geopandas dataframe to rasterize -- `like`: xr.DataArray to use as template for `res`, `out_shape`, and `extent`. Mutually exclusive with these parameters (default: `None`) -- `res`: (xres, yres) for desired resolution (default: `None`) -- `out_shape`: (ncols, nrows) for desired output shape (default: `None`) -- `extent`: (xmin, ymin, xmax, ymax) for desired output extent (default: `None`) -- `field`: column to rasterize. Mutually exclusive with `burn` (default: `None` -> a value of `1` is rasterized) -- `by`: column for grouping. Assign each group to a band in the stack. Values are taken from `field` if specified, else `burn` is rasterized (default: `None` -> singleband raster) -- `burn`: a single value to burn. Mutually exclusive with `field` (default: `None`). If no field is found in `gdf` or if `field` is `None`, then `burn=1` -- `fun`: pixel function to use when multiple values overlap. Available options are `sum`, `first`, `last`, `min`, `max`, `count`, or `any` (default: `last`) -- `background`: background value in final raster (default: `np.nan`). A `None` value corresponds to the default of the specified dtype. An illegal value for a dtype will be replaced with the default of that dtype. For example, a `background=np.nan` for `dtype="uint8"` will become `background=0`, where `0` is the default for `uint8`. -- `encoding`: defines the output format of the rasterization. This is either a dense xarray/numpy representing the burned rasterized geometries, or a sparse array in COOrdinate format good for sparse observations and low memory consumption. Available options are `xarray`, `numpy`, `sparse` (default: `xarray` -> will trigger an error if `xarray` and `rioxarray` are not found). -- `all_touched`: whether every pixel touching the geometry should be burned. -- `tap`: target aligned pixel to align the extent to the pixel resolution (defaul: `False`). -- `dtype`: dtype of the final raster. Available options are `uint8`, `uint16`, `uint32`, `uint64`, `int8`, `int16`, `int32`, `int64`, `float32`, `float64` (default: `float64`) +- **data** : `geopandas.GeoDataFrame`, `polars.DataFrame`, `list`, `numpy.ndarray`
+ Input data to rasterize. + - If `polars.DataFrame`, it must be have a "geometry" column with geometries stored in WKB or WKT format. + - If `list` or `numpy.ndarray`, geometries must be in WKT, WKB, or shapely formats (EPSG is not inferred and defaults to None). + +- **like** : `xarray.DataArray` or `xarray.Dataset` (default: None)
+ Template array used as a spatial blueprint (resolution, shape, extent). Mutually exclusive with `res`, `out_shape`, and `extent`. Requires xarray and rioxarray. + +- **res** : `tuple` or `list` (default: None)
+ Pixel resolution defined as (xres, yres). + +- **out_shape** : `tuple` or `list` (default: None)
+ Output raster dimensions defined as (nrows, ncols). + +- **extent** : `tuple` or `list` (default: None)
+ Spatial bounding box defined as (xmin, ymin, xmax, ymax). + +- **field** : `str` (default: None)
+ Column name to use for pixel values. Mutually exclusive with `burn`. Not considered when input is list or numpy.ndarray. + +- **by** : `str` (default: None)
+ Column used for grouping. Each group is rasterized into a distinct band in the output. Not considered when input is list or numpy.ndarray. + +- **burn** : `int` or `float` (default: None)
+ A static value to apply to all geometries. Mutually exclusive with `field`. + +- **fun** : `str` (default: "last")
+ Pixel function to use when burning geometries. Available options: `sum`, `first`, `last`, `min`, `max`, `count`, or `any`. + +- **background** : `int` or `float` (default: numpy.nan)
+ Value assigned to pixels not covered by any geometry. + +- **encoding** : `str` (default: "xarray")
+ The format of the returned object: `"xarray"`, `"numpy"`, or `"sparse"`. + +- **all_touched** : `bool` (default: False)
+ If True, every pixel touched by a geometry is burned. + +- **tap** : `bool` (default: False)
+ Target Aligned Pixel: aligns the extent to the pixel resolution. + +- **dtype** : `str` (default: "float64")
+ Output data type (e.g., `uint8`, `int32`, `float32`). Note that control over the desired extent is not as strict as for resolution and shape. That is, when resolution, output shape, and extent are specified, priority is given to resolution and shape. So, extent is not guaranteed, but resolution and shape are. If extent is not given, it is taken from the polygons and is not modified, unless you specify a resolution value. If you only specify an output shape, the extent is maintained. This mimics the logics of `gdal_rasterize`. -# Encoding +### Encoding -Version 0.5.0 introduced a new `encoding` parameter to control the output format of the rasterization. This means that you can return a `xarray/numpy` with the rasterized geometries, or a new `SparseArray` structure. This `SparseArray` structure stores the band/row/column triplets of where the geometries should be burned onto the final raster, as well as their corresponding values before applying any pixel function. This can be used as an intermediate output to avoid allocating memory before materializing the final raster, or as a final product. `SparseArray` has three convenience functions: `to_xarray()`, `to_numpy()`, and `to_frame()`. The first two return the final `xarray/numpy` with the appropriate pixel function, the last returns a `polars` dataframe with only the coordinates and values of the rasterized geometries. Note that `SparseArray` avoids allocating memory for the array during rasterization until it's actually needed (e.g. calling `to_xarray()`). See below for an example. +`rusterize` offers three encoding options for the rasterization output. You can return a `xarray/numpy` with the rasterized geometries, or a new `SparseArray` structure. This `SparseArray` structure stores the band/row/column triplets of where the geometries should be burned onto the final raster, as well as their corresponding values before applying any pixel function. This can be used as an intermediate output to avoid allocating memory before materializing the final raster, or as a final product. `SparseArray` has three convenience functions: `to_xarray()`, `to_numpy()`, and `to_frame()`. The first two return the final `xarray/numpy` with the appropriate pixel function, the last returns a `polars` dataframe with only the coordinates and values of the rasterized geometries. Note that `SparseArray` avoids allocating memory for the array during rasterization until it's actually needed (e.g. calling `to_xarray()`). See below for an example. -# Usage +### Usage ```python from rusterize import rusterize @@ -103,11 +146,8 @@ geoms = [ "GEOMETRYCOLLECTION (POINT (50 -40), POLYGON ((75 -40, 75 -30, 100 -30, 100 -40, 75 -40)), LINESTRING (60 -40, 80 0), GEOMETRYCOLLECTION (POLYGON ((100 20, 100 30, 110 30, 110 20, 100 20))))" ] -# convert WKT strings to Shapely geometries -geometries = [wkt.loads(geom) for geom in geoms] - -# create a GeoDataFrame -gdf = gpd.GeoDataFrame({'value': range(1, len(geoms) + 1)}, geometry=geometries, crs='EPSG:32619') +# create a GeoDataFrame with shapely geometries from WKT +gdf = gpd.GeoDataFrame({'value': range(1, len(geoms) + 1)}, geometry=wkt.loads(geoms), crs='EPSG:32619') # rusterize to "xarray" -> return a xarray with the burned geometries and spatial reference (default) # will raise a ModuleNotFoundError if xarray and rioxarray are not found @@ -167,25 +207,24 @@ output.to_frame() ![](img/plot.png) -# Benchmarks +### Benchmarks **rusterize** is fast! Let’s try it on small and large datasets in comparison to GDAL ([benchmark_rusterize.py](benchmarks/benchmark_rusterize.py)). You can run this with [pytest](https://docs.pytest.org/en/stable/) and [pytest-benchmark](https://pytest-benchmark.readthedocs.io/en/stable/): ``` pytest --benchmark-min-rounds=10 --benchmark-time-unit='s' -------------------------------------------------------------------- benchmark: 7 tests ------------------------------------------------------------ -Name Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations ---------------------------------------------------------------------------------------------------------------------------------------------------- -test_water_small_f64_numpy 0.0044 0.0221 0.0090 0.0047 0.0064 0.0070 32;0 111.4936 156 1 -test_water_small_f64 0.0054 0.0309 0.0084 0.0039 0.0068 0.0026 14;14 118.7794 149 1 -test_water_large_f64_numpy 1.5214 2.1011 1.8225 0.1610 1.8015 0.1500 3;1 0.5487 10 1 -test_water_large_f64 1.6683 2.2734 1.9249 0.2019 1.9257 0.2782 4;0 0.5195 10 1 -test_water_large_gdal_vsimem_f64 2.4127 2.4414 2.4217 0.0085 2.4203 0.0092 2;1 0.4129 10 1 -test_roads_uint8 3.5505 17.1630 6.5959 5.1604 3.6147 5.9051 2;0 0.1516 10 1 -test_roads_gdal_vsimem_uint8 9.6371 11.0570 10.3093 0.6694 10.2655 1.3462 4;0 0.0970 10 1 ---------------------------------------------------------------------------------------------------------------------------------------------------- - +-------------------------------------------------------------- benchmark: 7 tests ------------------------------------------------- +Name (time in s) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations +----------------------------------------------------------------------------------------------------------------------------------- +test_water_small_f64_numpy 0.0045 0.0074 0.0051 0.0006 0.0050 0.0006 29;13 194.3373 155 1 +test_water_small_f64 0.0058 0.0239 0.0110 0.0040 0.0101 0.0059 38;2 91.2137 133 1 +test_water_large_f64 1.6331 2.2212 1.8923 0.2013 1.9070 0.3507 5;0 0.5285 10 1 +test_water_large_f64_numpy 1.6530 2.3126 1.8641 0.2078 1.8139 0.3054 2;0 0.5365 10 1 +test_water_large_gdal_f64 2.3926 2.6120 2.4650 0.0849 2.4217 0.1024 2;0 0.4057 10 1 +test_roads_uint8 3.7092 17.6956 6.6547 5.5787 3.8136 1.8291 2;2 0.1503 10 1 +test_roads_gdal_uint8 9.2405 9.4942 9.3018 0.0727 9.2785 0.0445 1;1 0.1075 10 1 +----------------------------------------------------------------------------------------------------------------------------------- ``` And fasterize ([benchmark_fasterize.r](benchmarks/benchmark_fasterize.r)). Note that it doesn't support custom `dtype` so the returning raster is `float64`. @@ -197,20 +236,24 @@ Unit: seconds fasterize_large_f64 36.91321005 37.71877265 41.0140303 40.81343803 43.9201820 46.5596799 10 ``` -# Comparison with other tools +### Comparison with other tools While **rusterize** is fast, there are other fast alternatives out there, including `rasterio` and `geocube`. However, **rusterize** allows for a seamless, Rust-native processing with similar or lower memory footprint that doesn't require you to install GDAL and returns the geoinformation you need for downstream processing with ample control over resolution, shape, extent, and data type. The following is a time comparison of 10 runs (median) on the same large water bodies dataset used earlier (dtype is `float64`) ([run_others.py](benchmarks/run_others.py)). ``` -rusterize: 2.1 sec +rusterize: 1.9 sec rasterio: 15.2 sec geocube: 129.2 sec ``` -# Integrations +### Integrations **rusterize** is integrated into the following libraries: - [rasterix](https://github.com/xarray-contrib/rasterix) + +
+ +Disclaimer: Logo originally generated with Nano Banana Pro diff --git a/benchmarks/benchmark_rusterize.py b/benchmarks/benchmark_rusterize.py index 7b49daa..9e8c084 100644 --- a/benchmarks/benchmark_rusterize.py +++ b/benchmarks/benchmark_rusterize.py @@ -5,6 +5,7 @@ from osgeo import gdal from pyogrio import read_dataframe from rusterize import rusterize +import rioxarray # POLYGONS (~468MB) url = "https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/canvec/shp/Hydro/canvec_50K_BC_Hydro_shp.zip" @@ -33,9 +34,7 @@ # GDAL gdal.UseExceptions() src_water = gdal.OpenEx("canvec_50K_BC_Hydro/waterbody_2.shp") -dest_water = "/vsimem/output_water.tif" src_roads = gdal.OpenEx("lrnf000r25p_e/lrnf000r25p_e.gpkg") -dest_roads = "/vsimem/output_roads.tif" # BENCHES @@ -59,27 +58,27 @@ def test_roads_uint8(benchmark): benchmark(rusterize, roads, res=(50, 50), dtype="uint8") -def test_water_large_gdal_vsimem_f64(benchmark): +def test_water_large_gdal_f64(benchmark): benchmark( gdal.Rasterize, - dest_water, + "", src_water, xRes=1 / 6, yRes=1 / 6, - format="GTIFF", + format="MEM", outputType=gdal.GDT_Float64, burnValues=1, ) -def test_roads_gdal_vsimem_uint8(benchmark): +def test_roads_gdal_uint8(benchmark): benchmark( gdal.Rasterize, - dest_roads, + "", src_roads, xRes=50, yRes=50, - format="GTIFF", + format="MEM", outputType=gdal.GDT_Byte, burnValues=1, ) diff --git a/img/logo.png b/img/logo.png new file mode 100644 index 0000000..4a4d90e Binary files /dev/null and b/img/logo.png differ diff --git a/pyproject.toml b/pyproject.toml index fbd0d9b..0356405 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,15 +18,11 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dynamic = ["version"] -dependencies = [ - "geopandas>=1.0.1", - "pandas>=2.2.3", - "pyarrow>=18.1.0", - "polars>=1.19.0", -] +dependencies = ["numpy>=2.0.0"] [project.optional-dependencies] xarray = ["xarray>=2025.01.1", "rioxarray>=0.18.2"] +all = ["geopandas>=1.0.0", "pandas>=2.0.0", "pyarrow>=18.1.0", "polars>=1.19.0", "polars-st>=0.4.3", "rioxarray>=0.18.2", "xarray>=2025.01.1"] [project.urls] repository = "https://github.com/ttrotto/rusterize" @@ -35,3 +31,6 @@ repository = "https://github.com/ttrotto/rusterize" python-source = "python" module-name = "rusterize" include = [{ path = "rust-toolchain.toml", format = "sdist" }] + +[tool.ruff] +exclude = ["test/test_many.py", "benchmarks/benchmark_rusterize.py"] diff --git a/python/rusterize/__init__.py b/python/rusterize/__init__.py index 8a07e97..ed593ee 100644 --- a/python/rusterize/__init__.py +++ b/python/rusterize/__init__.py @@ -2,28 +2,33 @@ import importlib.metadata from types import NoneType -from typing import TYPE_CHECKING, List, Tuple +from typing import TYPE_CHECKING import numpy as np -import polars as pl -from geopandas import GeoDataFrame +from ._dependencies import ( + _check_for_geopandas, + _check_for_polars_st, + _polars_available, + _xarray_available, +) +from ._dependencies import geopandas as gpd +from ._dependencies import polars as pl +from ._dependencies import xarray as xr from .rusterize import _rusterize if TYPE_CHECKING: - from xarray import DataArray, Dataset - from .rusterize import SparseArray __version__ = importlib.metadata.version("rusterize") def rusterize( - gdf: GeoDataFrame, - like: DataArray | Dataset | None = None, - res: Tuple | List | None = None, - out_shape: Tuple | List | None = None, - extent: Tuple | List | None = None, + data: gpd.GeoDataFrame | pl.DataFrame | list | np.ndarray, + like: xr.DataArray | xr.Dataset | None = None, + res: tuple | list | None = None, + out_shape: tuple | list | None = None, + extent: tuple | list | None = None, field: str | None = None, by: str | None = None, burn: int | float | None = None, @@ -33,114 +38,218 @@ def rusterize( all_touched: bool = False, tap: bool = False, dtype: str = "float64", -) -> DataArray | np.ndarray | SparseArray: +) -> xr.DataArray | np.ndarray | SparseArray: """ - Fast geopandas rasterization in Rust. - - Args: - :param gdf: geopandas dataframe to rasterize. - :param like: array to use as blueprint for spatial matching (resolution, shape, extent). Mutually exlusive with res, out_shape, and extent. - :param res: (xres, yres) for rasterized data. - :param out_shape: (nrows, ncols) for regularized output shape. - :param extent: (xmin, ymin, xmax, ymax) for regularized extent. - :param field: field to rasterize, mutually exclusive with `burn`. Default is None. - :param by: column to rasterize, assigns each unique value to a layer in the stack based on field. Default is None. - :param burn: burn a value onto the raster, mutually exclusive with `field`. Default is None. - :param fun: pixel function to use. Available options are `sum`, `first`, `last`, `min`, `max`, `count`, or `any`. Default is `last`. - :param background: background value in final raster. Default is np.nan. - :param encoding: return a dense array (burned geometries onto a raster) or a sparse array in COOrdinate format (coordinates and values of the rasterized geometries). Available options are `xarray`, `numpy`, or `sparse`. The `xarray` encoding requires `xarray` and `rioxarray` to be installed. Default is `xarray`. - :param all_touched: if True, every pixel touched by the geometry is burned. Default is `False`. - :param tap: target aligned pixel to align the extent to the pixel resolution. Defaul is `False`. - :param dtype: specify the output dtype. Default is `float64`. - - Returns: + Fast geometry rasterization in Rust. + + Parameters + ---------- + data : geopandas.GeoDataFrame, polars.DataFrame, list, numpy.ndarray + Input data to rasterize. + - If polars.DataFrame, it must be have a "geometry" column with geometries stored in WKB or WKT format. + - If list or numpy.ndarray, geometries must be in WKT, WKB, or shapely formats (EPSG is not inferred and defaults to None). + like : xarray.DataArray or xarray.Dataset (default: None) + Template array used as a spatial blueprint (resolution, shape, extent). Mutually exclusive with `res`, `out_shape`, and `extent`. Requires xarray and rioxarray. + res : tuple or list (default: None) + Pixel resolution defined as (xres, yres). + out_shape : tuple or list (default: None) + Output raster dimensions defined as (nrows, ncols). + extent : `tuple` or `list` (default: None) + Spatial bounding box defined as `(xmin, ymin, xmax, ymax)`. + field : `str` (default: None) + Column name to use for pixel values. Mutually exclusive with `burn`. Not considered when input is list or numpy.ndarray. + by : `str` (default: None) + Column used for grouping. Each group is rasterized into a distinct band in the output. Not considered when input is list or numpy.ndarray. + burn : `int` or `float` (default: None) + A static value to apply to all geometries. Mutually exclusive with `field`. + fun : `str` (default: "last") + Pixel function to use when burning geometries. Available options: `sum`, `first`, `last`, `min`, `max`, `count`, or `any`. + background : `int` or `float` (default: numpy.nan) + Value assigned to pixels not covered by any geometry. + encoding : `str` (default: "xarray") + The format of the returned object: `"xarray"`, `"numpy"`, or `"sparse"`. + all_touched : `bool` (default: False) + If True, every pixel touched by a geometry is burned. + tap : `bool` (default: False) + Target Aligned Pixels: aligns the extent to the pixel resolution. + dtype : `str` (default: "float64") + Output data type (e.g., `uint8`, `int32`, `float32`). + + Returns + ------- xarray.DataArray, numpy.ndarray, or a sparse array in COO format. - Notes: - If `encoding` is `numpy`, the array is returned without any spatial reference. + Notes + ----- + If `encoding` is "numpy" or input is list or numpy.ndarray, the return array is without any spatial reference. When any of `res`, `out_shape`, or `extent` is not provided, it is inferred from the other arguments when applicable. - If `like` is specified, `res`, `out_shape`, and `extent` are inferred from the `like` DataArray. + If `like` is specified, `res`, `out_shape`, and `extent` are inferred from the `like` DataArray or Dataset. Unless `extent` is specified, a half-pixel buffer is applied to avoid missing points on the border. The logics dictating the final spatial properties of the rasterized geometries follow those of GDAL. - If `field` is not in `gdf`, then a default `burn` value of 1 is rasterized. + If `field` is not in `data`, then a default `burn` value of 1 is rasterized. A `None` value for `dtype` corresponds to the default of that dtype. An illegal value for a dtype will be replaced with the default of that dtype. For example, a `background=np.nan` for `dtype="uint8"` will become `background=0`, where `0` is the default for `uint8`. """ - # type checks - if not isinstance(gdf, GeoDataFrame): - raise TypeError("`gdf` must be a geopandas dataframe.") - if type(like).__name__ not in ("DataArray", "Dataset", "NoneType"): - raise TypeError("`like' must be a xarray.DataArray or xarray.Dataset") + data_type = None + if isinstance(data, (list, np.ndarray)): + data_type = "raw" + elif _check_for_geopandas(data) and isinstance(data, gpd.GeoDataFrame): + if data.empty: + raise ValueError("GeoDataFrame is empty.") + data_type = "geopandas" + elif _check_for_polars_st(data) and isinstance(data, pl.DataFrame): + if data.is_empty(): + raise ValueError("GeoDataFrame is empty.") + data_type = "polars" + else: + raise TypeError("`data` must be either geopandas.GeoDataFrame, polars.DataFrame, list, or numpy.ndarray") + if not isinstance(res, (tuple, list, NoneType)): - raise TypeError("`resolution` must be a tuple or list of (x, y).") + raise TypeError("`resolution` must be a tuple or list of (xres, yres).") + if not isinstance(out_shape, (tuple, list, NoneType)): raise TypeError("`out_shape` must be a tuple or list of (nrows, ncols).") + if not isinstance(extent, (tuple, list, NoneType)): raise TypeError("`extent` must be a tuple or list of (xmin, ymin, xmax, ymax).") + if not isinstance(field, (str, NoneType)): raise TypeError("`field` must be a string column name.") + if not isinstance(by, (str, NoneType)): raise TypeError("`by` must be a string column name.") + if not isinstance(burn, (int, float, NoneType)): raise TypeError("`burn` must be an integer or float.") + if not isinstance(fun, str): raise TypeError("`pixel_fn` must be one of sum, first, last, min, max, count, or any.") + if not isinstance(background, (int, float, NoneType)): raise TypeError("`background` must be integer, float, or None.") + if not isinstance(encoding, str): raise TypeError("`encoding` must be one of 'xarray', 'numpy', or 'sparse'.") + if not isinstance(all_touched, bool): raise TypeError("`all_touched` must be a boolean.") + if not isinstance(tap, bool): raise TypeError("`tap` must be a boolean.") + if not isinstance(dtype, str): raise TypeError( "`dtype` must be a one of 'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'" ) - # value checks and defaults + # value checks if field and burn: raise ValueError("Only one of `field` or `burn` can be specified.") + if encoding not in ["xarray", "numpy", "sparse"]: raise ValueError("`encoding` must be one of `xarray`, 'numpy', or `sparse`.") - if encoding == "xarray": - try: - import rioxarray - import xarray - except ModuleNotFoundError as e: - raise ModuleNotFoundError( - "`xarray` and `rioxarray` must be installed if encoding is `xarray`. Install with `pip install xarray rioxarray`." - ) from e + + if encoding == "xarray" and not _xarray_available(): + raise ModuleNotFoundError( + "`xarray` and `rioxarray` must be installed if encoding is `xarray`. Install with `pip install xarray rioxarray`." + ) + + _with_user_extent = False + _bounds = (np.inf, np.inf, np.inf, np.inf) + _res = (0, 0) + _shape = (0, 0) + if like is not None: + if not (_xarray_available() and isinstance(like, (xr.DataArray, xr.Dataset))): + raise TypeError("`like` must be a xarray.DataArray or xarray.Dataset") + if any((res, out_shape, extent)): raise ValueError("`like` is mutually exclusive with `res`, `out_shape`, and `extent`.") - elif hasattr(like, "rio"): + + if not hasattr(like, "rio"): + raise AttributeError("The `like` object must have a 'rio' accessor.") + + try: affine = like.rio.transform() _res = (affine.a, abs(affine.e)) _shape = like.squeeze().shape - _bounds, _has_extent = like.rio.bounds(), True - else: - raise AttributeError("The `like` object must have a `rioxarray` accessor.") + _bounds, _with_user_extent = like.rio.bounds(), True + except Exception as e: + raise AttributeError("No spatial dimension found for like object") from e else: if not res and not out_shape and not extent: raise ValueError("One of `res`, `out_shape`, or `extent` must be provided.") - if extent and not res and not out_shape: - raise ValueError("Must also specify `res` or `out_shape` with extent.") - if res and (len(res) != 2 or any(r <= 0 for r in res) or any(not isinstance(r, (int, float)) for r in res)): - raise ValueError("`res` must be 2 positive numbers.") - if out_shape and ( - len(out_shape) != 2 or any(s <= 0 for s in out_shape) or any(not isinstance(s, int) for s in out_shape) - ): - raise ValueError("`out_shape` must be 2 positive integers.") - if extent and len(extent) != 4: - raise ValueError("`extent` must be a tuple or list of (xmin, ymin, xmax, ymax).") - - # defaults - _res = res if res else (0, 0) - _shape = out_shape if out_shape else (0, 0) - (_bounds, _has_extent) = (extent, True) if extent else (gdf.total_bounds, False) + + if extent: + if not res and not out_shape: + raise ValueError("Must also specify `res` or `out_shape` with extent.") + + if len(extent) != 4 or all(e == 0 for e in extent): + raise ValueError("`extent` must be a tuple or list of (xmin, ymin, xmax, ymax).") + + _bounds = extent + _with_user_extent = True + + if res: + if len(res) != 2 or any(r <= 0 for r in res) or any(not isinstance(r, (int, float)) for r in res): + raise ValueError("`res` must be 2 positive numbers.") + + _res = res + + if out_shape: + if len(out_shape) != 2 or any(s <= 0 for s in out_shape) or any(not isinstance(s, int) for s in out_shape): + raise ValueError("`out_shape` must be 2 positive integers.") + + _shape = out_shape + + # extract columns of interest if dataframe + cols = list(set([col for col in (field, by) if col and col != "geometry"])) + df = None + epsg = None + + # get bounds + if data_type == "geopandas": + if not _polars_available(): + raise ModuleNotFoundError("polars must be installed when data is geopandas.GeoDataFrame.") + + if not _with_user_extent: + _bounds = data.total_bounds + + epsg = data.crs.to_epsg() if data.crs else None + + if cols: + try: + df = pl.from_pandas(data[cols]) + except KeyError as e: + raise KeyError("Column not found in GeoDataFrame.") from e + + geometries = data.geometry + + elif data_type == "polars": + if not _with_user_extent: + try: + _bounds = data.select(pl.col("geometry").st.total_bounds()).item().to_numpy() + except pl.exceptions.ColumnNotFoundError as e: + raise ValueError("If `polars.DataFrame`, a 'geometry' column is expected.") from e + + # check if geometry has SRID. If 0, then None, else assume first SRID is equal for all geometries + srid = data.select(pl.col("geometry").first().st.srid()).item() + epsg = None if srid == 0 else srid + + if cols: + try: + df = data.select(pl.col([*cols, "geometry"])) + except pl.exceptions.ColumnNotFoundError as e: + raise KeyError("Column not found in polars DataFrame.") from e + + # geometries are extracted directly on the Rust side + geometries = data.select(pl.col("geometry")).to_series() + + else: + # list or numpy.ndarray + geometries = data # RawRasterInfo raw_raster_info = { @@ -152,16 +261,21 @@ def rusterize( "ymax": _bounds[3], "xres": _res[0], "yres": _res[1], - "has_extent": _has_extent, + "with_user_extent": _with_user_extent, "tap": tap, - "epsg": gdf.crs.to_epsg() if gdf.crs else None, + "epsg": epsg, } - # extract columns of interest and convert to polars - cols = list(set([col for col in (field, by) if col])) - try: - df = pl.from_pandas(gdf[cols]) if cols else None - except KeyError as e: - raise KeyError("Column not found in GeoDataFrame.") from e - - return _rusterize(gdf.geometry, raw_raster_info, fun, df, field, by, burn, background, all_touched, encoding, dtype) + return _rusterize( + geometries, + raw_raster_info, + fun, + df, + field, + by, + burn, + background, + all_touched, + encoding, + dtype, + ) diff --git a/python/rusterize/_dependencies.py b/python/rusterize/_dependencies.py new file mode 100644 index 0000000..c1adbcd --- /dev/null +++ b/python/rusterize/_dependencies.py @@ -0,0 +1,171 @@ +# Adapted from https://github.com/pola-rs/polars/blob/628f2273373e1d61d68c6b381e2d67702c990a0c/py-polars/src/polars/_dependencies.py + +import re +import sys +from functools import cache +from importlib import import_module +from importlib.util import find_spec +from types import ModuleType +from typing import TYPE_CHECKING, Any, ClassVar + + +class _LazyModule(ModuleType): + """Module that can act both as a lazy-loader and as a proxy.""" + + __lazy__ = True + + _mod_pfx: ClassVar[dict[str, str]] = { + "geopandas": "gpd.", + "polars": "pl.", + "polars_st": "st.", + "xarray": "xr.", + } + + def __init__( + self, + module_name: str, + *, + module_available: bool, + ): + """ + Initialise lazy-loading proxy module. + + Parameters + ---------- + module_name : str + the name of the module to lazy-load (if available). + + module_available : bool + indicate if the referenced module is actually available (we will proxy it + in both cases, but raise a helpful error when invoked if it doesn't exist). + """ + self._module_available = module_available + self._module_name = module_name + self._globals = globals() + super().__init__(module_name) + + def _import(self) -> ModuleType: + # import the referenced module, replacing the proxy in this module's globals + module = import_module(self.__name__) + self._globals[self._module_name] = module + self.__dict__.update(module.__dict__) + return module + + def __getattr__(self, name: str) -> Any: + # have "hasattr('__wrapped__')" return False without triggering import + if name == "__wrapped__": + msg = f"{self._module_name!r} object has no attribute {name!r}" + raise AttributeError(msg) + + # accessing the proxy module's attributes triggers import of the real thing + if self._module_available: + # import the module and return the requested attribute + module = self._import() + return getattr(module, name) + + # user has not installed the proxied/lazy module + elif name == "__name__": + return self._module_name + elif re.match(r"^__\w+__$", name) and name != "__version__": + # allow some minimal introspection on private module + # attrs to avoid unnecessary error-handling elsewhere + return None + else: + # all other attribute access raises a helpful exception + pfx = self._mod_pfx.get(self._module_name, "") + msg = f"{pfx}{name} requires {self._module_name!r} module to be installed" + raise ModuleNotFoundError(msg) from None + + +def _lazy_import(module_name: str) -> tuple[ModuleType, bool]: + """ + Lazy import the given module; avoids up-front import costs. + + Parameters + ---------- + module_name : str + name of the module to import, eg: "pyarrow". + + Notes + ----- + If the requested module is not available (eg: has not been installed), a proxy + module is created in its place, which raises an exception on any attribute + access. This allows for import and use as normal, without requiring explicit + guard conditions - if the module is never used, no exception occurs; if it + is, then a helpful exception is raised. + + Returns + ------- + tuple of (Module, bool) + A lazy-loading module and a boolean indicating if the requested/underlying + module exists (if not, the returned module is a proxy). + """ + # check if module is LOADED + if module_name in sys.modules: + return sys.modules[module_name], True + + # check if module is AVAILABLE + try: + module_spec = find_spec(module_name) + module_available = not (module_spec is None or module_spec.loader is None) + except ModuleNotFoundError: + module_available = False + + # create lazy/proxy module that imports the real one on first use + # (or raises an explanatory ModuleNotFoundError if not available) + return ( + _LazyModule( + module_name=module_name, + module_available=module_available, + ), + module_available, + ) + + +if TYPE_CHECKING: + import geopandas + import polars + import polars_st + import rioxarray + import xarray +else: + geopandas, GEOPANDAS_AVAILABLE = _lazy_import("geopandas") + polars, POLARS_AVAILABLE = _lazy_import("polars") + polars_st, POLARS_ST_AVAILABLE = _lazy_import("polars_st") + xarray, XARRAY_AVAILABLE = _lazy_import("xarray") + rioxarray, RIOXARRAY_AVAILABLE = _lazy_import("rioxarray") + + +def _xarray_available() -> bool: + return XARRAY_AVAILABLE and RIOXARRAY_AVAILABLE + + +def _polars_available() -> bool: + return POLARS_AVAILABLE + + +@cache +def _might_be(cls: type, type_: str) -> bool: + """Infer if a class hierarchy contains a specific module name.""" + try: + return any(f"{type_}." in str(o) for o in cls.mro()) + except TypeError: + return False + + +def _check_for_geopandas(obj: Any) -> bool: + return GEOPANDAS_AVAILABLE and _might_be(type(obj), "geopandas") + + +def _check_for_polars_st(obj: Any) -> bool: + return POLARS_ST_AVAILABLE and _might_be(type(obj), "polars") + + +__all__ = [ + "_check_for_geopandas", + "_check_for_polars_st", + "_xarray_available", + "geopandas", + "polars", + "xarray", +] diff --git a/python/rusterize/rusterize.pyi b/python/rusterize/rusterize.pyi index 8887c7b..bb7661a 100644 --- a/python/rusterize/rusterize.pyi +++ b/python/rusterize/rusterize.pyi @@ -1,16 +1,15 @@ -from typing import List, Tuple - import numpy as np -from geopandas import GeoDataFrame -from polars import DataFrame -from xarray import DataArray, Dataset + +from ._dependencies import geopandas as gpd +from ._dependencies import polars as pl +from ._dependencies import xarray as xr def rusterize( - gdf: GeoDataFrame, - like: DataArray | Dataset | None = None, - res: Tuple | List | None = None, - out_shape: Tuple | List | None = None, - extent: Tuple | List | None = None, + data: gpd.GeoDataFrame | pl.DataFrame | list | np.ndarray, + like: xr.DataArray | xr.Dataset | None = None, + res: tuple | list | None = None, + out_shape: tuple | list | None = None, + extent: tuple | list | None = None, field: str | None = None, by: str | None = None, burn: int | float | None = None, @@ -20,42 +19,62 @@ def rusterize( all_touched: bool = False, tap: bool = False, dtype: str = "float64", -) -> DataArray | np.ndarray | SparseArray: +) -> xr.DataArray | np.ndarray | SparseArray: """ - Fast geopandas rasterization in Rust. - - Args: - :param gdf: geopandas dataframe to rasterize. - :param like: array to use as blueprint for spatial matching (resolution, shape, extent). Mutually exlusive with res, out_shape, and extent. - :param res: (xres, yres) for rasterized data. - :param out_shape: (nrows, ncols) for regularized output shape. - :param extent: (xmin, ymin, xmax, ymax) for regularized extent. - :param field: field to rasterize, mutually exclusive with `burn`. Default is None. - :param by: column to rasterize, assigns each unique value to a layer in the stack based on field. Default is None. - :param burn: burn a value onto the raster, mutually exclusive with `field`. Default is None. - :param fun: pixel function to use. Available options are `sum`, `first`, `last`, `min`, `max`, `count`, or `any`. Default is `last`. - :param background: background value in final raster. Default is np.nan. - :param encoding: return a dense array (burned geometries onto a raster) or a sparse array in COOrdinate format (coordinates and values of the rasterized geometries). Available options are `xarray`, `numpy`, or `sparse`. The `xarray` encoding requires `xarray` and `rioxarray` to be installed. Default is `xarray`. - :param all_touched: if True, every pixel touched by the geometry is burned. Default is `False`. - :param tap: target aligned pixel to align the extent to the pixel resolution. Defaul is `False`. - :param dtype: specify the output dtype. Default is `float64`. - - Returns: + Fast geometry rasterization in Rust. + + Parameters + ---------- + data : geopandas.GeoDataFrame, polars.DataFrame, list, numpy.ndarray + Input data to rasterize. + - If polars.DataFrame, it must be have a "geometry" column with geometries stored in WKB or WKT format. + - If list or numpy.ndarray, geometries must be in WKT, WKB, or shapely formats (EPSG is not inferred and defaults to None). + like : xarray.DataArray or xarray.Dataset (default: None) + Template array used as a spatial blueprint (resolution, shape, extent). Mutually exclusive with `res`, `out_shape`, and `extent`. Requires xarray and rioxarray. + res : tuple or list (default: None) + Pixel resolution defined as (xres, yres). + out_shape : tuple or list (default: None) + Output raster dimensions defined as (nrows, ncols). + extent : `tuple` or `list` (default: None) + Spatial bounding box defined as `(xmin, ymin, xmax, ymax)`. + field : `str` (default: None) + Column name to use for pixel values. Mutually exclusive with `burn`. Not considered when input is list or numpy.ndarray. + by : `str` (default: None) + Column used for grouping. Each group is rasterized into a distinct band in the output. Not considered when input is list or numpy.ndarray. + burn : `int` or `float` (default: None) + A static value to apply to all geometries. Mutually exclusive with `field`. + fun : `str` (default: "last") + Pixel function to use when burning geometries. Available options: `sum`, `first`, `last`, `min`, `max`, `count`, or `any`. + background : `int` or `float` (default: numpy.nan) + Value assigned to pixels not covered by any geometry. + encoding : `str` (default: "xarray") + The format of the returned object: `"xarray"`, `"numpy"`, or `"sparse"`. + all_touched : `bool` (default: False) + If True, every pixel touched by a geometry is burned. + tap : `bool` (default: False) + Target Aligned Pixels: aligns the extent to the pixel resolution. + dtype : `str` (default: "float64") + Output data type (e.g., `uint8`, `int32`, `float32`). + + Returns + ------- xarray.DataArray, numpy.ndarray, or a sparse array in COO format. - Notes: - If `encoding` is `numpy`, the array is returned without any spatial reference. + Notes + ----- + If `encoding` is "numpy" or input is list or numpy.ndarray, the return array is without any spatial reference. When any of `res`, `out_shape`, or `extent` is not provided, it is inferred from the other arguments when applicable. - If `like` is specified, `res`, `out_shape`, and `extent` are inferred from the `like` DataArray. + If `like` is specified, `res`, `out_shape`, and `extent` are inferred from the `like` DataArray or Dataset. Unless `extent` is specified, a half-pixel buffer is applied to avoid missing points on the border. The logics dictating the final spatial properties of the rasterized geometries follow those of GDAL. - If `field` is not in `gdf`, then a default `burn` value of 1 is rasterized. + If `field` is not in `data`, then a default `burn` value of 1 is rasterized. A `None` value for `dtype` corresponds to the default of that dtype. An illegal value for a dtype will be replaced with the default of that dtype. For example, a `background=np.nan` for `dtype="uint8"` will become `background=0`, where `0` is the default for `uint8`. """ class SparseArray: - def to_xarray(self) -> DataArray: ... - def to_frame(self) -> DataFrame: ... + def to_xarray(self) -> xr.DataArray: ... + def to_numpy(self) -> np.ndarray: ... + def to_frame(self) -> pl.DataFrame: ... diff --git a/rustfmt.toml b/rustfmt.toml index 2468c97..69bf4fa 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,2 +1,5 @@ +# https://github.com/rust-lang/rustfmt?tab=readme-ov-file#rusts-editions +edition = "2024" +style_edition = "2024" imports_granularity = "Crate" max_width = 120 diff --git a/src/encoding/arrays.rs b/src/encoding/arrays.rs index 255efe9..5088832 100644 --- a/src/encoding/arrays.rs +++ b/src/encoding/arrays.rs @@ -15,6 +15,7 @@ use numpy::{Element, IntoPyArray}; use polars::prelude::*; use pyo3::prelude::*; use pyo3_polars::PyDataFrame; +use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; pub struct DenseArray { raster: Array3, @@ -41,9 +42,7 @@ where let data = self.raster.into_pyarray(py); if opt_flags.with_xarray_output() { - // check dependencies before building the array - let xarray_module = py.import("xarray")?; - let xarray = build_xarray(py, xarray_module, self.raster_info, data, self.band_names)?; + let xarray = build_xarray(py, self.raster_info, data, self.band_names)?; Ok(PyOut::Dense(xarray)) } else { Ok(PyOut::Dense(data.into_any())) @@ -73,7 +72,10 @@ pub struct SparseArray { background: N, } -impl SparseArray { +impl SparseArray +where + N: Num + Copy, +{ pub fn new( band_names: Vec, rows: Vec, @@ -118,9 +120,9 @@ impl SparseArray { } } -impl PySparseArrayTraits for SparseArray +impl PySparseArrayTraits for SparseArray where - N: Num + Element + Copy + PolarsHandler, + T: Num + Element + Copy + PolarsHandler, { // estimated size of the materialized array fn size_str(&self) -> String { @@ -160,21 +162,11 @@ where } fn to_xarray<'py>(&self, py: Python<'py>) -> PyResult> { - // check dependencies before building the array - let xarray_module = py.import("xarray")?; - py.import("rioxarray")?; - let raster = self.build_raster(); let data = raster.into_pyarray(py); - build_xarray( - py, - xarray_module, - self.raster_info.clone(), - data, - self.band_names.clone(), - ) + build_xarray(py, self.raster_info.clone(), data, self.band_names.clone()) } fn to_numpy<'py>(&self, py: Python<'py>) -> PyResult> { @@ -198,23 +190,24 @@ where columns.push(bands_column); } - let rows = self.triplets.rows.iter().map(|v| *v as u64).collect::>(); + let rows = self.triplets.rows.par_iter().map(|v| *v as u64).collect::>(); + let length = rows.len(); columns.push(Column::new("row".into(), rows)); - let cols = self.triplets.cols.iter().map(|v| *v as u64).collect::>(); + let cols = self.triplets.cols.par_iter().map(|v| *v as u64).collect::>(); columns.push(Column::new("col".into(), cols)); - columns.push(N::from_named_vec("data", &self.triplets.data)); + columns.push(T::from_named_vec("data", &self.triplets.data)); - let df = DataFrame::new(columns).unwrap(); + let df = DataFrame::new(length, columns).unwrap(); PyDataFrame(df) } } // conversion to python -impl Pythonize for SparseArray +impl Pythonize for SparseArray where - N: Num + Element + Copy + PolarsHandler + 'static, + T: Num + Element + Copy + PolarsHandler + 'static, { fn pythonize(self, _py: Python, _opt_flags: OptFlags) -> PyResult { Ok(PyOut::Sparse(PySparseArray(Arc::new(self)))) diff --git a/src/encoding/build_xarray.rs b/src/encoding/build_xarray.rs index 0771a1e..5900e53 100644 --- a/src/encoding/build_xarray.rs +++ b/src/encoding/build_xarray.rs @@ -3,7 +3,6 @@ Build xarray object from a dictionary. The xarray module is passed as a function argument to avoid importing it twice for DenseSparse and SparseArray - */ use crate::geo::raster::RasterInfo; @@ -16,7 +15,6 @@ use pyo3::{ pub fn build_xarray<'py, T>( py: Python<'py>, - xarray_module: Bound<'py, PyModule>, raster_info: RasterInfo, data: Bound<'py, PyArray3>, band_names: Vec, @@ -24,6 +22,9 @@ pub fn build_xarray<'py, T>( where T: Num + Element, { + let xarray_module = py.import("xarray")?; + py.import("rioxarray")?; + let (y, x) = raster_info.make_coordinates(py); let bands = PyList::new(py, band_names)?; let dims = PyList::new(py, vec!["bands", "y", "x"])?; diff --git a/src/geo/edges.rs b/src/geo/edges.rs index 2a55c0d..e09a99b 100644 --- a/src/geo/edges.rs +++ b/src/geo/edges.rs @@ -43,7 +43,7 @@ impl PolyEdge { x0: x_top, y0: y_top, dxdy, - x_at_yline: f64::NAN, // dummy + x_at_yline: f64::INFINITY, // dummy } } @@ -106,8 +106,8 @@ pub fn extract_ring(edges: &mut Vec, line: &LineString, raster_in for i in 0..nrows { // world-to-pixel conversion let x0 = (node_array[[i, 0]] - raster_info.xmin) / raster_info.xres; - let x1 = (node_array[[i + 1, 0]] - raster_info.xmin) / raster_info.xres; let y0 = (raster_info.ymax - node_array[[i, 1]]) / raster_info.yres; + let x1 = (node_array[[i + 1, 0]] - raster_info.xmin) / raster_info.xres; let y1 = (raster_info.ymax - node_array[[i + 1, 1]]) / raster_info.yres; // skip horizontal diff --git a/src/geo/from_shapely.rs b/src/geo/from_shapely.rs deleted file mode 100644 index 63028d7..0000000 --- a/src/geo/from_shapely.rs +++ /dev/null @@ -1,60 +0,0 @@ -/* -Serialize geopandas geoemetries into WKB for Rust and deserialize into geo::Geometry -This is faster than parsing geometries directly via __geo_interface__ -Adapted from https://github.com/geoarrow/geoarrow-rs/blob/main/python/geoarrow-core/src/interop/shapely/from_shapely.rs - */ - -use geo_traits::to_geo::ToGeoGeometry; -use geo_types::Geometry; -use pyo3::{ - exceptions::PyValueError, - intern, - prelude::*, - pybacked::PyBackedBytes, - types::{PyAny, PyDict}, -}; -use wkb::reader::read_wkb; - -fn parse_wkb_to_geometry(wkb: &[u8]) -> Option> { - let wkb_result = read_wkb(wkb).unwrap(); - ToGeoGeometry::try_to_geometry(&wkb_result) -} - -fn import_shapely(py: Python) -> PyResult> { - let shapely_mod = py.import(intern!(py, "shapely"))?; - let shapely_version_string = shapely_mod.getattr(intern!(py, "__version__"))?.extract::()?; - if !shapely_version_string.starts_with('2') { - Err(PyValueError::new_err("Shapely version 2 required")) - } else { - Ok(shapely_mod) - } -} - -fn to_wkb<'a>(py: Python<'a>, shapely_mod: &'a Bound, input: &'a Bound) -> PyResult> { - let args = (input,); - - let kwargs = PyDict::new(py); - kwargs.set_item("output_dimension", 2)?; - kwargs.set_item("include_srid", false)?; - kwargs.set_item("flavor", "iso")?; - - shapely_mod.call_method(intern!(py, "to_wkb"), args, Some(&kwargs)) -} - -pub fn from_shapely(py: Python, input: &Bound) -> PyResult>> { - // call `shapely.to_wkb` - let shapely_mod = import_shapely(py)?; - let wkb_result = to_wkb(py, &shapely_mod, input)?; - - // build vector of binary geometries - let mut wkb_output = Vec::with_capacity(wkb_result.len()?); - for item in wkb_result.try_iter()? { - // extract bytes and deserialize - let buf = item?.extract::()?; - if let Some(parsed) = parse_wkb_to_geometry(&buf) { - wkb_output.push(parsed); - } - } - - Ok(wkb_output) -} diff --git a/src/geo/parse_geometry.rs b/src/geo/parse_geometry.rs new file mode 100644 index 0000000..717b87b --- /dev/null +++ b/src/geo/parse_geometry.rs @@ -0,0 +1,177 @@ +/* +Serialize geopandas geoemetries into WKB for Rust and deserialize into geo_types::Geometry +This is faster than parsing geometries directly via __geo_interface__ + */ + +use geo_traits::to_geo::ToGeoGeometry; +use geo_types::Geometry; +use polars::{datatypes::DataType, error::PolarsError, prelude::*}; +use pyo3::{ + Bound, + exceptions::{PyTypeError, PyValueError}, + intern, + prelude::*, + pybacked::PyBackedBytes, + types::{PyAny, PyBytes, PyDict, PyList, PyString}, +}; +use pyo3_polars::PySeries; +use rayon::iter::ParallelIterator; +use std::ops::Deref; +use wkb::reader::read_wkb; +use wkt::TryFromWkt; + +pub struct ParsedGeometry(pub Vec>); + +impl ParsedGeometry { + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn get(&self, index: usize) -> Option<&Geometry> { + self.0.get(index) + } +} + +impl<'a> IntoIterator for &'a ParsedGeometry { + type Item = &'a Geometry; + type IntoIter = std::slice::Iter<'a, Geometry>; + + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } +} + +impl Deref for ParsedGeometry { + type Target = [Geometry]; + + fn deref(&self) -> &Self::Target { + self.0.as_slice() + } +} + +impl FromPyObject<'_, '_> for ParsedGeometry { + type Error = PyErr; + + fn extract(obj: Borrowed<'_, '_, PyAny>) -> PyResult { + // geopandas.GeoDataFrame + if obj.hasattr("geom_type")? { + let py = obj.py(); + + // shapely >= 2.0.0 + let shapely_mod = py.import(intern!(py, "shapely"))?; + let shapely_version_string = shapely_mod.getattr(intern!(py, "__version__"))?.extract::()?; + if !shapely_version_string.starts_with('2') { + return Err(PyValueError::new_err("Shapely version 2 required")); + } + + let wkb_result = to_wkb(py, &shapely_mod, &obj)?; + + return parse_iterable_wkb(&wkb_result); + } + + if obj.is_instance_of::() || obj.get_type().name()? == "ndarray" { + if obj.is_empty()? { + return Err(PyValueError::new_err("No geometries found.")); + } + + // check first item to determine parsing strategy + let first = obj.get_item(0)?; + if first.is_instance_of::() { + return parse_iterable_wkb(&obj); + } else if first.is_instance_of::() { + return parse_iterable_wkt(&obj); + } else { + return Err(PyValueError::new_err( + "Iterable must contain geometries as bytes (WKB) or string (WKT).", + )); + } + } + + if let Ok(pyseries) = obj.extract::() { + let series: Series = pyseries.into(); + return parse_polars_series(series).map_err(|e| PyTypeError::new_err(e.to_string())); + } + + Err(PyTypeError::new_err("Unsupported geometry input type.")) + } +} + +fn try_parse_wkb_to_geometry(wkb: &[u8]) -> Option> { + let wkb_result = read_wkb(wkb).expect( + "Cannot parse geometry. Check that the WKB bytes are valid. \ + This may happen when you convert a list of WKB stored as python 'object' into a numpy array.", + ); + ToGeoGeometry::try_to_geometry(&wkb_result) +} + +fn try_parse_wkt_to_geometry(wkt: &str) -> Option> { + Some(Geometry::try_from_wkt_str(wkt).unwrap()) +} + +fn to_wkb<'a>( + py: Python<'a>, + shapely_mod: &'a Bound, + input: &Bound<'a, PyAny>, +) -> PyResult> { + let args = (input,); + + let kwargs = PyDict::new(py); + kwargs.set_item("output_dimension", 2)?; + kwargs.set_item("include_srid", false)?; + kwargs.set_item("flavor", "iso")?; + + shapely_mod.call_method(intern!(py, "to_wkb"), args, Some(&kwargs)) +} + +fn parse_iterable_wkb(input: &Bound) -> PyResult { + let mut geoms = Vec::with_capacity(input.len()?); + for item in input.try_iter()? { + let buf = item?.extract::()?; + if let Some(parsed) = try_parse_wkb_to_geometry(&buf) { + geoms.push(parsed); + } + } + + if geoms.is_empty() { + return Err(PyValueError::new_err( + "Could not parse geometry. Only WKT or WKB formats are supported.", + )); + } + + Ok(ParsedGeometry(geoms)) +} + +fn parse_iterable_wkt(input: &Bound<'_, PyAny>) -> PyResult { + let mut geoms = Vec::with_capacity(input.len().unwrap_or(0)); + for item in input.try_iter()? { + let s = item?.extract::()?; + if let Some(parsed) = try_parse_wkt_to_geometry(&s) { + geoms.push(parsed); + } + } + + if geoms.is_empty() { + return Err(PyValueError::new_err( + "Could not parse geometry. Only WKT or WKB formats are supported.", + )); + } + + Ok(ParsedGeometry(geoms)) +} + +fn parse_polars_series(input: Series) -> Result { + let wkb_output = match input.dtype() { + DataType::Binary => input + .binary()? + .iter() + .filter_map(|item| item.and_then(try_parse_wkb_to_geometry)) + .collect(), + DataType::String => input + .str()? + .par_iter() + .filter_map(|item| item.and_then(try_parse_wkt_to_geometry)) + .collect(), + _ => unimplemented!("Unsupported dtype for geometry column"), + }; + Ok(ParsedGeometry(wkb_output)) +} diff --git a/src/geo/raster.rs b/src/geo/raster.rs index 617741b..7f186dc 100644 --- a/src/geo/raster.rs +++ b/src/geo/raster.rs @@ -1,5 +1,7 @@ /* Structure to contain information on raster data */ +use geo::BoundingRect; +use geo_types::{Geometry, Rect, coord}; use num_traits::Num; use numpy::{ IntoPyArray, PyArray1, @@ -22,31 +24,22 @@ pub struct RasterInfo { #[derive(FromPyObject)] #[pyo3(from_item_all)] -struct RawRasterInfo { +pub struct RawRasterInfo { ncols: usize, nrows: usize, xmin: f64, - xmax: f64, ymin: f64, + xmax: f64, ymax: f64, xres: f64, yres: f64, - has_extent: bool, + with_user_extent: bool, tap: bool, epsg: Option, } -impl<'py> FromPyObject<'py> for RasterInfo { - fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { - let raw: RawRasterInfo = ob.extract()?; - let info = RasterInfo::from(raw); - Ok(info) - } -} - impl RasterInfo { - #[inline] - fn from(raw: RawRasterInfo) -> Self { + pub fn from(raw: RawRasterInfo, geoms: &[Geometry]) -> Self { let mut info = RasterInfo { ncols: raw.ncols, nrows: raw.nrows, @@ -59,11 +52,36 @@ impl RasterInfo { epsg: raw.epsg, }; + if info.xmin.is_infinite() { + // list or numpy.ndarray do not carry bounding information + let bounds = geoms.iter().fold(None, |acc, geom| { + let bounds = geom.bounding_rect(); + + match (acc, bounds) { + (None, None) => None, + (None, Some(r)) | (Some(r), None) => Some(r), + (Some(r1), Some(r2)) => Some(Rect::new( + coord! { x: r1.min().x.min(r2.min().x), y: r1.min().y.min(r2.min().y) }, + coord! { x: r1.max().x.max(r2.max().x), y: r1.max().y.max(r2.max().y) }, + )), + } + }); + + if let Some(b) = bounds { + info.xmin = b.min().x; + info.ymin = b.min().y; + info.xmax = b.max().x; + info.ymax = b.max().y; + } else { + panic!("Cannot infer bounding box from geometry.") + } + } + let has_res = info.xres != 0.0; let has_shape = info.nrows != 0; // extent by half pixel if custom extent not provided - if !raw.has_extent && !raw.tap && has_res { + if !raw.with_user_extent && !raw.tap && has_res { info.xmin -= info.xres / 2.0; info.xmax += info.xres / 2.0; info.ymin -= info.yres / 2.0; @@ -71,7 +89,7 @@ impl RasterInfo { } if !has_res { - info.resolution(); + info.assign_resolution(); } else if raw.tap && has_res { info.xmin = (info.xmin / info.xres).floor() * info.xres; info.xmax = (info.xmax / info.xres).ceil() * info.xres; @@ -80,20 +98,20 @@ impl RasterInfo { } if !has_shape { - info.shape(); + info.assign_shape(); } info } #[inline] - fn shape(&mut self) { + fn assign_shape(&mut self) { self.nrows = (0.5 + (self.ymax - self.ymin) / self.yres) as usize; self.ncols = (0.5 + (self.xmax - self.xmin) / self.xres) as usize } #[inline] - fn resolution(&mut self) { + fn assign_resolution(&mut self) { self.xres = (self.xmax - self.xmin) / self.ncols as f64; self.yres = (self.ymax - self.ymin) / self.nrows as f64; } diff --git a/src/lib.rs b/src/lib.rs index 3dce591..476342a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ mod allocator; mod geo { pub mod edges; - pub mod from_shapely; + pub mod parse_geometry; pub mod raster; } mod encoding { @@ -21,23 +21,37 @@ mod prelude; use crate::{ encoding::pyarrays::{PyOut, Pythonize}, - geo::from_shapely::from_shapely, + geo::parse_geometry::ParsedGeometry, prelude::*, rasterization::{ pixel_functions::set_pixel_function, - rusterize_impl::{Rasterize, rusterize_impl}, + prepare_dataframe::cast_df, + rusterize_impl::{Rasterize, RasterizeContext}, }, }; -use geo::raster::RasterInfo; -use geo_types::Geometry; +use geo::raster::{RasterInfo, RawRasterInfo}; use num_traits::Num; -use numpy::Element; use polars::prelude::DataFrame; -use pyo3::{prelude::*, types::PyAny}; +use pyo3::{conversion::FromPyObject, prelude::*, types::PyAny}; use pyo3_polars::PyDataFrame; +macro_rules! dispatch_rusterize { + ( + $dtype:expr, $encoding:expr, $py:expr, $ctx:expr, + [ $( ($str_val:pat, $rust_type:ty) ),* ] + ) => { + match ($dtype, $encoding) { + $( + ($str_val, "xarray" | "numpy") => rusterize_impl::<$rust_type, Dense>($py, $ctx), + ($str_val, "sparse") => rusterize_impl::<$rust_type, Sparse>($py, $ctx), + )* + _ => unimplemented!("Invalid dtype or encoding combination provided."), + } + }; +} + struct Context<'py> { - geometry: Vec, + geometry: ParsedGeometry, raster_info: RasterInfo, pypixel_fn: &'py str, pybackground: Option<&'py Bound<'py, PyAny>>, @@ -48,10 +62,9 @@ struct Context<'py> { opt_flags: OptFlags, } -#[allow(clippy::too_many_arguments)] -fn execute_rusterize<'py, T, R>(py: Python<'py>, ctx: Context<'py>) -> PyResult> +fn rusterize_impl<'py, T, R>(py: Python<'py>, ctx: Context<'py>) -> PyResult> where - T: Num + Copy + PixelOps + PolarsHandler + FromPyObject<'py> + Element + Default + 'static, + T: Num + Copy + PolarsHandler + Default + PixelOps + for<'a> FromPyObject<'a, 'py>, R: Rasterize, R::Output: Pythonize, { @@ -60,38 +73,41 @@ where .and_then(|inner| inner.extract().ok()) .unwrap_or_default(); let burn = ctx.pyburn.and_then(|inner| inner.extract().ok()).unwrap_or(T::one()); - let pixel_fn = set_pixel_function(ctx.pypixel_fn); - // rusterize - let ret = rusterize_impl::( - ctx.geometry, - ctx.raster_info, + // extract column from dataframe (cloning is cheap) + let casted = cast_df(ctx.df, ctx.pyfield, ctx.pyby, burn, ctx.geometry.len()); + let field = casted.column("field_casted").unwrap().clone(); + let by = casted.column("by_str").ok().and_then(|by| by.str().ok()); + + // rasterize + let rctx = RasterizeContext { + raster_info: ctx.raster_info, + geometry: ctx.geometry, + field, pixel_fn, background, - ctx.df, - ctx.pyfield, - ctx.pyby, - burn, - ctx.opt_flags, - ); + opt_flags: ctx.opt_flags, + }; + + let ret = R::rasterize(rctx, by); ret.pythonize(py, ctx.opt_flags) } #[pyfunction] #[pyo3(name = "_rusterize")] -#[pyo3(signature = (pygeometry, pyinfo, pypixel_fn, pydf=None, pyfield=None, pyby=None, pyburn=None, pybackground=None, pytouched=false, pyencoding="xarray", pydtype="float64"))] +#[pyo3(signature = (geometry, raw_raster_info, pypixel_fn, pydf=None, pyfield=None, pyby=None, pyburn=None, pybackground=None, pytouched=false, pyencoding="xarray", pydtype="float64"))] #[allow(clippy::too_many_arguments)] fn rusterize_py<'py>( py: Python<'py>, - pygeometry: &Bound<'py, PyAny>, - pyinfo: &Bound<'py, PyAny>, + geometry: ParsedGeometry, + raw_raster_info: RawRasterInfo, pypixel_fn: &'py str, pydf: Option, pyfield: Option<&'py str>, pyby: Option<&'py str>, - pyburn: Option<&'py Bound<'py, PyAny>>, - pybackground: Option<&'py Bound<'py, PyAny>>, + pyburn: Option<&'py Bound>, + pybackground: Option<&'py Bound>, pytouched: bool, pyencoding: &str, pydtype: &str, @@ -99,11 +115,8 @@ fn rusterize_py<'py>( // extract dataframe let df: Option = pydf.map(|inner| inner.into()); - // parse geometries - let geometry = from_shapely(py, pygeometry)?; - - // extract raster information - let raster_info = RasterInfo::extract_bound(pyinfo)?; + // construct raster info + let raster_info = RasterInfo::from(raw_raster_info, &geometry); // optional runtime flags let opt_flags = OptFlags::new(pytouched, pyencoding, pypixel_fn); @@ -120,46 +133,28 @@ fn rusterize_py<'py>( opt_flags, }; - match (pydtype, pyencoding) { - ("uint8", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("uint8", "sparse") => execute_rusterize::(py, ctx), - - ("uint16", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("uint16", "sparse") => execute_rusterize::(py, ctx), - - ("uint32", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("uint32", "sparse") => execute_rusterize::(py, ctx), - - ("uint64", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("uint64", "sparse") => execute_rusterize::(py, ctx), - - ("int8", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("int8", "sparse") => execute_rusterize::(py, ctx), - - ("int16", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("int16", "sparse") => execute_rusterize::(py, ctx), - - ("int32", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("int32", "sparse") => execute_rusterize::(py, ctx), - - ("int64", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("int64", "sparse") => execute_rusterize::(py, ctx), - - ("float32", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("float32", "sparse") => execute_rusterize::(py, ctx), - - ("float64", "xarray" | "numpy") => execute_rusterize::(py, ctx), - ("float64", "sparse") => execute_rusterize::(py, ctx), - - _ => unimplemented!( - "`dtype` must be one of uint8, uint16, uint32, uint64, int8, int16, int32, int64, float32, float64; \ - and `encoding` must be either 'xarray', 'numpy', or 'sparse'" - ), - } + dispatch_rusterize!( + pydtype, + pyencoding, + py, + ctx, + [ + ("uint8", u8), + ("uint16", u16), + ("uint32", u32), + ("uint64", u64), + ("int8", i8), + ("int16", i16), + ("int32", i32), + ("int64", i64), + ("float32", f32), + ("float64", f64) + ] + ) } #[pymodule] -fn rusterize(m: &Bound<'_, PyModule>) -> PyResult<()> { +fn rusterize(m: &Bound) -> PyResult<()> { m.add_function(wrap_pyfunction!(rusterize_py, m)?)?; Ok(()) } diff --git a/src/rasterization/burn_geometry.rs b/src/rasterization/burn_geometry.rs index fae4351..762600d 100644 --- a/src/rasterization/burn_geometry.rs +++ b/src/rasterization/burn_geometry.rs @@ -6,7 +6,6 @@ use crate::{ edges::{LineEdge, PolyEdge, extract_line, extract_point, extract_ring}, raster::RasterInfo, }, - prelude::OptFlags, rasterization::{ burners::{LineBurnStrategy, burn_point, burn_polygon}, rusterize_impl::PixelCache, @@ -20,37 +19,22 @@ where T: Num + Copy, W: PixelWriter, { - fn burn( - &self, - raster_info: &RasterInfo, - field_value: T, - writer: &mut W, - background: T, - opt_flags: &OptFlags, - ); + fn burn(&self, raster_info: &RasterInfo, field_value: T, writer: &mut W, background: T); } -#[rustfmt::skip] impl Burn for Geometry where T: Num + Copy, W: PixelWriter, { - fn burn( - &self, - raster_info: &RasterInfo, - field_value: T, - writer: &mut W, - background: T, - opt_flags: &OptFlags, - ) { + fn burn(&self, raster_info: &RasterInfo, field_value: T, writer: &mut W, background: T) { match self { Geometry::Point(geom) => { let mut pointedge = Vec::new(); extract_point(&mut pointedge, geom, raster_info); burn_point(pointedge, field_value, writer, background); - }, + } Geometry::MultiPoint(geom) => { let mut pointedge = Vec::new(); for point in geom { @@ -58,23 +42,13 @@ where } burn_point(pointedge, field_value, writer, background); - }, - Geometry::Polygon(geom) => { - geom.burn::(raster_info, field_value, writer, background, opt_flags) - } - Geometry::MultiPolygon(geom) => { - geom.burn::(raster_info, field_value, writer, background, opt_flags) - } - Geometry::LineString(geom) => { - geom.burn::(raster_info, field_value, writer, background, opt_flags) - } - Geometry::MultiLineString(geom) => { - geom.burn::(raster_info, field_value, writer, background, opt_flags) } - Geometry::GeometryCollection(geom) => { - geom.burn::(raster_info, field_value, writer, background, opt_flags) - }, - _ => () // not a shapely geometry + Geometry::Polygon(geom) => geom.burn::(raster_info, field_value, writer, background), + Geometry::MultiPolygon(geom) => geom.burn::(raster_info, field_value, writer, background), + Geometry::LineString(geom) => geom.burn::(raster_info, field_value, writer, background), + Geometry::MultiLineString(geom) => geom.burn::(raster_info, field_value, writer, background), + Geometry::GeometryCollection(geom) => geom.burn::(raster_info, field_value, writer, background), + _ => (), // not a shapely geometry } } } @@ -84,34 +58,19 @@ where T: Num + Copy, W: PixelWriter, { - fn burn( - &self, - raster_info: &RasterInfo, - field_value: T, - writer: &mut W, - background: T, - opt_flags: &OptFlags, - ) { + fn burn(&self, raster_info: &RasterInfo, field_value: T, writer: &mut W, background: T) { for geom in self { - geom.burn::(raster_info, field_value, writer, background, opt_flags) + geom.burn::(raster_info, field_value, writer, background) } } } -#[rustfmt::skip] impl Burn for Polygon where T: Num + Copy, W: PixelWriter, { - fn burn( - &self, - raster_info: &RasterInfo, - field_value: T, - writer: &mut W, - background: T, - opt_flags: &OptFlags, - ) { + fn burn(&self, raster_info: &RasterInfo, field_value: T, writer: &mut W, background: T) { // extract edges let mut polyedges = Vec::new(); extract_ring(&mut polyedges, self.exterior(), raster_info); @@ -127,7 +86,7 @@ where extract_line(&mut linedges, hole, raster_info); } - let pixel_cache = if opt_flags.requires_deduplication() { + let pixel_cache = if S::REQUIRES_DEDUPLICATION { Some(PixelCache::new(&linedges)) } else { None @@ -138,24 +97,24 @@ where (None, None) }; - handle_polygon::(raster_info, polyedges, linedges, &mut pixel_cache, field_value, writer, background) + handle_polygon::( + raster_info, + polyedges, + linedges, + &mut pixel_cache, + field_value, + writer, + background, + ) } } -#[rustfmt::skip] impl Burn for MultiPolygon where T: Num + Copy, W: PixelWriter, { - fn burn( - &self, - raster_info: &RasterInfo, - field_value: T, - writer: &mut W, - background: T, - opt_flags: &OptFlags, - ) { + fn burn(&self, raster_info: &RasterInfo, field_value: T, writer: &mut W, background: T) { // extract edges for all polygon let mut polyedges = Vec::new(); for polygon in self { @@ -175,7 +134,7 @@ where } } - let pixel_cache = if opt_flags.requires_deduplication() { + let pixel_cache = if S::REQUIRES_DEDUPLICATION { Some(PixelCache::new(&linedges)) } else { None @@ -186,7 +145,15 @@ where (None, None) }; - handle_polygon::(raster_info, polyedges, linedges, &mut pixel_cache, field_value, writer, background) + handle_polygon::( + raster_info, + polyedges, + linedges, + &mut pixel_cache, + field_value, + writer, + background, + ) } } @@ -195,20 +162,13 @@ where T: Num + Copy, W: PixelWriter, { - fn burn( - &self, - raster_info: &RasterInfo, - field_value: T, - writer: &mut W, - background: T, - opt_flags: &OptFlags, - ) { + fn burn(&self, raster_info: &RasterInfo, field_value: T, writer: &mut W, background: T) { // extract exterior and interior lines let mut linedges = Vec::new(); extract_line(&mut linedges, self, raster_info); // handle cases when pixels are not squares - if raster_info.xres != raster_info.yres || opt_flags.requires_deduplication() { + if raster_info.xres != raster_info.yres || S::REQUIRES_DEDUPLICATION { let mut cache = PixelCache::new(&linedges); let mut line_writer = LineWriter::new(writer, &mut cache); S::burn_line(linedges, raster_info, field_value, &mut line_writer, background) @@ -223,14 +183,7 @@ where T: Num + Copy, W: PixelWriter, { - fn burn( - &self, - raster_info: &RasterInfo, - field_value: T, - writer: &mut W, - background: T, - opt_flags: &OptFlags, - ) { + fn burn(&self, raster_info: &RasterInfo, field_value: T, writer: &mut W, background: T) { // extract all edges first to avoid overlaps when a line ends at the beginning of another let mut linedges = Vec::new(); for line in self { @@ -238,7 +191,7 @@ where } // handle cases when pixels are not squares - if raster_info.xres != raster_info.yres || opt_flags.requires_deduplication() { + if raster_info.xres != raster_info.yres || S::REQUIRES_DEDUPLICATION { let mut cache = PixelCache::new(&linedges); let mut line_writer = LineWriter::new(writer, &mut cache); S::burn_line(linedges, raster_info, field_value, &mut line_writer, background) diff --git a/src/rasterization/burners.rs b/src/rasterization/burners.rs index 6214286..7c95366 100644 --- a/src/rasterization/burners.rs +++ b/src/rasterization/burners.rs @@ -13,14 +13,17 @@ use crate::{ use num_traits::Num; use rayon::prelude::*; -pub struct Standard; -pub struct AllTouched; - const EPSILON_INTERSECT: f64 = 1e-4; const TOLERANCE: f64 = 1e-9; +pub struct Standard; +pub struct AllTouchedBase; +pub type AllTouched = AllTouchedBase; +pub type AllTouchedCached = AllTouchedBase; + pub trait LineBurnStrategy { const IS_ALL_TOUCHED: bool; + const REQUIRES_DEDUPLICATION: bool; fn burn_line( linedges: Vec, @@ -33,8 +36,9 @@ pub trait LineBurnStrategy { W: PixelWriter; } -impl LineBurnStrategy for Standard { +impl LineBurnStrategy for Standard { const IS_ALL_TOUCHED: bool = false; + const REQUIRES_DEDUPLICATION: bool = DEDUP; fn burn_line(linedges: Vec, raster_info: &RasterInfo, field_value: T, writer: &mut W, background: T) where @@ -91,8 +95,9 @@ impl LineBurnStrategy for Standard { } } -impl LineBurnStrategy for AllTouched { +impl LineBurnStrategy for AllTouchedBase { const IS_ALL_TOUCHED: bool = true; + const REQUIRES_DEDUPLICATION: bool = DEDUP; fn burn_line(linedges: Vec, raster_info: &RasterInfo, field_value: T, writer: &mut W, background: T) where @@ -276,9 +281,8 @@ pub fn burn_polygon( // start with first y line let mut yline = polyedges.first().unwrap().ystart; - let mut active_edges: Vec = Vec::new(); + let mut active_edges = Vec::new(); - // rasterize loop let ncols = raster_info.ncols as f64; while yline < raster_info.nrows && (!active_edges.is_empty() || !polyedges.is_empty()) { // transfer current edges to active edges @@ -305,7 +309,7 @@ pub fn burn_polygon( let x1 = &chunk[0].x_at_yline; let x2 = &chunk[1].x_at_yline; - // round down like GDAL + // round down let xstart = (x1 + 0.5).floor().clamp(0.0, ncols) as usize; let xend = (x2 + 0.5).floor().clamp(0.0, ncols) as usize; diff --git a/src/rasterization/prepare_dataframe.rs b/src/rasterization/prepare_dataframe.rs index 574d2fd..d3dbe6f 100644 --- a/src/rasterization/prepare_dataframe.rs +++ b/src/rasterization/prepare_dataframe.rs @@ -18,7 +18,7 @@ where match df { None => { // case 1: create a dummy `field` - DataFrame::new(vec![burn_value.into_column("field_casted", burn_length)]).unwrap() + DataFrame::new(burn_length, vec![burn_value.into_column("field_casted", burn_length)]).unwrap() } Some(df) => { let mut lf = df.lazy(); diff --git a/src/rasterization/rusterize_impl.rs b/src/rasterization/rusterize_impl.rs index 01df22c..21c9053 100644 --- a/src/rasterization/rusterize_impl.rs +++ b/src/rasterization/rusterize_impl.rs @@ -3,20 +3,17 @@ use crate::{ encoding::{ arrays::{DenseArray, SparseArray}, - pyarrays::Pythonize, writers::{DenseArrayWriter, PixelWriter, SparseArrayWriter, ToSparseArray}, }, - geo::{edges::LineEdge, raster::RasterInfo}, + geo::{edges::LineEdge, parse_geometry::ParsedGeometry, raster::RasterInfo}, prelude::{Dense, OptFlags, PolarsHandler, Sparse}, rasterization::{ burn_geometry::Burn, - burners::{AllTouched, LineBurnStrategy, Standard}, + burners::{AllTouched, AllTouchedCached, LineBurnStrategy, Standard}, pixel_functions::PixelFn, - prepare_dataframe::cast_df, }, }; use fixedbitset::FixedBitSet; -use geo_types::Geometry; use ndarray::Axis; use num_traits::Num; use numpy::Element; @@ -29,39 +26,39 @@ use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterato pub struct PixelCache { bits: FixedBitSet, width: usize, - min_x: isize, - min_y: isize, + xmin: isize, + ymin: isize, } impl PixelCache { pub fn new(linedges: &[LineEdge]) -> Self { - let (min_x, max_x, min_y, max_y) = linedges.iter().fold( - (f64::MAX, f64::MIN, f64::MAX, f64::MIN), - |(min_x, max_x, min_y, max_y), edge| { + let (xmin, ymin, xmax, ymax) = linedges.iter().fold( + (f64::MAX, f64::MAX, f64::MIN, f64::MIN), + |(xmin, ymin, xmax, ymax), edge| { ( - min_x.min(edge.x0).min(edge.x1), - max_x.max(edge.x0).max(edge.x1), - min_y.min(edge.y0).min(edge.y1), - max_y.max(edge.y0).max(edge.y1), + xmin.min(edge.x0).min(edge.x1), + ymin.min(edge.y0).min(edge.y1), + xmax.max(edge.x0).max(edge.x1), + ymax.max(edge.y0).max(edge.y1), ) }, ); - let width = (max_x.floor() - min_x.floor()) as usize + 1; - let length = (max_y.floor() - min_y.floor()) as usize + 1; + let width = (xmax.floor() - xmin.floor()) as usize + 1; + let length = (ymax.floor() - ymin.floor()) as usize + 1; Self { bits: FixedBitSet::with_capacity(width * length), width, - min_x: min_x as isize, - min_y: min_y as isize, + xmin: xmin as isize, + ymin: ymin as isize, } } #[inline] fn unravel_index(&self, x: usize, y: usize) -> usize { - let local_x = (x as isize - self.min_x) as usize; - let local_y = (y as isize - self.min_y) as usize; + let local_x = (x as isize - self.xmin) as usize; + let local_y = (y as isize - self.ymin) as usize; local_y * self.width + local_x } @@ -82,46 +79,21 @@ impl PixelCache { pub struct RasterizeContext { pub raster_info: RasterInfo, - geometry: Vec, - field: Column, + pub geometry: ParsedGeometry, + pub field: Column, pub pixel_fn: PixelFn, pub background: N, - opt_flags: OptFlags, + pub opt_flags: OptFlags, } -#[allow(clippy::too_many_arguments)] -pub fn rusterize_impl( - geometry: Vec, - raster_info: RasterInfo, - pixel_fn: PixelFn, - background: T, - df: Option, - field_name: Option<&str>, - by_name: Option<&str>, - burn_value: T, - opt_flags: OptFlags, -) -> R::Output -where - T: Num + PolarsHandler, - R: Rasterize, - R::Output: Pythonize, -{ - // extract column from dataframe (cloning is cheap) - let casted = cast_df(df, field_name, by_name, burn_value, geometry.len()); - let field = casted.column("field_casted").unwrap().clone(); - let by = casted.column("by_str").ok().and_then(|by| by.str().ok()); - - // main - let ctx = RasterizeContext { - raster_info, - geometry, - field, - pixel_fn, - background, - opt_flags, +macro_rules! dispatch_burn { + ($all_touched:expr, $dedup:expr, $func:ident, $ctx:expr, $writer:expr $(, $ext:expr)*) => { + match ($all_touched, $dedup) { + (true, true) => $func::($ctx, $writer $(, $ext)*), + (true, false) => $func::($ctx, $writer $(, $ext)*), + (false, _) => $func::($ctx, $writer $(, $ext)*), + } }; - - R::rasterize(ctx, by) } // rasterization logics @@ -138,6 +110,9 @@ where type Output = DenseArray; fn rasterize(ctx: RasterizeContext, by: Option<&ChunkedArray>) -> Self::Output { + let all_touched = ctx.opt_flags.with_all_touched(); + let dedup = ctx.opt_flags.requires_deduplication(); + match by { Some(by) => { let (n_groups, group_idx) = get_groups(by); @@ -151,11 +126,7 @@ where .map(|(band, (group_idx, idxs))| { let mut writer = DenseArrayWriter::new(band, ctx.pixel_fn); - if ctx.opt_flags.with_all_touched() { - process_multi::(&ctx, &mut writer, &idxs); - } else { - process_multi::(&ctx, &mut writer, &idxs); - } + dispatch_burn!(all_touched, dedup, process_multi, &ctx, &mut writer, &idxs); by.get(group_idx as usize).unwrap().to_string() }) @@ -168,11 +139,7 @@ where let mut raster = ctx.raster_info.build_raster(1, ctx.background); let mut writer = DenseArrayWriter::new(raster.index_axis_mut(Axis(0), 0), ctx.pixel_fn); - if ctx.opt_flags.with_all_touched() { - process_single::(&ctx, &mut writer); - } else { - process_single::(&ctx, &mut writer); - } + dispatch_burn!(all_touched, dedup, process_single, &ctx, &mut writer); DenseArray::new(raster, band_names, ctx.raster_info) } @@ -187,6 +154,9 @@ where type Output = SparseArray; fn rasterize(ctx: RasterizeContext, by: Option<&ChunkedArray>) -> Self::Output { + let all_touched = ctx.opt_flags.with_all_touched(); + let dedup = ctx.opt_flags.requires_deduplication(); + match by { Some(by) => { let (n_groups, group_idx) = get_groups(by); @@ -198,11 +168,7 @@ where let band_name = by.get(group_idx as usize).unwrap().to_string(); let mut writer = SparseArrayWriter::new(band_name); - if ctx.opt_flags.with_all_touched() { - process_multi::(&ctx, &mut writer, &idxs); - } else { - process_multi::(&ctx, &mut writer, &idxs); - } + dispatch_burn!(all_touched, dedup, process_multi, &ctx, &mut writer, &idxs); writer }) @@ -213,11 +179,8 @@ where None => { let mut writer = SparseArrayWriter::new(String::from("band_1")); - if ctx.opt_flags.with_all_touched() { - process_single::(&ctx, &mut writer); - } else { - process_single::(&ctx, &mut writer); - } + dispatch_burn!(all_touched, dedup, process_single, &ctx, &mut writer); + writer.finish(ctx) } } @@ -241,7 +204,7 @@ where .zip(&ctx.geometry) .for_each(|(field_value, geom)| { if let Some(fv) = N::from_anyvalue(field_value) { - geom.burn::(&ctx.raster_info, fv, writer, ctx.background, &ctx.opt_flags) + geom.burn::(&ctx.raster_info, fv, writer, ctx.background) } }); } @@ -257,7 +220,7 @@ where let anyvalue = ctx.field.get(i as usize).unwrap(); (N::from_anyvalue(anyvalue), ctx.geometry.get(i as usize)) } { - geom.burn::(&ctx.raster_info, fv, writer, ctx.background, &ctx.opt_flags) + geom.burn::(&ctx.raster_info, fv, writer, ctx.background) } } } diff --git a/test/data/standard_output_sum.tif b/test/data/standard_output_sum.tif new file mode 100644 index 0000000..46a71fc Binary files /dev/null and b/test/data/standard_output_sum.tif differ diff --git a/test/data/standard_output_sum_custom_shape.tif b/test/data/standard_output_sum_custom_shape.tif new file mode 100644 index 0000000..c6b40d7 Binary files /dev/null and b/test/data/standard_output_sum_custom_shape.tif differ diff --git a/test/test_many.py b/test/test_many.py new file mode 100644 index 0000000..43447d4 --- /dev/null +++ b/test/test_many.py @@ -0,0 +1,408 @@ +from osgeo import gdal + +import os +import re +import warnings +from tempfile import NamedTemporaryFile +from unittest.mock import patch + +import geopandas as gpd +import numpy as np +import polars_st as st +import pytest +import xarray as xr +from rusterize import rusterize +from shapely import wkt + +gdal.UseExceptions() + +GEOMS = [ + "POLYGON ((-180 -20, -140 55, 10 0, -140 -60, -180 -20), (-150 -20, -100 -10, -110 20, -150 -20))", + "POLYGON ((-10 0, 140 60, 160 0, 140 -55, -10 0))", + "POLYGON ((-125 0, 0 60, 40 5, 15 -45, -125 0))", + "MULTILINESTRING ((-180 -70, -140 -50), (-140 -50, -100 -70), (-100 -70, -60 -50), (-60 -50, -20 -70), (-20 -70, 20 -50), (20 -50, 60 -70), (60 -70, 100 -50), (100 -50, 140 -70), (140 -70, 180 -50))", + "GEOMETRYCOLLECTION (POINT (50 -40), POLYGON ((75 -40, 75 -30, 100 -30, 100 -40, 75 -40)), LINESTRING (60 -40, 80 0), GEOMETRYCOLLECTION (POLYGON ((100 20, 100 30, 110 30, 110 20, 100 20))))", +] + +geometries = [wkt.loads(geom) for geom in GEOMS] +GDF = gpd.GeoDataFrame({"value": range(1, len(GEOMS) + 1)}, geometry=geometries) + + +@pytest.fixture(scope="module") +def exploded_gpkg(): + """Temporary GPKG with exploded geometries for GDAL""" + with NamedTemporaryFile(suffix=".gpkg", delete=False) as tmp: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # GDAL struggles with nested collections, so we explode all of them to be safe + GDF.explode().explode().to_file(tmp.name, driver="GPKG", layer="test") + path = tmp.name + yield path + + if os.path.exists(path): + os.remove(path) + + +class TestTypeChecks: + @pytest.mark.parametrize( + "kwargs, expected_match", + [ + ({"data": "not_a_dataframe", "res": (1, 1)}, "`data` must be either geopandas"), + ({"like": "not_an_xarray", "res": (1, 1)}, "`like` must be a xarray.DataArray"), + ({"res": "1x1"}, "`resolution` must be a tuple or list"), + ({"out_shape": "100x100"}, "`out_shape` must be a tuple or list"), + ({"extent": "0,0,10,10"}, "`extent` must be a tuple or list"), + ({"field": 123}, "`field` must be a string"), + ({"by": 123}, "`by` must be a string"), + ({"burn": "hot"}, "`burn` must be an integer or float"), + ({"fun": 1}, "`pixel_fn` must be one of"), + ({"background": "black"}, "`background` must be integer, float, or None"), + ({"encoding": 1}, "`encoding` must be one of 'xarray'"), + ({"all_touched": "yes"}, "`all_touched` must be a boolean"), + ({"tap": "yes"}, "`tap` must be a boolean"), + ({"dtype": 64}, "`dtype` must be a one of"), + ], + ) + def test_type_errors(self, kwargs, expected_match): + args = {"data": GDF, "res": (1, 1)} + args.update(kwargs) + + with pytest.raises(TypeError, match=expected_match): + rusterize(**args) + + +class TestMissingDependencies: + def test_geopandas_missing(self): + import geopandas as gpd + from shapely import wkt + + gdf = gpd.GeoDataFrame(geometry=wkt.loads(GEOMS)) + + with patch("rusterize._check_for_geopandas", return_value=False): + with pytest.raises(TypeError, match="`data` must be either geopandas.GeoDataFrame"): + rusterize(gdf, res=(1, 1), encoding="numpy") + + def test_polars_missing(self): + import geopandas as gpd + from shapely import wkt + + gdf = gpd.GeoDataFrame(geometry=wkt.loads(GEOMS)) + + with patch("rusterize._check_for_geopandas", return_value=True): + with patch("rusterize._polars_available", return_value=False): + with pytest.raises(ModuleNotFoundError, match="polars must be installed when data is geopandas.GeoDataFrame."): + rusterize(gdf, res=(1, 1), encoding="numpy") + + def test_polars_st_missing(self): + import polars_st as st + + plst = st.GeoDataFrame({"geometry": GEOMS}) + + with patch("rusterize._check_for_polars_st", return_value=False): + with pytest.raises(TypeError, match="`data` must be either geopandas.GeoDataFrame, polars.DataFrame"): + rusterize(plst, res=(1, 1), encoding="numpy") + + def test_xarray_encoding_missing(self): + with patch("rusterize._xarray_available", return_value=False): + with pytest.raises(ModuleNotFoundError, match="`xarray` and `rioxarray` must be installed"): + rusterize(GEOMS, res=(1, 1), encoding="xarray") + + def test_xarray_like_missing(self): + with patch("rusterize._xarray_available", return_value=False): + import xarray as xr + like = xr.DataArray() + + with pytest.raises(TypeError, match="`like` must be a xarray.DataArray or xarray.Dataset"): + rusterize(GEOMS, like=like, encoding="numpy") + + +class TestArguments: + def test_burn_parameter(self): + r = rusterize(GDF, res=(1, 1), burn=99, encoding="numpy").squeeze() + assert np.nanmax(r) == 99 + assert np.nanmin(r[r > 0]) == 99 + + def test_background_parameter(self): + bg_value = -1 + r = rusterize(GDF, res=(1, 1), burn=1, background=bg_value, encoding="numpy").squeeze() + assert r[0, 0] == bg_value + + def test_mutually_exclusive_field_burn(self): + with pytest.raises(ValueError, match="Only one of `field` or `burn` can be specified"): + rusterize(GDF, res=(1, 1), field="value", burn=5) + + def test_missing_spatial_metadata_error(self): + with pytest.raises(ValueError, match="One of `res`, `out_shape`, or `extent` must be provided"): + rusterize(GDF) + + def test_invalid_resolution_error(self): + with pytest.raises(ValueError, match="`res` must be 2 positive numbers"): + rusterize(GDF, res=(-1, 1)) + + def test_invalid_shape_error(self): + with pytest.raises(ValueError, match="`out_shape` must be 2 positive integers"): + rusterize(GDF, out_shape=(-1, 1)) + + def test_invalid_extent_error1(self): + with pytest.raises(ValueError, match="Must also specify `res` or `out_shape` with extent."): + rusterize(GDF, extent=(1, 2, 3, 4)) + + def test_invalid_extent_error2(self): + expected_msg = "`extent` must be a tuple or list of (xmin, ymin, xmax, ymax)." + with pytest.raises(ValueError, match=re.escape(expected_msg)): + rusterize(GDF, res=(1, 1), extent=(0, 0, 0, 0)) + + def test_mutually_exclusive_like(self): + like = rusterize(GDF, res=(1, 1), field="value", encoding="xarray") + with pytest.raises(ValueError, match="`like` is mutually exclusive with `res`, `out_shape`, and `extent`."): + rusterize(GDF, like=like, res=(1, 1)) + + +class TestFormats: + def test_inputs(self): + # geopandas + r_gpd = rusterize(GDF, res=(1, 1), dtype="uint8", fun="sum", encoding="numpy") + + # list or numpy WKT + r_list = rusterize(GEOMS, res=(1, 1), dtype="uint8", fun="sum", encoding="numpy") + r_numpy = rusterize(np.asarray(GEOMS), res=(1, 1), dtype="uint8", fun="sum", encoding="numpy") + + # list or numpy WKB + r_list_wkb = rusterize(GDF.to_wkb().geometry.tolist(), res=(1, 1), dtype="uint8", fun="sum", encoding="numpy") + r_numpy_wkb = rusterize( + np.asarray(GDF.to_wkb().geometry), res=(1, 1), dtype="uint8", fun="sum", encoding="numpy" + ) + + # polars ST WKT + plst = st.GeoDataFrame({"value": list(range(1, len(GEOMS) + 1)), "geometry": GEOMS}) + r_plst = rusterize(plst, res=(1, 1), dtype="uint8", fun="sum", encoding="numpy") + + # polars ST WKB + plst_wkb = plst.st.to_wkb() + r_plst_wkb = rusterize(plst_wkb, res=(1, 1), dtype="uint8", fun="sum", encoding="numpy") + + assert np.allclose(r_gpd, r_list) + assert np.allclose(r_gpd, r_numpy) + assert np.allclose(r_gpd, r_plst) + assert np.allclose(r_gpd, r_list_wkb) + assert np.allclose(r_gpd, r_numpy_wkb) + assert np.allclose(r_gpd, r_plst_wkb) + + def test_outputs(self): + r_numpy = rusterize(GDF, res=(1, 1), dtype="uint8", field="value", encoding="numpy") + r_xarray = rusterize(GDF, res=(1, 1), dtype="uint8", field="value") + r_sparse1 = rusterize(GDF, res=(1, 1), dtype="uint8", field="value", encoding="sparse").to_numpy() + r_sparse2 = rusterize(GDF, res=(1, 1), dtype="uint8", field="value", encoding="sparse").to_xarray() + + assert np.allclose(r_numpy, r_xarray.data) + assert np.allclose(r_numpy, r_sparse1) + assert np.allclose(r_numpy, r_sparse2.data) + + +class TestCoherence: + def test_standard(self): + # comparing against a known-good static file + r = rusterize(GDF, res=(1, 1), dtype="uint8", field="value", fun="sum", encoding="numpy").squeeze() + + data_path = "test/data/standard_output_sum.tif" + with gdal.Open(data_path) as src: + gdal_array = src.ReadAsArray() + assert np.allclose(r, gdal_array) + + def test_alltouched(self, exploded_gpkg): + src_gdal = gdal.OpenEx(exploded_gpkg) + out_ds = gdal.Rasterize( + "", + src_gdal, + format="MEM", + outputType=gdal.GDT_Byte, + xRes=0.5, + yRes=0.5, + attribute="value", + layers=["test"], + allTouched=True, + add=True, + ) + + r = rusterize( + GDF.explode().explode(), + res=(0.5, 0.5), + dtype="uint8", + field="value", + fun="sum", + encoding="numpy", + all_touched=True, + ).squeeze() + + assert np.allclose(r, out_ds.ReadAsArray()) + + +class TestCustomRaster: + def test_extent_standard(self, exploded_gpkg): + extent = [-349, -507, 1, 0] + src_gdal = gdal.OpenEx(exploded_gpkg) + out_ds = gdal.Rasterize( + "", + src_gdal, + format="MEM", + outputType=gdal.GDT_Byte, + xRes=1, + yRes=1, + outputBounds=extent, + attribute="value", + layers=["test"], + add=True, + ) + + r = rusterize( + GDF.explode().explode(), + res=(1, 1), + dtype="uint8", + field="value", + extent=extent, + fun="sum", + encoding="numpy", + ).squeeze() + + assert np.allclose(r, out_ds.ReadAsArray()) + + def test_extent_alltouched(self, exploded_gpkg): + extent = [-349, -507, 1, 0] + src_gdal = gdal.OpenEx(exploded_gpkg) + out_ds = gdal.Rasterize( + "", + src_gdal, + format="MEM", + outputType=gdal.GDT_Byte, + xRes=1, + yRes=1, + outputBounds=extent, + attribute="value", + layers=["test"], + allTouched=True, + add=True, + ) + + r = rusterize( + GDF.explode().explode(), + res=(1, 1), + dtype="uint8", + field="value", + extent=extent, + fun="sum", + encoding="numpy", + all_touched=True, + ).squeeze() + + assert np.allclose(r, out_ds.ReadAsArray()) + + def test_shape_standard(self, exploded_gpkg): + shape = (47, 319) # (height, width) + + # interestingly, GDAL cuts the end/start of the lines with this custom shape + data_path = "test/data/standard_output_sum_custom_shape.tif" + with gdal.Open(data_path) as src: + gdal_array = src.ReadAsArray() + + r = rusterize( + GDF.explode().explode(), + dtype="uint8", + field="value", + out_shape=shape, + fun="sum", + encoding="numpy", + ).squeeze() + + assert np.allclose(r, gdal_array) + + def test_shape_alltouched(self, exploded_gpkg): + shape = (47, 319) # (height, width) + src_gdal = gdal.OpenEx(exploded_gpkg) + out_ds = gdal.Rasterize( + "", + src_gdal, + format="MEM", + outputType=gdal.GDT_Byte, + width=shape[1], + height=shape[0], + attribute="value", + layers=["test"], + allTouched=True, + add=True, + ) + + r = rusterize( + GDF.explode().explode(), + dtype="uint8", + field="value", + out_shape=shape, + fun="sum", + encoding="numpy", + all_touched=True, + ).squeeze() + + assert np.allclose(r, out_ds.ReadAsArray()) + + def test_some_user_inputs_standard(self, exploded_gpkg): + # GDAL doesn't directly support res + shape as input parameters here + extent = [-349, -507, 1, 0] + shape = (47, 319) + + src_gdal = gdal.OpenEx(exploded_gpkg) + out_ds = gdal.Rasterize( + "", + src_gdal, + format="MEM", + outputType=gdal.GDT_Byte, + width=shape[1], + height=shape[0], + outputBounds=extent, + attribute="value", + layers=["test"], + add=True, + ) + + r = rusterize( + GDF.explode().explode(), + dtype="uint8", + field="value", + out_shape=shape, + extent=extent, + fun="sum", + encoding="numpy", + ).squeeze() + + assert np.allclose(r, out_ds.ReadAsArray()) + + def test_some_user_inputs_alltouched(self, exploded_gpkg): + # GDAL doesn't directly support res + shape as input parameters here + extent = [-349, -507, 1, 0] + shape = (47, 319) + + src_gdal = gdal.OpenEx(exploded_gpkg) + out_ds = gdal.Rasterize( + "", + src_gdal, + format="MEM", + outputType=gdal.GDT_Byte, + width=shape[1], + height=shape[0], + outputBounds=extent, + attribute="value", + layers=["test"], + allTouched=True, + add=True, + ) + + r = rusterize( + GDF.explode().explode(), + dtype="uint8", + field="value", + out_shape=shape, + extent=extent, + fun="sum", + encoding="numpy", + all_touched=True, + ).squeeze() + + assert np.allclose(r, out_ds.ReadAsArray())