diff --git a/.docker/Api.Dockerfile b/.docker/Api.Dockerfile deleted file mode 100644 index 775391cf..00000000 --- a/.docker/Api.Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM python:3.11-slim -WORKDIR /app -COPY requirements.txt /app/ -RUN apt-get update && apt-get install -y --fix-missing \ - libexpat1 \ - libgdal-dev \ - g++ \ - && pip install --no-cache-dir gdal==$(gdal-config --version) \ - && apt-get purge -y g++ \ - && apt-get autoremove -y \ - && rm -rf /var/lib/apt/lists/* -RUN pip install --no-cache-dir -r requirements.txt -COPY . /app - -EXPOSE 8000 \ No newline at end of file diff --git a/.github/workflows/publish-api.yml b/.github/workflows/publish-api.yml deleted file mode 100644 index 6bb5140d..00000000 --- a/.github/workflows/publish-api.yml +++ /dev/null @@ -1,132 +0,0 @@ -name: Publish APIs - -on: - pull_request: - types: [ closed ] - workflow_dispatch: - - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - id-token: write - contents: read - -jobs: - detect-changes: - name: Detect changed paths - runs-on: ubuntu-latest - if: github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true - permissions: - contents: read - pull-requests: read - outputs: - api: ${{ steps.filter.outputs.api || (github.event_name == 'workflow_dispatch' && 'true') }} - steps: - - uses: actions/checkout@v4 - - - uses: dorny/paths-filter@v3 - id: filter - if: github.event_name == 'pull_request' - with: - filters: | - api: - - 'src/**' - - '!src/presentation/entrypoints/**' - - '!src/presentation/databricks/**' - - '.docker/Api.Dockerfile' - - 'requirements.txt' - - 'docker-compose.yml' - - '.github/workflows/publish-api.yml' - - build-and-push-api-images-to-acr: - name: Build & Push API Images to ACR - needs: detect-changes - if: needs.detect-changes.outputs.api == 'true' - runs-on: ubuntu-latest - strategy: - matrix: - include: - - service: vmt-api-server - display_name: VMT API Server - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Azure Login - uses: azure/login@v2 - with: - client-id: ${{ vars.AZURE_CLIENT_ID }} - tenant-id: ${{ vars.AZURE_TENANT_ID }} - subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} - - - name: Log in to ACR - run: az acr login --name ${{ vars.ACR_NAME }} - - - name: Build ${{ matrix.display_name }} from docker-compose - run: docker compose build ${{ matrix.service }} - - - name: Tag ${{ matrix.display_name }} as latest - run: docker tag ${{ matrix.service }}:latest ${{ vars.ACR_LOGIN_SERVER }}/${{ matrix.service }}:latest - - - name: Tag ${{ matrix.display_name }} with commit SHA - run: docker tag ${{ matrix.service }}:latest ${{ vars.ACR_LOGIN_SERVER }}/${{ matrix.service }}:${{ github.sha }} - - - name: Push ${{ matrix.display_name }} (latest) - run: docker push ${{ vars.ACR_LOGIN_SERVER }}/${{ matrix.service }}:latest - - - name: Push ${{ matrix.display_name }} (commit SHA) - run: docker push ${{ vars.ACR_LOGIN_SERVER }}/${{ matrix.service }}:${{ github.sha }} - - deploy-vmt-api: - name: Deploy VMT API to Azure Web App - runs-on: ubuntu-latest - needs: build-and-push-api-images-to-acr - strategy: - matrix: - include: - - service: vmt-api-server - display_name: VMT API Server - webapp_name: doppa-vmt - - steps: - - name: Azure Login - uses: azure/login@v2 - with: - client-id: ${{ vars.AZURE_CLIENT_ID }} - tenant-id: ${{ vars.AZURE_TENANT_ID }} - subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} - - - name: Configure app settings - uses: azure/cli@v2 - env: - WEBAPP_NAME: ${{ matrix.webapp_name }} - RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} - POSTGRES_USERNAME: ${{ secrets.POSTGRES_USERNAME }} - POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }} - POSTGRES_SERVER_NAME: ${{ vars.POSTGRES_SERVER_NAME }} - BLOB_CONN_STRING: ${{ secrets.AZURE_BLOB_STORAGE_CONNECTION_STRING }} - BLOB_BENCHMARK: ${{ vars.AZURE_BLOB_STORAGE_BENCHMARK_CONTAINER }} - BLOB_METADATA: ${{ vars.AZURE_BLOB_STORAGE_METADATA_CONTAINER }} - with: - azcliversion: latest - inlineScript: | - az webapp config appsettings set \ - --name "$WEBAPP_NAME" \ - --resource-group "$RESOURCE_GROUP" \ - --settings \ - POSTGRES_USERNAME="$POSTGRES_USERNAME" \ - POSTGRES_PASSWORD="$POSTGRES_PASSWORD" \ - POSTGRES_SERVER_NAME="$POSTGRES_SERVER_NAME" \ - AZURE_BLOB_STORAGE_CONNECTION_STRING="$BLOB_CONN_STRING" \ - AZURE_BLOB_STORAGE_BENCHMARK_CONTAINER="$BLOB_BENCHMARK" \ - AZURE_BLOB_STORAGE_METADATA_CONTAINER="$BLOB_METADATA" - - - name: Deploy ${{ matrix.display_name }} - uses: azure/webapps-deploy@v3 - with: - app-name: ${{ matrix.webapp_name }} - images: ${{ vars.ACR_LOGIN_SERVER }}/${{ matrix.service }}:latest \ No newline at end of file diff --git a/.github/workflows/pull-request-tests.yml b/.github/workflows/pull-request-tests.yml index b6f61e6c..f651ad2b 100644 --- a/.github/workflows/pull-request-tests.yml +++ b/.github/workflows/pull-request-tests.yml @@ -31,7 +31,6 @@ jobs: pull-requests: read outputs: orchestrator: ${{ steps.filter.outputs.orchestrator }} - api: ${{ steps.filter.outputs.api }} benchmarks: ${{ steps.filter.outputs.benchmarks }} steps: - uses: actions/checkout@v4 @@ -57,13 +56,6 @@ jobs: - '.docker/Setup.Dockerfile' - 'requirements.txt' - 'docker-compose.yml' - api: - - 'src/**' - - '!src/presentation/entrypoints/**' - - '!src/presentation/databricks/**' - - '.docker/Api.Dockerfile' - - 'requirements.txt' - - 'docker-compose.yml' compile: name: Check Python syntax @@ -109,24 +101,6 @@ jobs: - name: Build Container Orchestrator from docker-compose run: docker compose build container-orchestrator - build-api-image: - name: Build VMT API Server - runs-on: ubuntu-latest - needs: - - compile - - detect-changes - if: needs.detect-changes.outputs.api == 'true' - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build VMT API Server from docker-compose - run: docker compose build vmt-api-server - build-benchmark-images: name: Build ${{ matrix.display_name }} runs-on: ubuntu-latest diff --git a/CLAUDE.md b/CLAUDE.md index 04649798..a4c619af 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,14 +4,14 @@ Reproducible benchmarking framework comparing cloud-native (DuckDB + GeoParquet, ## Stack -Python, DuckDB (spatial), PostGIS on Azure Database for PostgreSQL, Apache Sedona on Databricks, Azure Blob Storage, Azure Container Instances, `dependency_injector`, FastAPI, PMTiles/MVT. See `requirements.txt` for versions. +Python, DuckDB (spatial), PostGIS on Azure Database for PostgreSQL, Apache Sedona on Databricks, Azure Blob Storage, Azure Container Instances, `dependency_injector`. See `requirements.txt` for versions. ## Layout (Clean Architecture) - `src/domain/` — enums only; no dependencies on other layers. - `src/application/` — `contracts/` (service interfaces), `dtos/`, `common/` (logger, monitor). - `src/infra/` — `infrastructure/services/` (contract impls), `infrastructure/containers.py` (DI wiring), `persistence/context/` (DuckDB, Postgres, Blob clients). -- `src/presentation/` — `entrypoints/` (one file per benchmark), `configuration/app_config.py` (`initialize_dependencies`), `databricks/` (notebook script), `endpoints/tile_server.py` (FastAPI VMT server). +- `src/presentation/` — `entrypoints/` (one file per benchmark), `configuration/app_config.py` (`initialize_dependencies`), `databricks/` (notebook script). - `main.py` — outside-ACI orchestrator. Reads `benchmarks.yml`, launches one ACI per experiment. - `benchmark_runner.py` — in-container dispatcher. Matches `--script-id` to a function in `src/presentation/entrypoints/`. - `benchmarks.yml` — experiment manifest. Each entry: `id`, `image`, `cpu`, `memory_gb`, `related_script_ids`. diff --git a/README.md b/README.md index 39eee7d8..24a8ceaa 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ doppa is a reproducible benchmarking framework for evaluating traditional geospatial query stacks (PostGIS, shapefiles) against cloud-native geospatial (CNG) alternatives (DuckDB over GeoParquet in -blob storage, PMTiles/MVT vector tiles, and Apache Sedona on Databricks) across a range of real-world +blob storage and Apache Sedona on Databricks) across a range of real-world spatial query patterns: point-in-polygon lookups, k-nearest-neighbour search, bounding-box filtering, and a national-scale spatial join. @@ -13,7 +13,7 @@ measurable and reproducible on identical datasets and hardware.
-[![Push containers to Azure Container Registry](https://github.com/kartAI/doppa-data/actions/workflows/push-containers-to-acr.yml/badge.svg)](https://github.com/kartAI/doppa-data/actions/workflows/push-containers-to-acr.yml) [![Publish APIs](https://github.com/kartAI/doppa-data/actions/workflows/publish-api.yml/badge.svg)](https://github.com/kartAI/doppa-data/actions/workflows/publish-api.yml) +[![Push containers to Azure Container Registry](https://github.com/kartAI/doppa-data/actions/workflows/push-containers-to-acr.yml/badge.svg)](https://github.com/kartAI/doppa-data/actions/workflows/push-containers-to-acr.yml)
@@ -60,7 +60,7 @@ format internals to client-observed cost is measured end to end. **Cloud-native vector formats vs. traditional formats on cloud storage.** Empirical comparisons in the literature (Holmes 2023; Flatgeobuf 2024) measure write times and file sizes on local disk and do not place cloud-native and traditional formats side by side on cloud storage. doppa benchmarks GeoParquet over Azure Blob Storage (via DuckDB) -against PostGIS on Azure Database for PostgreSQL, and PMTiles against WMS-style vector tiles, across the active +against PostGIS on Azure Database for PostgreSQL, across the active catalog of query patterns: point-in-polygon lookups, k-nearest-neighbour search, bounding-box filtering, and a national-scale spatial join. The local-Shapefile entrypoints sit on the side as a laptop-workflow reference, with the Shapefile downloaded ahead of the timed scope to emulate that workflow rather than to bench the format on cloud @@ -167,8 +167,6 @@ to the elapsed-time distribution. | PostGIS | Single-node, managed service | Azure Database for PostgreSQL Flexible Server | | GeoPandas + Shapefile | Single-node, local-disk baseline | Shapefile pre-downloaded to the container before the timed scope | | Apache Sedona | Distributed | Azure Databricks, 2 / 4 / 8 / 12 / 16 `Standard_D4s_v3` workers, reading GeoParquet via ABFS | -| PMTiles | Cloud-native vector tiles | PMTiles archive in blob storage, accessed via HTTP range reads | -| WMS-style vector tiles | Traditional vector tiles | `doppa-vmt` web app for containers, tiles assembled on demand | DuckDB and PostGIS each run inside an Azure Container Instance with 4 vCPU and 16 GB RAM, so CPU and memory baselines match between the single-node engines. @@ -348,7 +346,7 @@ so. #### Resource naming The resource names used throughout this section (`doppa`, `doppabs`, `doppaacr`, `doppa-uami`, -`doppa-db`, `doppa-vmt`, `doppa-databricks`) are baked into source and configuration. Keep them +`doppa-db`, `doppa-databricks`) are baked into source and configuration. Keep them as-is for the simplest setup; this is also what the thesis deployment uses, so reproducing the published results requires these exact names. @@ -356,9 +354,8 @@ If you need to rename a resource, the following references must be updated toget | Location | What is hardcoded | |-------------------------------------|--------------------------------------------------------------------------------| -| `src/config.py` | Default values for resource group, blob URL/account, VMT URL, STAC container | +| `src/config.py` | Default values for resource group, blob URL/account, STAC container | | `benchmarks.yml` | ACR image references (`doppaacr.azurecr.io/:latest`) for every benchmark | -| `.github/workflows/publish-api.yml` | `webapp_name: doppa-vmt` | `src/config.py` defaults can also be overridden via the corresponding environment variables (see [Local development](#local-development) and [GitHub Actions](#github-actions)) without @@ -464,34 +461,6 @@ same setting change the following: - `effective_cache_size`: `6291456` - `work_mem`: `65536` -#### Web app for containers - -Create -a [web app for containers](https://portal.azure.com/#view/Microsoft_Azure_Marketplace/GalleryItemDetailsBladeNopdl/id/Microsoft.AppSvcLinux/selectionMode~/false/resourceGroupId//resourceGroupLocation//dontDiscardJourney~/false/selectedMenuId/home/launchingContext~/%7B%22galleryItemId%22%3A%22Microsoft.AppSvcLinux%22%2C%22source%22%3A%5B%22GalleryFeaturedMenuItemPart%22%2C%22VirtualizedTileDetails%22%5D%2C%22menuItemId%22%3A%22home%22%2C%22subMenuItemId%22%3A%22Search%20results%22%2C%22telemetryId%22%3A%22135c4e97-6a92-446e-aa0a-3f2201ddfdb1%22%7D/searchTelemetryId/c154ee0a-06d6-49e4-a17f-3820937e6335) -The process is the same for each of the following API servers: - -- `doppa-vmt` - -Under *Basics*: - -- Resource group: `doppa` -- Name: `` -- Publish: `Container` -- Operating system: `Linux` -- Pricing plan: `Premium V4 P0V4` - -Under *Container*: - -- Image source: `Azure Container Registry` -- Registry: `doppaacr` -- Authentication: `Managed identity` -- Identity: `doppa-uami` -- Image: `