AmadeusITGroup · guleclerc · Apr 15, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -16,29 +16,52 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - spark_profile: spark34
+            java_version: "11"
+            label: "Spark 3.4 / Java 11"
+          - spark_profile: spark35
+            java_version: "11"
+            label: "Spark 3.5 / Java 11"
+          - spark_profile: spark40
+            java_version: "17"
+            label: "Spark 4.0 / Java 17"
+
+    name: "Build (${{ matrix.label }})"
+
+    env:
+      SPARK_PROFILE: ${{ matrix.spark_profile }}
+
     steps:
       - uses: actions/checkout@v4
 
-      - uses: actions/setup-java@v3
+      - uses: actions/setup-java@v4
         with:
           distribution: "temurin"
-          java-version: 8
+          java-version: ${{ matrix.java_version }}
           cache: "sbt"
 
       - uses: sbt/setup-sbt@v1
         with:
           sbt-runner-version: 1.10.11
 
-      - run: sbt compile
+      - name: Compile
+        run: sbt compile
 
-      - run: sbt test
+      - name: Test
+        run: sbt test
 
-      - run: sbt package
+      - name: Package
+        run: sbt package
 
-      - run: tar cf artefacts.tar target/ */target/
+      - name: Bundle artifacts
+        run: tar cf artefacts.tar target/ */target/
 
       - uses: actions/upload-artifact@v4
         with:
-          name: Artefacts
+          name: Artefacts-${{ matrix.spark_profile }}
           path: artefacts.tar
-
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -11,8 +11,13 @@ on:
           - MINOR
           - MAJOR
 
+run-name: ${{ format('Publishing new version - {0}', inputs.release_type) }}
+
 jobs:
-  publish:
+  # ────────────────────────────────────────────────────────────────────
+  #  Phase 1: Create release tag (runs once, on default Spark profile)
+  # ────────────────────────────────────────────────────────────────────
+  release-tag:
     runs-on: ubuntu-latest
     permissions:
       contents: write
@@ -22,6 +27,10 @@ jobs:
     env:
       GITHUB_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       RELEASE_TYPE: ${{ inputs.release_type }}
+      SPARK_PROFILE: spark35
+    outputs:
+      release_tag: ${{ steps.get_tag.outputs.tag }}
+      release_base_version: ${{ steps.extract_version.outputs.base_version }}
     steps:
       - uses: actions/checkout@v4
 
@@ -30,17 +39,85 @@ jobs:
           git config user.name "github-actions[bot]"
           git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
 
-      - uses: actions/setup-java@v3
+      - uses: actions/setup-java@v4
         with:
-          java-version: 8
+          java-version: 11
           distribution: "temurin"
           cache: "sbt"
 
       - uses: sbt/setup-sbt@v1
         with:
           sbt-runner-version: 1.10.11
 
-      - name: Run SBT release
+      - name: Run SBT release (tag + publish default profile)
         run: sbt 'release with-defaults'
 
-run-name: ${{ format('Publishing new version - {0}', inputs.release_type) }}
+      - name: Extract release tag
+        id: get_tag
+        run: echo "tag=$(git describe --tags --abbrev=0)" >> "$GITHUB_OUTPUT"
+
+      - name: Extract base version (without Spark suffix)
+        id: extract_version
+        run: |
+          # Tag format is e.g. "v1.2.0-spark3.5.3" — extract "1.2.0"
+          TAG=$(git describe --tags --abbrev=0)
+          BASE_VERSION=$(echo "$TAG" | sed 's/^v//' | sed 's/-spark.*//')
+          echo "base_version=$BASE_VERSION" >> "$GITHUB_OUTPUT"
+
+  # ────────────────────────────────────────────────────────────────────
+  #  Phase 2: Publish all other Spark profiles from the release tag
+  # ────────────────────────────────────────────────────────────────────
+  publish-profiles:
+    needs: release-tag
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+      attestations: write
+      id-token: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - spark_profile: spark34
+            java_version: "11"
+            label: "Spark 3.4"
+          - spark_profile: spark35
+            java_version: "11"
+            label: "Spark 3.5"
+          - spark_profile: spark40
+            java_version: "17"
+            label: "Spark 4.0"
+
+    name: "Publish (${{ matrix.label }})"
+
+    env:
+      GITHUB_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      SPARK_PROFILE: ${{ matrix.spark_profile }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ needs.release-tag.outputs.release_tag }}
+
+      - uses: actions/setup-java@v4
+        with:
+          java-version: ${{ matrix.java_version }}
+          distribution: "temurin"
+          cache: "sbt"
+
+      - uses: sbt/setup-sbt@v1
+        with:
+          sbt-runner-version: 1.10.11
+
+      - name: Set version for this Spark profile
+        env:
+          BASE_VERSION: ${{ needs.release-tag.outputs.release_base_version }}
+        run: |
+          # sbt-release wrote a hardcoded version at the tag (e.g. 1.2.0-spark3.5.3).
+          # We need to re-derive it for this profile's Spark version.
+          echo "ThisBuild / version := s\"${BASE_VERSION}-spark\${SparkProfiles.active.sparkVersion}\"" > version.sbt
+
+      - name: Publish JARs for ${{ matrix.label }}
+        run: sbt publish
diff --git a/BUILDING.md b/BUILDING.md
@@ -0,0 +1,153 @@
+# Building Data I/O
+
+This document describes how the multi-Spark-version build and release system works.
+
+## Architecture Overview
+
+Data I/O publishes artifacts for multiple Apache Spark versions from a **single codebase**.
+All version-coupled values are defined in one place:
+
+```
+project/SparkProfiles.scala   ← Single source of truth for ALL versioned dependencies
+build.sbt                     ← References SparkProfiles.active (no hardcoded versions)
+version.sbt                   ← Derives the Spark tag dynamically from the active profile
+```
+
+### How It Works
+
+1. **`project/SparkProfiles.scala`** defines a `SparkProfile` case class containing:
+   - Spark version (e.g., `3.5.3`)
+   - Scala version (e.g., `2.12.15`)
+   - Java target (e.g., `11`)
+   - All connector library versions (Snowflake, Elasticsearch, Embedded Kafka, etc.)
+   - Feature flags (`supportsSnowflake`, `supportsElasticsearch`)
+
+2. **Profile selection** is driven by the `SPARK_PROFILE` environment variable:
+   ```bash
+   SPARK_PROFILE=spark34 sbt compile
+   ```
+   If unset, it defaults to `spark35`.
+
+3. **`build.sbt`** reads `SparkProfiles.active` and uses it for all `scalaVersion`,
+   `javacOptions`, and `libraryDependencies` settings. Connector modules (Snowflake,
+   Elasticsearch) are conditionally included based on profile feature flags.
+
+4. **`version.sbt`** embeds the Spark version in the artifact version string:
+   ```
+   1.1.1-spark3.5.3-SNAPSHOT
+   ```
+
+## Supported Profiles
+
+| Profile    | Spark | Scala   | Java | Snowflake         | Elasticsearch    |
+|------------|-------|---------|------|-------------------|------------------|
+| `spark34`  | 3.4.4 | 2.12.15 | 11   | 2.16.0-spark_3.4  | 8.17.4           |
+| `spark35`  | 3.5.3 | 2.12.15 | 11   | 3.1.1             | 8.17.4           |
+| `spark40`  | 4.0.2 | 2.13.17 | 17   | —                 | —                |
+
+## Local Development
+
+```bash
+# Default profile (Spark 3.5)
+sbt compile test package
+
+# Specific profile
+SPARK_PROFILE=spark35 sbt compile test
+
+# Build all profiles locally (shell loop)
+for profile in spark34 spark35 spark40; do
+  echo "=== Building $profile ==="
+  SPARK_PROFILE=$profile sbt clean compile test
+done
+```
+
+## CI Pipeline
+
+The GitHub Actions CI workflow (`.github/workflows/ci.yml`) uses a **matrix strategy**
+to build, test, and package all supported Spark profiles in parallel:
+
+```yaml
+strategy:
+  matrix:
+    include:
+      - spark_profile: spark34
+        java_version: "11"
+      - spark_profile: spark35
+        java_version: "11"
+      - spark_profile: spark40
+        java_version: "17"
+```
+
+Each matrix leg runs with the appropriate Java version and `SPARK_PROFILE` env var.
+Artifacts are uploaded with profile-specific names (e.g., `Artefacts-spark34`).
+
+## Release Process
+
+The publish workflow (`.github/workflows/publish.yml`) uses a **two-phase approach**:
+
+### Phase 1: Release Tag (runs once)
+1. Runs `sbt 'release with-defaults'` with the default Spark profile (`spark35`)
+2. This creates the git tag, commits version bumps, and publishes the Spark 3.5 artifacts
+3. The release tag is captured as an output for Phase 2
+
+### Phase 2: Publish Other Profiles (matrix, runs in parallel)
+1. Checks out the release tag from Phase 1
+2. Regenerates `version.sbt` with the correct Spark-tagged version for the profile
+   (needed because sbt-release hardcodes the version string at the tag commit)
+3. For each additional profile (`spark34`, `spark40`, ...), runs `sbt publish`
+4. Each profile publishes artifacts with its own Spark-tagged version string
+
+### Artifact Naming
+
+Published artifacts follow this convention:
+```
+com.amadeus.dataio:dataio-core_2.12:1.2.0-spark4.0.2
+com.amadeus.dataio:dataio-core_2.12:1.2.0-spark3.5.3
+com.amadeus.dataio:dataio-core_2.12:1.2.0-spark3.4.4
+```
+
+## Adding a New Spark Version
+
+To add support for a new Spark version:
+
+1. **Edit `project/SparkProfiles.scala`** — add a new entry to the `profiles` map:
+   ```scala
+   "spark41" -> SparkProfile(
+     sparkVersion              = "4.1.1",
+     scalaVersion              = "2.13.18",
+     javaTarget                = "17",
+     sparkSnowflakeVersion     = Some("x.y.z"),
+     elasticsearchSparkVersion = Some("8.x.y"),
+     embeddedKafkaVersion      = "3.6.0"
+   )
+   ```
+
+2. **Update CI matrix** — add the profile to `.github/workflows/ci.yml`:
+   ```yaml
+   - spark_profile: spark41
+     java_version: "17"
+     label: "Spark 4.0 / Java 17"
+   ```
+
+3. **Update Publish matrix** — add to `.github/workflows/publish.yml`
+
+4. **Update README.md** — add badge and compatibility table entry
+
+That's it. No changes to `build.sbt` or any source code are needed.
+
+## Removing a Spark Version
+
+1. Remove the entry from `project/SparkProfiles.scala`
+2. Remove from CI and Publish workflow matrices
+3. Update README
+
+## Connector Availability
+
+Some connectors may not support all Spark versions (e.g., Spark 4.0). The `SparkProfile`
+uses `Option[String]` for connector versions:
+- `Some("x.y.z")` → connector is included in the build and published
+- `None` → connector module is excluded from aggregation and `publish/skip := true`
+
+This is handled automatically — no manual module toggling needed.
+
+
diff --git a/README.md b/README.md
@@ -1,7 +1,8 @@
 # Data I/O
 
 [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
-[![Spark](https://img.shields.io/badge/Spark-3.5.0-blue)](https://spark.apache.org/releases/spark-release-3-4-1.html)
+[![Spark 3.4](https://img.shields.io/badge/Spark-3.4.4-blue)](https://spark.apache.org/releases/spark-release-3-4-4.html)
+[![Spark 3.5](https://img.shields.io/badge/Spark-3.5.3-blue)](https://spark.apache.org/releases/spark-release-3-5-3.html)
 [![Scala](https://img.shields.io/badge/Scala-2.12.15-red)](https://www.scala-lang.org/)
 [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)][contributing]
 
@@ -14,6 +15,36 @@ Data I/O is an open source project that provides a flexible and scalable framewo
 - Support for batch and streaming data processing
 - Extensible architecture for custom data processors and pipelines
 - Scalable and fault-tolerant processing using Apache Spark
+- **Multi-Spark version support** — builds and publishes for Spark 3.3, 3.4, and 3.5 from a single codebase
+
+## Supported Spark Versions
+
+| Profile    | Spark | Scala | Java | Snowflake | Elasticsearch |
+|------------|-------|-------|------|-----------|---------------|
+| `spark34`  | 3.4.4 | 2.12  | 11   | ✅         | ✅             |
+| `spark35`  | 3.5.3 | 2.12  | 11   | ✅         | ✅             |
+| `spark40`  | 4.0.2 | 2.13  | 17   | ❌         | ❌             |
+
+> **Note:** Spark 4.0 support is experimental. Snowflake and Elasticsearch connectors
+> do not yet have Spark 4.0–compatible releases.
+
+## Building Locally
+
+Select a Spark profile via the `SPARK_PROFILE` environment variable (defaults to `spark35`):
+
+```bash
+# Build for Spark 3.4
+SPARK_PROFILE=spark34 sbt compile
+
+# Run tests for Spark 3.4
+SPARK_PROFILE=spark34 sbt test
+
+# Package for Spark 3.5 (default)
+sbt package
+```
+
+All version-coupled dependencies (Spark, Scala, Java target, connectors) are defined in
+[`project/SparkProfiles.scala`](project/SparkProfiles.scala) — the single source of truth.
 
 ## Getting Started
 To get started with Data I/O, please refer to the [documentation][gettingstarted] for installation instructions, usage examples, and API references.