From 76a0740c79ca2cf6b5ba379c2b6a9c0752ef1000 Mon Sep 17 00:00:00 2001 From: MatloaItumeleng Date: Wed, 8 Apr 2026 16:33:36 +0200 Subject: [PATCH 1/5] spark and commons upgrade , target 1.11 java and scala 2.12/13 --- .github/workflows/build.yml | 4 ++-- .github/workflows/format_check.yml | 2 +- .github/workflows/jacoco_check.yml | 2 +- .github/workflows/licence_check.yml | 2 +- .github/workflows/release.yml | 2 +- README.md | 20 +++++--------------- build.sbt | 9 ++++++--- project/Dependencies.scala | 8 ++++---- project/plugins.sbt | 1 - 9 files changed, 21 insertions(+), 29 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5718486..a128d4a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -28,7 +28,7 @@ jobs: strategy: fail-fast: false matrix: - scala: [2.11.12, 2.12.18, 2.13.11] + scala: [2.12.18, 2.13.11] name: Scala ${{matrix.scala}} steps: - name: Checkout code @@ -41,7 +41,7 @@ jobs: - name: Setup Scala uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c with: - java-version: "adopt@1.8" + java-version: "adopt@1.11" - name: Build and run tests run: sbt ++${{matrix.scala}} test doc diff --git a/.github/workflows/format_check.yml b/.github/workflows/format_check.yml index 3456b72..5a24a99 100644 --- a/.github/workflows/format_check.yml +++ b/.github/workflows/format_check.yml @@ -37,7 +37,7 @@ jobs: - name: Setup Scala uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c with: - java-version: "adopt@1.8" + java-version: "adopt@1.11" - name: Run scalafmt And Print Diff continue-on-error: true diff --git a/.github/workflows/jacoco_check.yml b/.github/workflows/jacoco_check.yml index 73fc4c2..faa1360 100644 --- a/.github/workflows/jacoco_check.yml +++ b/.github/workflows/jacoco_check.yml @@ -47,7 +47,7 @@ jobs: - name: Setup Scala uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c with: - java-version: "adopt@1.8" + java-version: "adopt@1.11" - name: Build and run tests run: sbt ++${{matrix.scala}} jacoco diff --git a/.github/workflows/licence_check.yml b/.github/workflows/licence_check.yml index 6285b4d..c46f1cc 100644 --- a/.github/workflows/licence_check.yml +++ b/.github/workflows/licence_check.yml @@ -34,7 +34,7 @@ jobs: - name: Setup Scala uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c with: - java-version: "adopt@1.8" + java-version: "adopt@1.11" # note, that task "headerCheck" defaults to just "compile:headerCheck" - see https://github.com/sbt/sbt-header/issues/14 - name: SBT src licence header check diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5734818..40b5b95 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,7 +31,7 @@ jobs: - name: Setup Scala uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c with: - java-version: "adopt@1.8" + java-version: "adopt@1.11" - run: sbt ci-release env: diff --git a/README.md b/README.md index 57bbea1..976891d 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) [![Release](https://github.com/AbsaOSS/spark-data-standardization/actions/workflows/release.yml/badge.svg)](https://github.com/AbsaOSS/spark-data-standardization/actions/workflows/release.yml) -![Java 8](https://img.shields.io/badge/Java_1.8-ED8B00?style=flat&logo=openjdk&logoColor=black) +![Java 11](https://img.shields.io/badge/Java_11-ED8B00?style=flat&logo=openjdk&logoColor=black) - Dataframe in - Standardized Dataframe out @@ -16,7 +16,7 @@ The library needs following dependencies to be included in your project ```sbt "org.apache.spark" %% "spark-core" % SPARK_VERSION, "org.apache.spark" %% "spark-sql" % SPARK_VERSION, -"za.co.absa" %% s"spark-commons-spark${SPARK_MAJOR}.${SPARK_MINOR}" % "0.6.1", +"za.co.absa" %% s"spark-commons-spark${SPARK_MAJOR}.${SPARK_MINOR}" % "0.6.3", ``` ### Usage in SBT: @@ -26,16 +26,6 @@ The library needs following dependencies to be included in your project ### Usage in Maven -### Scala 2.11 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.11) - -```xml - - za.co.absa - spark-data-standardization_2.11 - ${latest_version} - -``` - ### Scala 2.12 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.12/badge.svg)](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.12) ```xml @@ -57,9 +47,9 @@ The library needs following dependencies to be included in your project ``` Spark and Scala compatibility ->| | Scala 2.11 | Scala 2.12 | Scala 2.13 | ->|---|---|---|---| ->|Spark| 2.4.7 | 3.2.1 | 3.2.1 | +>| | Scala 2.12 | Scala 2.13 | +>|---|---|---| +>|Spark| 3.5.x | 3.5.x | ## How to Release diff --git a/build.sbt b/build.sbt index f4ea43f..e57db7d 100644 --- a/build.sbt +++ b/build.sbt @@ -20,15 +20,18 @@ import Dependencies._ ThisBuild / name := "spark-data-standardization" ThisBuild / organization := "za.co.absa" -lazy val scala211 = "2.11.12" lazy val scala212 = "2.12.20" lazy val scala213 = "2.13.16" -ThisBuild / crossScalaVersions := Seq(scala211, scala212, scala213) -ThisBuild / scalaVersion := scala211 +ThisBuild / crossScalaVersions := Seq(scala212, scala213) +ThisBuild / scalaVersion := scala212 ThisBuild / versionScheme := Some("early-semver") +// Java 11 +ThisBuild / javacOptions ++= Seq("-source", "11", "-target", "11") +ThisBuild / scalacOptions ++= Seq("-release", "11") + libraryDependencies ++= dependencyList(scalaVersion.value) lazy val printSparkScalaVersion = taskKey[Unit]("Print Spark and Scala versions for standardization") diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 55e3468..7fb67b4 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -23,9 +23,9 @@ object Dependencies { s"$major.$minor" } - private def sparkFastTestsVersion(scalaVersion: String): String = if (scalaVersion.startsWith("2.11")) "0.23.0" else "1.1.0" + private def sparkFastTestsVersion(scalaVersion: String): String = "1.1.0" - def getSparkVersion(scalaVersion: String): String = if (scalaVersion.startsWith("2.11")) "2.4.7" else "3.2.1" + def getSparkVersion(scalaVersion: String): String = "3.5.7" def dependencyList(scalaVersion: String): Seq[ModuleID] = { val sparkVersion = getSparkVersion(scalaVersion) @@ -33,8 +33,8 @@ object Dependencies { List( "org.apache.spark" %% "spark-core" % sparkVersion % Provided, "org.apache.spark" %% "spark-sql" % sparkVersion % Provided, - "za.co.absa" %% s"spark-commons-spark$sparkVersionUpToMinor" % "0.6.1" % Provided, - "za.co.absa" %% "spark-commons-test" % "0.6.1" % Test, + "za.co.absa" %% s"spark-commons-spark$sparkVersionUpToMinor" % "0.6.3" % Provided, + "za.co.absa" %% "spark-commons-test" % "0.6.3" % Test, "com.typesafe" % "config" % "1.4.2", "com.github.mrpowers" %% "spark-fast-tests" % sparkFastTestsVersion(scalaVersion) % Test, "org.scalatest" %% "scalatest" % "3.2.15" % Test diff --git a/project/plugins.sbt b/project/plugins.sbt index 3f52147..4e3fb02 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -26,7 +26,6 @@ lazy val ow2Version = "9.5" def ow2Url(artifactName: String): String = s"https://repo1.maven.org/maven2/org/ow2/asm/$artifactName/$ow2Version/$artifactName-$ow2Version.jar" -addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.11/2.0/scala-arm_2.11-2.0.jar") addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.12/2.0/scala-arm_2.12-2.0.jar") addSbtPlugin("org.ow2.asm" % "asm" % ow2Version from ow2Url("asm")) addSbtPlugin("org.ow2.asm" % "asm-commons" % ow2Version from ow2Url("asm-commons")) From 9fb2231f144639fa63e99645a2a1f68c3efc61e0 Mon Sep 17 00:00:00 2001 From: MatloaItumeleng Date: Fri, 10 Apr 2026 13:24:02 +0200 Subject: [PATCH 2/5] handling spark expressions error for empty arrary, changed null case & Java 11 normalised ClassCastException message format --- .../standardization/StandardizationCsvSuite.scala | 2 +- .../stages/TypeParserSuiteTemplate.scala | 5 +++-- .../types/TypedStructFieldSuite.scala | 14 ++++++++++++-- .../field/ScalarFieldValidatorSuite.scala | 5 ++++- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/test/scala/za/co/absa/standardization/StandardizationCsvSuite.scala b/src/test/scala/za/co/absa/standardization/StandardizationCsvSuite.scala index c3fff87..bf623fd 100644 --- a/src/test/scala/za/co/absa/standardization/StandardizationCsvSuite.scala +++ b/src/test/scala/za/co/absa/standardization/StandardizationCsvSuite.scala @@ -74,7 +74,7 @@ class StandardizationCsvSuite extends AnyFunSuite with SparkTestBase { ||201|202|2 |2019-05-05|2019-05-05|[] | ||301|302|1 |2019-05-06|2019-05-06|[] | ||401|402|1 |2019-05-07|2019-05-07|[] | - ||501|502|null|2019-05-08|2019-05-08|[] | + ||501|502|NULL|2019-05-08|2019-05-08|[] | |+---+---+----+----------+----------+------+ | |""".stripMargin.replace("\r\n", "\n") diff --git a/src/test/scala/za/co/absa/standardization/interpreter/stages/TypeParserSuiteTemplate.scala b/src/test/scala/za/co/absa/standardization/interpreter/stages/TypeParserSuiteTemplate.scala index 8b00c9e..c9b5bd2 100644 --- a/src/test/scala/za/co/absa/standardization/interpreter/stages/TypeParserSuiteTemplate.scala +++ b/src/test/scala/za/co/absa/standardization/interpreter/stages/TypeParserSuiteTemplate.scala @@ -291,12 +291,13 @@ trait TypeParserSuiteTemplate extends AnyFunSuite with SparkTestBase { private def assembleErrorExpression(srcField: String, target: StructField, castS: String, fromType: DataType, toType: String, pattern: String): String = { val errCond = createErrorCondition(srcField, target, castS) val patternExpr = if (pattern.isEmpty) "NULL" else pattern + val emptyArr = if (SPARK_VERSION.startsWith("3.")) "ARRAY()" else "[]" if (target.nullable) { - s"CASE WHEN (($srcField IS NOT NULL) AND ($errCond)) THEN array(stdCastErr($srcField, CAST($srcField AS STRING), ${fromType.typeName}, $toType, $patternExpr)) ELSE [] END" + s"CASE WHEN (($srcField IS NOT NULL) AND ($errCond)) THEN array(stdCastErr($srcField, CAST($srcField AS STRING), ${fromType.typeName}, $toType, $patternExpr)) ELSE $emptyArr END" } else { s"CASE WHEN ($srcField IS NULL) THEN array(stdNullErr($srcField)) ELSE " + - s"CASE WHEN ($errCond) THEN array(stdCastErr($srcField, CAST($srcField AS STRING), ${fromType.typeName}, $toType, $patternExpr)) ELSE [] END END" + s"CASE WHEN ($errCond) THEN array(stdCastErr($srcField, CAST($srcField AS STRING), ${fromType.typeName}, $toType, $patternExpr)) ELSE $emptyArr END END" } } diff --git a/src/test/scala/za/co/absa/standardization/types/TypedStructFieldSuite.scala b/src/test/scala/za/co/absa/standardization/types/TypedStructFieldSuite.scala index 7b75345..d0ad042 100644 --- a/src/test/scala/za/co/absa/standardization/types/TypedStructFieldSuite.scala +++ b/src/test/scala/za/co/absa/standardization/types/TypedStructFieldSuite.scala @@ -53,6 +53,9 @@ class TypedStructFieldSuite extends AnyFunSuite { StructField(fieldName, dataType, nullable,metadata) } + private def normalizeCastMsg(msg: String): String = + msg.replaceAll("\\bclass ", "").replaceAll(" \\(.*\\)", "") + def checkField(field: TypedStructField, dataType: DataType, ownDefaultValue: Try[Option[Option[Any]]], @@ -68,7 +71,8 @@ class TypedStructFieldSuite extends AnyFunSuite { got.get } assert(caught.getClass == e.getClass) - assert(caught.getMessage == e.getMessage) + assert(normalizeCastMsg(caught.getMessage) == normalizeCastMsg(e.getMessage), + s"\nExpected: ${e.getMessage}\nActual : ${caught.getMessage}") } } @@ -95,7 +99,13 @@ class TypedStructFieldSuite extends AnyFunSuite { assert(field.nullable == nullable) assertTry(field.ownDefaultValue, ownDefaultValue) assertTry(field.defaultValueWithGlobal, defaultValueWithGlobal) - assert(field.validate() == validationIssues) + val normalizeIssue: ValidationIssue => ValidationIssue = { + case ValidationError(msg) => ValidationError(normalizeCastMsg(msg)) + case ValidationWarning(msg) => ValidationWarning(normalizeCastMsg(msg)) + } + val normalizedActualIssues = field.validate().map(normalizeIssue) + val normalizedExpectedIssues = validationIssues.map(normalizeIssue) + assert(normalizedActualIssues == normalizedExpectedIssues) } test("String type without default defined") { diff --git a/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala b/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala index c4b2fdd..87c7b54 100644 --- a/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala +++ b/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala @@ -44,9 +44,12 @@ class ScalarFieldValidatorSuite extends AnyFunSuite { assert(testResult.isEmpty) } + private def normalizeCastMsg(msg: String): String = + msg.replaceAll("\\bclass ", "").replaceAll(" \\(.*\\)", "") + test("Default value is set to non string value fails") { val field = StructField("test_field", StringType, nullable = false, new MetadataBuilder().putBoolean(MetadataKeys.DefaultValue, value = true).build()) val testResult = ScalarFieldValidator.validate(TypedStructField(field)) - assert(testResult == Seq(ValidationError("java.lang.Boolean cannot be cast to java.lang.String"))) + assert(testResult.map(e => ValidationError(normalizeCastMsg(e.msg))) == Seq(ValidationError("java.lang.Boolean cannot be cast to java.lang.String"))) } } From 60faf6ab1e29aee48689ea2813d2937b13b2fff9 Mon Sep 17 00:00:00 2001 From: MatloaItumeleng Date: Fri, 10 Apr 2026 14:20:49 +0200 Subject: [PATCH 3/5] jacoco check update and removal for script blind check --- .github/workflows/jacoco_check.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/jacoco_check.yml b/.github/workflows/jacoco_check.yml index faa1360..a4a960f 100644 --- a/.github/workflows/jacoco_check.yml +++ b/.github/workflows/jacoco_check.yml @@ -27,12 +27,12 @@ jobs: strategy: matrix: include: - # The project supports Scala 2.11, 2.12, 2.13 + # The project supports Scala 2.12, 2.13 # The CI runs all tests suites for all supported Scala versions at build.yml # The codebase for all Scala versions is the same, so the coverage is calculated only once # Scala 2.12 is chosen since it is supported by the most wide range of Spark versions and # vendor distributions. - - scala: 2.12.12 + - scala: 2.12.20 scalaShort: "2.12" overall: 80.0 changed: 80.0 @@ -69,9 +69,3 @@ jobs: echo "Total coverage ${{ steps.jacoco.outputs.coverage-overall }}" echo "Changed Files coverage ${{ steps.jacoco.outputs.coverage-changed-files }}" - - name: Fail PR if changed files coverage is less than ${{ matrix.changed }}% - if: ${{ steps.jacoco.outputs.coverage-changed-files < 80.0 }} - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd - with: - script: | - core.setFailed('Changed files coverage is less than ${{ matrix.changed }}%!') From d82304815120928f90cdf32c0c777c727769da17 Mon Sep 17 00:00:00 2001 From: MatloaItumeleng Date: Tue, 14 Apr 2026 11:16:20 +0200 Subject: [PATCH 4/5] review recommendations , using contains instead --- .../validation/field/ScalarFieldValidatorSuite.scala | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala b/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala index 87c7b54..96783bd 100644 --- a/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala +++ b/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala @@ -18,7 +18,6 @@ package za.co.absa.standardization.validation.field import org.apache.spark.sql.types.{MetadataBuilder, StringType, StructField} import org.scalatest.funsuite.AnyFunSuite -import za.co.absa.standardization.ValidationError import za.co.absa.standardization.schema.MetadataKeys import za.co.absa.standardization.types.{TypeDefaults, CommonTypeDefaults, TypedStructField} @@ -44,12 +43,10 @@ class ScalarFieldValidatorSuite extends AnyFunSuite { assert(testResult.isEmpty) } - private def normalizeCastMsg(msg: String): String = - msg.replaceAll("\\bclass ", "").replaceAll(" \\(.*\\)", "") - test("Default value is set to non string value fails") { val field = StructField("test_field", StringType, nullable = false, new MetadataBuilder().putBoolean(MetadataKeys.DefaultValue, value = true).build()) val testResult = ScalarFieldValidator.validate(TypedStructField(field)) - assert(testResult.map(e => ValidationError(normalizeCastMsg(e.msg))) == Seq(ValidationError("java.lang.Boolean cannot be cast to java.lang.String"))) + // Using `contains` because newer JVMs (11+) prefix type names with "class " and append module/classloader info in parentheses to ClassCastException messages + assert(testResult.head.msg.contains("java.lang.Boolean cannot be cast to") && testResult.head.msg.contains("java.lang.String")) } } From c447015d4f6ac27ffa2af803fa56909766a797d6 Mon Sep 17 00:00:00 2001 From: MatloaItumeleng Date: Fri, 17 Apr 2026 21:32:01 +0200 Subject: [PATCH 5/5] review recommendation, using matchers for handling ClassCastException --- .../validation/field/ScalarFieldValidatorSuite.scala | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala b/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala index 96783bd..a399fae 100644 --- a/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala +++ b/src/test/scala/za/co/absa/standardization/validation/field/ScalarFieldValidatorSuite.scala @@ -18,10 +18,11 @@ package za.co.absa.standardization.validation.field import org.apache.spark.sql.types.{MetadataBuilder, StringType, StructField} import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers import za.co.absa.standardization.schema.MetadataKeys import za.co.absa.standardization.types.{TypeDefaults, CommonTypeDefaults, TypedStructField} -class ScalarFieldValidatorSuite extends AnyFunSuite { +class ScalarFieldValidatorSuite extends AnyFunSuite with Matchers { private implicit val defaults: TypeDefaults = CommonTypeDefaults @@ -46,7 +47,11 @@ class ScalarFieldValidatorSuite extends AnyFunSuite { test("Default value is set to non string value fails") { val field = StructField("test_field", StringType, nullable = false, new MetadataBuilder().putBoolean(MetadataKeys.DefaultValue, value = true).build()) val testResult = ScalarFieldValidator.validate(TypedStructField(field)) - // Using `contains` because newer JVMs (11+) prefix type names with "class " and append module/classloader info in parentheses to ClassCastException messages - assert(testResult.head.msg.contains("java.lang.Boolean cannot be cast to") && testResult.head.msg.contains("java.lang.String")) + + testResult should have size 1 + testResult.head.msg should ( + startWith("java.lang.Boolean cannot be cast to java.lang.String") or // Java 8 + startWith("class java.lang.Boolean cannot be cast to class java.lang.String") // Java 11+ + ) } }