Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
strategy:
fail-fast: false
matrix:
scala: [2.11.12, 2.12.18, 2.13.11]
scala: [2.12.18, 2.13.11]
name: Scala ${{matrix.scala}}
steps:
- name: Checkout code
Expand All @@ -41,7 +41,7 @@ jobs:
- name: Setup Scala
uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c
with:
java-version: "adopt@1.8"
java-version: "adopt@1.11"

- name: Build and run tests
run: sbt ++${{matrix.scala}} test doc
2 changes: 1 addition & 1 deletion .github/workflows/format_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
- name: Setup Scala
uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c
with:
java-version: "adopt@1.8"
java-version: "adopt@1.11"

- name: Run scalafmt And Print Diff
continue-on-error: true
Expand Down
12 changes: 3 additions & 9 deletions .github/workflows/jacoco_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ jobs:
strategy:
matrix:
include:
# The project supports Scala 2.11, 2.12, 2.13
# The project supports Scala 2.12, 2.13
# The CI runs all tests suites for all supported Scala versions at build.yml
# The codebase for all Scala versions is the same, so the coverage is calculated only once
# Scala 2.12 is chosen since it is supported by the most wide range of Spark versions and
# vendor distributions.
- scala: 2.12.12
- scala: 2.12.20
scalaShort: "2.12"
overall: 80.0
changed: 80.0
Expand All @@ -47,7 +47,7 @@ jobs:
- name: Setup Scala
uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c
with:
java-version: "adopt@1.8"
java-version: "adopt@1.11"

- name: Build and run tests
run: sbt ++${{matrix.scala}} jacoco
Expand All @@ -69,9 +69,3 @@ jobs:
echo "Total coverage ${{ steps.jacoco.outputs.coverage-overall }}"
echo "Changed Files coverage ${{ steps.jacoco.outputs.coverage-changed-files }}"

- name: Fail PR if changed files coverage is less than ${{ matrix.changed }}%
if: ${{ steps.jacoco.outputs.coverage-changed-files < 80.0 }}
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd
with:
script: |
core.setFailed('Changed files coverage is less than ${{ matrix.changed }}%!')
2 changes: 1 addition & 1 deletion .github/workflows/licence_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
- name: Setup Scala
uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c
with:
java-version: "adopt@1.8"
java-version: "adopt@1.11"

# note, that task "headerCheck" defaults to just "compile:headerCheck" - see https://github.com/sbt/sbt-header/issues/14
- name: SBT src licence header check
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- name: Setup Scala
uses: olafurpg/setup-scala@32ffa16635ff8f19cc21ea253a987f0fdf29844c
with:
java-version: "adopt@1.8"
java-version: "adopt@1.11"

- run: sbt ci-release
env:
Expand Down
20 changes: 5 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html)
[![Release](https://github.com/AbsaOSS/spark-data-standardization/actions/workflows/release.yml/badge.svg)](https://github.com/AbsaOSS/spark-data-standardization/actions/workflows/release.yml)
![Java 8](https://img.shields.io/badge/Java_1.8-ED8B00?style=flat&logo=openjdk&logoColor=black)
![Java 11](https://img.shields.io/badge/Java_11-ED8B00?style=flat&logo=openjdk&logoColor=black)

- Dataframe in
- Standardized Dataframe out
Expand All @@ -16,7 +16,7 @@ The library needs following dependencies to be included in your project
```sbt
"org.apache.spark" %% "spark-core" % SPARK_VERSION,
"org.apache.spark" %% "spark-sql" % SPARK_VERSION,
"za.co.absa" %% s"spark-commons-spark${SPARK_MAJOR}.${SPARK_MINOR}" % "0.6.1",
"za.co.absa" %% s"spark-commons-spark${SPARK_MAJOR}.${SPARK_MINOR}" % "0.6.3",
```

### Usage in SBT:
Expand All @@ -26,16 +26,6 @@ The library needs following dependencies to be included in your project

### Usage in Maven

### Scala 2.11 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.11)

```xml
<dependency>
<groupId>za.co.absa</groupId>
<artifactId>spark-data-standardization_2.11</artifactId>
<version>${latest_version}</version>
</dependency>
```

### Scala 2.12 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.12/badge.svg)](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.12)

```xml
Expand All @@ -57,9 +47,9 @@ The library needs following dependencies to be included in your project
```

Spark and Scala compatibility
>| | Scala 2.11 | Scala 2.12 | Scala 2.13 |
>|---|---|---|---|
>|Spark| 2.4.7 | 3.2.1 | 3.2.1 |
>| | Scala 2.12 | Scala 2.13 |
>|---|---|---|
>|Spark| 3.5.x | 3.5.x |

## How to Release

Expand Down
9 changes: 6 additions & 3 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,18 @@ import Dependencies._
ThisBuild / name := "spark-data-standardization"
ThisBuild / organization := "za.co.absa"

lazy val scala211 = "2.11.12"
lazy val scala212 = "2.12.20"
lazy val scala213 = "2.13.16"

ThisBuild / crossScalaVersions := Seq(scala211, scala212, scala213)
ThisBuild / scalaVersion := scala211
ThisBuild / crossScalaVersions := Seq(scala212, scala213)
ThisBuild / scalaVersion := scala212

ThisBuild / versionScheme := Some("early-semver")

// Java 11
ThisBuild / javacOptions ++= Seq("-source", "11", "-target", "11")
ThisBuild / scalacOptions ++= Seq("-release", "11")

libraryDependencies ++= dependencyList(scalaVersion.value)

lazy val printSparkScalaVersion = taskKey[Unit]("Print Spark and Scala versions for standardization")
Expand Down
8 changes: 4 additions & 4 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ object Dependencies {
s"$major.$minor"
}

private def sparkFastTestsVersion(scalaVersion: String): String = if (scalaVersion.startsWith("2.11")) "0.23.0" else "1.1.0"
private def sparkFastTestsVersion(scalaVersion: String): String = "1.1.0"

def getSparkVersion(scalaVersion: String): String = if (scalaVersion.startsWith("2.11")) "2.4.7" else "3.2.1"
def getSparkVersion(scalaVersion: String): String = "3.5.7"

def dependencyList(scalaVersion: String): Seq[ModuleID] = {
val sparkVersion = getSparkVersion(scalaVersion)
val sparkVersionUpToMinor = getSparkVersionUpToMinor(sparkVersion)
List(
"org.apache.spark" %% "spark-core" % sparkVersion % Provided,
"org.apache.spark" %% "spark-sql" % sparkVersion % Provided,
"za.co.absa" %% s"spark-commons-spark$sparkVersionUpToMinor" % "0.6.1" % Provided,
"za.co.absa" %% "spark-commons-test" % "0.6.1" % Test,
"za.co.absa" %% s"spark-commons-spark$sparkVersionUpToMinor" % "0.6.3" % Provided,
"za.co.absa" %% "spark-commons-test" % "0.6.3" % Test,
"com.typesafe" % "config" % "1.4.2",
"com.github.mrpowers" %% "spark-fast-tests" % sparkFastTestsVersion(scalaVersion) % Test,
"org.scalatest" %% "scalatest" % "3.2.15" % Test
Expand Down
1 change: 0 additions & 1 deletion project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ lazy val ow2Version = "9.5"

def ow2Url(artifactName: String): String = s"https://repo1.maven.org/maven2/org/ow2/asm/$artifactName/$ow2Version/$artifactName-$ow2Version.jar"

addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.11/2.0/scala-arm_2.11-2.0.jar")
addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.12/2.0/scala-arm_2.12-2.0.jar")
addSbtPlugin("org.ow2.asm" % "asm" % ow2Version from ow2Url("asm"))
addSbtPlugin("org.ow2.asm" % "asm-commons" % ow2Version from ow2Url("asm-commons"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class StandardizationCsvSuite extends AnyFunSuite with SparkTestBase {
||201|202|2 |2019-05-05|2019-05-05|[] |
||301|302|1 |2019-05-06|2019-05-06|[] |
||401|402|1 |2019-05-07|2019-05-07|[] |
||501|502|null|2019-05-08|2019-05-08|[] |
||501|502|NULL|2019-05-08|2019-05-08|[] |
|+---+---+----+----------+----------+------+
|
|""".stripMargin.replace("\r\n", "\n")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,13 @@ trait TypeParserSuiteTemplate extends AnyFunSuite with SparkTestBase {
private def assembleErrorExpression(srcField: String, target: StructField, castS: String, fromType: DataType, toType: String, pattern: String): String = {
val errCond = createErrorCondition(srcField, target, castS)
val patternExpr = if (pattern.isEmpty) "NULL" else pattern
val emptyArr = if (SPARK_VERSION.startsWith("3.")) "ARRAY()" else "[]"

if (target.nullable) {
s"CASE WHEN (($srcField IS NOT NULL) AND ($errCond)) THEN array(stdCastErr($srcField, CAST($srcField AS STRING), ${fromType.typeName}, $toType, $patternExpr)) ELSE [] END"
s"CASE WHEN (($srcField IS NOT NULL) AND ($errCond)) THEN array(stdCastErr($srcField, CAST($srcField AS STRING), ${fromType.typeName}, $toType, $patternExpr)) ELSE $emptyArr END"
} else {
s"CASE WHEN ($srcField IS NULL) THEN array(stdNullErr($srcField)) ELSE " +
s"CASE WHEN ($errCond) THEN array(stdCastErr($srcField, CAST($srcField AS STRING), ${fromType.typeName}, $toType, $patternExpr)) ELSE [] END END"
s"CASE WHEN ($errCond) THEN array(stdCastErr($srcField, CAST($srcField AS STRING), ${fromType.typeName}, $toType, $patternExpr)) ELSE $emptyArr END END"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class TypedStructFieldSuite extends AnyFunSuite {
StructField(fieldName, dataType, nullable,metadata)
}

private def normalizeCastMsg(msg: String): String =
msg.replaceAll("\\bclass ", "").replaceAll(" \\(.*\\)", "")

def checkField(field: TypedStructField,
dataType: DataType,
ownDefaultValue: Try[Option[Option[Any]]],
Expand All @@ -68,7 +71,8 @@ class TypedStructFieldSuite extends AnyFunSuite {
got.get
}
assert(caught.getClass == e.getClass)
assert(caught.getMessage == e.getMessage)
assert(normalizeCastMsg(caught.getMessage) == normalizeCastMsg(e.getMessage),
s"\nExpected: ${e.getMessage}\nActual : ${caught.getMessage}")
}
}

Expand All @@ -95,7 +99,13 @@ class TypedStructFieldSuite extends AnyFunSuite {
assert(field.nullable == nullable)
assertTry(field.ownDefaultValue, ownDefaultValue)
assertTry(field.defaultValueWithGlobal, defaultValueWithGlobal)
assert(field.validate() == validationIssues)
val normalizeIssue: ValidationIssue => ValidationIssue = {
case ValidationError(msg) => ValidationError(normalizeCastMsg(msg))
case ValidationWarning(msg) => ValidationWarning(normalizeCastMsg(msg))
}
val normalizedActualIssues = field.validate().map(normalizeIssue)
val normalizedExpectedIssues = validationIssues.map(normalizeIssue)
assert(normalizedActualIssues == normalizedExpectedIssues)
}

test("String type without default defined") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ package za.co.absa.standardization.validation.field

import org.apache.spark.sql.types.{MetadataBuilder, StringType, StructField}
import org.scalatest.funsuite.AnyFunSuite
import za.co.absa.standardization.ValidationError
import org.scalatest.matchers.should.Matchers
import za.co.absa.standardization.schema.MetadataKeys
import za.co.absa.standardization.types.{TypeDefaults, CommonTypeDefaults, TypedStructField}

class ScalarFieldValidatorSuite extends AnyFunSuite {
class ScalarFieldValidatorSuite extends AnyFunSuite with Matchers {

private implicit val defaults: TypeDefaults = CommonTypeDefaults

Expand All @@ -47,6 +47,11 @@ class ScalarFieldValidatorSuite extends AnyFunSuite {
test("Default value is set to non string value fails") {
val field = StructField("test_field", StringType, nullable = false, new MetadataBuilder().putBoolean(MetadataKeys.DefaultValue, value = true).build())
val testResult = ScalarFieldValidator.validate(TypedStructField(field))
assert(testResult == Seq(ValidationError("java.lang.Boolean cannot be cast to java.lang.String")))

testResult should have size 1
testResult.head.msg should (
startWith("java.lang.Boolean cannot be cast to java.lang.String") or // Java 8
startWith("class java.lang.Boolean cannot be cast to class java.lang.String") // Java 11+
)
}
}
Loading