diff --git a/.github/workflows/onpush.yml b/.github/workflows/onpush.yml
index 98cf18d..736d9ca 100644
--- a/.github/workflows/onpush.yml
+++ b/.github/workflows/onpush.yml
@@ -20,10 +20,15 @@ jobs:
env:
DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }}
DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+ BRANCH: ${{ github.head_ref || github.ref_name }}
+ DEVELOPER: ${{ github.actor }}
steps:
- uses: actions/checkout@v1
+ - name: Dump GitHub context
+ run: echo '${{ toJson(github) }}'
+
- name: Set up Python
uses: actions/setup-python@v5
with:
@@ -47,14 +52,9 @@ jobs:
- name: Deploy on staging
run: |
- BRANCH_NAME="${{ github.head_ref || github.ref_name }}"
- PR_NUMBER="${{ github.event.pull_request.number }}"
- DEVELOPER="${{ github.actor }}"
-
- uv run python ./scripts/generate_template_workflow.py staging --serverless \
- --branch "$BRANCH_NAME" \
- --developer "$DEVELOPER" \
- $(if [ -n "$PR_NUMBER" ]; then echo "--pr-number $PR_NUMBER"; fi)
+ uv run python ./scripts/generate_template_workflow.py staging \
+ --branch "$BRANCH" \
+ --developer "$DEVELOPER"
uv run databricks bundle deploy --target staging
@@ -64,13 +64,8 @@ jobs:
- name: Deploy on prod
run: |
- BRANCH_NAME="${{ github.head_ref || github.ref_name }}"
- PR_NUMBER="${{ github.event.pull_request.number }}"
- DEVELOPER="${{ github.actor }}"
-
- uv run python ./scripts/generate_template_workflow.py prod --serverless \
- --branch "$BRANCH_NAME" \
- --developer "$DEVELOPER" \
- $(if [ -n "$PR_NUMBER" ]; then echo "--pr-number $PR_NUMBER"; fi)
+ uv run python ./scripts/generate_template_workflow.py prod \
+ --branch "$BRANCH" \
+ --developer "$DEVELOPER"
uv run databricks bundle deploy --target prod
diff --git a/Makefile b/Makefile
index 490a572..3f41ba8 100644
--- a/Makefile
+++ b/Makefile
@@ -10,8 +10,8 @@ pre-commit:
pre-commit autoupdate
pre-commit run --all-files
-deploy-serverless:
- uv run python ./scripts/generate_template_workflow.py $(env) --serverless
+deploy:
+ uv run python ./scripts/generate_template_workflow.py $(env)
uv run databricks bundle deploy --target $(env)
run:
diff --git a/README.md b/README.md
index 4a77e42..1ec32d2 100644
--- a/README.md
+++ b/README.md
@@ -33,15 +33,17 @@ Interested in bringing these principles in your own project? Letβs [connect o
This project template demonstrates how to:
-- structure PySpark code inside classes/packages.
-- run unit tests on transformations with [pytest package](https://pypi.org/project/pytest/) - set up VSCode to run unit tests on your local machine.
-- structure integration tests to be executed on different environments / catalogs.
+- structure PySpark code inside classes/packages, instead of notebooks.
+- package and deploy code to different environments (dev, staging, prod).
+- use a CI/CD pipeline with [Github Actions](https://docs.github.com/en/actions).
+- run unit tests on transformations with [pytest package](https://pypi.org/project/pytest/). Set up VSCode to run unit tests on your local machine.
+- run integration tests setting the input data and validating the output data.
+- isolate "dev" environments / catalogs to avoid concurrency issues between developer tests.
+- show developer name and branch as job tags to track issues.
- utilize [coverage package](https://pypi.org/project/coverage/) to generate test coverage reports.
-- package and deploy code to different environments (dev, staging, prod) using a CI/CD pipeline with [Github Actions](https://docs.github.com/en/actions).
-- isolate "dev" environments / catalogs to avoid concurrency issues between developers testing jobs.
- utilize [uv](https://docs.astral.sh/uv/) as a project/package manager.
-- configure the workflow to run in different environments with different parameters with [jinja package](https://pypi.org/project/jinja2/).
-- configure the workflow to run tasks selectively.
+- configure job to run in different environments with different parameters with [jinja package](https://pypi.org/project/jinja2/).
+- configure job to run tasks selectively.
- use [medallion architecture](https://www.databricks.com/glossary/medallion-architecture) pattern.
- lint and format code with [ruff](https://docs.astral.sh/ruff/) and [pre-commit](https://pre-commit.com/).
- use a Make file to automate repetitive tasks.
@@ -52,9 +54,9 @@ This project template demonstrates how to:
- utilize [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to package/deploy/run a Python wheel package on Databricks.
- utilize [Databricks DQX](https://databrickslabs.github.io/dqx/) to define and enforce data quality rules, such as null checks, uniqueness, thresholds, and schema validation, and filter bad data on quarantine tables.
- utilize [Databricks SDK for Python](https://docs.databricks.com/en/dev-tools/sdk-python.html) to manage workspaces and accounts and analyse costs. Refer to 'scripts' folder for some examples.
-- utilize [Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog) and get data lineage for your tables and columns and a simplified permission model for your data.
+- utilize [Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog) and get data lineage for your tables and columns.
- utilize [Databricks Lakeflow Jobs](https://docs.databricks.com/en/workflows/index.html) to execute a DAG and [task parameters](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) to share context information between tasks (see [Task Parameters section](#task-parameters)). Yes, you don't need Airflow to manage your DAGs here!!!
-- utilize serverless job clusters on [Databricks Free Edition](https://docs.databricks.com/aws/en/getting-started/free-edition ) to deploy your pipelines.
+- utilize serverless job clusters on [Databricks Free Edition](https://docs.databricks.com/aws/en/getting-started/free-edition) to deploy your pipelines.
## π§ Resources
@@ -78,17 +80,17 @@ Other:
```
databricks-template/
β
-βββ .github/ # CI/CD automation
+βββ .github/ # CI/CD automation
β βββ workflows/
-β βββ onpush.yml # GitHub Actions pipeline
+β βββ onpush.yml # GitHub Actions pipeline
β
-βββ src/ # Main source code
-β βββ template/ # Python package
-β βββ main.py # Entry point with CLI (argparse)
-β βββ config.py # Configuration management
-β βββ baseTask.py # Base class for all tasks
-β βββ commonSchemas.py # Shared PySpark schemas
-β βββ job1/ # Job-specific tasks
+βββ src/ # Main source code
+β βββ template/ # Python package
+β βββ main.py # Entry point with CLI (argparse)
+β βββ config.py # Configuration management
+β βββ baseTask.py # Base class for all tasks
+β βββ commonSchemas.py # Shared PySpark schemas
+β βββ job1/ # Job-specific tasks
β βββ extract_source1.py
β βββ extract_source2.py
β βββ generate_orders.py
@@ -96,21 +98,21 @@ databricks-template/
β βββ integration_setup.py
β βββ integration_validate.py
β
-βββ tests/ # Unit tests
+βββ tests/ # Unit tests
β βββ job1/
-β βββ unit_test.py # Pytest unit tests
+β βββ unit_test.py # Pytest unit tests
β
-βββ resources/ # Databricks workflow templates
+βββ resources/ # Databricks workflow templates
β βββ wf_template_serverless.yml # Jinja2 template for serverless
β βββ wf_template.yml # Jinja2 template for job clusters
β βββ workflow.yml # Generated workflow (auto-created)
β
-βββ scripts/ # Helper scripts
+βββ scripts/ # Helper scripts
β βββ generate_template_workflow.py # Workflow generator (Jinja2)
-β βββ sdk_analyze_job_costs.py # Cost analysis script
-β βββ sdk_workspace_and_account.py # Workspace and account management
-β print("SUMMARY")
-βββ docs/ # Documentation assets
+β βββ sdk_analyze_job_costs.py # Cost analysis script
+β βββ sdk_workspace_and_account.py # Workspace and account management
+β
+βββ docs/ # Documentation assets
β βββ dag.png
β βββ task_output.png
β βββ data_lineage.png
@@ -127,6 +129,14 @@ databricks-template/
βββ README.md # This file
```
+## CI/CD pipeline
+
+
+
+
+
+
+
## Jobs
@@ -161,31 +171,22 @@ databricks-template/
-## CI/CD pipeline
-
-
-
-
-
-
-
-
## Instructions
1) Create a workspace. Use a [Databricks Free Edition](https://docs.databricks.com/aws/en/getting-started/free-edition) workspace.
-2) Install and configure Databricks CLI on your local machine. Follow instructions [here](https://docs.databricks.com/en/dev-tools/cli/install.html). Check the current version on databricks.yaml.
+2) Install and configure Databricks CLI on your local machine. Check the current version on databricks.yaml. Follow instructions [here](https://docs.databricks.com/en/dev-tools/cli/install.html).
-3) Build Python env and execute unit tests on your local machine
+3) Build Python env and execute unit tests on your local machine.
make sync & make test
4) Deploy and execute on the dev workspace.
- make deploy-serverless env=dev
+ make deploy env=dev
5) configure CI/CD automation. Configure [Github Actions repository secrets](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions) (DATABRICKS_HOST and DATABRICKS_TOKEN).
diff --git a/scripts/generate_template_workflow.py b/scripts/generate_template_workflow.py
index 6623dd8..a9010ed 100644
--- a/scripts/generate_template_workflow.py
+++ b/scripts/generate_template_workflow.py
@@ -34,41 +34,33 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
- python generate_template_workflow.py dev --serverless
- python generate_template_workflow.py staging --serverless --branch main --developer john --pr-number 123
+ python generate_template_workflow.py dev
+ python generate_template_workflow.py staging --branch main --developer john
""",
)
parser.add_argument("environment", help="Target environment (dev, staging, prod)")
- parser.add_argument("--serverless", action="store_true", help="Use serverless workflow template")
parser.add_argument("--branch", help="Git branch name (auto-detected if not provided)")
parser.add_argument("--developer", help="Developer/deployer name (auto-detected if not provided)")
- parser.add_argument("--pr-number", help="Pull request number (optional)")
args = parser.parse_args()
- # Get or auto-detect git metadata
+ # Auto-detect git metadata in local environments, use provided values in CI
branch = args.branch if args.branch else get_git_branch()
developer = args.developer if args.developer else get_git_user()
- pr_number = args.pr_number if args.pr_number else ""
print(f"Environment: {args.environment}")
- print(f"Serverless mode: {args.serverless}")
print(f"Git branch: {branch}")
print(f"Developer: {developer}")
- print(f"PR number: {pr_number if pr_number else 'N/A'}")
# Load and render template
file_loader = FileSystemLoader(".")
env = Environment(loader=file_loader)
- if args.serverless:
- template = env.get_template("/resources/wf_template_serverless.yml")
- else:
- template = env.get_template("/resources/wf_template.yml")
+ template = env.get_template("/resources/wf_template_serverless.yml")
# Render the template with all variables
- output = template.render(environment=args.environment, branch=branch, developer=developer, pr_number=pr_number)
+ output = template.render(environment=args.environment, branch=branch, developer=developer)
# Save the rendered YAML to a file
output_file = "./resources/workflow.yml"