diff --git a/.github/workflows/onpush.yml b/.github/workflows/onpush.yml index 98cf18d..736d9ca 100644 --- a/.github/workflows/onpush.yml +++ b/.github/workflows/onpush.yml @@ -20,10 +20,15 @@ jobs: env: DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }} DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} + BRANCH: ${{ github.head_ref || github.ref_name }} + DEVELOPER: ${{ github.actor }} steps: - uses: actions/checkout@v1 + - name: Dump GitHub context + run: echo '${{ toJson(github) }}' + - name: Set up Python uses: actions/setup-python@v5 with: @@ -47,14 +52,9 @@ jobs: - name: Deploy on staging run: | - BRANCH_NAME="${{ github.head_ref || github.ref_name }}" - PR_NUMBER="${{ github.event.pull_request.number }}" - DEVELOPER="${{ github.actor }}" - - uv run python ./scripts/generate_template_workflow.py staging --serverless \ - --branch "$BRANCH_NAME" \ - --developer "$DEVELOPER" \ - $(if [ -n "$PR_NUMBER" ]; then echo "--pr-number $PR_NUMBER"; fi) + uv run python ./scripts/generate_template_workflow.py staging \ + --branch "$BRANCH" \ + --developer "$DEVELOPER" uv run databricks bundle deploy --target staging @@ -64,13 +64,8 @@ jobs: - name: Deploy on prod run: | - BRANCH_NAME="${{ github.head_ref || github.ref_name }}" - PR_NUMBER="${{ github.event.pull_request.number }}" - DEVELOPER="${{ github.actor }}" - - uv run python ./scripts/generate_template_workflow.py prod --serverless \ - --branch "$BRANCH_NAME" \ - --developer "$DEVELOPER" \ - $(if [ -n "$PR_NUMBER" ]; then echo "--pr-number $PR_NUMBER"; fi) + uv run python ./scripts/generate_template_workflow.py prod \ + --branch "$BRANCH" \ + --developer "$DEVELOPER" uv run databricks bundle deploy --target prod diff --git a/Makefile b/Makefile index 490a572..3f41ba8 100644 --- a/Makefile +++ b/Makefile @@ -10,8 +10,8 @@ pre-commit: pre-commit autoupdate pre-commit run --all-files -deploy-serverless: - uv run python ./scripts/generate_template_workflow.py $(env) --serverless +deploy: + uv run python ./scripts/generate_template_workflow.py $(env) uv run databricks bundle deploy --target $(env) run: diff --git a/README.md b/README.md index 4a77e42..1ec32d2 100644 --- a/README.md +++ b/README.md @@ -33,15 +33,17 @@ Interested in bringing these principles in your own project? Let’s [connect o This project template demonstrates how to: -- structure PySpark code inside classes/packages. -- run unit tests on transformations with [pytest package](https://pypi.org/project/pytest/) - set up VSCode to run unit tests on your local machine. -- structure integration tests to be executed on different environments / catalogs. +- structure PySpark code inside classes/packages, instead of notebooks. +- package and deploy code to different environments (dev, staging, prod). +- use a CI/CD pipeline with [Github Actions](https://docs.github.com/en/actions). +- run unit tests on transformations with [pytest package](https://pypi.org/project/pytest/). Set up VSCode to run unit tests on your local machine. +- run integration tests setting the input data and validating the output data. +- isolate "dev" environments / catalogs to avoid concurrency issues between developer tests. +- show developer name and branch as job tags to track issues. - utilize [coverage package](https://pypi.org/project/coverage/) to generate test coverage reports. -- package and deploy code to different environments (dev, staging, prod) using a CI/CD pipeline with [Github Actions](https://docs.github.com/en/actions). -- isolate "dev" environments / catalogs to avoid concurrency issues between developers testing jobs. - utilize [uv](https://docs.astral.sh/uv/) as a project/package manager. -- configure the workflow to run in different environments with different parameters with [jinja package](https://pypi.org/project/jinja2/). -- configure the workflow to run tasks selectively. +- configure job to run in different environments with different parameters with [jinja package](https://pypi.org/project/jinja2/). +- configure job to run tasks selectively. - use [medallion architecture](https://www.databricks.com/glossary/medallion-architecture) pattern. - lint and format code with [ruff](https://docs.astral.sh/ruff/) and [pre-commit](https://pre-commit.com/). - use a Make file to automate repetitive tasks. @@ -52,9 +54,9 @@ This project template demonstrates how to: - utilize [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to package/deploy/run a Python wheel package on Databricks. - utilize [Databricks DQX](https://databrickslabs.github.io/dqx/) to define and enforce data quality rules, such as null checks, uniqueness, thresholds, and schema validation, and filter bad data on quarantine tables. - utilize [Databricks SDK for Python](https://docs.databricks.com/en/dev-tools/sdk-python.html) to manage workspaces and accounts and analyse costs. Refer to 'scripts' folder for some examples. -- utilize [Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog) and get data lineage for your tables and columns and a simplified permission model for your data. +- utilize [Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog) and get data lineage for your tables and columns. - utilize [Databricks Lakeflow Jobs](https://docs.databricks.com/en/workflows/index.html) to execute a DAG and [task parameters](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) to share context information between tasks (see [Task Parameters section](#task-parameters)). Yes, you don't need Airflow to manage your DAGs here!!! -- utilize serverless job clusters on [Databricks Free Edition](https://docs.databricks.com/aws/en/getting-started/free-edition ) to deploy your pipelines. +- utilize serverless job clusters on [Databricks Free Edition](https://docs.databricks.com/aws/en/getting-started/free-edition) to deploy your pipelines. ## 🧠 Resources @@ -78,17 +80,17 @@ Other: ``` databricks-template/ β”‚ -β”œβ”€β”€ .github/ # CI/CD automation +β”œβ”€β”€ .github/ # CI/CD automation β”‚ └── workflows/ -β”‚ └── onpush.yml # GitHub Actions pipeline +β”‚ └── onpush.yml # GitHub Actions pipeline β”‚ -β”œβ”€β”€ src/ # Main source code -β”‚ └── template/ # Python package -β”‚ β”œβ”€β”€ main.py # Entry point with CLI (argparse) -β”‚ β”œβ”€β”€ config.py # Configuration management -β”‚ β”œβ”€β”€ baseTask.py # Base class for all tasks -β”‚ β”œβ”€β”€ commonSchemas.py # Shared PySpark schemas -β”‚ └── job1/ # Job-specific tasks +β”œβ”€β”€ src/ # Main source code +β”‚ └── template/ # Python package +β”‚ β”œβ”€β”€ main.py # Entry point with CLI (argparse) +β”‚ β”œβ”€β”€ config.py # Configuration management +β”‚ β”œβ”€β”€ baseTask.py # Base class for all tasks +β”‚ β”œβ”€β”€ commonSchemas.py # Shared PySpark schemas +β”‚ └── job1/ # Job-specific tasks β”‚ β”œβ”€β”€ extract_source1.py β”‚ β”œβ”€β”€ extract_source2.py β”‚ β”œβ”€β”€ generate_orders.py @@ -96,21 +98,21 @@ databricks-template/ β”‚ β”œβ”€β”€ integration_setup.py β”‚ └── integration_validate.py β”‚ -β”œβ”€β”€ tests/ # Unit tests +β”œβ”€β”€ tests/ # Unit tests β”‚ └── job1/ -β”‚ └── unit_test.py # Pytest unit tests +β”‚ └── unit_test.py # Pytest unit tests β”‚ -β”œβ”€β”€ resources/ # Databricks workflow templates +β”œβ”€β”€ resources/ # Databricks workflow templates β”‚ β”œβ”€β”€ wf_template_serverless.yml # Jinja2 template for serverless β”‚ β”œβ”€β”€ wf_template.yml # Jinja2 template for job clusters β”‚ └── workflow.yml # Generated workflow (auto-created) β”‚ -β”œβ”€β”€ scripts/ # Helper scripts +β”œβ”€β”€ scripts/ # Helper scripts β”‚ β”œβ”€β”€ generate_template_workflow.py # Workflow generator (Jinja2) -β”‚ β”œβ”€β”€ sdk_analyze_job_costs.py # Cost analysis script -β”‚ └── sdk_workspace_and_account.py # Workspace and account management -β”‚ print("SUMMARY") -β”œβ”€β”€ docs/ # Documentation assets +β”‚ β”œβ”€β”€ sdk_analyze_job_costs.py # Cost analysis script +β”‚ └── sdk_workspace_and_account.py # Workspace and account management +β”‚ +β”œβ”€β”€ docs/ # Documentation assets β”‚ β”œβ”€β”€ dag.png β”‚ β”œβ”€β”€ task_output.png β”‚ β”œβ”€β”€ data_lineage.png @@ -127,6 +129,14 @@ databricks-template/ └── README.md # This file ``` +## CI/CD pipeline + +
+ + + +
+ ## Jobs
@@ -161,31 +171,22 @@ databricks-template/
-## CI/CD pipeline - -
- - - -
- - ## Instructions 1) Create a workspace. Use a [Databricks Free Edition](https://docs.databricks.com/aws/en/getting-started/free-edition) workspace. -2) Install and configure Databricks CLI on your local machine. Follow instructions [here](https://docs.databricks.com/en/dev-tools/cli/install.html). Check the current version on databricks.yaml. +2) Install and configure Databricks CLI on your local machine. Check the current version on databricks.yaml. Follow instructions [here](https://docs.databricks.com/en/dev-tools/cli/install.html). -3) Build Python env and execute unit tests on your local machine +3) Build Python env and execute unit tests on your local machine. make sync & make test 4) Deploy and execute on the dev workspace. - make deploy-serverless env=dev + make deploy env=dev 5) configure CI/CD automation. Configure [Github Actions repository secrets](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions) (DATABRICKS_HOST and DATABRICKS_TOKEN). diff --git a/scripts/generate_template_workflow.py b/scripts/generate_template_workflow.py index 6623dd8..a9010ed 100644 --- a/scripts/generate_template_workflow.py +++ b/scripts/generate_template_workflow.py @@ -34,41 +34,33 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - python generate_template_workflow.py dev --serverless - python generate_template_workflow.py staging --serverless --branch main --developer john --pr-number 123 + python generate_template_workflow.py dev + python generate_template_workflow.py staging --branch main --developer john """, ) parser.add_argument("environment", help="Target environment (dev, staging, prod)") - parser.add_argument("--serverless", action="store_true", help="Use serverless workflow template") parser.add_argument("--branch", help="Git branch name (auto-detected if not provided)") parser.add_argument("--developer", help="Developer/deployer name (auto-detected if not provided)") - parser.add_argument("--pr-number", help="Pull request number (optional)") args = parser.parse_args() - # Get or auto-detect git metadata + # Auto-detect git metadata in local environments, use provided values in CI branch = args.branch if args.branch else get_git_branch() developer = args.developer if args.developer else get_git_user() - pr_number = args.pr_number if args.pr_number else "" print(f"Environment: {args.environment}") - print(f"Serverless mode: {args.serverless}") print(f"Git branch: {branch}") print(f"Developer: {developer}") - print(f"PR number: {pr_number if pr_number else 'N/A'}") # Load and render template file_loader = FileSystemLoader(".") env = Environment(loader=file_loader) - if args.serverless: - template = env.get_template("/resources/wf_template_serverless.yml") - else: - template = env.get_template("/resources/wf_template.yml") + template = env.get_template("/resources/wf_template_serverless.yml") # Render the template with all variables - output = template.render(environment=args.environment, branch=branch, developer=developer, pr_number=pr_number) + output = template.render(environment=args.environment, branch=branch, developer=developer) # Save the rendered YAML to a file output_file = "./resources/workflow.yml"