From 1f27f62bf0da8431617bbdbf1f4693d95d92cf14 Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Mon, 17 Nov 2025 20:29:54 +0100 Subject: [PATCH 01/29] pydabs for job with task values and job with for each --- .../pydabs_job_with_for_each/README.md | 69 +++++++++++++++++++ .../pydabs_job_with_for_each/databricks.yml | 20 ++++++ .../pydabs_job_with_for_each/pyproject.toml | 26 +++++++ .../resources/__init__.py | 16 +++++ .../resources/for_each_simple.py | 38 ++++++++++ .../src/notebook_extract.py | 16 +++++ .../src/notebook_process_item.py | 16 +++++ .../src/pydabs_job_with_for_each/__init__.py | 0 .../src/pydabs_job_with_for_each/main.py | 5 ++ .../pydabs_job_with_task_values/README.md | 69 +++++++++++++++++++ .../databricks.yml | 20 ++++++ .../pyproject.toml | 26 +++++++ .../resources/__init__.py | 16 +++++ .../resources/task_values_simple.py | 19 +++++ .../src/notebook_task_a.py | 3 + .../src/notebook_task_b.py | 3 + .../pydabs_job_with_task_values/__init__.py | 0 .../src/pydabs_job_with_task_values/main.py | 5 ++ 18 files changed, 367 insertions(+) create mode 100644 knowledge_base/pydabs_job_with_for_each/README.md create mode 100644 knowledge_base/pydabs_job_with_for_each/databricks.yml create mode 100644 knowledge_base/pydabs_job_with_for_each/pyproject.toml create mode 100644 knowledge_base/pydabs_job_with_for_each/resources/__init__.py create mode 100644 knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py create mode 100644 knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py create mode 100644 knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py create mode 100644 knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/__init__.py create mode 100644 knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py create mode 100644 knowledge_base/pydabs_job_with_task_values/README.md create mode 100644 knowledge_base/pydabs_job_with_task_values/databricks.yml create mode 100644 knowledge_base/pydabs_job_with_task_values/pyproject.toml create mode 100644 knowledge_base/pydabs_job_with_task_values/resources/__init__.py create mode 100644 knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py create mode 100644 knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py create mode 100644 knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py create mode 100644 knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/__init__.py create mode 100644 knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py diff --git a/knowledge_base/pydabs_job_with_for_each/README.md b/knowledge_base/pydabs_job_with_for_each/README.md new file mode 100644 index 00000000..474327be --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/README.md @@ -0,0 +1,69 @@ +# pydabs_job_with_for_each + +This example demonstrates a simple Databricks job that uses a foreach task. + +* `src/`: Python source code for this project. + * `src/pydabs_job_with_for_each/`: Shared Python code that can be used by jobs and pipelines. +* `resources/`: Resource configurations (jobs, pipelines, etc.) + + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a job called + `[dev yourname] pydabs_airflow_job` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. + +3. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + Note that the default job from the template has a schedule that runs every day + (defined in resources/sample_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` + diff --git a/knowledge_base/pydabs_job_with_for_each/databricks.yml b/knowledge_base/pydabs_job_with_for_each/databricks.yml new file mode 100644 index 00000000..690fc324 --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/databricks.yml @@ -0,0 +1,20 @@ +# This is a Databricks asset bundle definition for pydabs_airflow. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: pydabs_job_with_foreach + uuid: 3874a19c-7ea5-401d-bca2-9bd1f9d3efbf + +python: + venv_path: .venv + # Functions called to load resources defined in Python. See resources/__init__.py + resources: + - "resources:load_resources" + +include: + - resources/*.yml + - resources/*/*.yml + +targets: + dev: + mode: development + default: true \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_for_each/pyproject.toml b/knowledge_base/pydabs_job_with_for_each/pyproject.toml new file mode 100644 index 00000000..b6439cee --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "pydabs_job_with_for_each" +version = "0.0.1" +authors = [{ name = "Databricks Field Engineering" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-connect>=15.4,<15.5", + "databricks-bundles==0.275.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.black] +line-length = 125 diff --git a/knowledge_base/pydabs_job_with_for_each/resources/__init__.py b/knowledge_base/pydabs_job_with_for_each/resources/__init__.py new file mode 100644 index 00000000..fbcb9dc5 --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/resources/__init__.py @@ -0,0 +1,16 @@ +from databricks.bundles.core import ( + Bundle, + Resources, + load_resources_from_current_package_module, +) + + +def load_resources(bundle: Bundle) -> Resources: + """ + 'load_resources' function is referenced in databricks.yml and is responsible for loading + bundle resources defined in Python code. This function is called by Databricks CLI during + bundle deployment. After deployment, this function is not used. + """ + + # the default implementation loads all Python files in 'resources' directory + return load_resources_from_current_package_module() diff --git a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py new file mode 100644 index 00000000..fad42159 --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py @@ -0,0 +1,38 @@ +from databricks.bundles.jobs import Job, Task, NotebookTask, ForEachTask, TaskDependency + +extract = Task( + task_key="extract", + notebook_task=NotebookTask(notebook_path="src/notebook_extract.py"), +) +process_item_iteration = Task( + task_key="process_item_iteration", + notebook_task=NotebookTask( + notebook_path="src/notebook_process_item.py", + base_parameters={ + "index": "{{input}}", + }, + ), +) +process_item = Task( + task_key='process_item', + depends_on=[TaskDependency(task_key="extract")], + for_each_task=ForEachTask( + inputs='{{tasks.extract.values.indexes}}', + task=process_item_iteration, + concurrency=10 + ) +) + +for_each_example = Job( + name="for_each_example", + tasks=[ + extract, + process_item, + ], + parameters=[ + { + "name": "lookup_file_name", + "default": "/Volumes/main/for_each_example/hotchpotch/my_file.json", + }, + ], +) diff --git a/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py b/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py new file mode 100644 index 00000000..16c8b1ec --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py @@ -0,0 +1,16 @@ +# Databricks notebook source +lookup_file_name = dbutils.widgets.get('lookup_file_name') + +# COMMAND ---------- + +import json +from datetime import datetime, timedelta + +indexes = range(0,10) +start_date = datetime.today() +data = [{"date": (start_date + timedelta(days=index)).strftime("%Y-%m-%d")} for index in indexes] +dbutils.fs.put(lookup_file_name, json.dumps(data), overwrite=True) +dbutils.jobs.taskValues.set("indexes", list(indexes)) + +# COMMAND ---------- + diff --git a/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py b/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py new file mode 100644 index 00000000..a878dcc4 --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py @@ -0,0 +1,16 @@ +# Databricks notebook source +lookup_file_name = dbutils.widgets.get('lookup_file_name') +index = int(dbutils.widgets.get('index')) + +# COMMAND ---------- + +import json + +with open(lookup_file_name, "r") as f: + data = json.load(f) +date = data[index].get("date") + +print(date) + +# COMMAND ---------- + diff --git a/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/__init__.py b/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py b/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py new file mode 100644 index 00000000..3339ef24 --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py @@ -0,0 +1,5 @@ +def main(): + pass + +if __name__ == "__main__": + main() diff --git a/knowledge_base/pydabs_job_with_task_values/README.md b/knowledge_base/pydabs_job_with_task_values/README.md new file mode 100644 index 00000000..d2149f1f --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/README.md @@ -0,0 +1,69 @@ +# pydabs_job_with_task_values + +This example demonstrates a simple Databricks job that uses tasks values to exchange info. + +* `src/`: Python source code for this project. + * `src/pydabs_job_with_task_values/`: Shared Python code that can be used by jobs and pipelines. +* `resources/`: Resource configurations (jobs, pipelines, etc.) + + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a job called + `[dev yourname] pydabs_airflow_job` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. + +3. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + Note that the default job from the template has a schedule that runs every day + (defined in resources/sample_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` + diff --git a/knowledge_base/pydabs_job_with_task_values/databricks.yml b/knowledge_base/pydabs_job_with_task_values/databricks.yml new file mode 100644 index 00000000..82e52a5b --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/databricks.yml @@ -0,0 +1,20 @@ +# This is a Databricks asset bundle definition for pydabs_airflow. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: pydabs_job_with_task_values + uuid: 3874a19c-7ea5-401d-bca2-9bd1f9d3efbf + +python: + venv_path: .venv + # Functions called to load resources defined in Python. See resources/__init__.py + resources: + - "resources:load_resources" + +include: + - resources/*.yml + - resources/*/*.yml + +targets: + dev: + mode: development + default: true \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_task_values/pyproject.toml b/knowledge_base/pydabs_job_with_task_values/pyproject.toml new file mode 100644 index 00000000..c2808624 --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "pydabs_job_with_task_values" +version = "0.0.1" +authors = [{ name = "Databricks Field Engineering" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-connect>=15.4,<15.5", + "databricks-bundles==0.275.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.black] +line-length = 125 diff --git a/knowledge_base/pydabs_job_with_task_values/resources/__init__.py b/knowledge_base/pydabs_job_with_task_values/resources/__init__.py new file mode 100644 index 00000000..fbcb9dc5 --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/resources/__init__.py @@ -0,0 +1,16 @@ +from databricks.bundles.core import ( + Bundle, + Resources, + load_resources_from_current_package_module, +) + + +def load_resources(bundle: Bundle) -> Resources: + """ + 'load_resources' function is referenced in databricks.yml and is responsible for loading + bundle resources defined in Python code. This function is called by Databricks CLI during + bundle deployment. After deployment, this function is not used. + """ + + # the default implementation loads all Python files in 'resources' directory + return load_resources_from_current_package_module() diff --git a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py new file mode 100644 index 00000000..45e91f48 --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py @@ -0,0 +1,19 @@ +from databricks.bundles.jobs import Job, Task, NotebookTask, TaskDependency + +task_a = Task( + task_key="task_a", + notebook_task=NotebookTask(notebook_path="src/notebook_task_a.py"), +) +task_b = Task( + task_key="task_b", + depends_on=[TaskDependency(task_key="task_a")], + notebook_task=NotebookTask(notebook_path="src/notebook_task_b.py"), +) + +task_values_simple=Job( + name="task_values_simple", + tasks=[ + task_a, + task_b, + ], +) diff --git a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py new file mode 100644 index 00000000..c3715d0d --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py @@ -0,0 +1,3 @@ +# Databricks notebook source +val = [42, 12, 1812] +dbutils.jobs.taskValues.set(key="my_key", value=val) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py new file mode 100644 index 00000000..578de83b --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py @@ -0,0 +1,3 @@ +# Databricks notebook source +val = dbutils.jobs.taskValues.get(taskKey="task_a", key="my_key") +print(val) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/__init__.py b/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py b/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py new file mode 100644 index 00000000..3339ef24 --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py @@ -0,0 +1,5 @@ +def main(): + pass + +if __name__ == "__main__": + main() From 46422feaf8638151b32580593bc0fbf2e6320a6e Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Tue, 18 Nov 2025 13:29:50 +0100 Subject: [PATCH 02/29] adding conditional execution and file arrival scripts --- .../README.md | 64 ++++++++++++++++ .../databricks.yml | 21 +++++ .../pyproject.toml | 26 +++++++ .../resources/__init__.py | 16 ++++ .../resources/conditional_execution.py | 54 +++++++++++++ .../src/check_quality.ipynb | 56 ++++++++++++++ .../src/process_bad_data.ipynb | 25 ++++++ .../src/process_good_data.ipynb | 25 ++++++ .../pydabs_job_file_arrival/README.md | 76 +++++++++++++++++++ .../pydabs_job_file_arrival/databricks.yml | 21 +++++ .../pydabs_job_file_arrival/pyproject.toml | 26 +++++++ .../resources/__init__.py | 16 ++++ .../resources/file_arrival.py | 27 +++++++ .../src/process_files.ipynb | 40 ++++++++++ 14 files changed, 493 insertions(+) create mode 100644 knowledge_base/pydabs_job_conditional_execution/README.md create mode 100644 knowledge_base/pydabs_job_conditional_execution/databricks.yml create mode 100644 knowledge_base/pydabs_job_conditional_execution/pyproject.toml create mode 100644 knowledge_base/pydabs_job_conditional_execution/resources/__init__.py create mode 100644 knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py create mode 100644 knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb create mode 100644 knowledge_base/pydabs_job_conditional_execution/src/process_bad_data.ipynb create mode 100644 knowledge_base/pydabs_job_conditional_execution/src/process_good_data.ipynb create mode 100644 knowledge_base/pydabs_job_file_arrival/README.md create mode 100644 knowledge_base/pydabs_job_file_arrival/databricks.yml create mode 100644 knowledge_base/pydabs_job_file_arrival/pyproject.toml create mode 100644 knowledge_base/pydabs_job_file_arrival/resources/__init__.py create mode 100644 knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py create mode 100644 knowledge_base/pydabs_job_file_arrival/src/process_files.ipynb diff --git a/knowledge_base/pydabs_job_conditional_execution/README.md b/knowledge_base/pydabs_job_conditional_execution/README.md new file mode 100644 index 00000000..c10e4a4f --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/README.md @@ -0,0 +1,64 @@ +# pydabs_job_conditional_execution + +This example demonstrates a Lakeflow Job that uses conditional task execution based on data quality checks. + +The Lakeflow Job consists of following tasks: +1. Checks data quality and calculates bad records +2. Evaluates if bad records exceed a threshold (100 records) +3. Routes to different processing paths based on the condition: + - If bad records > 100: runs `handle_bad_data` task + - If bad records ≤ 100: runs `continue_pipeline` task + +* `src/`: Notebook source code for this project. + * `src/check_quality.ipynb`: Checks data quality and outputs bad record count + * `src/process_bad_data.ipynb`: Handles cases with high bad record count + * `src/process_good_data.ipynb`: Continues normal pipeline for good data +* `resources/`: Resource configurations (jobs, pipelines, etc.) + * `resources/conditional_execution.py`: PyDABs job definition with conditional tasks + + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, this project will deploy a job called + `[dev yourname] pydabs_job_conditional_execution` to your workspace. + You can find that resource by opening your workspace and clicking on **Jobs & Pipelines**. + +3. To run the job, use the "run" command: + ``` + $ databricks bundle run pydabs_job_conditional_execution + ``` \ No newline at end of file diff --git a/knowledge_base/pydabs_job_conditional_execution/databricks.yml b/knowledge_base/pydabs_job_conditional_execution/databricks.yml new file mode 100644 index 00000000..55aa0938 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/databricks.yml @@ -0,0 +1,21 @@ +# This is a Databricks asset bundle definition for pydabs_airflow. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: pydabs_job_conditional_execution + +python: + venv_path: .venv + # Functions called to load resources defined in Python. See resources/__init__.py + resources: + - "resources:load_resources" + +include: + - resources/*.yml + - resources/*/*.yml + +targets: + dev: + mode: development + default: true + workspace: + host: https://myworkspace.databricks.com \ No newline at end of file diff --git a/knowledge_base/pydabs_job_conditional_execution/pyproject.toml b/knowledge_base/pydabs_job_conditional_execution/pyproject.toml new file mode 100644 index 00000000..44bbd651 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "pydabs_job_conditional_execution" +version = "0.0.1" +authors = [{ name = "Databricks Field Engineering" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-connect>=15.4,<15.5", + "databricks-bundles==0.275.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.black] +line-length = 125 diff --git a/knowledge_base/pydabs_job_conditional_execution/resources/__init__.py b/knowledge_base/pydabs_job_conditional_execution/resources/__init__.py new file mode 100644 index 00000000..fbcb9dc5 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/resources/__init__.py @@ -0,0 +1,16 @@ +from databricks.bundles.core import ( + Bundle, + Resources, + load_resources_from_current_package_module, +) + + +def load_resources(bundle: Bundle) -> Resources: + """ + 'load_resources' function is referenced in databricks.yml and is responsible for loading + bundle resources defined in Python code. This function is called by Databricks CLI during + bundle deployment. After deployment, this function is not used. + """ + + # the default implementation loads all Python files in 'resources' directory + return load_resources_from_current_package_module() diff --git a/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py b/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py new file mode 100644 index 00000000..f4242ec4 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py @@ -0,0 +1,54 @@ +from databricks.bundles.jobs import ( + ConditionTask, + Job, + NotebookTask, + Task, + TaskDependency, +) + +pydabs_job_conditional_execution = Job( + name="pydabs_job_conditional_execution", + tasks=[ + Task( + task_key="check_data_quality", + notebook_task=NotebookTask( + notebook_path="src/check_quality.ipynb" + ), + ), + Task( + task_key="evaluate_quality", + condition_task=ConditionTask( + left="{{tasks.check_data_quality.values.bad_records}}", + op="GREATER_THAN", + right="100" + ), + depends_on=[ + TaskDependency(task_key="check_data_quality") + ], + ), + Task( + task_key="handle_bad_data", + notebook_task=NotebookTask( + notebook_path="src/process_bad_data.ipynb" + ), + depends_on=[ + TaskDependency( + task_key="evaluate_quality", + outcome="true" + ) + ], + ), + Task( + task_key="continue_pipeline", + notebook_task=NotebookTask( + notebook_path="src/process_good_data.ipynb" + ), + depends_on=[ + TaskDependency( + task_key="evaluate_quality", + outcome="false" + ) + ], + ) + ] +) diff --git a/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb b/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb new file mode 100644 index 00000000..4de98129 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb @@ -0,0 +1,56 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "cdf58588", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "bad_records_count = 150" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97594830", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "dbutils.jobs.taskValues.set(\n", + " key='bad_records',\n", + " value=bad_records_count\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f48c4bc8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "print(f\"Found {bad_records_count} bad records\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/knowledge_base/pydabs_job_conditional_execution/src/process_bad_data.ipynb b/knowledge_base/pydabs_job_conditional_execution/src/process_bad_data.ipynb new file mode 100644 index 00000000..66e15fa2 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/src/process_bad_data.ipynb @@ -0,0 +1,25 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "2ed3f14d", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "print(\"Processing Files...\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/knowledge_base/pydabs_job_conditional_execution/src/process_good_data.ipynb b/knowledge_base/pydabs_job_conditional_execution/src/process_good_data.ipynb new file mode 100644 index 00000000..66e15fa2 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/src/process_good_data.ipynb @@ -0,0 +1,25 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "2ed3f14d", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "print(\"Processing Files...\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/knowledge_base/pydabs_job_file_arrival/README.md b/knowledge_base/pydabs_job_file_arrival/README.md new file mode 100644 index 00000000..8ce71fe2 --- /dev/null +++ b/knowledge_base/pydabs_job_file_arrival/README.md @@ -0,0 +1,76 @@ +# pydabs_job_file_arrival + +This example demonstrates a Lakeflow Job that uses file arrival triggers to automatically process new files when they arrive in a Unity Catalog Volume. + +The Lakeflow Job is configured with: +- **File arrival trigger**: Monitors a Unity Catalog Volume (root or subpath) for new files, recursively. +- **Configurable wait times**: + - Minimum time between triggers: 60 seconds + - Wait after last file change: 90 seconds (ensures file write is complete) +- **Automatic processing**: When files are detected, the job automatically runs and processes them + +* `src/`: Notebook source code for this project. + * `src/process_files.ipynb`: Processes newly arrived files from the volume path. +* `resources/`: Resource configurations (jobs, pipelines, etc.) + * `resources/file_arrival.py`: PyDABs job with file arrival trigger configuration. + + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +## Development vs. Production behavior + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, this project will deploy a job called + `[dev yourname] pydabs_job_file_arrival` to your workspace. + You can find that resource by opening your workspace and clicking on **Jobs & Pipelines**. + +3. To configure the volume location: + - Edit `resources/file_arrival.py` and update the `url` parameter to point to your Unity Catalog Volume: + + ```python + url="/Volumes/your_catalog/your_schema/your_volume/" + ``` + +4. Development vs. Production behavior + - Dev target (mode: development): Schedules and automatic triggers are disabled by design, so the job will not auto-fire on file arrival. Use manual runs to test the logic. + You can also manually run it with: + + ``` + $ databricks bundle run pydabs_job_file_arrival + ``` + - Prod target (mode: production): Automatic triggers are active. Uploading a file to the configured Unity Catalog Volume path will trigger the job run when the trigger evaluates. + \ No newline at end of file diff --git a/knowledge_base/pydabs_job_file_arrival/databricks.yml b/knowledge_base/pydabs_job_file_arrival/databricks.yml new file mode 100644 index 00000000..80dff3ca --- /dev/null +++ b/knowledge_base/pydabs_job_file_arrival/databricks.yml @@ -0,0 +1,21 @@ +# This is a Databricks asset bundle definition for pydabs_airflow. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: pydabs_job_file_arrival + +python: + venv_path: .venv + # Functions called to load resources defined in Python. See resources/__init__.py + resources: + - "resources:load_resources" + +include: + - resources/*.yml + - resources/*/*.yml + +targets: + dev: + mode: development + default: true + workspace: + host: https://myworkspace.databricks.com \ No newline at end of file diff --git a/knowledge_base/pydabs_job_file_arrival/pyproject.toml b/knowledge_base/pydabs_job_file_arrival/pyproject.toml new file mode 100644 index 00000000..cf126d21 --- /dev/null +++ b/knowledge_base/pydabs_job_file_arrival/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "pydabs_job_file_arrival" +version = "0.0.1" +authors = [{ name = "Databricks Field Engineering" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-connect>=15.4,<15.5", + "databricks-bundles==0.275.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.black] +line-length = 125 diff --git a/knowledge_base/pydabs_job_file_arrival/resources/__init__.py b/knowledge_base/pydabs_job_file_arrival/resources/__init__.py new file mode 100644 index 00000000..fbcb9dc5 --- /dev/null +++ b/knowledge_base/pydabs_job_file_arrival/resources/__init__.py @@ -0,0 +1,16 @@ +from databricks.bundles.core import ( + Bundle, + Resources, + load_resources_from_current_package_module, +) + + +def load_resources(bundle: Bundle) -> Resources: + """ + 'load_resources' function is referenced in databricks.yml and is responsible for loading + bundle resources defined in Python code. This function is called by Databricks CLI during + bundle deployment. After deployment, this function is not used. + """ + + # the default implementation loads all Python files in 'resources' directory + return load_resources_from_current_package_module() diff --git a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py new file mode 100644 index 00000000..58ddb45b --- /dev/null +++ b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py @@ -0,0 +1,27 @@ +from databricks.bundles.jobs import ( + FileArrivalTriggerConfiguration, + Job, + NotebookTask, + Task, + TriggerSettings, +) + +pydabs_job_file_arrival = Job( + name="pydabs_job_file_arrival", + tasks=[ + Task( + task_key="process_new_files", + notebook_task=NotebookTask( + notebook_path="src/process_files.ipynb", + base_parameters={"file_arrival_location": "{{job.trigger.file_arrival.location}}"}, + ), + ) + ], + trigger=TriggerSettings( + file_arrival=FileArrivalTriggerConfiguration( + url="/Volumes/your_catalog/your_schema/your_volume/", + min_time_between_triggers_seconds=60, + wait_after_last_change_seconds=90, + ), + ), +) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_file_arrival/src/process_files.ipynb b/knowledge_base/pydabs_job_file_arrival/src/process_files.ipynb new file mode 100644 index 00000000..ac291495 --- /dev/null +++ b/knowledge_base/pydabs_job_file_arrival/src/process_files.ipynb @@ -0,0 +1,40 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "2ed3f14d", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "file_arrival_location = dbutils.widgets.get(\"file_arrival_location\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17d7a5e3", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "print(\"Processing Files...\")\n", + "print(f\"File location: {file_arrival_location}\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From f841d1ce19a1e09b20fdd2e9351bd3eaa5f921d7 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Tue, 18 Nov 2025 13:34:39 +0100 Subject: [PATCH 03/29] adding conditional execution and file arrival scripts --- knowledge_base/pydabs_job_file_arrival/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/knowledge_base/pydabs_job_file_arrival/README.md b/knowledge_base/pydabs_job_file_arrival/README.md index 8ce71fe2..c0d8db6b 100644 --- a/knowledge_base/pydabs_job_file_arrival/README.md +++ b/knowledge_base/pydabs_job_file_arrival/README.md @@ -36,8 +36,6 @@ If you're developing with an IDE, dependencies for this project should be instal # Using this project using the CLI -## Development vs. Production behavior - The Databricks workspace and IDE extensions provide a graphical interface for working with this project. It's also possible to interact with it directly using the CLI: From 0d789ab31437f26ca825b1574166b77ca98f73af Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Fri, 21 Nov 2025 19:31:58 +0100 Subject: [PATCH 04/29] pydabs programmatic generation -initial version --- .../README.md | 69 +++++++++++++++++++ .../databricks.yml | 22 ++++++ .../mutators.py | 21 ++++++ .../pyproject.toml | 26 +++++++ .../resources/__init__.py | 40 +++++++++++ .../src/notebook_extract.py | 16 +++++ .../src/notebook_process_item.py | 16 +++++ .../__init__.py | 0 .../main.py | 5 ++ 9 files changed, 215 insertions(+) create mode 100644 knowledge_base/pydabs_job_programmatic_generation/README.md create mode 100644 knowledge_base/pydabs_job_programmatic_generation/databricks.yml create mode 100644 knowledge_base/pydabs_job_programmatic_generation/mutators.py create mode 100644 knowledge_base/pydabs_job_programmatic_generation/pyproject.toml create mode 100644 knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py create mode 100644 knowledge_base/pydabs_job_programmatic_generation/src/notebook_extract.py create mode 100644 knowledge_base/pydabs_job_programmatic_generation/src/notebook_process_item.py create mode 100644 knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/__init__.py create mode 100644 knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/main.py diff --git a/knowledge_base/pydabs_job_programmatic_generation/README.md b/knowledge_base/pydabs_job_programmatic_generation/README.md new file mode 100644 index 00000000..7e45b143 --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/README.md @@ -0,0 +1,69 @@ +# pydabs_job_with_for_each + +This example demonstrates a simple Databricks job with programmatic generation and customization. + +* `src/`: Python source code for this project. + * `src/pydabs_job_programmatic_generation/`: Shared Python code that can be used by jobs and pipelines. +* `resources/`: Resource configurations (jobs, pipelines, etc.) + + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a job called + `[dev yourname] pydabs_airflow_job` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. + +3. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + Note that the default job from the template has a schedule that runs every day + (defined in resources/sample_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` + diff --git a/knowledge_base/pydabs_job_programmatic_generation/databricks.yml b/knowledge_base/pydabs_job_programmatic_generation/databricks.yml new file mode 100644 index 00000000..a4f07de8 --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/databricks.yml @@ -0,0 +1,22 @@ +# This is a Databricks asset bundle definition for pydabs_airflow. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: pydabs_job_programmatic_generation + uuid: 3874a19c-7ea5-401d-bca2-9bd1f9d3efbf + +python: + venv_path: .venv + # Functions called to load resources defined in Python. See resources/__init__.py + resources: + - "resources:load_resources" + mutators: + - 'mutators:add_email_notifications' + +include: + - resources/*.yml + - resources/*/*.yml + +targets: + dev: + mode: development + default: true \ No newline at end of file diff --git a/knowledge_base/pydabs_job_programmatic_generation/mutators.py b/knowledge_base/pydabs_job_programmatic_generation/mutators.py new file mode 100644 index 00000000..201b6f42 --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/mutators.py @@ -0,0 +1,21 @@ +from dataclasses import replace + + +from databricks.bundles.core import Bundle, job_mutator, m +from databricks.bundles.jobs import Job, JobEmailNotifications + + +@job_mutator +def add_email_notifications(bundle: Bundle, job: Job) -> Job: + if job.email_notifications: + return job + + + email_notifications = JobEmailNotifications.from_dict( + { + "on_failure": ["${workspace.current_user.userName}"], + } + ) + + + return replace(job, email_notifications=email_notifications) diff --git a/knowledge_base/pydabs_job_programmatic_generation/pyproject.toml b/knowledge_base/pydabs_job_programmatic_generation/pyproject.toml new file mode 100644 index 00000000..6bb9e9fb --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "pydabs_job_programmatic_generation" +version = "0.0.1" +authors = [{ name = "Databricks Field Engineering" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-connect>=15.4,<15.5", + "databricks-bundles==0.275.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.black] +line-length = 125 diff --git a/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py b/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py new file mode 100644 index 00000000..d72e7d2a --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py @@ -0,0 +1,40 @@ +import os +import glob + +from databricks.bundles.core import ( + Bundle, + Resources, + load_resources_from_current_package_module, +) + + +def load_resources(bundle: Bundle) -> Resources: + """ + 'load_resources' function is referenced in databricks.yml and is responsible for loading + bundle resources defined in Python code. This function is called by Databricks CLI during + bundle deployment. After deployment, this function is not used. + """ + + # the default implementation loads all Python files in 'resources' directory + # return load_resources_from_current_package_module() + + """ + load_resources() is called during bundle deployment + Here a job is created for every notebook in the src folder + """ + resources = Resources() + for file in glob.glob("src/notebook*.py", recursive=True): + resources.add_job( + resource_name=os.path.basename(file).removesuffix(".py"), + job={ + "name": file, + "tasks": [ + { + "task_key": "notebook_task", + "notebook_task": {"notebook_path": file}, + }, + ], + }, + ) + + return resources diff --git a/knowledge_base/pydabs_job_programmatic_generation/src/notebook_extract.py b/knowledge_base/pydabs_job_programmatic_generation/src/notebook_extract.py new file mode 100644 index 00000000..16c8b1ec --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/src/notebook_extract.py @@ -0,0 +1,16 @@ +# Databricks notebook source +lookup_file_name = dbutils.widgets.get('lookup_file_name') + +# COMMAND ---------- + +import json +from datetime import datetime, timedelta + +indexes = range(0,10) +start_date = datetime.today() +data = [{"date": (start_date + timedelta(days=index)).strftime("%Y-%m-%d")} for index in indexes] +dbutils.fs.put(lookup_file_name, json.dumps(data), overwrite=True) +dbutils.jobs.taskValues.set("indexes", list(indexes)) + +# COMMAND ---------- + diff --git a/knowledge_base/pydabs_job_programmatic_generation/src/notebook_process_item.py b/knowledge_base/pydabs_job_programmatic_generation/src/notebook_process_item.py new file mode 100644 index 00000000..a878dcc4 --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/src/notebook_process_item.py @@ -0,0 +1,16 @@ +# Databricks notebook source +lookup_file_name = dbutils.widgets.get('lookup_file_name') +index = int(dbutils.widgets.get('index')) + +# COMMAND ---------- + +import json + +with open(lookup_file_name, "r") as f: + data = json.load(f) +date = data[index].get("date") + +print(date) + +# COMMAND ---------- + diff --git a/knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/__init__.py b/knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/main.py b/knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/main.py new file mode 100644 index 00000000..3339ef24 --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/main.py @@ -0,0 +1,5 @@ +def main(): + pass + +if __name__ == "__main__": + main() From e66e1b697b72ac809e5bb39661ac469f1bc985fd Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Mon, 24 Nov 2025 18:02:06 +0100 Subject: [PATCH 05/29] programmatic customization of pydabs --- .../README.md | 2 +- .../databricks.yml | 14 +++- .../mutators.py | 8 +-- .../resources/__init__.py | 47 ++++++++---- .../src/notebook_extract.py | 16 ----- .../src/notebook_process_item.py | 16 ----- .../src/tpcds_query1.sql | 42 +++++++++++ .../src/tpcds_query2.sql | 72 +++++++++++++++++++ 8 files changed, 165 insertions(+), 52 deletions(-) delete mode 100644 knowledge_base/pydabs_job_programmatic_generation/src/notebook_extract.py delete mode 100644 knowledge_base/pydabs_job_programmatic_generation/src/notebook_process_item.py create mode 100644 knowledge_base/pydabs_job_programmatic_generation/src/tpcds_query1.sql create mode 100644 knowledge_base/pydabs_job_programmatic_generation/src/tpcds_query2.sql diff --git a/knowledge_base/pydabs_job_programmatic_generation/README.md b/knowledge_base/pydabs_job_programmatic_generation/README.md index 7e45b143..e124406a 100644 --- a/knowledge_base/pydabs_job_programmatic_generation/README.md +++ b/knowledge_base/pydabs_job_programmatic_generation/README.md @@ -1,4 +1,4 @@ -# pydabs_job_with_for_each +# pydabs_job_programmatic_generation This example demonstrates a simple Databricks job with programmatic generation and customization. diff --git a/knowledge_base/pydabs_job_programmatic_generation/databricks.yml b/knowledge_base/pydabs_job_programmatic_generation/databricks.yml index a4f07de8..263c2023 100644 --- a/knowledge_base/pydabs_job_programmatic_generation/databricks.yml +++ b/knowledge_base/pydabs_job_programmatic_generation/databricks.yml @@ -16,7 +16,19 @@ include: - resources/*.yml - resources/*/*.yml +resources: + sql_warehouses: + twoxs_warehouse: + name: Serverless Starter Warehouse + cluster_size: 2X-Small + auto_stop_mins: 10 + targets: dev: mode: development - default: true \ No newline at end of file + default: true + + prod: + mode: production + workspace: + root_path: /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/${bundle.target} diff --git a/knowledge_base/pydabs_job_programmatic_generation/mutators.py b/knowledge_base/pydabs_job_programmatic_generation/mutators.py index 201b6f42..2e73300d 100644 --- a/knowledge_base/pydabs_job_programmatic_generation/mutators.py +++ b/knowledge_base/pydabs_job_programmatic_generation/mutators.py @@ -1,21 +1,17 @@ from dataclasses import replace - -from databricks.bundles.core import Bundle, job_mutator, m +from databricks.bundles.core import Bundle, job_mutator, mu from databricks.bundles.jobs import Job, JobEmailNotifications - @job_mutator def add_email_notifications(bundle: Bundle, job: Job) -> Job: - if job.email_notifications: + if bundle.target == 'dev': return job - email_notifications = JobEmailNotifications.from_dict( { "on_failure": ["${workspace.current_user.userName}"], } ) - return replace(job, email_notifications=email_notifications) diff --git a/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py b/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py index d72e7d2a..e903c000 100644 --- a/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py +++ b/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py @@ -4,8 +4,8 @@ from databricks.bundles.core import ( Bundle, Resources, - load_resources_from_current_package_module, ) +from databricks.bundles.schemas import Schema def load_resources(bundle: Bundle) -> Resources: @@ -13,26 +13,49 @@ def load_resources(bundle: Bundle) -> Resources: 'load_resources' function is referenced in databricks.yml and is responsible for loading bundle resources defined in Python code. This function is called by Databricks CLI during bundle deployment. After deployment, this function is not used. - """ - # the default implementation loads all Python files in 'resources' directory - # return load_resources_from_current_package_module() + the default implementation loads all Python files in 'resources' directory + return load_resources_from_current_package_module() - """ - load_resources() is called during bundle deployment - Here a job is created for every notebook in the src folder + Here a job is created for every notebook in the src folder + Plus a schema for the dev environment, to have one schema per user deploying the Job """ resources = Resources() - for file in glob.glob("src/notebook*.py", recursive=True): + + target_schema_name = "target_prod_schema" # this is the schema name for prod - should be deployed with Terraform + + if bundle.target == "dev": + # create 1 schema per user in other environments + # note databricks.yml: the target dev is mode "development" + schema = Schema( + catalog_name="main", + name="prog_gen_target", + comment="Schema for output data" + ) + resources.add_schema( + resource_name="project_schema", + schema=schema + ) + target_schema_name = "${resources.schemas.project_schema.name}" + + for file in glob.glob("src/*.sql", recursive=True): resources.add_job( - resource_name=os.path.basename(file).removesuffix(".py"), + resource_name=os.path.basename(file).removesuffix(".sql"), job={ "name": file, "tasks": [ { - "task_key": "notebook_task", - "notebook_task": {"notebook_path": file}, - }, + "task_key": "create_table", + "sql_task": { + "parameters": { + "target_schema": target_schema_name + }, + "file": { + "path": file, + }, + "warehouse_id": "${resources.sql_warehouses.twoxs_warehouse.id}" + }, + } ], }, ) diff --git a/knowledge_base/pydabs_job_programmatic_generation/src/notebook_extract.py b/knowledge_base/pydabs_job_programmatic_generation/src/notebook_extract.py deleted file mode 100644 index 16c8b1ec..00000000 --- a/knowledge_base/pydabs_job_programmatic_generation/src/notebook_extract.py +++ /dev/null @@ -1,16 +0,0 @@ -# Databricks notebook source -lookup_file_name = dbutils.widgets.get('lookup_file_name') - -# COMMAND ---------- - -import json -from datetime import datetime, timedelta - -indexes = range(0,10) -start_date = datetime.today() -data = [{"date": (start_date + timedelta(days=index)).strftime("%Y-%m-%d")} for index in indexes] -dbutils.fs.put(lookup_file_name, json.dumps(data), overwrite=True) -dbutils.jobs.taskValues.set("indexes", list(indexes)) - -# COMMAND ---------- - diff --git a/knowledge_base/pydabs_job_programmatic_generation/src/notebook_process_item.py b/knowledge_base/pydabs_job_programmatic_generation/src/notebook_process_item.py deleted file mode 100644 index a878dcc4..00000000 --- a/knowledge_base/pydabs_job_programmatic_generation/src/notebook_process_item.py +++ /dev/null @@ -1,16 +0,0 @@ -# Databricks notebook source -lookup_file_name = dbutils.widgets.get('lookup_file_name') -index = int(dbutils.widgets.get('index')) - -# COMMAND ---------- - -import json - -with open(lookup_file_name, "r") as f: - data = json.load(f) -date = data[index].get("date") - -print(date) - -# COMMAND ---------- - diff --git a/knowledge_base/pydabs_job_programmatic_generation/src/tpcds_query1.sql b/knowledge_base/pydabs_job_programmatic_generation/src/tpcds_query1.sql new file mode 100644 index 00000000..7dfcf368 --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/src/tpcds_query1.sql @@ -0,0 +1,42 @@ +USE CATALOG samples; +USE SCHEMA tpcds_sf1; + +CREATE OR REPLACE TABLE IDENTIFIER( + 'main.' || :target_schema || '.tpcds_query1' +) AS + WITH customer_total_return AS ( + SELECT + sr_customer_sk AS ctr_customer_sk, + sr_store_sk AS ctr_store_sk, + SUM(sr_return_amt) AS ctr_total_return + FROM + store_returns, + date_dim + WHERE + sr_returned_date_sk = d_date_sk + AND d_year = 2001 + GROUP BY + sr_customer_sk, + sr_store_sk + ) + SELECT + c_customer_id + FROM + customer_total_return ctr1, + store, + customer + WHERE + ctr1.ctr_total_return > ( + SELECT + AVG(ctr_total_return) * 1.2 + FROM + customer_total_return ctr2 + WHERE + ctr1.ctr_store_sk = ctr2.ctr_store_sk + ) + AND s_store_sk = ctr1.ctr_store_sk + AND s_state = 'TN' + AND ctr1.ctr_customer_sk = c_customer_sk + ORDER BY + c_customer_id + LIMIT 100; \ No newline at end of file diff --git a/knowledge_base/pydabs_job_programmatic_generation/src/tpcds_query2.sql b/knowledge_base/pydabs_job_programmatic_generation/src/tpcds_query2.sql new file mode 100644 index 00000000..b7b10ac9 --- /dev/null +++ b/knowledge_base/pydabs_job_programmatic_generation/src/tpcds_query2.sql @@ -0,0 +1,72 @@ +USE CATALOG samples; +USE SCHEMA tpcds_sf1; + +CREATE OR REPLACE TABLE IDENTIFIER( + 'main.' || :target_schema || '.tpcds_query2' +) +TBLPROPERTIES ( + 'delta.columnMapping.mode' = 'name' +) AS +WITH wscs AS ( + SELECT sold_date_sk, sales_price + FROM ( + SELECT ws_sold_date_sk AS sold_date_sk, ws_ext_sales_price AS sales_price + FROM web_sales + UNION ALL + SELECT cs_sold_date_sk AS sold_date_sk, cs_ext_sales_price AS sales_price + FROM catalog_sales + ) +), +wswscs AS ( + SELECT + d_week_seq, + SUM(CASE WHEN d_day_name = 'Sunday' THEN sales_price ELSE NULL END) AS sun_sales, + SUM(CASE WHEN d_day_name = 'Monday' THEN sales_price ELSE NULL END) AS mon_sales, + SUM(CASE WHEN d_day_name = 'Tuesday' THEN sales_price ELSE NULL END) AS tue_sales, + SUM(CASE WHEN d_day_name = 'Wednesday' THEN sales_price ELSE NULL END) AS wed_sales, + SUM(CASE WHEN d_day_name = 'Thursday' THEN sales_price ELSE NULL END) AS thu_sales, + SUM(CASE WHEN d_day_name = 'Friday' THEN sales_price ELSE NULL END) AS fri_sales, + SUM(CASE WHEN d_day_name = 'Saturday' THEN sales_price ELSE NULL END) AS sat_sales + FROM wscs + JOIN date_dim ON d_date_sk = sold_date_sk + GROUP BY d_week_seq +) +SELECT + d_week_seq1, + ROUND(sun_sales1 / sun_sales2, 2), + ROUND(mon_sales1 / mon_sales2, 2), + ROUND(tue_sales1 / tue_sales2, 2), + ROUND(wed_sales1 / wed_sales2, 2), + ROUND(thu_sales1 / thu_sales2, 2), + ROUND(fri_sales1 / fri_sales2, 2), + ROUND(sat_sales1 / sat_sales2, 2) +FROM ( + SELECT + wswscs.d_week_seq AS d_week_seq1, + sun_sales AS sun_sales1, + mon_sales AS mon_sales1, + tue_sales AS tue_sales1, + wed_sales AS wed_sales1, + thu_sales AS thu_sales1, + fri_sales AS fri_sales1, + sat_sales AS sat_sales1 + FROM wswscs + JOIN date_dim ON date_dim.d_week_seq = wswscs.d_week_seq + WHERE d_year = 1998 +) y +JOIN ( + SELECT + wswscs.d_week_seq AS d_week_seq2, + sun_sales AS sun_sales2, + mon_sales AS mon_sales2, + tue_sales AS tue_sales2, + wed_sales AS wed_sales2, + thu_sales AS thu_sales2, + fri_sales AS fri_sales2, + sat_sales AS sat_sales2 + FROM wswscs + JOIN date_dim ON date_dim.d_week_seq = wswscs.d_week_seq + WHERE d_year = 1999 +) z +ON d_week_seq1 = d_week_seq2 - 53 +ORDER BY d_week_seq1; \ No newline at end of file From d9b3b3b7ecc86e164ec00670ef12802bf45a9d4d Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Mon, 24 Nov 2025 19:09:34 +0100 Subject: [PATCH 06/29] typo --- knowledge_base/pydabs_job_programmatic_generation/mutators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knowledge_base/pydabs_job_programmatic_generation/mutators.py b/knowledge_base/pydabs_job_programmatic_generation/mutators.py index 2e73300d..b928905a 100644 --- a/knowledge_base/pydabs_job_programmatic_generation/mutators.py +++ b/knowledge_base/pydabs_job_programmatic_generation/mutators.py @@ -1,6 +1,6 @@ from dataclasses import replace -from databricks.bundles.core import Bundle, job_mutator, mu +from databricks.bundles.core import Bundle, job_mutator from databricks.bundles.jobs import Job, JobEmailNotifications @job_mutator From 01e32caceb54f992b8e54ad4968e3a152a3c8969 Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Thu, 11 Dec 2025 15:44:32 +0100 Subject: [PATCH 07/29] ruff reformatting --- .../resources/conditional_execution.py | 36 +++++-------------- .../resources/file_arrival.py | 6 ++-- .../mutators.py | 3 +- .../resources/__init__.py | 19 ++++------ .../main.py | 1 + .../resources/for_each_simple.py | 18 +++++----- .../src/notebook_extract.py | 10 +++--- .../src/notebook_process_item.py | 5 ++- .../src/pydabs_job_with_for_each/main.py | 1 + .../resources/task_values_simple.py | 13 ++++++- .../src/notebook_task_a.py | 2 +- .../src/notebook_task_b.py | 2 +- .../src/pydabs_job_with_task_values/main.py | 1 + 13 files changed, 56 insertions(+), 61 deletions(-) diff --git a/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py b/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py index f4242ec4..7034b775 100644 --- a/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py +++ b/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py @@ -11,44 +11,26 @@ tasks=[ Task( task_key="check_data_quality", - notebook_task=NotebookTask( - notebook_path="src/check_quality.ipynb" - ), + notebook_task=NotebookTask(notebook_path="src/check_quality.ipynb"), ), Task( task_key="evaluate_quality", condition_task=ConditionTask( left="{{tasks.check_data_quality.values.bad_records}}", op="GREATER_THAN", - right="100" + right="100", ), - depends_on=[ - TaskDependency(task_key="check_data_quality") - ], + depends_on=[TaskDependency(task_key="check_data_quality")], ), Task( task_key="handle_bad_data", - notebook_task=NotebookTask( - notebook_path="src/process_bad_data.ipynb" - ), - depends_on=[ - TaskDependency( - task_key="evaluate_quality", - outcome="true" - ) - ], + notebook_task=NotebookTask(notebook_path="src/process_bad_data.ipynb"), + depends_on=[TaskDependency(task_key="evaluate_quality", outcome="true")], ), Task( task_key="continue_pipeline", - notebook_task=NotebookTask( - notebook_path="src/process_good_data.ipynb" - ), - depends_on=[ - TaskDependency( - task_key="evaluate_quality", - outcome="false" - ) - ], - ) - ] + notebook_task=NotebookTask(notebook_path="src/process_good_data.ipynb"), + depends_on=[TaskDependency(task_key="evaluate_quality", outcome="false")], + ), + ], ) diff --git a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py index 58ddb45b..022b5ce6 100644 --- a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py +++ b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py @@ -13,7 +13,9 @@ task_key="process_new_files", notebook_task=NotebookTask( notebook_path="src/process_files.ipynb", - base_parameters={"file_arrival_location": "{{job.trigger.file_arrival.location}}"}, + base_parameters={ + "file_arrival_location": "{{job.trigger.file_arrival.location}}" + }, ), ) ], @@ -24,4 +26,4 @@ wait_after_last_change_seconds=90, ), ), -) \ No newline at end of file +) diff --git a/knowledge_base/pydabs_job_programmatic_generation/mutators.py b/knowledge_base/pydabs_job_programmatic_generation/mutators.py index b928905a..0ef61721 100644 --- a/knowledge_base/pydabs_job_programmatic_generation/mutators.py +++ b/knowledge_base/pydabs_job_programmatic_generation/mutators.py @@ -3,9 +3,10 @@ from databricks.bundles.core import Bundle, job_mutator from databricks.bundles.jobs import Job, JobEmailNotifications + @job_mutator def add_email_notifications(bundle: Bundle, job: Job) -> Job: - if bundle.target == 'dev': + if bundle.target == "dev": return job email_notifications = JobEmailNotifications.from_dict( diff --git a/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py b/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py index e903c000..c6b6516b 100644 --- a/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py +++ b/knowledge_base/pydabs_job_programmatic_generation/resources/__init__.py @@ -22,20 +22,17 @@ def load_resources(bundle: Bundle) -> Resources: """ resources = Resources() - target_schema_name = "target_prod_schema" # this is the schema name for prod - should be deployed with Terraform + target_schema_name = "target_prod_schema" # this is the schema name for prod - should be deployed with Terraform if bundle.target == "dev": # create 1 schema per user in other environments # note databricks.yml: the target dev is mode "development" schema = Schema( - catalog_name="main", - name="prog_gen_target", - comment="Schema for output data" - ) - resources.add_schema( - resource_name="project_schema", - schema=schema + catalog_name="main", + name="prog_gen_target", + comment="Schema for output data", ) + resources.add_schema(resource_name="project_schema", schema=schema) target_schema_name = "${resources.schemas.project_schema.name}" for file in glob.glob("src/*.sql", recursive=True): @@ -47,13 +44,11 @@ def load_resources(bundle: Bundle) -> Resources: { "task_key": "create_table", "sql_task": { - "parameters": { - "target_schema": target_schema_name - }, + "parameters": {"target_schema": target_schema_name}, "file": { "path": file, }, - "warehouse_id": "${resources.sql_warehouses.twoxs_warehouse.id}" + "warehouse_id": "${resources.sql_warehouses.twoxs_warehouse.id}", }, } ], diff --git a/knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/main.py b/knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/main.py index 3339ef24..cd9ac480 100644 --- a/knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/main.py +++ b/knowledge_base/pydabs_job_programmatic_generation/src/pydabs_job_programmatic_generation/main.py @@ -1,5 +1,6 @@ def main(): pass + if __name__ == "__main__": main() diff --git a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py index fad42159..ea3b7b9d 100644 --- a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py +++ b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py @@ -10,17 +10,17 @@ notebook_path="src/notebook_process_item.py", base_parameters={ "index": "{{input}}", - }, + }, ), ) process_item = Task( - task_key='process_item', + task_key="process_item", depends_on=[TaskDependency(task_key="extract")], for_each_task=ForEachTask( - inputs='{{tasks.extract.values.indexes}}', + inputs="{{tasks.extract.values.indexes}}", task=process_item_iteration, - concurrency=10 - ) + concurrency=10, + ), ) for_each_example = Job( @@ -30,9 +30,9 @@ process_item, ], parameters=[ - { - "name": "lookup_file_name", - "default": "/Volumes/main/for_each_example/hotchpotch/my_file.json", - }, + { + "name": "lookup_file_name", + "default": "/Volumes/main/for_each_example/hotchpotch/my_file.json", + }, ], ) diff --git a/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py b/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py index 16c8b1ec..ebda7133 100644 --- a/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py +++ b/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py @@ -1,16 +1,18 @@ # Databricks notebook source -lookup_file_name = dbutils.widgets.get('lookup_file_name') +lookup_file_name = dbutils.widgets.get("lookup_file_name") # COMMAND ---------- import json from datetime import datetime, timedelta -indexes = range(0,10) +indexes = range(0, 10) start_date = datetime.today() -data = [{"date": (start_date + timedelta(days=index)).strftime("%Y-%m-%d")} for index in indexes] +data = [ + {"date": (start_date + timedelta(days=index)).strftime("%Y-%m-%d")} + for index in indexes +] dbutils.fs.put(lookup_file_name, json.dumps(data), overwrite=True) dbutils.jobs.taskValues.set("indexes", list(indexes)) # COMMAND ---------- - diff --git a/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py b/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py index a878dcc4..ba439a8c 100644 --- a/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py +++ b/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py @@ -1,6 +1,6 @@ # Databricks notebook source -lookup_file_name = dbutils.widgets.get('lookup_file_name') -index = int(dbutils.widgets.get('index')) +lookup_file_name = dbutils.widgets.get("lookup_file_name") +index = int(dbutils.widgets.get("index")) # COMMAND ---------- @@ -13,4 +13,3 @@ print(date) # COMMAND ---------- - diff --git a/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py b/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py index 3339ef24..cd9ac480 100644 --- a/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py +++ b/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py @@ -1,5 +1,6 @@ def main(): pass + if __name__ == "__main__": main() diff --git a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py index 45e91f48..e47787dc 100644 --- a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py +++ b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py @@ -9,11 +9,22 @@ depends_on=[TaskDependency(task_key="task_a")], notebook_task=NotebookTask(notebook_path="src/notebook_task_b.py"), ) +task_c = Task( + task_key="task_c", + notebook_task=NotebookTask(notebook_path="src/notebook_task_c.py"), +) +task_d = Task( + task_key="task_d", + depends_on=[TaskDependency(task_key="task_c")], + notebook_task=NotebookTask(notebook_path="src/notebook_task_d.py"), +) -task_values_simple=Job( +task_values_simple = Job( name="task_values_simple", tasks=[ task_a, task_b, + task_c, + task_d, ], ) diff --git a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py index c3715d0d..63e4e2eb 100644 --- a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py +++ b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py @@ -1,3 +1,3 @@ # Databricks notebook source val = [42, 12, 1812] -dbutils.jobs.taskValues.set(key="my_key", value=val) \ No newline at end of file +dbutils.jobs.taskValues.set(key="my_key", value=val) diff --git a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py index 578de83b..e49eeb34 100644 --- a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py +++ b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py @@ -1,3 +1,3 @@ # Databricks notebook source val = dbutils.jobs.taskValues.get(taskKey="task_a", key="my_key") -print(val) \ No newline at end of file +print(val) diff --git a/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py b/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py index 3339ef24..cd9ac480 100644 --- a/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py +++ b/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py @@ -1,5 +1,6 @@ def main(): pass + if __name__ == "__main__": main() From 236c6888ff10214061fe4d8a93adec726602fa39 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Fri, 13 Feb 2026 13:25:06 +0100 Subject: [PATCH 08/29] update task value example --- .../pydabs_job_with_task_values/README.md | 2 +- .../resources/task_values_simple.py | 37 ++++++------------- .../src/notebook_task_a.py | 3 -- .../src/notebook_task_b.py | 3 -- .../src/pydabs_job_with_task_values/main.py | 6 --- .../__init__.py | 0 .../src/xcom/consumer.py | 4 ++ .../src/xcom/producer.py | 5 +++ 8 files changed, 22 insertions(+), 38 deletions(-) delete mode 100644 knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py delete mode 100644 knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py delete mode 100644 knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py rename knowledge_base/pydabs_job_with_task_values/src/{pydabs_job_with_task_values => xcom}/__init__.py (100%) create mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py create mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py diff --git a/knowledge_base/pydabs_job_with_task_values/README.md b/knowledge_base/pydabs_job_with_task_values/README.md index d2149f1f..71506515 100644 --- a/knowledge_base/pydabs_job_with_task_values/README.md +++ b/knowledge_base/pydabs_job_with_task_values/README.md @@ -4,7 +4,7 @@ This example demonstrates a simple Databricks job that uses tasks values to exch * `src/`: Python source code for this project. * `src/pydabs_job_with_task_values/`: Shared Python code that can be used by jobs and pipelines. -* `resources/`: Resource configurations (jobs, pipelines, etc.) +* `resources/`: Resource configurations, in this case a Lakeflow Job with two tasks exchanging information. ## Getting started diff --git a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py index e47787dc..61248604 100644 --- a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py +++ b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py @@ -1,30 +1,17 @@ from databricks.bundles.jobs import Job, Task, NotebookTask, TaskDependency -task_a = Task( - task_key="task_a", - notebook_task=NotebookTask(notebook_path="src/notebook_task_a.py"), -) -task_b = Task( - task_key="task_b", - depends_on=[TaskDependency(task_key="task_a")], - notebook_task=NotebookTask(notebook_path="src/notebook_task_b.py"), -) -task_c = Task( - task_key="task_c", - notebook_task=NotebookTask(notebook_path="src/notebook_task_c.py"), -) -task_d = Task( - task_key="task_d", - depends_on=[TaskDependency(task_key="task_c")], - notebook_task=NotebookTask(notebook_path="src/notebook_task_d.py"), +producer = Task( + task_key="producer", + notebook_task=NotebookTask(notebook_path="src/xcom/producer.py"), ) -task_values_simple = Job( - name="task_values_simple", - tasks=[ - task_a, - task_b, - task_c, - task_d, - ], +consumer = Task( + task_key="consumer", + depends_on=[TaskDependency(task_key="producer")], + notebook_task=NotebookTask(notebook_path="src/xcom/consumer.py"), ) + +job = Job( + name="xcom_to_task_values_example", + tasks=[producer, consumer], +) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py deleted file mode 100644 index 63e4e2eb..00000000 --- a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_a.py +++ /dev/null @@ -1,3 +0,0 @@ -# Databricks notebook source -val = [42, 12, 1812] -dbutils.jobs.taskValues.set(key="my_key", value=val) diff --git a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py b/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py deleted file mode 100644 index e49eeb34..00000000 --- a/knowledge_base/pydabs_job_with_task_values/src/notebook_task_b.py +++ /dev/null @@ -1,3 +0,0 @@ -# Databricks notebook source -val = dbutils.jobs.taskValues.get(taskKey="task_a", key="my_key") -print(val) diff --git a/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py b/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py deleted file mode 100644 index cd9ac480..00000000 --- a/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/main.py +++ /dev/null @@ -1,6 +0,0 @@ -def main(): - pass - - -if __name__ == "__main__": - main() diff --git a/knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/__init__.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/__init__.py similarity index 100% rename from knowledge_base/pydabs_job_with_task_values/src/pydabs_job_with_task_values/__init__.py rename to knowledge_base/pydabs_job_with_task_values/src/xcom/__init__.py diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py new file mode 100644 index 00000000..89a27a68 --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py @@ -0,0 +1,4 @@ +from databricks.sdk.runtime import dbutils + +val = dbutils.jobs.taskValues.get(taskKey="producer", key="answer", debugValue=None) +print(f"Got value: {val}") \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py new file mode 100644 index 00000000..5d56be9e --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py @@ -0,0 +1,5 @@ +from databricks.sdk.runtime import dbutils + +value = 42 +dbutils.jobs.taskValues.set(key="answer", value=value) +print(f"Produced value: {value}") \ No newline at end of file From 54a2dfaa884fc7889ffec35c68f37445f9c3ec4a Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Fri, 13 Feb 2026 13:31:24 +0100 Subject: [PATCH 09/29] update file arrival trigger example --- .../pydabs_job_file_arrival/README.md | 2 +- .../resources/file_arrival.py | 37 ++++++----------- .../src/process_files.ipynb | 40 ------------------- .../src/process_files.py | 5 +++ 4 files changed, 18 insertions(+), 66 deletions(-) delete mode 100644 knowledge_base/pydabs_job_file_arrival/src/process_files.ipynb create mode 100644 knowledge_base/pydabs_job_file_arrival/src/process_files.py diff --git a/knowledge_base/pydabs_job_file_arrival/README.md b/knowledge_base/pydabs_job_file_arrival/README.md index c0d8db6b..a734022f 100644 --- a/knowledge_base/pydabs_job_file_arrival/README.md +++ b/knowledge_base/pydabs_job_file_arrival/README.md @@ -10,7 +10,7 @@ The Lakeflow Job is configured with: - **Automatic processing**: When files are detected, the job automatically runs and processes them * `src/`: Notebook source code for this project. - * `src/process_files.ipynb`: Processes newly arrived files from the volume path. + * `src/process_files.py`: Processes newly arrived files from the volume path. * `resources/`: Resource configurations (jobs, pipelines, etc.) * `resources/file_arrival.py`: PyDABs job with file arrival trigger configuration. diff --git a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py index 022b5ce6..00f61535 100644 --- a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py +++ b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py @@ -1,29 +1,16 @@ -from databricks.bundles.jobs import ( - FileArrivalTriggerConfiguration, - Job, - NotebookTask, - Task, - TriggerSettings, +from databricks.bundles.jobs import FileArrivalTriggerConfiguration, Job, Task, NotebookTask + +process_files = Task( + task_key="process_files", + notebook_task=NotebookTask(notebook_path="src/files/process_files.py"), ) -pydabs_job_file_arrival = Job( - name="pydabs_job_file_arrival", - tasks=[ - Task( - task_key="process_new_files", - notebook_task=NotebookTask( - notebook_path="src/process_files.ipynb", - base_parameters={ - "file_arrival_location": "{{job.trigger.file_arrival.location}}" - }, - ), - ) - ], - trigger=TriggerSettings( - file_arrival=FileArrivalTriggerConfiguration( - url="/Volumes/your_catalog/your_schema/your_volume/", - min_time_between_triggers_seconds=60, - wait_after_last_change_seconds=90, - ), +job = Job( + name="file_arrival_example", + trigger=FileArrivalTriggerConfiguration( + url="Volumes/main.raw.incoming", # UC volume or external location + min_time_between_triggers_seconds=60, + wait_after_last_change_seconds=90, ), + tasks=[process_files], ) diff --git a/knowledge_base/pydabs_job_file_arrival/src/process_files.ipynb b/knowledge_base/pydabs_job_file_arrival/src/process_files.ipynb deleted file mode 100644 index ac291495..00000000 --- a/knowledge_base/pydabs_job_file_arrival/src/process_files.ipynb +++ /dev/null @@ -1,40 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "2ed3f14d", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "file_arrival_location = dbutils.widgets.get(\"file_arrival_location\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17d7a5e3", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "print(\"Processing Files...\")\n", - "print(f\"File location: {file_arrival_location}\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_file_arrival/src/process_files.py b/knowledge_base/pydabs_job_file_arrival/src/process_files.py new file mode 100644 index 00000000..929d34b9 --- /dev/null +++ b/knowledge_base/pydabs_job_file_arrival/src/process_files.py @@ -0,0 +1,5 @@ +from pyspark.sql import functions as F + +df = spark.read.format("cloudFiles") \ + .option("cloudFiles.format", "csv") \ + .load("/Volumes/main/raw/incoming") \ No newline at end of file From 02cad2996d52966f25bb030b58d1234470d08e93 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Fri, 13 Feb 2026 13:48:08 +0100 Subject: [PATCH 10/29] add table update trigger example --- .../pydabs_job_table_update_trigger/README.md | 68 +++++++++++++++++++ .../databricks.yml | 21 ++++++ .../pyproject.toml | 26 +++++++ .../resources/__init__.py | 16 +++++ .../resources/table_update.py | 16 +++++ .../src/assets/consume_table.py | 5 ++ 6 files changed, 152 insertions(+) create mode 100644 knowledge_base/pydabs_job_table_update_trigger/README.md create mode 100644 knowledge_base/pydabs_job_table_update_trigger/databricks.yml create mode 100644 knowledge_base/pydabs_job_table_update_trigger/pyproject.toml create mode 100644 knowledge_base/pydabs_job_table_update_trigger/resources/__init__.py create mode 100644 knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py create mode 100644 knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py diff --git a/knowledge_base/pydabs_job_table_update_trigger/README.md b/knowledge_base/pydabs_job_table_update_trigger/README.md new file mode 100644 index 00000000..d0061a6a --- /dev/null +++ b/knowledge_base/pydabs_job_table_update_trigger/README.md @@ -0,0 +1,68 @@ +# pydabs_job_table_update_trigger + +This example demonstrates a workflow when producers write to Unity Catalog tables, consumers can trigger on table updates instead of time‑based schedules. + + +The Lakeflow Job is configured with: +- **Table update trigger**: To run a job when new data is ready without the need for a continuously running cluster or knowledge of the processes that update a table. +- **Configurable wait times**: + - Minimum time between triggers: 0 seconds + - Wait after last file change: 3600 seconds +- **Automatic processing**: When updates are detected, the job automatically runs and processes them + +* `src/`: Notebook source code for this project. + * `src/assets/consume_table.py`: +* `resources/`: Resource configurations (jobs, pipelines, etc.) + * `resources/table_update.py`: PyDABs job with table update trigger configuration. + + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, this project will deploy a job called + `[dev yourname] table_update_example` to your workspace. + You can find that resource by opening your workspace and clicking on **Jobs & Pipelines**. + +3. Development vs. Production behavior + - Dev target (mode: development): Schedules and automatic triggers are disabled by design, so the job will not auto-fire on file arrival. Use manual runs to test the logic. + You can also manually run it with: + + ``` + $ databricks bundle run table_update_example + ``` + - Prod target (mode: production): Automatic triggers are active. Uploading a file to the configured Unity Catalog Volume path will trigger the job run when the trigger evaluates. + \ No newline at end of file diff --git a/knowledge_base/pydabs_job_table_update_trigger/databricks.yml b/knowledge_base/pydabs_job_table_update_trigger/databricks.yml new file mode 100644 index 00000000..6e24521b --- /dev/null +++ b/knowledge_base/pydabs_job_table_update_trigger/databricks.yml @@ -0,0 +1,21 @@ +# This is a Databricks asset bundle definition for pydabs_airflow. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: pydabs_job_table_update_trigger + +python: + venv_path: .venv + # Functions called to load resources defined in Python. See resources/__init__.py + resources: + - "resources:load_resources" + +include: + - resources/*.yml + - resources/*/*.yml + +targets: + dev: + mode: development + default: true + workspace: + host: https://myworkspace.databricks.com \ No newline at end of file diff --git a/knowledge_base/pydabs_job_table_update_trigger/pyproject.toml b/knowledge_base/pydabs_job_table_update_trigger/pyproject.toml new file mode 100644 index 00000000..898124d4 --- /dev/null +++ b/knowledge_base/pydabs_job_table_update_trigger/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "pydabs_job_table_update_trigger" +version = "0.0.1" +authors = [{ name = "Databricks Field Engineering" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-connect>=15.4,<15.5", + "databricks-bundles==0.275.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.black] +line-length = 125 diff --git a/knowledge_base/pydabs_job_table_update_trigger/resources/__init__.py b/knowledge_base/pydabs_job_table_update_trigger/resources/__init__.py new file mode 100644 index 00000000..fbcb9dc5 --- /dev/null +++ b/knowledge_base/pydabs_job_table_update_trigger/resources/__init__.py @@ -0,0 +1,16 @@ +from databricks.bundles.core import ( + Bundle, + Resources, + load_resources_from_current_package_module, +) + + +def load_resources(bundle: Bundle) -> Resources: + """ + 'load_resources' function is referenced in databricks.yml and is responsible for loading + bundle resources defined in Python code. This function is called by Databricks CLI during + bundle deployment. After deployment, this function is not used. + """ + + # the default implementation loads all Python files in 'resources' directory + return load_resources_from_current_package_module() diff --git a/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py b/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py new file mode 100644 index 00000000..3a6e13ff --- /dev/null +++ b/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py @@ -0,0 +1,16 @@ +from databricks.bundles.jobs import Job, NotebookTask, Task, TableUpdateTriggerConfiguration + +consume_table = Task( + task_key="consume_table", + notebook_task=NotebookTask(notebook_path="src/assets/consume_table.py"), +) + +job = Job( + name="table_update_example", + trigger=TableUpdateTriggerConfiguration( + table_names=["main.analytics.daily_events"], + min_time_between_triggers_seconds=0, + wait_after_last_change_seconds=3600, + ), + tasks=[consume_table], +) diff --git a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py new file mode 100644 index 00000000..92ebdd80 --- /dev/null +++ b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py @@ -0,0 +1,5 @@ +from pyspark.sql import functions as F + +source_table = "main.analytics.daily_events" +# Insert consumer logic here +df = spark.read.table(source_table) From 545e1dda55c547fe28571ff143205c0a33277e76 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Fri, 13 Feb 2026 13:50:20 +0100 Subject: [PATCH 11/29] update read me with new file names --- knowledge_base/pydabs_job_file_arrival/README.md | 13 +++---------- .../pydabs_job_with_task_values/README.md | 2 +- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/knowledge_base/pydabs_job_file_arrival/README.md b/knowledge_base/pydabs_job_file_arrival/README.md index a734022f..f5bace77 100644 --- a/knowledge_base/pydabs_job_file_arrival/README.md +++ b/knowledge_base/pydabs_job_file_arrival/README.md @@ -53,22 +53,15 @@ with this project. It's also possible to interact with it directly using the CLI This deploys everything that's defined for this project. For example, this project will deploy a job called - `[dev yourname] pydabs_job_file_arrival` to your workspace. + `[dev yourname] file_arrival_example` to your workspace. You can find that resource by opening your workspace and clicking on **Jobs & Pipelines**. -3. To configure the volume location: - - Edit `resources/file_arrival.py` and update the `url` parameter to point to your Unity Catalog Volume: - - ```python - url="/Volumes/your_catalog/your_schema/your_volume/" - ``` - -4. Development vs. Production behavior +3. Development vs. Production behavior - Dev target (mode: development): Schedules and automatic triggers are disabled by design, so the job will not auto-fire on file arrival. Use manual runs to test the logic. You can also manually run it with: ``` - $ databricks bundle run pydabs_job_file_arrival + $ databricks bundle run file_arrival_example ``` - Prod target (mode: production): Automatic triggers are active. Uploading a file to the configured Unity Catalog Volume path will trigger the job run when the trigger evaluates. \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_task_values/README.md b/knowledge_base/pydabs_job_with_task_values/README.md index 71506515..30097bf0 100644 --- a/knowledge_base/pydabs_job_with_task_values/README.md +++ b/knowledge_base/pydabs_job_with_task_values/README.md @@ -45,7 +45,7 @@ with this project. It's also possible to interact with it directly using the CLI This deploys everything that's defined for this project. For example, the default template would deploy a job called - `[dev yourname] pydabs_airflow_job` to your workspace. + `[dev yourname] xcom_to_task_values_example` to your workspace. You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. 3. Similarly, to deploy a production copy, type: From 79304c3aa4984ef3b22e854920eac20a8451f503 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Fri, 13 Feb 2026 14:27:07 +0100 Subject: [PATCH 12/29] update conditional execution example --- .../README.md | 12 ++-- .../resources/conditional_execution.py | 65 ++++++++++--------- ...{process_bad_data.ipynb => fix_path.ipynb} | 0 ...rocess_good_data.ipynb => skip_path.ipynb} | 0 4 files changed, 42 insertions(+), 35 deletions(-) rename knowledge_base/pydabs_job_conditional_execution/src/{process_bad_data.ipynb => fix_path.ipynb} (100%) rename knowledge_base/pydabs_job_conditional_execution/src/{process_good_data.ipynb => skip_path.ipynb} (100%) diff --git a/knowledge_base/pydabs_job_conditional_execution/README.md b/knowledge_base/pydabs_job_conditional_execution/README.md index c10e4a4f..8d1082b9 100644 --- a/knowledge_base/pydabs_job_conditional_execution/README.md +++ b/knowledge_base/pydabs_job_conditional_execution/README.md @@ -6,13 +6,13 @@ The Lakeflow Job consists of following tasks: 1. Checks data quality and calculates bad records 2. Evaluates if bad records exceed a threshold (100 records) 3. Routes to different processing paths based on the condition: - - If bad records > 100: runs `handle_bad_data` task - - If bad records ≤ 100: runs `continue_pipeline` task + - If bad records > 100: runs `fix_path` task + - If bad records ≤ 100: runs `skip_path` task * `src/`: Notebook source code for this project. * `src/check_quality.ipynb`: Checks data quality and outputs bad record count - * `src/process_bad_data.ipynb`: Handles cases with high bad record count - * `src/process_good_data.ipynb`: Continues normal pipeline for good data + * `src/fix_path.ipynb`: Handles cases with high bad record count + * `src/skip_path.ipynb`: Continues normal pipeline for good data * `resources/`: Resource configurations (jobs, pipelines, etc.) * `resources/conditional_execution.py`: PyDABs job definition with conditional tasks @@ -55,10 +55,10 @@ with this project. It's also possible to interact with it directly using the CLI This deploys everything that's defined for this project. For example, this project will deploy a job called - `[dev yourname] pydabs_job_conditional_execution` to your workspace. + `[dev yourname] conditional_execution_example` to your workspace. You can find that resource by opening your workspace and clicking on **Jobs & Pipelines**. 3. To run the job, use the "run" command: ``` - $ databricks bundle run pydabs_job_conditional_execution + $ databricks bundle run conditional_execution_example ``` \ No newline at end of file diff --git a/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py b/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py index 7034b775..fa9041ef 100644 --- a/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py +++ b/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py @@ -1,36 +1,43 @@ from databricks.bundles.jobs import ( - ConditionTask, Job, - NotebookTask, Task, + NotebookTask, + ConditionTask, + ConditionTaskOp, TaskDependency, ) -pydabs_job_conditional_execution = Job( - name="pydabs_job_conditional_execution", - tasks=[ - Task( - task_key="check_data_quality", - notebook_task=NotebookTask(notebook_path="src/check_quality.ipynb"), - ), - Task( - task_key="evaluate_quality", - condition_task=ConditionTask( - left="{{tasks.check_data_quality.values.bad_records}}", - op="GREATER_THAN", - right="100", - ), - depends_on=[TaskDependency(task_key="check_data_quality")], - ), - Task( - task_key="handle_bad_data", - notebook_task=NotebookTask(notebook_path="src/process_bad_data.ipynb"), - depends_on=[TaskDependency(task_key="evaluate_quality", outcome="true")], - ), - Task( - task_key="continue_pipeline", - notebook_task=NotebookTask(notebook_path="src/process_good_data.ipynb"), - depends_on=[TaskDependency(task_key="evaluate_quality", outcome="false")], - ), - ], +# 1) Producer task: runs a notebook and emits a task value +check_quality = Task( + task_key="check_quality", + notebook_task=NotebookTask(notebook_path="src/branch/check_quality.ipynb"), +) + +# 2) Branch task: evaluates an expression using an upstream task value +branch = Task( + task_key="branch", + condition_task=ConditionTask( + left="{{tasks.check_quality.values.bad_records}}", + op=ConditionTaskOp.GREATER_THAN, + right="100", + ), + depends_on=[TaskDependency(task_key="check_quality")], +) + +# 3) Downstream tasks: gated on the condition outcome +fix_path = Task( + task_key="fix_path", + notebook_task=NotebookTask(notebook_path="src/branch/fix_path.ipynb"), + depends_on=[TaskDependency(task_key="branch", outcome="true")], +) + +skip_path = Task( + task_key="skip_path", + notebook_task=NotebookTask(notebook_path="src/branch/skip_path.ipynb"), + depends_on=[TaskDependency(task_key="branch", outcome="false")], +) + +job = Job( + name="conditional_execution_example", + tasks=[check_quality, branch, fix_path, skip_path], ) diff --git a/knowledge_base/pydabs_job_conditional_execution/src/process_bad_data.ipynb b/knowledge_base/pydabs_job_conditional_execution/src/fix_path.ipynb similarity index 100% rename from knowledge_base/pydabs_job_conditional_execution/src/process_bad_data.ipynb rename to knowledge_base/pydabs_job_conditional_execution/src/fix_path.ipynb diff --git a/knowledge_base/pydabs_job_conditional_execution/src/process_good_data.ipynb b/knowledge_base/pydabs_job_conditional_execution/src/skip_path.ipynb similarity index 100% rename from knowledge_base/pydabs_job_conditional_execution/src/process_good_data.ipynb rename to knowledge_base/pydabs_job_conditional_execution/src/skip_path.ipynb From fdb1bf690f36406c7563df152a355be3b4575fde Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Fri, 13 Feb 2026 14:30:02 +0100 Subject: [PATCH 13/29] update check quality notebook --- .../src/check_quality.ipynb | 36 +++---------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb b/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb index 4de98129..d3cf1860 100644 --- a/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb +++ b/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb @@ -11,38 +11,10 @@ }, "outputs": [], "source": [ - "bad_records_count = 150" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97594830", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "dbutils.jobs.taskValues.set(\n", - " key='bad_records',\n", - " value=bad_records_count\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f48c4bc8", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "print(f\"Found {bad_records_count} bad records\")" + "from databricks.sdk.runtime import dbutils\n", + "\n", + "bad_records = 123 # result of a data quality check\n", + "dbutils.jobs.taskValues.set(key=\"bad_records\", value=bad_records)" ] } ], From 66d3de8888fb1ccc3fb27fd89896ccccb7063ecf Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Fri, 13 Feb 2026 15:45:47 +0100 Subject: [PATCH 14/29] update for each example --- .../pydabs_job_with_for_each/README.md | 5 +- .../resources/for_each_simple.py | 55 +++++++++---------- .../src/foreach/generate_items.ipynb | 24 ++++++++ .../src/foreach/process_item.ipynb | 25 +++++++++ .../src/notebook_extract.py | 18 ------ .../src/notebook_process_item.py | 15 ----- .../src/pydabs_job_with_for_each/__init__.py | 0 .../src/pydabs_job_with_for_each/main.py | 6 -- 8 files changed, 78 insertions(+), 70 deletions(-) create mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb create mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb delete mode 100644 knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py delete mode 100644 knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py delete mode 100644 knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/__init__.py delete mode 100644 knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py diff --git a/knowledge_base/pydabs_job_with_for_each/README.md b/knowledge_base/pydabs_job_with_for_each/README.md index 474327be..17867619 100644 --- a/knowledge_base/pydabs_job_with_for_each/README.md +++ b/knowledge_base/pydabs_job_with_for_each/README.md @@ -3,7 +3,8 @@ This example demonstrates a simple Databricks job that uses a foreach task. * `src/`: Python source code for this project. - * `src/pydabs_job_with_for_each/`: Shared Python code that can be used by jobs and pipelines. + * `foreach/generate_items.ipynb`: A notebook which returns a list of items to be used for task generation. + * `foreach/process_item.ipynb`: A notebook which will process an item. * `resources/`: Resource configurations (jobs, pipelines, etc.) @@ -45,7 +46,7 @@ with this project. It's also possible to interact with it directly using the CLI This deploys everything that's defined for this project. For example, the default template would deploy a job called - `[dev yourname] pydabs_airflow_job` to your workspace. + `[dev yourname] for_each_task_example` to your workspace. You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. 3. Similarly, to deploy a production copy, type: diff --git a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py index ea3b7b9d..bc37d509 100644 --- a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py +++ b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py @@ -1,38 +1,35 @@ -from databricks.bundles.jobs import Job, Task, NotebookTask, ForEachTask, TaskDependency - -extract = Task( - task_key="extract", - notebook_task=NotebookTask(notebook_path="src/notebook_extract.py"), +from databricks.bundles.jobs import ( + Job, + Task, + NotebookTask, + ForEachTask, + TaskDependency, + JobEnvironment, + Environment, ) -process_item_iteration = Task( - task_key="process_item_iteration", - notebook_task=NotebookTask( - notebook_path="src/notebook_process_item.py", - base_parameters={ - "index": "{{input}}", - }, - ), + +generate_items = Task( + task_key="generate_items", + notebook_task=NotebookTask(notebook_path="src/foreach/generate_items.ipynb"), ) + process_item = Task( task_key="process_item", - depends_on=[TaskDependency(task_key="extract")], for_each_task=ForEachTask( - inputs="{{tasks.extract.values.indexes}}", - task=process_item_iteration, + inputs="{{tasks.generate_items.values.items}}", + task=Task( + task_key="process_item_iteration", + notebook_task=NotebookTask( + notebook_path="src/foreach/process_item.ipynb", + base_parameters={"item": "{{input}}"}, + ), + ), concurrency=10, ), + depends_on=[TaskDependency(task_key="generate_items")], ) -for_each_example = Job( - name="for_each_example", - tasks=[ - extract, - process_item, - ], - parameters=[ - { - "name": "lookup_file_name", - "default": "/Volumes/main/for_each_example/hotchpotch/my_file.json", - }, - ], -) +for_each_task_example = Job( + name="for_each_task_example", + tasks=[generate_items, process_item], +) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb new file mode 100644 index 00000000..669f7149 --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb @@ -0,0 +1,24 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e4c15846", + "metadata": {}, + "outputs": [], + "source": [ + "from databricks.sdk.runtime import dbutils\n", + "\n", + "items = [1, 2, 3]\n", + "dbutils.jobs.taskValues.set(key=\"items\", value=items)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb new file mode 100644 index 00000000..4127a086 --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb @@ -0,0 +1,25 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Runs once per item in the for-each. Do not call dbutils.jobs.taskValues.set() here.\n", + "from databricks.sdk.runtime import dbutils\n", + "\n", + "# Current iteration value passed from the for-each task (base_parameters: item = {{input}})\n", + "current_item = dbutils.widgets.get(\"item\")\n", + "print(f\"Processing item: {current_item}\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py b/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py deleted file mode 100644 index ebda7133..00000000 --- a/knowledge_base/pydabs_job_with_for_each/src/notebook_extract.py +++ /dev/null @@ -1,18 +0,0 @@ -# Databricks notebook source -lookup_file_name = dbutils.widgets.get("lookup_file_name") - -# COMMAND ---------- - -import json -from datetime import datetime, timedelta - -indexes = range(0, 10) -start_date = datetime.today() -data = [ - {"date": (start_date + timedelta(days=index)).strftime("%Y-%m-%d")} - for index in indexes -] -dbutils.fs.put(lookup_file_name, json.dumps(data), overwrite=True) -dbutils.jobs.taskValues.set("indexes", list(indexes)) - -# COMMAND ---------- diff --git a/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py b/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py deleted file mode 100644 index ba439a8c..00000000 --- a/knowledge_base/pydabs_job_with_for_each/src/notebook_process_item.py +++ /dev/null @@ -1,15 +0,0 @@ -# Databricks notebook source -lookup_file_name = dbutils.widgets.get("lookup_file_name") -index = int(dbutils.widgets.get("index")) - -# COMMAND ---------- - -import json - -with open(lookup_file_name, "r") as f: - data = json.load(f) -date = data[index].get("date") - -print(date) - -# COMMAND ---------- diff --git a/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/__init__.py b/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py b/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py deleted file mode 100644 index cd9ac480..00000000 --- a/knowledge_base/pydabs_job_with_for_each/src/pydabs_job_with_for_each/main.py +++ /dev/null @@ -1,6 +0,0 @@ -def main(): - pass - - -if __name__ == "__main__": - main() From c4cddde20e910ac3d5a4d9c7f8465a2c83fd0a2b Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Fri, 13 Feb 2026 16:22:13 +0100 Subject: [PATCH 15/29] add job backfill example --- .../pydabs_job_backfill_data/README.md | 67 +++++++++++++++++++ .../pydabs_job_backfill_data/databricks.yml | 21 ++++++ .../pydabs_job_backfill_data/pyproject.toml | 26 +++++++ .../resources/__init__.py | 16 +++++ .../resources/backfill_data.py | 24 +++++++ .../pydabs_job_backfill_data/src/my_query.sql | 5 ++ 6 files changed, 159 insertions(+) create mode 100644 knowledge_base/pydabs_job_backfill_data/README.md create mode 100644 knowledge_base/pydabs_job_backfill_data/databricks.yml create mode 100644 knowledge_base/pydabs_job_backfill_data/pyproject.toml create mode 100644 knowledge_base/pydabs_job_backfill_data/resources/__init__.py create mode 100644 knowledge_base/pydabs_job_backfill_data/resources/backfill_data.py create mode 100644 knowledge_base/pydabs_job_backfill_data/src/my_query.sql diff --git a/knowledge_base/pydabs_job_backfill_data/README.md b/knowledge_base/pydabs_job_backfill_data/README.md new file mode 100644 index 00000000..0bdbf89e --- /dev/null +++ b/knowledge_base/pydabs_job_backfill_data/README.md @@ -0,0 +1,67 @@ +# pydabs_job_backfill_data + +This example demonstrates a Databricks Asset Bundle (DABs) Job that runs a SQL task with a date parameter for backfilling data. + +The Job consists of: + +1. **run_daily_sql** — A SQL task that runs `src/my_query.sql` with a `run_date` job parameter. The query inserts data from a source table into a target table filtered by `event_date = run_date`, so you can backfill or reprocess specific dates. + +* `src/`: SQL and notebook source code for this project. + * `src/my_query.sql`: Daily insert query that uses the `:run_date` parameter to filter by event date. +* `resources/`: Resource configurations (jobs, pipelines, etc.) + * `resources/backfill_data.py`: PyDABs job definition with a parameterized SQL task. + +## Job parameters + +| Parameter | Default | Description | +|------------|-------------|--------------------------------------| +| `run_date` | `2024-01-01` | Date used to filter data (e.g. `event_date`). | + +Before deploying, set `warehouse_id` in `resources/backfill_data.py` to your SQL warehouse ID, and adjust the catalog/schema/table names in `src/my_query.sql` to match your environment. + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + +## Using this project with the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. You can also use the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, run: + ``` + $ databricks bundle deploy --target dev + ``` + (Note: "dev" is the default target, so `--target` is optional.) + + This deploys everything defined for this project, including the job + `[dev yourname] sql_backfill_example`. You can find it under **Workflows** (or **Jobs & Pipelines**) in your workspace. + +3. To run the job with the default `run_date`: + ``` + $ databricks bundle run sql_backfill_example + ``` + +4. To run the job for a specific date (e.g. backfill): + ``` + $ databricks bundle run sql_backfill_example --parameters run_date=2024-02-01 + ``` diff --git a/knowledge_base/pydabs_job_backfill_data/databricks.yml b/knowledge_base/pydabs_job_backfill_data/databricks.yml new file mode 100644 index 00000000..55aa0938 --- /dev/null +++ b/knowledge_base/pydabs_job_backfill_data/databricks.yml @@ -0,0 +1,21 @@ +# This is a Databricks asset bundle definition for pydabs_airflow. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: pydabs_job_conditional_execution + +python: + venv_path: .venv + # Functions called to load resources defined in Python. See resources/__init__.py + resources: + - "resources:load_resources" + +include: + - resources/*.yml + - resources/*/*.yml + +targets: + dev: + mode: development + default: true + workspace: + host: https://myworkspace.databricks.com \ No newline at end of file diff --git a/knowledge_base/pydabs_job_backfill_data/pyproject.toml b/knowledge_base/pydabs_job_backfill_data/pyproject.toml new file mode 100644 index 00000000..15f1f3e0 --- /dev/null +++ b/knowledge_base/pydabs_job_backfill_data/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "pydabs_job_backfill_data" +version = "0.0.1" +authors = [{ name = "Databricks Field Engineering" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-connect>=15.4,<15.5", + "databricks-bundles==0.275.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.black] +line-length = 125 diff --git a/knowledge_base/pydabs_job_backfill_data/resources/__init__.py b/knowledge_base/pydabs_job_backfill_data/resources/__init__.py new file mode 100644 index 00000000..fbcb9dc5 --- /dev/null +++ b/knowledge_base/pydabs_job_backfill_data/resources/__init__.py @@ -0,0 +1,16 @@ +from databricks.bundles.core import ( + Bundle, + Resources, + load_resources_from_current_package_module, +) + + +def load_resources(bundle: Bundle) -> Resources: + """ + 'load_resources' function is referenced in databricks.yml and is responsible for loading + bundle resources defined in Python code. This function is called by Databricks CLI during + bundle deployment. After deployment, this function is not used. + """ + + # the default implementation loads all Python files in 'resources' directory + return load_resources_from_current_package_module() diff --git a/knowledge_base/pydabs_job_backfill_data/resources/backfill_data.py b/knowledge_base/pydabs_job_backfill_data/resources/backfill_data.py new file mode 100644 index 00000000..5a8a2b88 --- /dev/null +++ b/knowledge_base/pydabs_job_backfill_data/resources/backfill_data.py @@ -0,0 +1,24 @@ +from databricks.bundles.jobs import ( + Job, + Task, + SqlTask, + SqlTaskFile, + JobParameterDefinition, +) + +run_daily_sql = Task( + task_key="run_daily_sql", + sql_task=SqlTask( + warehouse_id="", + file=SqlTaskFile(path="src/my_query.sql"), + parameters={"run_date": "{{job.parameters.run_date}}"}, + ), +) + +sql_backfill_example = Job( + name="sql_backfill_example", + tasks=[run_daily_sql], + parameters=[ + JobParameterDefinition(name="run_date", default="2024-01-01"), + ], +) diff --git a/knowledge_base/pydabs_job_backfill_data/src/my_query.sql b/knowledge_base/pydabs_job_backfill_data/src/my_query.sql new file mode 100644 index 00000000..a71393e4 --- /dev/null +++ b/knowledge_base/pydabs_job_backfill_data/src/my_query.sql @@ -0,0 +1,5 @@ +-- referenced by sql_task +INSERT INTO catalog.schema.target_table +SELECT * +FROM catalog.schema.source_table +WHERE event_date = date(:run_date); \ No newline at end of file From 285ffdeedd2c08dc905f0080d82579735d472e69 Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Sun, 15 Feb 2026 18:13:16 +0100 Subject: [PATCH 16/29] second last review --- .../pydabs_job_with_for_each/databricks.yml | 1 - .../resources/for_each_simple.py | 14 +++-------- .../src/foreach/generate_items.ipynb | 24 ------------------ .../src/foreach/generate_items.py | 5 ++++ .../src/foreach/process_item.ipynb | 25 ------------------- .../src/foreach/process_item.py | 8 ++++++ .../databricks.yml | 1 - .../src/xcom/consumer.py | 1 + .../src/xcom/producer.py | 1 + 9 files changed, 18 insertions(+), 62 deletions(-) delete mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb create mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py delete mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb create mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py diff --git a/knowledge_base/pydabs_job_with_for_each/databricks.yml b/knowledge_base/pydabs_job_with_for_each/databricks.yml index 690fc324..e5d5f882 100644 --- a/knowledge_base/pydabs_job_with_for_each/databricks.yml +++ b/knowledge_base/pydabs_job_with_for_each/databricks.yml @@ -2,7 +2,6 @@ # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: pydabs_job_with_foreach - uuid: 3874a19c-7ea5-401d-bca2-9bd1f9d3efbf python: venv_path: .venv diff --git a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py index bc37d509..759c5d95 100644 --- a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py +++ b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py @@ -1,16 +1,8 @@ -from databricks.bundles.jobs import ( - Job, - Task, - NotebookTask, - ForEachTask, - TaskDependency, - JobEnvironment, - Environment, -) +from databricks.bundles.jobs import Job, Task, NotebookTask, ForEachTask, TaskDependency generate_items = Task( task_key="generate_items", - notebook_task=NotebookTask(notebook_path="src/foreach/generate_items.ipynb"), + notebook_task=NotebookTask(notebook_path="src/foreach/generate_items.py"), ) process_item = Task( @@ -20,7 +12,7 @@ task=Task( task_key="process_item_iteration", notebook_task=NotebookTask( - notebook_path="src/foreach/process_item.ipynb", + notebook_path="src/foreach/process_item.py", base_parameters={"item": "{{input}}"}, ), ), diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb deleted file mode 100644 index 669f7149..00000000 --- a/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb +++ /dev/null @@ -1,24 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "e4c15846", - "metadata": {}, - "outputs": [], - "source": [ - "from databricks.sdk.runtime import dbutils\n", - "\n", - "items = [1, 2, 3]\n", - "dbutils.jobs.taskValues.set(key=\"items\", value=items)" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py new file mode 100644 index 00000000..32443485 --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py @@ -0,0 +1,5 @@ +# Databricks notebook source +from databricks.sdk.runtime import dbutils + +items = [1, 2, 3] +dbutils.jobs.taskValues.set(key="items", value=items) diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb deleted file mode 100644 index 4127a086..00000000 --- a/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb +++ /dev/null @@ -1,25 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Runs once per item in the for-each. Do not call dbutils.jobs.taskValues.set() here.\n", - "from databricks.sdk.runtime import dbutils\n", - "\n", - "# Current iteration value passed from the for-each task (base_parameters: item = {{input}})\n", - "current_item = dbutils.widgets.get(\"item\")\n", - "print(f\"Processing item: {current_item}\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py new file mode 100644 index 00000000..1481a890 --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py @@ -0,0 +1,8 @@ +# Databricks notebook source + +# Runs once per item in the for-each. Do not call dbutils.jobs.taskValues.set() here.\n", +from databricks.sdk.runtime import dbutils + +# Current iteration value passed from the for-each task (base_parameters: item = {{input}})\n", +current_item = dbutils.widgets.get("item") +print(f"Processing item: {current_item}") diff --git a/knowledge_base/pydabs_job_with_task_values/databricks.yml b/knowledge_base/pydabs_job_with_task_values/databricks.yml index 82e52a5b..fe2f64b4 100644 --- a/knowledge_base/pydabs_job_with_task_values/databricks.yml +++ b/knowledge_base/pydabs_job_with_task_values/databricks.yml @@ -2,7 +2,6 @@ # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: pydabs_job_with_task_values - uuid: 3874a19c-7ea5-401d-bca2-9bd1f9d3efbf python: venv_path: .venv diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py index 89a27a68..614f4f3e 100644 --- a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py @@ -1,3 +1,4 @@ +# Databricks notebook source from databricks.sdk.runtime import dbutils val = dbutils.jobs.taskValues.get(taskKey="producer", key="answer", debugValue=None) diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py index 5d56be9e..429bf99f 100644 --- a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py @@ -1,3 +1,4 @@ +# Databricks notebook source from databricks.sdk.runtime import dbutils value = 42 From 8caf45df2e9210e7716cd542a3cd09b7facd0535 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 11:33:16 +0100 Subject: [PATCH 17/29] update job file arrival --- .../pydabs_job_file_arrival/README.md | 2 +- .../resources/file_arrival.py | 16 +++++----- .../src/files/process_files.ipynb | 30 +++++++++++++++++++ .../src/process_files.py | 5 ---- 4 files changed, 40 insertions(+), 13 deletions(-) create mode 100644 knowledge_base/pydabs_job_file_arrival/src/files/process_files.ipynb delete mode 100644 knowledge_base/pydabs_job_file_arrival/src/process_files.py diff --git a/knowledge_base/pydabs_job_file_arrival/README.md b/knowledge_base/pydabs_job_file_arrival/README.md index f5bace77..09d63991 100644 --- a/knowledge_base/pydabs_job_file_arrival/README.md +++ b/knowledge_base/pydabs_job_file_arrival/README.md @@ -10,7 +10,7 @@ The Lakeflow Job is configured with: - **Automatic processing**: When files are detected, the job automatically runs and processes them * `src/`: Notebook source code for this project. - * `src/process_files.py`: Processes newly arrived files from the volume path. + * `src/files/process_files.ipynb`: Processes newly arrived files from the volume path. * `resources/`: Resource configurations (jobs, pipelines, etc.) * `resources/file_arrival.py`: PyDABs job with file arrival trigger configuration. diff --git a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py index 00f61535..46ce92c6 100644 --- a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py +++ b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py @@ -1,16 +1,18 @@ -from databricks.bundles.jobs import FileArrivalTriggerConfiguration, Job, Task, NotebookTask +from databricks.bundles.jobs import FileArrivalTriggerConfiguration, Job, Task, TriggerSettings, NotebookTask process_files = Task( task_key="process_files", - notebook_task=NotebookTask(notebook_path="src/files/process_files.py"), + notebook_task=NotebookTask(notebook_path="src/files/process_files.ipynb"), ) job = Job( name="file_arrival_example", - trigger=FileArrivalTriggerConfiguration( - url="Volumes/main.raw.incoming", # UC volume or external location - min_time_between_triggers_seconds=60, - wait_after_last_change_seconds=90, + trigger=TriggerSettings( + file_arrival=FileArrivalTriggerConfiguration( + url="/Volumes/main/raw/incoming/", # UC volume or external location + min_time_between_triggers_seconds=60, + wait_after_last_change_seconds=90, + ) ), tasks=[process_files], -) +) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_file_arrival/src/files/process_files.ipynb b/knowledge_base/pydabs_job_file_arrival/src/files/process_files.ipynb new file mode 100644 index 00000000..4e46e9ca --- /dev/null +++ b/knowledge_base/pydabs_job_file_arrival/src/files/process_files.ipynb @@ -0,0 +1,30 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "84bc51ae", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql import functions as F\n", + "\n", + "df = spark.readStream.format(\"cloudFiles\") \\\n", + " .option(\"cloudFiles.format\", \"csv\") \\\n", + " .option(\"cloudFiles.schemaLocation\", \"/tmp/autoloader/_checkpoint/my_stream\") \\\n", + " .load(\"/Volumes/main/raw/incoming\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/knowledge_base/pydabs_job_file_arrival/src/process_files.py b/knowledge_base/pydabs_job_file_arrival/src/process_files.py deleted file mode 100644 index 929d34b9..00000000 --- a/knowledge_base/pydabs_job_file_arrival/src/process_files.py +++ /dev/null @@ -1,5 +0,0 @@ -from pyspark.sql import functions as F - -df = spark.read.format("cloudFiles") \ - .option("cloudFiles.format", "csv") \ - .load("/Volumes/main/raw/incoming") \ No newline at end of file From 19a9ef57d7e93df99d1ae5797c58488b96b04e12 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 11:42:03 +0100 Subject: [PATCH 18/29] update the table update trigger example --- .../resources/table_update.py | 10 ++++--- .../src/assets/consume_table.ipynb | 29 +++++++++++++++++++ .../src/assets/consume_table.py | 5 ---- 3 files changed, 35 insertions(+), 9 deletions(-) create mode 100644 knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.ipynb delete mode 100644 knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py diff --git a/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py b/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py index 3a6e13ff..ed310ff7 100644 --- a/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py +++ b/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py @@ -1,16 +1,18 @@ -from databricks.bundles.jobs import Job, NotebookTask, Task, TableUpdateTriggerConfiguration +from databricks.bundles.jobs import Job, NotebookTask, Task, TriggerSettings, TableUpdateTriggerConfiguration consume_table = Task( task_key="consume_table", - notebook_task=NotebookTask(notebook_path="src/assets/consume_table.py"), + notebook_task=NotebookTask(notebook_path="src/assets/consume_table.ipynb"), ) job = Job( name="table_update_example", - trigger=TableUpdateTriggerConfiguration( + trigger=TriggerSettings( + table_update=TableUpdateTriggerConfiguration( table_names=["main.analytics.daily_events"], min_time_between_triggers_seconds=0, wait_after_last_change_seconds=3600, + ) ), tasks=[consume_table], -) +) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.ipynb b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.ipynb new file mode 100644 index 00000000..777b28cb --- /dev/null +++ b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.ipynb @@ -0,0 +1,29 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e459f2ca", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql import functions as F\n", + "\n", + "source_table = \"main.analytics.daily_events\"\n", + "# Insert consumer logic here\n", + "df = spark.read.table(source_table)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py deleted file mode 100644 index 92ebdd80..00000000 --- a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py +++ /dev/null @@ -1,5 +0,0 @@ -from pyspark.sql import functions as F - -source_table = "main.analytics.daily_events" -# Insert consumer logic here -df = spark.read.table(source_table) From 8e2e90791445cf1c022dd9ca22c14e261324b952 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 11:42:41 +0100 Subject: [PATCH 19/29] quick fix: typo in read me --- knowledge_base/pydabs_job_table_update_trigger/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knowledge_base/pydabs_job_table_update_trigger/README.md b/knowledge_base/pydabs_job_table_update_trigger/README.md index d0061a6a..5092889a 100644 --- a/knowledge_base/pydabs_job_table_update_trigger/README.md +++ b/knowledge_base/pydabs_job_table_update_trigger/README.md @@ -11,7 +11,7 @@ The Lakeflow Job is configured with: - **Automatic processing**: When updates are detected, the job automatically runs and processes them * `src/`: Notebook source code for this project. - * `src/assets/consume_table.py`: + * `src/assets/consume_table.ipynb`: * `resources/`: Resource configurations (jobs, pipelines, etc.) * `resources/table_update.py`: PyDABs job with table update trigger configuration. From 3bca825fd5570bea5cf651fff4de92124ff4cf3f Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 11:50:52 +0100 Subject: [PATCH 20/29] update task values example --- .../resources/task_values_simple.py | 4 +-- .../src/xcom/consumer.ipynb | 28 ++++++++++++++++++ .../src/xcom/consumer.py | 4 --- .../src/xcom/producer.ipynb | 29 +++++++++++++++++++ .../src/xcom/producer.py | 5 ---- 5 files changed, 59 insertions(+), 11 deletions(-) create mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.ipynb delete mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py create mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/producer.ipynb delete mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py diff --git a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py index 61248604..1f562ca8 100644 --- a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py +++ b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py @@ -2,13 +2,13 @@ producer = Task( task_key="producer", - notebook_task=NotebookTask(notebook_path="src/xcom/producer.py"), + notebook_task=NotebookTask(notebook_path="src/xcom/producer.ipynb"), ) consumer = Task( task_key="consumer", depends_on=[TaskDependency(task_key="producer")], - notebook_task=NotebookTask(notebook_path="src/xcom/consumer.py"), + notebook_task=NotebookTask(notebook_path="src/xcom/consumer.ipynb"), ) job = Job( diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.ipynb b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.ipynb new file mode 100644 index 00000000..02ccc642 --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.ipynb @@ -0,0 +1,28 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "2f1b0352", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "from databricks.sdk.runtime import dbutils\n", + "\n", + "val = dbutils.jobs.taskValues.get(taskKey=\"producer\", key=\"answer\", debugValue=None)\n", + "print(f\"Got value: {val}\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py deleted file mode 100644 index 89a27a68..00000000 --- a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py +++ /dev/null @@ -1,4 +0,0 @@ -from databricks.sdk.runtime import dbutils - -val = dbutils.jobs.taskValues.get(taskKey="producer", key="answer", debugValue=None) -print(f"Got value: {val}") \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.ipynb b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.ipynb new file mode 100644 index 00000000..771092ea --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.ipynb @@ -0,0 +1,29 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "461d9f21", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "from databricks.sdk.runtime import dbutils\n", + "\n", + "value = 42\n", + "dbutils.jobs.taskValues.set(key=\"answer\", value=value)\n", + "print(f\"Produced value: {value}\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py deleted file mode 100644 index 5d56be9e..00000000 --- a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py +++ /dev/null @@ -1,5 +0,0 @@ -from databricks.sdk.runtime import dbutils - -value = 42 -dbutils.jobs.taskValues.set(key="answer", value=value) -print(f"Produced value: {value}") \ No newline at end of file From 47e1d1dfd0c51b6997f444680d31b3041d719a8b Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 14:45:13 +0100 Subject: [PATCH 21/29] change ipynb to py scripts --- .../resources/conditional_execution.py | 6 ++-- .../src/check_quality.ipynb | 28 ------------------- .../src/check_quality.py | 5 ++++ .../src/fix_path.ipynb | 25 ----------------- .../src/fix_path.py | 2 ++ .../src/skip_path.ipynb | 25 ----------------- .../src/skip_path.py | 2 ++ 7 files changed, 12 insertions(+), 81 deletions(-) delete mode 100644 knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb create mode 100644 knowledge_base/pydabs_job_conditional_execution/src/check_quality.py delete mode 100644 knowledge_base/pydabs_job_conditional_execution/src/fix_path.ipynb create mode 100644 knowledge_base/pydabs_job_conditional_execution/src/fix_path.py delete mode 100644 knowledge_base/pydabs_job_conditional_execution/src/skip_path.ipynb create mode 100644 knowledge_base/pydabs_job_conditional_execution/src/skip_path.py diff --git a/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py b/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py index fa9041ef..4598fe31 100644 --- a/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py +++ b/knowledge_base/pydabs_job_conditional_execution/resources/conditional_execution.py @@ -10,7 +10,7 @@ # 1) Producer task: runs a notebook and emits a task value check_quality = Task( task_key="check_quality", - notebook_task=NotebookTask(notebook_path="src/branch/check_quality.ipynb"), + notebook_task=NotebookTask(notebook_path="src/branch/check_quality.py"), ) # 2) Branch task: evaluates an expression using an upstream task value @@ -27,13 +27,13 @@ # 3) Downstream tasks: gated on the condition outcome fix_path = Task( task_key="fix_path", - notebook_task=NotebookTask(notebook_path="src/branch/fix_path.ipynb"), + notebook_task=NotebookTask(notebook_path="src/branch/fix_path.py"), depends_on=[TaskDependency(task_key="branch", outcome="true")], ) skip_path = Task( task_key="skip_path", - notebook_task=NotebookTask(notebook_path="src/branch/skip_path.ipynb"), + notebook_task=NotebookTask(notebook_path="src/branch/skip_path.py"), depends_on=[TaskDependency(task_key="branch", outcome="false")], ) diff --git a/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb b/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb deleted file mode 100644 index d3cf1860..00000000 --- a/knowledge_base/pydabs_job_conditional_execution/src/check_quality.ipynb +++ /dev/null @@ -1,28 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "cdf58588", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "from databricks.sdk.runtime import dbutils\n", - "\n", - "bad_records = 123 # result of a data quality check\n", - "dbutils.jobs.taskValues.set(key=\"bad_records\", value=bad_records)" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_conditional_execution/src/check_quality.py b/knowledge_base/pydabs_job_conditional_execution/src/check_quality.py new file mode 100644 index 00000000..2eb30a96 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/src/check_quality.py @@ -0,0 +1,5 @@ +# Databricks notebook source +from databricks.sdk.runtime import dbutils + +bad_records = 123 # result of a data quality check +dbutils.jobs.taskValues.set(key="bad_records", value=bad_records) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_conditional_execution/src/fix_path.ipynb b/knowledge_base/pydabs_job_conditional_execution/src/fix_path.ipynb deleted file mode 100644 index 66e15fa2..00000000 --- a/knowledge_base/pydabs_job_conditional_execution/src/fix_path.ipynb +++ /dev/null @@ -1,25 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "2ed3f14d", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "print(\"Processing Files...\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_conditional_execution/src/fix_path.py b/knowledge_base/pydabs_job_conditional_execution/src/fix_path.py new file mode 100644 index 00000000..d30767b3 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/src/fix_path.py @@ -0,0 +1,2 @@ +# Databricks notebook source +print("Processing Files...") \ No newline at end of file diff --git a/knowledge_base/pydabs_job_conditional_execution/src/skip_path.ipynb b/knowledge_base/pydabs_job_conditional_execution/src/skip_path.ipynb deleted file mode 100644 index 66e15fa2..00000000 --- a/knowledge_base/pydabs_job_conditional_execution/src/skip_path.ipynb +++ /dev/null @@ -1,25 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "2ed3f14d", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "print(\"Processing Files...\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_conditional_execution/src/skip_path.py b/knowledge_base/pydabs_job_conditional_execution/src/skip_path.py new file mode 100644 index 00000000..82ffb5d4 --- /dev/null +++ b/knowledge_base/pydabs_job_conditional_execution/src/skip_path.py @@ -0,0 +1,2 @@ +# Databricks notebook source +print("Skipping Files...") \ No newline at end of file From 34a8b85091eb1cfc9fa4b7f29279800659b87c02 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 14:48:21 +0100 Subject: [PATCH 22/29] change ipynb to py scripts --- .../resources/task_values_simple.py | 4 +-- .../src/xcom/consumer.ipynb | 28 ------------------ .../src/xcom/consumer.py | 5 ++++ .../src/xcom/producer.ipynb | 29 ------------------- .../src/xcom/producer.py | 6 ++++ 5 files changed, 13 insertions(+), 59 deletions(-) delete mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.ipynb create mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py delete mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/producer.ipynb create mode 100644 knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py diff --git a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py index 1f562ca8..61248604 100644 --- a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py +++ b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py @@ -2,13 +2,13 @@ producer = Task( task_key="producer", - notebook_task=NotebookTask(notebook_path="src/xcom/producer.ipynb"), + notebook_task=NotebookTask(notebook_path="src/xcom/producer.py"), ) consumer = Task( task_key="consumer", depends_on=[TaskDependency(task_key="producer")], - notebook_task=NotebookTask(notebook_path="src/xcom/consumer.ipynb"), + notebook_task=NotebookTask(notebook_path="src/xcom/consumer.py"), ) job = Job( diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.ipynb b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.ipynb deleted file mode 100644 index 02ccc642..00000000 --- a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.ipynb +++ /dev/null @@ -1,28 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "2f1b0352", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "from databricks.sdk.runtime import dbutils\n", - "\n", - "val = dbutils.jobs.taskValues.get(taskKey=\"producer\", key=\"answer\", debugValue=None)\n", - "print(f\"Got value: {val}\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py new file mode 100644 index 00000000..614f4f3e --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py @@ -0,0 +1,5 @@ +# Databricks notebook source +from databricks.sdk.runtime import dbutils + +val = dbutils.jobs.taskValues.get(taskKey="producer", key="answer", debugValue=None) +print(f"Got value: {val}") \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.ipynb b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.ipynb deleted file mode 100644 index 771092ea..00000000 --- a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.ipynb +++ /dev/null @@ -1,29 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "461d9f21", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "from databricks.sdk.runtime import dbutils\n", - "\n", - "value = 42\n", - "dbutils.jobs.taskValues.set(key=\"answer\", value=value)\n", - "print(f\"Produced value: {value}\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py new file mode 100644 index 00000000..429bf99f --- /dev/null +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py @@ -0,0 +1,6 @@ +# Databricks notebook source +from databricks.sdk.runtime import dbutils + +value = 42 +dbutils.jobs.taskValues.set(key="answer", value=value) +print(f"Produced value: {value}") \ No newline at end of file From 51e25ac2e4fce27158581c3c55bfafa23579ef92 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 14:50:45 +0100 Subject: [PATCH 23/29] change ipynb to py scripts file arrival --- .../resources/file_arrival.py | 2 +- .../src/files/process_files.ipynb | 30 ------------------- .../src/files/process_files.py | 7 +++++ 3 files changed, 8 insertions(+), 31 deletions(-) delete mode 100644 knowledge_base/pydabs_job_file_arrival/src/files/process_files.ipynb create mode 100644 knowledge_base/pydabs_job_file_arrival/src/files/process_files.py diff --git a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py index 46ce92c6..5397c859 100644 --- a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py +++ b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py @@ -2,7 +2,7 @@ process_files = Task( task_key="process_files", - notebook_task=NotebookTask(notebook_path="src/files/process_files.ipynb"), + notebook_task=NotebookTask(notebook_path="src/files/process_files.py"), ) job = Job( diff --git a/knowledge_base/pydabs_job_file_arrival/src/files/process_files.ipynb b/knowledge_base/pydabs_job_file_arrival/src/files/process_files.ipynb deleted file mode 100644 index 4e46e9ca..00000000 --- a/knowledge_base/pydabs_job_file_arrival/src/files/process_files.ipynb +++ /dev/null @@ -1,30 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "84bc51ae", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "from pyspark.sql import functions as F\n", - "\n", - "df = spark.readStream.format(\"cloudFiles\") \\\n", - " .option(\"cloudFiles.format\", \"csv\") \\\n", - " .option(\"cloudFiles.schemaLocation\", \"/tmp/autoloader/_checkpoint/my_stream\") \\\n", - " .load(\"/Volumes/main/raw/incoming\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_file_arrival/src/files/process_files.py b/knowledge_base/pydabs_job_file_arrival/src/files/process_files.py new file mode 100644 index 00000000..fd9203ad --- /dev/null +++ b/knowledge_base/pydabs_job_file_arrival/src/files/process_files.py @@ -0,0 +1,7 @@ +# Databricks notebook source +from pyspark.sql import functions as F + +df = spark.readStream.format("cloudFiles") \ + .option("cloudFiles.format", "csv") \ + .option("cloudFiles.schemaLocation", "/tmp/autoloader/_checkpoint/my_stream") \ + .load("/Volumes/main/raw/incoming") \ No newline at end of file From 1584bfe7e6bad554c4c6ce2b0cf013386ae2045b Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 14:51:38 +0100 Subject: [PATCH 24/29] change ipynb to py scripts table update --- .../resources/table_update.py | 2 +- .../src/assets/consume_table.ipynb | 29 ------------------- .../src/assets/consume_table.py | 6 ++++ 3 files changed, 7 insertions(+), 30 deletions(-) delete mode 100644 knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.ipynb create mode 100644 knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py diff --git a/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py b/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py index ed310ff7..353feda3 100644 --- a/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py +++ b/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py @@ -2,7 +2,7 @@ consume_table = Task( task_key="consume_table", - notebook_task=NotebookTask(notebook_path="src/assets/consume_table.ipynb"), + notebook_task=NotebookTask(notebook_path="src/assets/consume_table.py"), ) job = Job( diff --git a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.ipynb b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.ipynb deleted file mode 100644 index 777b28cb..00000000 --- a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.ipynb +++ /dev/null @@ -1,29 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "e459f2ca", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "from pyspark.sql import functions as F\n", - "\n", - "source_table = \"main.analytics.daily_events\"\n", - "# Insert consumer logic here\n", - "df = spark.read.table(source_table)" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py new file mode 100644 index 00000000..4cccdce6 --- /dev/null +++ b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py @@ -0,0 +1,6 @@ +# Databricks notebook source +from pyspark.sql import functions as F + +source_table = "main.analytics.daily_events" +# Insert consumer logic here +df = spark.read.table(source_table) \ No newline at end of file From 3af5209a84cbf8badaa56372c07a57dcc7ff44ea Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 14:54:36 +0100 Subject: [PATCH 25/29] change ipynb to py scripts for each --- .../resources/for_each_simple.py | 6 ++--- .../src/foreach/generate_items.ipynb | 24 ------------------ .../src/foreach/generate_items.py | 5 ++++ .../src/foreach/process_item.ipynb | 25 ------------------- .../src/foreach/process_item.py | 6 +++++ 5 files changed, 14 insertions(+), 52 deletions(-) delete mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb create mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py delete mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb create mode 100644 knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py diff --git a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py index bc37d509..529170e9 100644 --- a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py +++ b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py @@ -10,7 +10,7 @@ generate_items = Task( task_key="generate_items", - notebook_task=NotebookTask(notebook_path="src/foreach/generate_items.ipynb"), + notebook_task=NotebookTask(notebook_path="src/foreach/generate_items.py"), ) process_item = Task( @@ -20,7 +20,7 @@ task=Task( task_key="process_item_iteration", notebook_task=NotebookTask( - notebook_path="src/foreach/process_item.ipynb", + notebook_path="src/foreach/process_item.py", base_parameters={"item": "{{input}}"}, ), ), @@ -29,7 +29,7 @@ depends_on=[TaskDependency(task_key="generate_items")], ) -for_each_task_example = Job( +job = Job( name="for_each_task_example", tasks=[generate_items, process_item], ) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb deleted file mode 100644 index 669f7149..00000000 --- a/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.ipynb +++ /dev/null @@ -1,24 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "e4c15846", - "metadata": {}, - "outputs": [], - "source": [ - "from databricks.sdk.runtime import dbutils\n", - "\n", - "items = [1, 2, 3]\n", - "dbutils.jobs.taskValues.set(key=\"items\", value=items)" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py new file mode 100644 index 00000000..af49e40a --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py @@ -0,0 +1,5 @@ +# Databricks notebook source +from databricks.sdk.runtime import dbutils + +items = [1, 2, 3] +dbutils.jobs.taskValues.set(key="items", value=items) \ No newline at end of file diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb deleted file mode 100644 index 4127a086..00000000 --- a/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.ipynb +++ /dev/null @@ -1,25 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Runs once per item in the for-each. Do not call dbutils.jobs.taskValues.set() here.\n", - "from databricks.sdk.runtime import dbutils\n", - "\n", - "# Current iteration value passed from the for-each task (base_parameters: item = {{input}})\n", - "current_item = dbutils.widgets.get(\"item\")\n", - "print(f\"Processing item: {current_item}\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py new file mode 100644 index 00000000..fe247ecf --- /dev/null +++ b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py @@ -0,0 +1,6 @@ +# Databricks notebook source +from databricks.sdk.runtime import dbutils + +# Current iteration value passed from the for-each task (base_parameters: item = {{input}}) +current_item = dbutils.widgets.get("item") +print(f"Processing item: {current_item}") \ No newline at end of file From 5304fbce87ed3782b5627986d0235061583f1002 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 14:56:19 +0100 Subject: [PATCH 26/29] rename for consistency --- .../pydabs_job_backfill_data/resources/backfill_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knowledge_base/pydabs_job_backfill_data/resources/backfill_data.py b/knowledge_base/pydabs_job_backfill_data/resources/backfill_data.py index 5a8a2b88..2526a12b 100644 --- a/knowledge_base/pydabs_job_backfill_data/resources/backfill_data.py +++ b/knowledge_base/pydabs_job_backfill_data/resources/backfill_data.py @@ -15,7 +15,7 @@ ), ) -sql_backfill_example = Job( +job = Job( name="sql_backfill_example", tasks=[run_daily_sql], parameters=[ From 77279176568470ed542d3701b3ae6c82c9a92566 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 15:03:03 +0100 Subject: [PATCH 27/29] update readme references for ipynb files --- knowledge_base/pydabs_job_conditional_execution/README.md | 6 +++--- knowledge_base/pydabs_job_file_arrival/README.md | 2 +- knowledge_base/pydabs_job_table_update_trigger/README.md | 2 +- knowledge_base/pydabs_job_with_for_each/README.md | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/knowledge_base/pydabs_job_conditional_execution/README.md b/knowledge_base/pydabs_job_conditional_execution/README.md index 8d1082b9..c73e613f 100644 --- a/knowledge_base/pydabs_job_conditional_execution/README.md +++ b/knowledge_base/pydabs_job_conditional_execution/README.md @@ -10,9 +10,9 @@ The Lakeflow Job consists of following tasks: - If bad records ≤ 100: runs `skip_path` task * `src/`: Notebook source code for this project. - * `src/check_quality.ipynb`: Checks data quality and outputs bad record count - * `src/fix_path.ipynb`: Handles cases with high bad record count - * `src/skip_path.ipynb`: Continues normal pipeline for good data + * `src/check_quality.py`: Checks data quality and outputs bad record count + * `src/fix_path.py`: Handles cases with high bad record count + * `src/skip_path.py`: The skip path * `resources/`: Resource configurations (jobs, pipelines, etc.) * `resources/conditional_execution.py`: PyDABs job definition with conditional tasks diff --git a/knowledge_base/pydabs_job_file_arrival/README.md b/knowledge_base/pydabs_job_file_arrival/README.md index 09d63991..da1fdc41 100644 --- a/knowledge_base/pydabs_job_file_arrival/README.md +++ b/knowledge_base/pydabs_job_file_arrival/README.md @@ -10,7 +10,7 @@ The Lakeflow Job is configured with: - **Automatic processing**: When files are detected, the job automatically runs and processes them * `src/`: Notebook source code for this project. - * `src/files/process_files.ipynb`: Processes newly arrived files from the volume path. + * `src/files/process_files.py`: Processes newly arrived files from the volume path. * `resources/`: Resource configurations (jobs, pipelines, etc.) * `resources/file_arrival.py`: PyDABs job with file arrival trigger configuration. diff --git a/knowledge_base/pydabs_job_table_update_trigger/README.md b/knowledge_base/pydabs_job_table_update_trigger/README.md index 5092889a..d0061a6a 100644 --- a/knowledge_base/pydabs_job_table_update_trigger/README.md +++ b/knowledge_base/pydabs_job_table_update_trigger/README.md @@ -11,7 +11,7 @@ The Lakeflow Job is configured with: - **Automatic processing**: When updates are detected, the job automatically runs and processes them * `src/`: Notebook source code for this project. - * `src/assets/consume_table.ipynb`: + * `src/assets/consume_table.py`: * `resources/`: Resource configurations (jobs, pipelines, etc.) * `resources/table_update.py`: PyDABs job with table update trigger configuration. diff --git a/knowledge_base/pydabs_job_with_for_each/README.md b/knowledge_base/pydabs_job_with_for_each/README.md index 17867619..ba958dfd 100644 --- a/knowledge_base/pydabs_job_with_for_each/README.md +++ b/knowledge_base/pydabs_job_with_for_each/README.md @@ -3,8 +3,8 @@ This example demonstrates a simple Databricks job that uses a foreach task. * `src/`: Python source code for this project. - * `foreach/generate_items.ipynb`: A notebook which returns a list of items to be used for task generation. - * `foreach/process_item.ipynb`: A notebook which will process an item. + * `foreach/generate_items.py`: A notebook which returns a list of items to be used for task generation. + * `foreach/process_item.py`: A notebook which will process an item. * `resources/`: Resource configurations (jobs, pipelines, etc.) From 6231d6f0d167420347165e5ef7e92a85be7c4f95 Mon Sep 17 00:00:00 2001 From: Zanita Rahimi Date: Mon, 16 Feb 2026 16:42:33 +0100 Subject: [PATCH 28/29] ruff formatting --- .../src/check_quality.py | 2 +- .../src/fix_path.py | 2 +- .../src/skip_path.py | 2 +- .../resources/file_arrival.py | 12 +++++++++--- .../src/files/process_files.py | 10 ++++++---- .../resources/table_update.py | 16 +++++++++++----- .../src/assets/consume_table.py | 2 +- .../resources/for_each_simple.py | 2 +- .../src/foreach/generate_items.py | 2 +- .../src/foreach/process_item.py | 2 +- .../resources/task_values_simple.py | 2 +- .../src/xcom/consumer.py | 2 +- .../src/xcom/producer.py | 2 +- 13 files changed, 36 insertions(+), 22 deletions(-) diff --git a/knowledge_base/pydabs_job_conditional_execution/src/check_quality.py b/knowledge_base/pydabs_job_conditional_execution/src/check_quality.py index 2eb30a96..f583e5f3 100644 --- a/knowledge_base/pydabs_job_conditional_execution/src/check_quality.py +++ b/knowledge_base/pydabs_job_conditional_execution/src/check_quality.py @@ -2,4 +2,4 @@ from databricks.sdk.runtime import dbutils bad_records = 123 # result of a data quality check -dbutils.jobs.taskValues.set(key="bad_records", value=bad_records) \ No newline at end of file +dbutils.jobs.taskValues.set(key="bad_records", value=bad_records) diff --git a/knowledge_base/pydabs_job_conditional_execution/src/fix_path.py b/knowledge_base/pydabs_job_conditional_execution/src/fix_path.py index d30767b3..46c7e621 100644 --- a/knowledge_base/pydabs_job_conditional_execution/src/fix_path.py +++ b/knowledge_base/pydabs_job_conditional_execution/src/fix_path.py @@ -1,2 +1,2 @@ # Databricks notebook source -print("Processing Files...") \ No newline at end of file +print("Processing Files...") diff --git a/knowledge_base/pydabs_job_conditional_execution/src/skip_path.py b/knowledge_base/pydabs_job_conditional_execution/src/skip_path.py index 82ffb5d4..d63152dd 100644 --- a/knowledge_base/pydabs_job_conditional_execution/src/skip_path.py +++ b/knowledge_base/pydabs_job_conditional_execution/src/skip_path.py @@ -1,2 +1,2 @@ # Databricks notebook source -print("Skipping Files...") \ No newline at end of file +print("Skipping Files...") diff --git a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py index 5397c859..c67f3923 100644 --- a/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py +++ b/knowledge_base/pydabs_job_file_arrival/resources/file_arrival.py @@ -1,4 +1,10 @@ -from databricks.bundles.jobs import FileArrivalTriggerConfiguration, Job, Task, TriggerSettings, NotebookTask +from databricks.bundles.jobs import ( + FileArrivalTriggerConfiguration, + Job, + Task, + TriggerSettings, + NotebookTask, +) process_files = Task( task_key="process_files", @@ -9,10 +15,10 @@ name="file_arrival_example", trigger=TriggerSettings( file_arrival=FileArrivalTriggerConfiguration( - url="/Volumes/main/raw/incoming/", # UC volume or external location + url="/Volumes/main/raw/incoming/", # UC volume or external location min_time_between_triggers_seconds=60, wait_after_last_change_seconds=90, ) ), tasks=[process_files], -) \ No newline at end of file +) diff --git a/knowledge_base/pydabs_job_file_arrival/src/files/process_files.py b/knowledge_base/pydabs_job_file_arrival/src/files/process_files.py index fd9203ad..d657db1f 100644 --- a/knowledge_base/pydabs_job_file_arrival/src/files/process_files.py +++ b/knowledge_base/pydabs_job_file_arrival/src/files/process_files.py @@ -1,7 +1,9 @@ # Databricks notebook source from pyspark.sql import functions as F -df = spark.readStream.format("cloudFiles") \ - .option("cloudFiles.format", "csv") \ - .option("cloudFiles.schemaLocation", "/tmp/autoloader/_checkpoint/my_stream") \ - .load("/Volumes/main/raw/incoming") \ No newline at end of file +df = ( + spark.readStream.format("cloudFiles") + .option("cloudFiles.format", "csv") + .option("cloudFiles.schemaLocation", "/tmp/autoloader/_checkpoint/my_stream") + .load("/Volumes/main/raw/incoming") +) diff --git a/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py b/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py index 353feda3..d04f77fa 100644 --- a/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py +++ b/knowledge_base/pydabs_job_table_update_trigger/resources/table_update.py @@ -1,4 +1,10 @@ -from databricks.bundles.jobs import Job, NotebookTask, Task, TriggerSettings, TableUpdateTriggerConfiguration +from databricks.bundles.jobs import ( + Job, + NotebookTask, + Task, + TriggerSettings, + TableUpdateTriggerConfiguration, +) consume_table = Task( task_key="consume_table", @@ -9,10 +15,10 @@ name="table_update_example", trigger=TriggerSettings( table_update=TableUpdateTriggerConfiguration( - table_names=["main.analytics.daily_events"], - min_time_between_triggers_seconds=0, - wait_after_last_change_seconds=3600, + table_names=["main.analytics.daily_events"], + min_time_between_triggers_seconds=0, + wait_after_last_change_seconds=3600, ) ), tasks=[consume_table], -) \ No newline at end of file +) diff --git a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py index 4cccdce6..9799c015 100644 --- a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py +++ b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py @@ -3,4 +3,4 @@ source_table = "main.analytics.daily_events" # Insert consumer logic here -df = spark.read.table(source_table) \ No newline at end of file +df = spark.read.table(source_table) diff --git a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py index 529170e9..bdeef54e 100644 --- a/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py +++ b/knowledge_base/pydabs_job_with_for_each/resources/for_each_simple.py @@ -32,4 +32,4 @@ job = Job( name="for_each_task_example", tasks=[generate_items, process_item], -) \ No newline at end of file +) diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py index af49e40a..32443485 100644 --- a/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py +++ b/knowledge_base/pydabs_job_with_for_each/src/foreach/generate_items.py @@ -2,4 +2,4 @@ from databricks.sdk.runtime import dbutils items = [1, 2, 3] -dbutils.jobs.taskValues.set(key="items", value=items) \ No newline at end of file +dbutils.jobs.taskValues.set(key="items", value=items) diff --git a/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py index fe247ecf..4a9a9f20 100644 --- a/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py +++ b/knowledge_base/pydabs_job_with_for_each/src/foreach/process_item.py @@ -3,4 +3,4 @@ # Current iteration value passed from the for-each task (base_parameters: item = {{input}}) current_item = dbutils.widgets.get("item") -print(f"Processing item: {current_item}") \ No newline at end of file +print(f"Processing item: {current_item}") diff --git a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py index 61248604..5ded8a0f 100644 --- a/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py +++ b/knowledge_base/pydabs_job_with_task_values/resources/task_values_simple.py @@ -14,4 +14,4 @@ job = Job( name="xcom_to_task_values_example", tasks=[producer, consumer], -) \ No newline at end of file +) diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py index 614f4f3e..96442b2d 100644 --- a/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/consumer.py @@ -2,4 +2,4 @@ from databricks.sdk.runtime import dbutils val = dbutils.jobs.taskValues.get(taskKey="producer", key="answer", debugValue=None) -print(f"Got value: {val}") \ No newline at end of file +print(f"Got value: {val}") diff --git a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py index 429bf99f..903d0778 100644 --- a/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py +++ b/knowledge_base/pydabs_job_with_task_values/src/xcom/producer.py @@ -3,4 +3,4 @@ value = 42 dbutils.jobs.taskValues.set(key="answer", value=value) -print(f"Produced value: {value}") \ No newline at end of file +print(f"Produced value: {value}") From c2bfa57ee026f4193397f554e244830689ed03ef Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Tue, 17 Feb 2026 15:27:46 +0100 Subject: [PATCH 29/29] final review --- knowledge_base/pydabs_job_backfill_data/databricks.yml | 2 +- .../src/{ => branch}/check_quality.py | 0 .../src/{ => branch}/fix_path.py | 0 .../src/{ => branch}/skip_path.py | 0 .../pydabs_job_table_update_trigger/src/assets/consume_table.py | 2 -- 5 files changed, 1 insertion(+), 3 deletions(-) rename knowledge_base/pydabs_job_conditional_execution/src/{ => branch}/check_quality.py (100%) rename knowledge_base/pydabs_job_conditional_execution/src/{ => branch}/fix_path.py (100%) rename knowledge_base/pydabs_job_conditional_execution/src/{ => branch}/skip_path.py (100%) diff --git a/knowledge_base/pydabs_job_backfill_data/databricks.yml b/knowledge_base/pydabs_job_backfill_data/databricks.yml index 55aa0938..f5317fff 100644 --- a/knowledge_base/pydabs_job_backfill_data/databricks.yml +++ b/knowledge_base/pydabs_job_backfill_data/databricks.yml @@ -1,7 +1,7 @@ # This is a Databricks asset bundle definition for pydabs_airflow. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: - name: pydabs_job_conditional_execution + name: pydabs_job_backfill_data python: venv_path: .venv diff --git a/knowledge_base/pydabs_job_conditional_execution/src/check_quality.py b/knowledge_base/pydabs_job_conditional_execution/src/branch/check_quality.py similarity index 100% rename from knowledge_base/pydabs_job_conditional_execution/src/check_quality.py rename to knowledge_base/pydabs_job_conditional_execution/src/branch/check_quality.py diff --git a/knowledge_base/pydabs_job_conditional_execution/src/fix_path.py b/knowledge_base/pydabs_job_conditional_execution/src/branch/fix_path.py similarity index 100% rename from knowledge_base/pydabs_job_conditional_execution/src/fix_path.py rename to knowledge_base/pydabs_job_conditional_execution/src/branch/fix_path.py diff --git a/knowledge_base/pydabs_job_conditional_execution/src/skip_path.py b/knowledge_base/pydabs_job_conditional_execution/src/branch/skip_path.py similarity index 100% rename from knowledge_base/pydabs_job_conditional_execution/src/skip_path.py rename to knowledge_base/pydabs_job_conditional_execution/src/branch/skip_path.py diff --git a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py index 9799c015..69db2be7 100644 --- a/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py +++ b/knowledge_base/pydabs_job_table_update_trigger/src/assets/consume_table.py @@ -1,6 +1,4 @@ # Databricks notebook source -from pyspark.sql import functions as F - source_table = "main.analytics.daily_events" # Insert consumer logic here df = spark.read.table(source_table)