diff --git a/README.md b/README.md
index 46e37fa..a242030 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ This collection includes the following roles for managing OpenShift Virtualizati
* [aap_machine_credentials](roles/aap_machine_credentials/README.md) - Management of Machine Credentials.
* [aap_seed](roles/aap_seed/README.md) - Populates an Ansible Automation Platform instance.
* [bootstrap](roles/bootstrap/README.md) - Initialization of the Ansible for OpenShift Virtualization Migration environment.
+* [cluster_healthcheck](roles/cluster_healthcheck/README.md) - Cluster health validation for OpenShift Virtualization migration environments.
* [create_mf_aap_token](roles/create_mf_aap_token/README.md) - create_mf_aap_token
* [mtv_management](roles/mtv_management/README.md) - Management of the Migration Toolkit for Virtualization (MTV).
* [mtv_migrate](roles/mtv_migrate/README.md) - Migration of Virtual Machines from Source to Destination.
diff --git a/extensions/audit/event_query.yml b/extensions/audit/event_query.yml
new file mode 100644
index 0000000..a4eaa18
--- /dev/null
+++ b/extensions/audit/event_query.yml
@@ -0,0 +1,119 @@
+---
+infra.openshift_virtualization_migration.*:
+ query: >-
+ (
+ {
+ "virtualmachine": "Virtual Machine",
+ "migration": "Migration",
+ "plan": "Migration Plan",
+ "provider": "Provider",
+ "networkmap": "Network Map",
+ "storagemap": "Storage Map",
+ "backup": "VM Backup",
+ "snapshot": "VM Snapshot",
+ "operator": "Operator"
+ } as $mapping |
+ (.vm // .resources // .) |
+ (if type=="array" then .[] else if type=="object" then . else empty end end) as $data |
+ select($data.metadata != null or $data.name != null) |
+ (
+ if $data | has("kind") then
+ (
+ if $data.kind == "VirtualMachine" then "virtualmachine"
+ elif $data.kind == "VirtualMachineInstance" then "virtualmachine"
+ elif $data.kind == "Migration" then "migration"
+ elif $data.kind == "Plan" then "plan"
+ elif $data.kind == "Provider" then "provider"
+ elif $data.kind == "NetworkMap" then "networkmap"
+ elif $data.kind == "StorageMap" then "storagemap"
+ elif $data.kind | test("Backup") then "backup"
+ elif $data.kind | test("Snapshot") then "snapshot"
+ else "unknown"
+ end
+ )
+ elif $data.metadata.labels then
+ (
+ if $data.metadata.labels | has("kubevirt.io/vm") then "virtualmachine"
+ elif $data.metadata.labels | has("migration.openshift.io/plan-name") then "migration"
+ else "unknown"
+ end
+ )
+ else "unknown"
+ end
+ ) as $node_type |
+ (
+ if $node_type == "virtualmachine" then
+ (
+ if $data.status.printableStatus then $data.status.printableStatus
+ else "vm"
+ end
+ )
+ elif $node_type == "migration" then "migration"
+ elif $node_type == "plan" then "plan"
+ elif $node_type == "provider" then ($data.spec.type // "provider")
+ elif $node_type == "networkmap" then "network"
+ elif $node_type == "storagemap" then "storage"
+ elif $node_type == "backup" then "backup"
+ elif $node_type == "snapshot" then "snapshot"
+ else "unknown"
+ end
+ ) as $sub_node_type |
+ {
+ name: (
+ if $data.metadata then ($data.metadata.name // $data.metadata.uid)
+ else ($data.name // "UNKNOWN")
+ end
+ ),
+ canonical_facts: {
+ name: (
+ if $data.metadata then ($data.metadata.name // "UNKNOWN")
+ else ($data.name // "UNKNOWN")
+ end
+ ),
+ id: (
+ if $data.metadata then ($data.metadata.uid // $data.metadata.name)
+ else ($data.id // $data.name)
+ end
+ ),
+ node_type: $node_type
+ },
+ facts: {
+ infra_type: "openshift_virtualization",
+ infra_bucket: ($mapping[$node_type] // "UNKNOWN"),
+ device_type: $sub_node_type,
+ namespace: (
+ if $data.metadata then ($data.metadata.namespace // "")
+ else ""
+ end
+ ),
+ status: (
+ if $data.status then
+ (
+ if $data.status.printableStatus then $data.status.printableStatus
+ elif $data.status.phase then $data.status.phase
+ elif $data.status.conditions then
+ (
+ $data.status.conditions |
+ map(select(.status == "True")) |
+ .[0].type // "unknown"
+ )
+ else "unknown"
+ end
+ )
+ else "unknown"
+ end
+ ),
+ migration_source: (
+ if $data.spec and $data.spec.source then $data.spec.source.type
+ else ""
+ end
+ ),
+ labels: (
+ if $data.metadata and $data.metadata.labels then $data.metadata.labels
+ else {}
+ end
+ )
+ }
+ }
+ )
+...
diff --git a/playbooks/cluster_healthcheck.yml b/playbooks/cluster_healthcheck.yml
new file mode 100644
index 0000000..dafcabb
--- /dev/null
+++ b/playbooks/cluster_healthcheck.yml
@@ -0,0 +1,10 @@
+---
+- name: Run cluster healthchecks
+ hosts: localhost
+ connection: local
+ gather_facts: false
+ tasks:
+ - name: Include cluster_healthcheck role
+ ansible.builtin.import_role:
+ name: infra.openshift_virtualization_migration.cluster_healthcheck
+...
diff --git a/roles/cluster_healthcheck/README.md b/roles/cluster_healthcheck/README.md
new file mode 100644
index 0000000..89701e3
--- /dev/null
+++ b/roles/cluster_healthcheck/README.md
@@ -0,0 +1,504 @@
+# cluster_healthcheck
+
+```
+Role belongs to infra/openshift_virtualization_migration
+Namespace - infra
+Collection - openshift_virtualization_migration
+```
+
+Description: Cluster health validation for OpenShift Virtualization migration environments.
+
+## Requirements
+
+- OpenShift cluster with `kubeconfig` configured
+- `kubernetes.core` collection installed
+- OpenShift Virtualization (CNV) operator installed
+- Migration Toolkit for Virtualization (MTV) operator installed
+
+## Role Variables
+
+### Defaults
+
+| Variable | Type | Default | Description |
+|----------|------|---------|-------------|
+| `cluster_healthcheck_checks` | list | See defaults/main.yml | List of health checks to run |
+| `cluster_healthcheck_post_migration_vms` | list | `[]` | VMs to check post-migration |
+| `cluster_healthcheck_generate_report` | bool | `true` | Generate HTML report |
+| `cluster_healthcheck_report_path` | str | `/tmp/cluster_healthcheck_report.html` | Report output path |
+| `cluster_healthcheck_mtv_namespace` | str | `openshift-mtv` | MTV operator namespace |
+| `cluster_healthcheck_kubevirt_namespace` | str | `openshift-cnv` | KubeVirt operator namespace |
+| `cluster_healthcheck_ssh_timeout` | int | `10` | SSH check timeout in seconds |
+| `cluster_healthcheck_debug` | bool | `false` | Enable verbose debug output |
+
+### Post-Migration VM Format
+
+```yaml
+cluster_healthcheck_post_migration_vms:
+ - name: my-vm
+ namespace: my-namespace
+ check_ssh: true # optional, default false
+```
+
+## Health Checks
+
+| Check | Description |
+|-------|-------------|
+| `ocp_node_health` | Node Ready status, resource pressure, kubevirt.io/schedulable label |
+| `kubevirt_health` | HyperConverged CR, virt-* pods, CDI operator |
+| `mtv_health` | ForkliftController, MTV pods, Providers, Plans |
+| `storage_health` | StorageClasses, CSI drivers, PV capacity, pending PVCs |
+| `network_health` | Multus, NADs, OVN/SDN health, migration network |
+
+## Example Playbook
+
+```yaml
+- name: Run cluster healthchecks
+ hosts: localhost
+ connection: local
+ gather_facts: false
+ roles:
+ - role: infra.openshift_virtualization_migration.cluster_healthcheck
+ vars:
+ cluster_healthcheck_post_migration_vms:
+ - name: rhel9-vm
+ namespace: migration-target
+```
+
+## cluster_healthcheck
+
+```
+Role belongs to infra/openshift_virtualization_migration
+Namespace - infra
+Collection - openshift_virtualization_migration
+Version - 1.25.0
+Repository - https://github.com/redhat-cop/openshift_virtualization_migration
+```
+
+Description: Cluster health validation for OpenShift Virtualization migration environments.
+
+### Defaults
+
+**These are static variables with lower priority**
+
+#### File: defaults/main.yml
+
+| Var | Type | Value |Choices |Required | Title |
+|--------------|--------------|-------------|-------------|-------------|-------------|
+| [`cluster_healthcheck_checks`](defaults/main.yml#L3) | list | `[]` | None | None | None |
+| [`cluster_healthcheck_checks.0`](defaults/main.yml#L4) | str | `ocp_node_health` | None | None | None |
+| [`cluster_healthcheck_checks.1`](defaults/main.yml#L5) | str | `kubevirt_health` | None | None | None |
+| [`cluster_healthcheck_checks.2`](defaults/main.yml#L6) | str | `mtv_health` | None | None | None |
+| [`cluster_healthcheck_checks.3`](defaults/main.yml#L7) | str | `storage_health` | None | None | None |
+| [`cluster_healthcheck_checks.4`](defaults/main.yml#L8) | str | `network_health` | None | None | None |
+| [`cluster_healthcheck_debug`](defaults/main.yml#L22) | bool | `False` | None | None | None |
+| [`cluster_healthcheck_generate_report`](defaults/main.yml#L12) | bool | `True` | None | None | None |
+| [`cluster_healthcheck_kubevirt_namespace`](defaults/main.yml#L18) | str | `openshift-cnv` | None | None | None |
+| [`cluster_healthcheck_mtv_namespace`](defaults/main.yml#L16) | str | `openshift-mtv` | None | None | None |
+| [`cluster_healthcheck_post_migration_vms`](defaults/main.yml#L10) | list | `[]` | None | None | None |
+| [`cluster_healthcheck_report_path`](defaults/main.yml#L14) | str | `/tmp/cluster_healthcheck_report.html` | None | None | None |
+| [`cluster_healthcheck_ssh_timeout`](defaults/main.yml#L20) | int | `10` | None | None | None |
+
+🖇️ Full descriptions for vars in defaults/main.yml
+
+`cluster_healthcheck_checks`: None
+
+`cluster_healthcheck_checks.0`: None
+
+`cluster_healthcheck_checks.1`: None
+
+`cluster_healthcheck_checks.2`: None
+
+`cluster_healthcheck_checks.3`: None
+
+`cluster_healthcheck_checks.4`: None
+
+`cluster_healthcheck_debug`: None
+
+`cluster_healthcheck_generate_report`: None
+
+`cluster_healthcheck_kubevirt_namespace`: None
+
+`cluster_healthcheck_mtv_namespace`: None
+
+`cluster_healthcheck_post_migration_vms`: None
+
+`cluster_healthcheck_report_path`: None
+
+`cluster_healthcheck_ssh_timeout`: None
+
+
+
+### Vars
+
+**These are variables with higher priority**
+
+#### File: vars/main.yml
+
+| Var | Type | Value |
+|--------------|--------------|-------------|
+| [__cluster_healthcheck_results](vars/main.yml#L3) | dict | `{}` |
+
+### Tasks
+
+#### File: tasks/main.yml
+
+| Name | Module | Has Conditions |
+| ---- | ------ | --------- |
+| Initialize healthcheck results | `ansible.builtin.set_fact` | False |
+| Include ocp_node_health tasks | `ansible.builtin.include_tasks` | True |
+| Include kubevirt_health tasks | `ansible.builtin.include_tasks` | True |
+| Include mtv_health tasks | `ansible.builtin.include_tasks` | True |
+| Include storage_health tasks | `ansible.builtin.include_tasks` | True |
+| Include network_health tasks | `ansible.builtin.include_tasks` | True |
+| Include post_migration_vm tasks | `ansible.builtin.include_tasks` | True |
+| Include report tasks | `ansible.builtin.include_tasks` | True |
+
+#### File: tasks/kubevirt_health.yml
+
+| Name | Module | Has Conditions |
+| ---- | ------ | --------- |
+| kubevirt_health ¦ Get HyperConverged CR status | `kubernetes.core.k8s_info` | False |
+| kubevirt_health ¦ Evaluate HyperConverged conditions | `ansible.builtin.set_fact` | False |
+| kubevirt_health ¦ Report HyperConverged status | `ansible.builtin.debug` | False |
+| kubevirt_health ¦ Check virt-operator pods | `kubernetes.core.k8s_info` | False |
+| kubevirt_health ¦ Check virt-controller pods | `kubernetes.core.k8s_info` | False |
+| kubevirt_health ¦ Check virt-handler pods | `kubernetes.core.k8s_info` | False |
+| kubevirt_health ¦ Check virt-api pods | `kubernetes.core.k8s_info` | False |
+| kubevirt_health ¦ Evaluate KubeVirt pod health | `ansible.builtin.set_fact` | False |
+| kubevirt_health ¦ Check CDI operator pods | `kubernetes.core.k8s_info` | False |
+| kubevirt_health ¦ Check CDI deployment pods | `kubernetes.core.k8s_info` | False |
+| kubevirt_health ¦ Check CDI apiserver pods | `kubernetes.core.k8s_info` | False |
+| kubevirt_health ¦ Check CDI uploadproxy pods | `kubernetes.core.k8s_info` | False |
+| kubevirt_health ¦ Evaluate CDI health | `ansible.builtin.set_fact` | False |
+| kubevirt_health ¦ Set kubevirt health result | `ansible.builtin.set_fact` | False |
+
+#### File: tasks/mtv_health.yml
+
+| Name | Module | Has Conditions |
+| ---- | ------ | --------- |
+| mtv_health ¦ Check ForkliftController CR | `kubernetes.core.k8s_info` | False |
+| mtv_health ¦ Evaluate ForkliftController health | `ansible.builtin.set_fact` | False |
+| mtv_health ¦ Report ForkliftController status | `ansible.builtin.debug` | False |
+| mtv_health ¦ Check MTV operator pods | `kubernetes.core.k8s_info` | False |
+| mtv_health ¦ Evaluate MTV operator pod status | `ansible.builtin.set_fact` | False |
+| mtv_health ¦ Check Provider CRs | `kubernetes.core.k8s_info` | False |
+| mtv_health ¦ Identify Ready providers | `ansible.builtin.set_fact` | False |
+| mtv_health ¦ Evaluate Provider readiness | `ansible.builtin.set_fact` | False |
+| mtv_health ¦ Check for failed migration Plans | `kubernetes.core.k8s_info` | False |
+| mtv_health ¦ Evaluate failed Plans | `ansible.builtin.set_fact` | False |
+| mtv_health ¦ Report failed Plans | `ansible.builtin.debug` | False |
+| mtv_health ¦ Set MTV health result | `ansible.builtin.set_fact` | False |
+
+#### File: tasks/network_health.yml
+
+| Name | Module | Has Conditions |
+| ---- | ------ | --------- |
+| network_health ¦ Check Multus pods | `kubernetes.core.k8s_info` | False |
+| network_health ¦ Evaluate Multus pod health | `ansible.builtin.set_fact` | False |
+| network_health ¦ List NetworkAttachmentDefinitions | `kubernetes.core.k8s_info` | False |
+| network_health ¦ Report NetworkAttachmentDefinitions | `ansible.builtin.debug` | True |
+| network_health ¦ Check OVN-Kubernetes pods | `kubernetes.core.k8s_info` | False |
+| network_health ¦ Check OpenShiftSDN pods as fallback | `kubernetes.core.k8s_info` | True |
+| network_health ¦ Evaluate SDN health | `ansible.builtin.set_fact` | False |
+| network_health ¦ Get HyperConverged CR for migration network config | `kubernetes.core.k8s_info` | False |
+| network_health ¦ Extract configured migration network | `ansible.builtin.set_fact` | True |
+| network_health ¦ Check migration network NAD | `kubernetes.core.k8s_info` | True |
+| network_health ¦ Set network health result | `ansible.builtin.set_fact` | False |
+
+#### File: tasks/ocp_node_health.yml
+
+| Name | Module | Has Conditions |
+| ---- | ------ | --------- |
+| ocp_node_health ¦ Get all cluster nodes | `kubernetes.core.k8s_info` | False |
+| ocp_node_health ¦ Evaluate node Ready status | `ansible.builtin.set_fact` | False |
+| ocp_node_health ¦ Report nodes not Ready | `ansible.builtin.debug` | False |
+| ocp_node_health ¦ Check for resource pressure conditions | `ansible.builtin.set_fact` | True |
+| ocp_node_health ¦ Report nodes with resource pressure | `ansible.builtin.debug` | False |
+| ocp_node_health ¦ Check allocatable vs capacity ratios | `ansible.builtin.set_fact` | False |
+| ocp_node_health ¦ Display capacity information | `ansible.builtin.debug` | True |
+| ocp_node_health ¦ Verify worker nodes have kubevirt.io/schedulable label | `ansible.builtin.set_fact` | False |
+| ocp_node_health ¦ Report workers missing kubevirt.io/schedulable label | `ansible.builtin.debug` | False |
+| ocp_node_health ¦ Set node health result | `ansible.builtin.set_fact` | False |
+
+#### File: tasks/post_migration_vm.yml
+
+| Name | Module | Has Conditions |
+| ---- | ------ | --------- |
+| post_migration_vm ¦ Check VirtualMachineInstance status | `kubernetes.core.k8s_info` | False |
+| post_migration_vm ¦ Evaluate VM status | `ansible.builtin.set_fact` | False |
+| post_migration_vm ¦ Report VM status | `ansible.builtin.debug` | False |
+| post_migration_vm ¦ Optional SSH connectivity check | `ansible.builtin.wait_for` | True |
+| post_migration_vm ¦ Set post-migration VM result | `ansible.builtin.set_fact` | False |
+
+#### File: tasks/report.yml
+
+| Name | Module | Has Conditions |
+| ---- | ------ | --------- |
+| report ¦ Display healthcheck summary | `ansible.builtin.debug` | False |
+| report ¦ Generate HTML healthcheck report | `ansible.builtin.template` | False |
+| report ¦ Report file location | `ansible.builtin.debug` | False |
+
+#### File: tasks/storage_health.yml
+
+| Name | Module | Has Conditions |
+| ---- | ------ | --------- |
+| storage_health ¦ Get StorageClass resources | `kubernetes.core.k8s_info` | False |
+| storage_health ¦ Check for default StorageClass | `ansible.builtin.set_fact` | False |
+| storage_health ¦ Report StorageClasses | `ansible.builtin.debug` | False |
+| storage_health ¦ Check CSI driver pods | `kubernetes.core.k8s_info` | False |
+| storage_health ¦ Report CSI drivers | `ansible.builtin.debug` | False |
+| storage_health ¦ Get PersistentVolumes | `kubernetes.core.k8s_info` | False |
+| storage_health ¦ Evaluate PV capacity | `ansible.builtin.set_fact` | False |
+| storage_health ¦ Check for PVCs stuck in Pending | `kubernetes.core.k8s_info` | False |
+| storage_health ¦ Report pending PVCs | `ansible.builtin.debug` | False |
+| storage_health ¦ Set storage health result | `ansible.builtin.set_fact` | False |
+
+## Task Flow Graphs
+
+### Graph for kubevirt_health.yml
+
+```mermaid
+flowchart TD
+Start
+classDef block stroke:#3498db,stroke-width:2px;
+classDef task stroke:#4b76bb,stroke-width:2px;
+classDef includeTasks stroke:#16a085,stroke-width:2px;
+classDef importTasks stroke:#34495e,stroke-width:2px;
+classDef includeRole stroke:#2980b9,stroke-width:2px;
+classDef importRole stroke:#699ba7,stroke-width:2px;
+classDef includeVars stroke:#8e44ad,stroke-width:2px;
+classDef rescue stroke:#665352,stroke-width:2px;
+
+ Start-->|Task| kubevirt_health___Get_HyperConverged_CR_status0[kubevirt health get hyperconverged cr status]:::task
+ kubevirt_health___Get_HyperConverged_CR_status0-->|Task| kubevirt_health___Evaluate_HyperConverged_conditions1[kubevirt health evaluate hyperconverged
conditions]:::task
+ kubevirt_health___Evaluate_HyperConverged_conditions1-->|Task| kubevirt_health___Report_HyperConverged_status2[kubevirt health report hyperconverged status]:::task
+ kubevirt_health___Report_HyperConverged_status2-->|Task| kubevirt_health___Check_virt_operator_pods3[kubevirt health check virt operator pods]:::task
+ kubevirt_health___Check_virt_operator_pods3-->|Task| kubevirt_health___Check_virt_controller_pods4[kubevirt health check virt controller pods]:::task
+ kubevirt_health___Check_virt_controller_pods4-->|Task| kubevirt_health___Check_virt_handler_pods5[kubevirt health check virt handler pods]:::task
+ kubevirt_health___Check_virt_handler_pods5-->|Task| kubevirt_health___Check_virt_api_pods6[kubevirt health check virt api pods]:::task
+ kubevirt_health___Check_virt_api_pods6-->|Task| kubevirt_health___Evaluate_KubeVirt_pod_health7[kubevirt health evaluate kubevirt pod health]:::task
+ kubevirt_health___Evaluate_KubeVirt_pod_health7-->|Task| kubevirt_health___Check_CDI_operator_pods8[kubevirt health check cdi operator pods]:::task
+ kubevirt_health___Check_CDI_operator_pods8-->|Task| kubevirt_health___Check_CDI_deployment_pods9[kubevirt health check cdi deployment pods]:::task
+ kubevirt_health___Check_CDI_deployment_pods9-->|Task| kubevirt_health___Check_CDI_apiserver_pods10[kubevirt health check cdi apiserver pods]:::task
+ kubevirt_health___Check_CDI_apiserver_pods10-->|Task| kubevirt_health___Check_CDI_uploadproxy_pods11[kubevirt health check cdi uploadproxy pods]:::task
+ kubevirt_health___Check_CDI_uploadproxy_pods11-->|Task| kubevirt_health___Evaluate_CDI_health12[kubevirt health evaluate cdi health]:::task
+ kubevirt_health___Evaluate_CDI_health12-->|Task| kubevirt_health___Set_kubevirt_health_result13[kubevirt health set kubevirt health result]:::task
+ kubevirt_health___Set_kubevirt_health_result13-->End
+```
+
+### Graph for main.yml
+
+```mermaid
+flowchart TD
+Start
+classDef block stroke:#3498db,stroke-width:2px;
+classDef task stroke:#4b76bb,stroke-width:2px;
+classDef includeTasks stroke:#16a085,stroke-width:2px;
+classDef importTasks stroke:#34495e,stroke-width:2px;
+classDef includeRole stroke:#2980b9,stroke-width:2px;
+classDef importRole stroke:#699ba7,stroke-width:2px;
+classDef includeVars stroke:#8e44ad,stroke-width:2px;
+classDef rescue stroke:#665352,stroke-width:2px;
+
+ Start-->|Task| Initialize_healthcheck_results0[initialize healthcheck results]:::task
+ Initialize_healthcheck_results0-->|Include task| Include_ocp_node_health_tasks_ocp_node_health_yml_1[include ocp node health tasks
When: **ocp node health in cluster healthcheck checks**
include_task: ocp node health yml]:::includeTasks
+ Include_ocp_node_health_tasks_ocp_node_health_yml_1-->|Include task| Include_kubevirt_health_tasks_kubevirt_health_yml_2[include kubevirt health tasks
When: **kubevirt health in cluster healthcheck checks**
include_task: kubevirt health yml]:::includeTasks
+ Include_kubevirt_health_tasks_kubevirt_health_yml_2-->|Include task| Include_mtv_health_tasks_mtv_health_yml_3[include mtv health tasks
When: **mtv health in cluster healthcheck checks**
include_task: mtv health yml]:::includeTasks
+ Include_mtv_health_tasks_mtv_health_yml_3-->|Include task| Include_storage_health_tasks_storage_health_yml_4[include storage health tasks
When: **storage health in cluster healthcheck checks**
include_task: storage health yml]:::includeTasks
+ Include_storage_health_tasks_storage_health_yml_4-->|Include task| Include_network_health_tasks_network_health_yml_5[include network health tasks
When: **network health in cluster healthcheck checks**
include_task: network health yml]:::includeTasks
+ Include_network_health_tasks_network_health_yml_5-->|Include task| Include_post_migration_vm_tasks_post_migration_vm_yml_6[include post migration vm tasks
When: **cluster healthcheck post migration vms length
0**
include_task: post migration vm yml]:::includeTasks
+ Include_post_migration_vm_tasks_post_migration_vm_yml_6-->|Include task| Include_report_tasks_report_yml_7[include report tasks
When: **cluster healthcheck generate report**
include_task: report yml]:::includeTasks
+ Include_report_tasks_report_yml_7-->End
+```
+
+### Graph for mtv_health.yml
+
+```mermaid
+flowchart TD
+Start
+classDef block stroke:#3498db,stroke-width:2px;
+classDef task stroke:#4b76bb,stroke-width:2px;
+classDef includeTasks stroke:#16a085,stroke-width:2px;
+classDef importTasks stroke:#34495e,stroke-width:2px;
+classDef includeRole stroke:#2980b9,stroke-width:2px;
+classDef importRole stroke:#699ba7,stroke-width:2px;
+classDef includeVars stroke:#8e44ad,stroke-width:2px;
+classDef rescue stroke:#665352,stroke-width:2px;
+
+ Start-->|Task| mtv_health___Check_ForkliftController_CR0[mtv health check forkliftcontroller cr]:::task
+ mtv_health___Check_ForkliftController_CR0-->|Task| mtv_health___Evaluate_ForkliftController_health1[mtv health evaluate forkliftcontroller health]:::task
+ mtv_health___Evaluate_ForkliftController_health1-->|Task| mtv_health___Report_ForkliftController_status2[mtv health report forkliftcontroller status]:::task
+ mtv_health___Report_ForkliftController_status2-->|Task| mtv_health___Check_MTV_operator_pods3[mtv health check mtv operator pods]:::task
+ mtv_health___Check_MTV_operator_pods3-->|Task| mtv_health___Evaluate_MTV_operator_pod_status4[mtv health evaluate mtv operator pod status]:::task
+ mtv_health___Evaluate_MTV_operator_pod_status4-->|Task| mtv_health___Check_Provider_CRs5[mtv health check provider crs]:::task
+ mtv_health___Check_Provider_CRs5-->|Task| mtv_health___Identify_Ready_providers6[mtv health identify ready providers]:::task
+ mtv_health___Identify_Ready_providers6-->|Task| mtv_health___Evaluate_Provider_readiness7[mtv health evaluate provider readiness]:::task
+ mtv_health___Evaluate_Provider_readiness7-->|Task| mtv_health___Check_for_failed_migration_Plans8[mtv health check for failed migration plans]:::task
+ mtv_health___Check_for_failed_migration_Plans8-->|Task| mtv_health___Evaluate_failed_Plans9[mtv health evaluate failed plans]:::task
+ mtv_health___Evaluate_failed_Plans9-->|Task| mtv_health___Report_failed_Plans10[mtv health report failed plans]:::task
+ mtv_health___Report_failed_Plans10-->|Task| mtv_health___Set_MTV_health_result11[mtv health set mtv health result]:::task
+ mtv_health___Set_MTV_health_result11-->End
+```
+
+### Graph for network_health.yml
+
+```mermaid
+flowchart TD
+Start
+classDef block stroke:#3498db,stroke-width:2px;
+classDef task stroke:#4b76bb,stroke-width:2px;
+classDef includeTasks stroke:#16a085,stroke-width:2px;
+classDef importTasks stroke:#34495e,stroke-width:2px;
+classDef includeRole stroke:#2980b9,stroke-width:2px;
+classDef importRole stroke:#699ba7,stroke-width:2px;
+classDef includeVars stroke:#8e44ad,stroke-width:2px;
+classDef rescue stroke:#665352,stroke-width:2px;
+
+ Start-->|Task| network_health___Check_Multus_pods0[network health check multus pods]:::task
+ network_health___Check_Multus_pods0-->|Task| network_health___Evaluate_Multus_pod_health1[network health evaluate multus pod health]:::task
+ network_health___Evaluate_Multus_pod_health1-->|Task| network_health___List_NetworkAttachmentDefinitions2[network health list networkattachmentdefinitions]:::task
+ network_health___List_NetworkAttachmentDefinitions2-->|Task| network_health___Report_NetworkAttachmentDefinitions3[network health report
networkattachmentdefinitions
When: **cluster healthcheck debug**]:::task
+ network_health___Report_NetworkAttachmentDefinitions3-->|Task| network_health___Check_OVN_Kubernetes_pods4[network health check ovn kubernetes pods]:::task
+ network_health___Check_OVN_Kubernetes_pods4-->|Task| network_health___Check_OpenShiftSDN_pods_as_fallback5[network health check openshiftsdn pods as
fallback
When: **cluster healthcheck ovn pods resources length
0**]:::task
+ network_health___Check_OpenShiftSDN_pods_as_fallback5-->|Task| network_health___Evaluate_SDN_health6[network health evaluate sdn health]:::task
+ network_health___Evaluate_SDN_health6-->|Task| network_health___Get_HyperConverged_CR_for_migration_network_config7[network health get hyperconverged cr for
migration network config]:::task
+ network_health___Get_HyperConverged_CR_for_migration_network_config7-->|Task| network_health___Extract_configured_migration_network8[network health extract configured migration
network
When: **cluster healthcheck hco network resources
length 0**]:::task
+ network_health___Extract_configured_migration_network8-->|Task| network_health___Check_migration_network_NAD9[network health check migration network nad
When: **cluster healthcheck migration network default
length 0**]:::task
+ network_health___Check_migration_network_NAD9-->|Task| network_health___Set_network_health_result10[network health set network health result]:::task
+ network_health___Set_network_health_result10-->End
+```
+
+### Graph for ocp_node_health.yml
+
+```mermaid
+flowchart TD
+Start
+classDef block stroke:#3498db,stroke-width:2px;
+classDef task stroke:#4b76bb,stroke-width:2px;
+classDef includeTasks stroke:#16a085,stroke-width:2px;
+classDef importTasks stroke:#34495e,stroke-width:2px;
+classDef includeRole stroke:#2980b9,stroke-width:2px;
+classDef importRole stroke:#699ba7,stroke-width:2px;
+classDef includeVars stroke:#8e44ad,stroke-width:2px;
+classDef rescue stroke:#665352,stroke-width:2px;
+
+ Start-->|Task| ocp_node_health___Get_all_cluster_nodes0[ocp node health get all cluster nodes]:::task
+ ocp_node_health___Get_all_cluster_nodes0-->|Task| ocp_node_health___Evaluate_node_Ready_status1[ocp node health evaluate node ready status]:::task
+ ocp_node_health___Evaluate_node_Ready_status1-->|Task| ocp_node_health___Report_nodes_not_Ready2[ocp node health report nodes not ready]:::task
+ ocp_node_health___Report_nodes_not_Ready2-->|Task| ocp_node_health___Check_for_resource_pressure_conditions3[ocp node health check for resource pressure
conditions
When: **item status conditions selectattr type in
memorypressure diskpressure pidpressure
selectattr status equalto true list
length 0**]:::task
+ ocp_node_health___Check_for_resource_pressure_conditions3-->|Task| ocp_node_health___Report_nodes_with_resource_pressure4[ocp node health report nodes with resource
pressure]:::task
+ ocp_node_health___Report_nodes_with_resource_pressure4-->|Task| ocp_node_health___Check_allocatable_vs_capacity_ratios5[ocp node health check allocatable vs capacity
ratios]:::task
+ ocp_node_health___Check_allocatable_vs_capacity_ratios5-->|Task| ocp_node_health___Display_capacity_information6[ocp node health display capacity information
When: **cluster healthcheck debug**]:::task
+ ocp_node_health___Display_capacity_information6-->|Task| ocp_node_health___Verify_worker_nodes_have_kubevirt_io_schedulable_label7[ocp node health verify worker nodes have
kubevirt io schedulable label]:::task
+ ocp_node_health___Verify_worker_nodes_have_kubevirt_io_schedulable_label7-->|Task| ocp_node_health___Report_workers_missing_kubevirt_io_schedulable_label8[ocp node health report workers missing kubevirt
io schedulable label]:::task
+ ocp_node_health___Report_workers_missing_kubevirt_io_schedulable_label8-->|Task| ocp_node_health___Set_node_health_result9[ocp node health set node health result]:::task
+ ocp_node_health___Set_node_health_result9-->End
+```
+
+### Graph for post_migration_vm.yml
+
+```mermaid
+flowchart TD
+Start
+classDef block stroke:#3498db,stroke-width:2px;
+classDef task stroke:#4b76bb,stroke-width:2px;
+classDef includeTasks stroke:#16a085,stroke-width:2px;
+classDef importTasks stroke:#34495e,stroke-width:2px;
+classDef includeRole stroke:#2980b9,stroke-width:2px;
+classDef importRole stroke:#699ba7,stroke-width:2px;
+classDef includeVars stroke:#8e44ad,stroke-width:2px;
+classDef rescue stroke:#665352,stroke-width:2px;
+
+ Start-->|Task| post_migration_vm___Check_VirtualMachineInstance_status0[post migration vm check virtualmachineinstance
status]:::task
+ post_migration_vm___Check_VirtualMachineInstance_status0-->|Task| post_migration_vm___Evaluate_VM_status1[post migration vm evaluate vm status]:::task
+ post_migration_vm___Evaluate_VM_status1-->|Task| post_migration_vm___Report_VM_status2[post migration vm report vm status]:::task
+ post_migration_vm___Report_VM_status2-->|Task| post_migration_vm___Optional_SSH_connectivity_check3[post migration vm optional ssh connectivity
check
When: **cluster healthcheck vmi item cluster
healthcheck vm check ssh default false and
cluster healthcheck vmi item resources length
0 and cluster healthcheck vmi item resources 0
status interfaces default length 0**]:::task
+ post_migration_vm___Optional_SSH_connectivity_check3-->|Task| post_migration_vm___Set_post_migration_VM_result4[post migration vm set post migration vm result]:::task
+ post_migration_vm___Set_post_migration_VM_result4-->End
+```
+
+### Graph for report.yml
+
+```mermaid
+flowchart TD
+Start
+classDef block stroke:#3498db,stroke-width:2px;
+classDef task stroke:#4b76bb,stroke-width:2px;
+classDef includeTasks stroke:#16a085,stroke-width:2px;
+classDef importTasks stroke:#34495e,stroke-width:2px;
+classDef includeRole stroke:#2980b9,stroke-width:2px;
+classDef importRole stroke:#699ba7,stroke-width:2px;
+classDef includeVars stroke:#8e44ad,stroke-width:2px;
+classDef rescue stroke:#665352,stroke-width:2px;
+
+ Start-->|Task| report___Display_healthcheck_summary0[report display healthcheck summary]:::task
+ report___Display_healthcheck_summary0-->|Task| report___Generate_HTML_healthcheck_report1[report generate html healthcheck report]:::task
+ report___Generate_HTML_healthcheck_report1-->|Task| report___Report_file_location2[report report file location]:::task
+ report___Report_file_location2-->End
+```
+
+### Graph for storage_health.yml
+
+```mermaid
+flowchart TD
+Start
+classDef block stroke:#3498db,stroke-width:2px;
+classDef task stroke:#4b76bb,stroke-width:2px;
+classDef includeTasks stroke:#16a085,stroke-width:2px;
+classDef importTasks stroke:#34495e,stroke-width:2px;
+classDef includeRole stroke:#2980b9,stroke-width:2px;
+classDef importRole stroke:#699ba7,stroke-width:2px;
+classDef includeVars stroke:#8e44ad,stroke-width:2px;
+classDef rescue stroke:#665352,stroke-width:2px;
+
+ Start-->|Task| storage_health___Get_StorageClass_resources0[storage health get storageclass resources]:::task
+ storage_health___Get_StorageClass_resources0-->|Task| storage_health___Check_for_default_StorageClass1[storage health check for default storageclass]:::task
+ storage_health___Check_for_default_StorageClass1-->|Task| storage_health___Report_StorageClasses2[storage health report storageclasses]:::task
+ storage_health___Report_StorageClasses2-->|Task| storage_health___Check_CSI_driver_pods3[storage health check csi driver pods]:::task
+ storage_health___Check_CSI_driver_pods3-->|Task| storage_health___Report_CSI_drivers4[storage health report csi drivers]:::task
+ storage_health___Report_CSI_drivers4-->|Task| storage_health___Get_PersistentVolumes5[storage health get persistentvolumes]:::task
+ storage_health___Get_PersistentVolumes5-->|Task| storage_health___Evaluate_PV_capacity6[storage health evaluate pv capacity]:::task
+ storage_health___Evaluate_PV_capacity6-->|Task| storage_health___Check_for_PVCs_stuck_in_Pending7[storage health check for pvcs stuck in pending]:::task
+ storage_health___Check_for_PVCs_stuck_in_Pending7-->|Task| storage_health___Report_pending_PVCs8[storage health report pending pvcs]:::task
+ storage_health___Report_pending_PVCs8-->|Task| storage_health___Set_storage_health_result9[storage health set storage health result]:::task
+ storage_health___Set_storage_health_result9-->End
+```
+
+## Playbook
+
+```yml
+---
+- name: Test cluster_healthcheck role
+ hosts: localhost
+ connection: local
+ gather_facts: false
+ roles:
+ - role: cluster_healthcheck
+...
+
+```
+
+## Playbook graph
+
+```mermaid
+flowchart TD
+ hosts[localhost]-->|Role| cluster_healthcheck[cluster healthcheck]
+```
+
+## Author Information
+
+OpenShift Virtualization Migration Contributors
+
+## License
+
+GPL-3.0-only
+
+## Minimum Ansible Version
+
+2.15.0
+
+## Platforms
+
+No platforms specified.
+
+
\ No newline at end of file
diff --git a/roles/cluster_healthcheck/defaults/main.yml b/roles/cluster_healthcheck/defaults/main.yml
new file mode 100644
index 0000000..f8b58e9
--- /dev/null
+++ b/roles/cluster_healthcheck/defaults/main.yml
@@ -0,0 +1,23 @@
+---
+# defaults file for cluster_healthcheck
+cluster_healthcheck_checks:
+ - ocp_node_health
+ - kubevirt_health
+ - mtv_health
+ - storage_health
+ - network_health
+
+cluster_healthcheck_post_migration_vms: []
+
+cluster_healthcheck_generate_report: true
+
+cluster_healthcheck_report_path: "/tmp/cluster_healthcheck_report.html"
+
+cluster_healthcheck_mtv_namespace: "openshift-mtv"
+
+cluster_healthcheck_kubevirt_namespace: "openshift-cnv"
+
+cluster_healthcheck_ssh_timeout: 10
+
+cluster_healthcheck_debug: false
+...
diff --git a/roles/cluster_healthcheck/meta/main.yml b/roles/cluster_healthcheck/meta/main.yml
new file mode 100644
index 0000000..7f4bf14
--- /dev/null
+++ b/roles/cluster_healthcheck/meta/main.yml
@@ -0,0 +1,10 @@
+---
+galaxy_info:
+ author: ""
+ description: Cluster health validation for OpenShift Virtualization migration environments.
+ company: Red Hat
+ license: GPL-3.0-only
+ min_ansible_version: 2.15.0
+ galaxy_tags: []
+dependencies: []
+...
diff --git a/roles/cluster_healthcheck/tasks/kubevirt_health.yml b/roles/cluster_healthcheck/tasks/kubevirt_health.yml
new file mode 100644
index 0000000..0e916df
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/kubevirt_health.yml
@@ -0,0 +1,215 @@
+---
+- name: kubevirt_health | Get HyperConverged CR status
+ kubernetes.core.k8s_info:
+ api_version: hco.kubevirt.io/v1beta1
+ kind: HyperConverged
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ register: __cluster_healthcheck_hco
+
+- name: kubevirt_health | Evaluate HyperConverged conditions
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_hco_available: >-
+ {{ __cluster_healthcheck_hco.resources | length > 0 and
+ __cluster_healthcheck_hco.resources[0].status.conditions | default([])
+ | selectattr('type', 'equalto', 'Available')
+ | map(attribute='status')
+ | first | default('False') == 'True' }}
+ __cluster_healthcheck_hco_degraded: >-
+ {{ __cluster_healthcheck_hco.resources | length > 0 and
+ __cluster_healthcheck_hco.resources[0].status.conditions | default([])
+ | selectattr('type', 'equalto', 'Degraded')
+ | map(attribute='status')
+ | first | default('False') == 'True' }}
+
+- name: kubevirt_health | Report HyperConverged status
+ ansible.builtin.debug:
+ msg: >-
+ HyperConverged CR -
+ Available: {{ __cluster_healthcheck_hco_available }},
+ Degraded: {{ __cluster_healthcheck_hco_degraded }}
+
+- name: kubevirt_health | Check virt-operator pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "kubevirt.io=virt-operator"
+ register: __cluster_healthcheck_virt_operator_pods
+
+- name: kubevirt_health | Check virt-controller pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "kubevirt.io=virt-controller"
+ register: __cluster_healthcheck_virt_controller_pods
+
+- name: kubevirt_health | Check virt-handler pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "kubevirt.io=virt-handler"
+ register: __cluster_healthcheck_virt_handler_pods
+
+- name: kubevirt_health | Check virt-api pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "kubevirt.io=virt-api"
+ register: __cluster_healthcheck_virt_api_pods
+
+- name: kubevirt_health | Evaluate KubeVirt pod health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_kubevirt_pods:
+ virt_operator:
+ running: "{{ __cluster_healthcheck_virt_operator_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_virt_operator_pods.resources | length }}"
+ virt_controller:
+ running: "{{ __cluster_healthcheck_virt_controller_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_virt_controller_pods.resources | length }}"
+ virt_handler:
+ running: "{{ __cluster_healthcheck_virt_handler_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_virt_handler_pods.resources | length }}"
+ virt_api:
+ running: "{{ __cluster_healthcheck_virt_api_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_virt_api_pods.resources | length }}"
+
+- name: kubevirt_health | Check CDI operator pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "cdi.kubevirt.io=cdi-operator"
+ register: __cluster_healthcheck_cdi_operator_pods
+
+- name: kubevirt_health | Check CDI deployment pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "cdi.kubevirt.io=cdi-deployment"
+ register: __cluster_healthcheck_cdi_deployment_pods
+
+- name: kubevirt_health | Check CDI apiserver pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "cdi.kubevirt.io=cdi-apiserver"
+ register: __cluster_healthcheck_cdi_apiserver_pods
+
+- name: kubevirt_health | Check CDI uploadproxy pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "cdi.kubevirt.io=cdi-uploadproxy"
+ register: __cluster_healthcheck_cdi_uploadproxy_pods
+
+- name: kubevirt_health | Evaluate CDI health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_cdi_pods:
+ cdi_operator:
+ running: "{{ __cluster_healthcheck_cdi_operator_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_cdi_operator_pods.resources | length }}"
+ cdi_deployment:
+ running: "{{ __cluster_healthcheck_cdi_deployment_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_cdi_deployment_pods.resources | length }}"
+ cdi_apiserver:
+ running: "{{ __cluster_healthcheck_cdi_apiserver_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_cdi_apiserver_pods.resources | length }}"
+ cdi_uploadproxy:
+ running: "{{ __cluster_healthcheck_cdi_uploadproxy_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_cdi_uploadproxy_pods.resources | length }}"
+
+- name: kubevirt_health | Set kubevirt health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'kubevirt_health': {
+ 'status': ('fail' if (not __cluster_healthcheck_hco_available or
+ __cluster_healthcheck_hco_degraded or
+ __cluster_healthcheck_kubevirt_pods.virt_operator.total | int == 0 or
+ __cluster_healthcheck_kubevirt_pods.virt_controller.total | int == 0)
+ else 'pass'),
+ 'details': [
+ { 'check': 'HyperConverged Available',
+ 'status': ('pass' if __cluster_healthcheck_hco_available else 'fail'),
+ 'message': ('HyperConverged CR is Available'
+ if __cluster_healthcheck_hco_available
+ else 'HyperConverged CR is NOT Available') },
+ { 'check': 'HyperConverged Not Degraded',
+ 'status': ('fail' if __cluster_healthcheck_hco_degraded else 'pass'),
+ 'message': ('HyperConverged CR is Degraded'
+ if __cluster_healthcheck_hco_degraded
+ else 'HyperConverged CR is not Degraded') },
+ { 'check': 'virt-operator',
+ 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_operator.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_kubevirt_pods.virt_operator.running | string +
+ '/' + __cluster_healthcheck_kubevirt_pods.virt_operator.total | string +
+ ' pods Running') },
+ { 'check': 'virt-controller',
+ 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_controller.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_kubevirt_pods.virt_controller.running | string +
+ '/' + __cluster_healthcheck_kubevirt_pods.virt_controller.total | string +
+ ' pods Running') },
+ { 'check': 'virt-handler',
+ 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_handler.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_kubevirt_pods.virt_handler.running | string +
+ '/' + __cluster_healthcheck_kubevirt_pods.virt_handler.total | string +
+ ' pods Running') },
+ { 'check': 'virt-api',
+ 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_api.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_kubevirt_pods.virt_api.running | string +
+ '/' + __cluster_healthcheck_kubevirt_pods.virt_api.total | string +
+ ' pods Running') },
+ { 'check': 'CDI Operator',
+ 'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_operator.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_cdi_pods.cdi_operator.running | string +
+ '/' + __cluster_healthcheck_cdi_pods.cdi_operator.total | string +
+ ' pods Running') },
+ { 'check': 'CDI Deployment',
+ 'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_deployment.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_cdi_pods.cdi_deployment.running | string +
+ '/' + __cluster_healthcheck_cdi_pods.cdi_deployment.total | string +
+ ' pods Running') },
+ { 'check': 'CDI API Server',
+ 'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_apiserver.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_cdi_pods.cdi_apiserver.running | string +
+ '/' + __cluster_healthcheck_cdi_pods.cdi_apiserver.total | string +
+ ' pods Running') },
+ { 'check': 'CDI Upload Proxy',
+ 'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_uploadproxy.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_cdi_pods.cdi_uploadproxy.running | string +
+ '/' + __cluster_healthcheck_cdi_pods.cdi_uploadproxy.total | string +
+ ' pods Running') }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/main.yml b/roles/cluster_healthcheck/tasks/main.yml
new file mode 100644
index 0000000..3bb1475
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/main.yml
@@ -0,0 +1,40 @@
+---
+# tasks file for cluster_healthcheck
+- name: Initialize healthcheck results
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results:
+ ocp_node_health: { status: "skipped", details: [] }
+ kubevirt_health: { status: "skipped", details: [] }
+ mtv_health: { status: "skipped", details: [] }
+ storage_health: { status: "skipped", details: [] }
+ network_health: { status: "skipped", details: [] }
+ post_migration_vm: { status: "skipped", details: [] }
+
+- name: Include ocp_node_health tasks
+ ansible.builtin.include_tasks: ocp_node_health.yml
+ when: "'ocp_node_health' in cluster_healthcheck_checks"
+
+- name: Include kubevirt_health tasks
+ ansible.builtin.include_tasks: kubevirt_health.yml
+ when: "'kubevirt_health' in cluster_healthcheck_checks"
+
+- name: Include mtv_health tasks
+ ansible.builtin.include_tasks: mtv_health.yml
+ when: "'mtv_health' in cluster_healthcheck_checks"
+
+- name: Include storage_health tasks
+ ansible.builtin.include_tasks: storage_health.yml
+ when: "'storage_health' in cluster_healthcheck_checks"
+
+- name: Include network_health tasks
+ ansible.builtin.include_tasks: network_health.yml
+ when: "'network_health' in cluster_healthcheck_checks"
+
+- name: Include post_migration_vm tasks
+ ansible.builtin.include_tasks: post_migration_vm.yml
+ when: "cluster_healthcheck_post_migration_vms | length > 0"
+
+- name: Include report tasks
+ ansible.builtin.include_tasks: report.yml
+ when: cluster_healthcheck_generate_report
+...
diff --git a/roles/cluster_healthcheck/tasks/mtv_health.yml b/roles/cluster_healthcheck/tasks/mtv_health.yml
new file mode 100644
index 0000000..38645de
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/mtv_health.yml
@@ -0,0 +1,120 @@
+---
+- name: mtv_health | Check ForkliftController CR
+ kubernetes.core.k8s_info:
+ api_version: forklift.konveyor.io/v1beta1
+ kind: ForkliftController
+ namespace: "{{ cluster_healthcheck_mtv_namespace }}"
+ register: __cluster_healthcheck_forklift_controller
+
+- name: mtv_health | Evaluate ForkliftController health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_forklift_healthy: >-
+ {{ __cluster_healthcheck_forklift_controller.resources | length > 0 and
+ __cluster_healthcheck_forklift_controller.resources[0].status.conditions | default([])
+ | selectattr('type', 'equalto', 'Successful')
+ | map(attribute='status')
+ | first | default('False') == 'True' }}
+
+- name: mtv_health | Report ForkliftController status
+ ansible.builtin.debug:
+ msg: >-
+ ForkliftController -
+ Found: {{ __cluster_healthcheck_forklift_controller.resources | length > 0 }},
+ Healthy: {{ __cluster_healthcheck_forklift_healthy }}
+
+- name: mtv_health | Check MTV operator pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_mtv_namespace }}"
+ label_selectors:
+ - "app=forklift"
+ register: __cluster_healthcheck_mtv_pods
+
+- name: mtv_health | Evaluate MTV operator pod status
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_mtv_pods_running: >-
+ {{ __cluster_healthcheck_mtv_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running')
+ | list | length }}
+ __cluster_healthcheck_mtv_pods_total: >-
+ {{ __cluster_healthcheck_mtv_pods.resources | length }}
+
+- name: mtv_health | Check Provider CRs
+ kubernetes.core.k8s_info:
+ api_version: forklift.konveyor.io/v1beta1
+ kind: Provider
+ namespace: "{{ cluster_healthcheck_mtv_namespace }}"
+ register: __cluster_healthcheck_providers
+
+- name: mtv_health | Identify Ready providers
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_ready_provider_names: >-
+ {{ __cluster_healthcheck_providers.resources
+ | community.general.json_query("[?status.conditions[?type=='Ready' && status=='True']].metadata.name") }}
+
+- name: mtv_health | Evaluate Provider readiness
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_providers_not_ready: >-
+ {{ __cluster_healthcheck_providers.resources
+ | map(attribute='metadata.name')
+ | reject('in', __cluster_healthcheck_ready_provider_names)
+ | list }}
+
+- name: mtv_health | Check for failed migration Plans
+ kubernetes.core.k8s_info:
+ api_version: forklift.konveyor.io/v1beta1
+ kind: Plan
+ namespace: "{{ cluster_healthcheck_mtv_namespace }}"
+ register: __cluster_healthcheck_plans
+
+- name: mtv_health | Evaluate failed Plans
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_failed_plans: >-
+ {{ __cluster_healthcheck_plans.resources
+ | community.general.json_query("[?status.conditions[?type=='Failed' && status=='True']].metadata.name") }}
+
+- name: mtv_health | Report failed Plans
+ ansible.builtin.debug:
+ msg: "Plan {{ item }} is in Failed state"
+ loop: "{{ __cluster_healthcheck_failed_plans }}"
+
+- name: mtv_health | Set MTV health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'mtv_health': {
+ 'status': ('fail' if (__cluster_healthcheck_forklift_controller.resources | length == 0 or
+ not __cluster_healthcheck_forklift_healthy or
+ __cluster_healthcheck_mtv_pods_running | int == 0)
+ else ('warning' if (__cluster_healthcheck_providers_not_ready | length > 0 or
+ __cluster_healthcheck_failed_plans | length > 0)
+ else 'pass')),
+ 'details': [
+ { 'check': 'ForkliftController',
+ 'status': ('pass' if (__cluster_healthcheck_forklift_controller.resources | length > 0 and
+ __cluster_healthcheck_forklift_healthy) else 'fail'),
+ 'message': ('ForkliftController is healthy'
+ if __cluster_healthcheck_forklift_healthy
+ else 'ForkliftController is NOT healthy or missing') },
+ { 'check': 'MTV Operator Pods',
+ 'status': ('pass' if __cluster_healthcheck_mtv_pods_running | int > 0 else 'fail'),
+ 'message': (__cluster_healthcheck_mtv_pods_running | string + '/' +
+ __cluster_healthcheck_mtv_pods_total | string + ' pods Running') },
+ { 'check': 'Providers Ready',
+ 'status': ('warning' if __cluster_healthcheck_providers_not_ready | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_providers_not_ready | length | string) +
+ ' provider(s) not Ready: ' +
+ __cluster_healthcheck_providers_not_ready | join(', '))
+ if __cluster_healthcheck_providers_not_ready | length > 0
+ else 'All providers Ready' },
+ { 'check': 'Failed Plans',
+ 'status': ('warning' if __cluster_healthcheck_failed_plans | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_failed_plans | length | string) +
+ ' plan(s) in Failed state')
+ if __cluster_healthcheck_failed_plans | length > 0
+ else 'No failed plans' }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/network_health.yml b/roles/cluster_healthcheck/tasks/network_health.yml
new file mode 100644
index 0000000..ca2aead
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/network_health.yml
@@ -0,0 +1,122 @@
+---
+- name: network_health | Check Multus pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: openshift-multus
+ label_selectors:
+ - "app=multus"
+ register: __cluster_healthcheck_multus_pods
+
+- name: network_health | Evaluate Multus pod health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_multus_running: >-
+ {{ __cluster_healthcheck_multus_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running')
+ | list | length }}
+ __cluster_healthcheck_multus_total: >-
+ {{ __cluster_healthcheck_multus_pods.resources | length }}
+
+- name: network_health | List NetworkAttachmentDefinitions
+ kubernetes.core.k8s_info:
+ api_version: k8s.cni.cncf.io/v1
+ kind: NetworkAttachmentDefinition
+ register: __cluster_healthcheck_nad
+
+- name: network_health | Report NetworkAttachmentDefinitions
+ ansible.builtin.debug:
+ msg: >-
+ NetworkAttachmentDefinition: {{ item.metadata.namespace }}/{{ item.metadata.name }}
+ loop: "{{ __cluster_healthcheck_nad.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+ when: cluster_healthcheck_debug
+
+- name: network_health | Check OVN-Kubernetes pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: openshift-ovn-kubernetes
+ label_selectors:
+ - "app=ovnkube-node"
+ register: __cluster_healthcheck_ovn_pods
+
+- name: network_health | Check OpenShiftSDN pods as fallback
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: openshift-sdn
+ label_selectors:
+ - "app=sdn"
+ register: __cluster_healthcheck_sdn_pods
+ when: __cluster_healthcheck_ovn_pods.resources | length == 0
+
+- name: network_health | Evaluate SDN health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_sdn_type: >-
+ {{ 'ovn-kubernetes' if __cluster_healthcheck_ovn_pods.resources | length > 0
+ else ('openshift-sdn' if (__cluster_healthcheck_sdn_pods.resources | default([]) | length > 0)
+ else 'unknown') }}
+ __cluster_healthcheck_sdn_running: >-
+ {{ (__cluster_healthcheck_ovn_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running')
+ | list | length) if __cluster_healthcheck_ovn_pods.resources | length > 0
+ else (__cluster_healthcheck_sdn_pods.resources | default([])
+ | selectattr('status.phase', 'equalto', 'Running')
+ | list | length) }}
+
+- name: network_health | Get HyperConverged CR for migration network config
+ kubernetes.core.k8s_info:
+ api_version: hco.kubevirt.io/v1beta1
+ kind: HyperConverged
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ register: __cluster_healthcheck_hco_network
+
+- name: network_health | Extract configured migration network
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_migration_network: >-
+ {{ __cluster_healthcheck_hco_network.resources[0].spec.liveMigrationConfig.network
+ | default('') }}
+ when: __cluster_healthcheck_hco_network.resources | length > 0
+
+- name: network_health | Check migration network NAD
+ kubernetes.core.k8s_info:
+ api_version: k8s.cni.cncf.io/v1
+ kind: NetworkAttachmentDefinition
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ register: __cluster_healthcheck_migration_nad
+ when: __cluster_healthcheck_migration_network | default('') | length > 0
+
+- name: network_health | Set network health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'network_health': {
+ 'status': ('fail' if __cluster_healthcheck_sdn_type == 'unknown'
+ else ('warning' if __cluster_healthcheck_multus_total | int == 0
+ else 'pass')),
+ 'details': [
+ { 'check': 'Multus Pods',
+ 'status': ('pass' if __cluster_healthcheck_multus_running | int > 0 else 'warning'),
+ 'message': (__cluster_healthcheck_multus_running | string + '/' +
+ __cluster_healthcheck_multus_total | string + ' Multus pods Running') },
+ { 'check': 'SDN Type',
+ 'status': ('pass' if __cluster_healthcheck_sdn_type != 'unknown' else 'fail'),
+ 'message': ('SDN: ' + __cluster_healthcheck_sdn_type) },
+ { 'check': 'NetworkAttachmentDefinitions',
+ 'status': 'pass',
+ 'message': (__cluster_healthcheck_nad.resources | length | string +
+ ' NAD(s) found across cluster') },
+ { 'check': 'Migration Network',
+ 'status': ('pass' if (__cluster_healthcheck_migration_network | default('') | length == 0 or
+ (__cluster_healthcheck_migration_nad.resources | default([]) | length > 0))
+ else 'warning'),
+ 'message': ('No dedicated migration network configured in HyperConverged CR'
+ if __cluster_healthcheck_migration_network | default('') | length == 0
+ else ((__cluster_healthcheck_migration_nad.resources | default([]) | length | string) +
+ ' NAD(s) in ' + cluster_healthcheck_kubevirt_namespace +
+ ' for migration network ' + __cluster_healthcheck_migration_network)) }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/ocp_node_health.yml b/roles/cluster_healthcheck/tasks/ocp_node_health.yml
new file mode 100644
index 0000000..0bf99b7
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/ocp_node_health.yml
@@ -0,0 +1,122 @@
+---
+- name: ocp_node_health | Get all cluster nodes
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Node
+ register: __cluster_healthcheck_nodes
+
+- name: ocp_node_health | Evaluate node Ready status
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_nodes_not_ready: >-
+ {{ __cluster_healthcheck_nodes.resources
+ | selectattr('status.conditions', 'defined')
+ | map(attribute='metadata.name')
+ | zip(__cluster_healthcheck_nodes.resources
+ | map(attribute='status.conditions')
+ | map('selectattr', 'type', 'equalto', 'Ready')
+ | map('first'))
+ | selectattr('1.status', 'ne', 'True')
+ | map(attribute='0')
+ | list }}
+
+- name: ocp_node_health | Report nodes not Ready
+ ansible.builtin.debug:
+ msg: "Node {{ item }} is NOT Ready"
+ loop: "{{ __cluster_healthcheck_nodes_not_ready }}"
+
+- name: ocp_node_health | Check for resource pressure conditions
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_pressure_nodes: >-
+ {{ __cluster_healthcheck_pressure_nodes | default([]) +
+ [{ 'name': item.metadata.name,
+ 'pressures': item.status.conditions
+ | selectattr('type', 'in', ['MemoryPressure', 'DiskPressure', 'PIDPressure'])
+ | selectattr('status', 'equalto', 'True')
+ | map(attribute='type')
+ | list }] }}
+ loop: "{{ __cluster_healthcheck_nodes.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+ when: >-
+ item.status.conditions
+ | selectattr('type', 'in', ['MemoryPressure', 'DiskPressure', 'PIDPressure'])
+ | selectattr('status', 'equalto', 'True')
+ | list | length > 0
+
+- name: ocp_node_health | Report nodes with resource pressure
+ ansible.builtin.debug:
+ msg: "Node {{ item.name }} has pressure conditions: {{ item.pressures | join(', ') }}"
+ loop: "{{ __cluster_healthcheck_pressure_nodes | default([]) }}"
+ loop_control:
+ label: "{{ item.name }}"
+
+- name: ocp_node_health | Check allocatable vs capacity ratios
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_capacity_info: >-
+ {{ __cluster_healthcheck_capacity_info | default([]) +
+ [{ 'name': item.metadata.name,
+ 'cpu_allocatable': item.status.allocatable.cpu | default('0'),
+ 'cpu_capacity': item.status.capacity.cpu | default('0'),
+ 'memory_allocatable': item.status.allocatable.memory | default('0Ki'),
+ 'memory_capacity': item.status.capacity.memory | default('0Ki') }] }}
+ loop: "{{ __cluster_healthcheck_nodes.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+
+- name: ocp_node_health | Display capacity information
+ ansible.builtin.debug:
+ msg: >-
+ Node {{ item.name }} -
+ CPU: {{ item.cpu_allocatable }}/{{ item.cpu_capacity }},
+ Memory: {{ item.memory_allocatable }}/{{ item.memory_capacity }}
+ loop: "{{ __cluster_healthcheck_capacity_info | default([]) }}"
+ loop_control:
+ label: "{{ item.name }}"
+ when: cluster_healthcheck_debug
+
+- name: ocp_node_health | Verify worker nodes have kubevirt.io/schedulable label
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_workers_not_schedulable: >-
+ {{ __cluster_healthcheck_nodes.resources
+ | selectattr('metadata.labels', 'defined')
+ | selectattr('metadata.labels.node-role.kubernetes.io/worker', 'defined')
+ | rejectattr('metadata.labels', 'ansible.builtin.contains', 'kubevirt.io/schedulable')
+ | map(attribute='metadata.name')
+ | list }}
+
+- name: ocp_node_health | Report workers missing kubevirt.io/schedulable label
+ ansible.builtin.debug:
+ msg: "Worker node {{ item }} is missing the kubevirt.io/schedulable label"
+ loop: "{{ __cluster_healthcheck_workers_not_schedulable }}"
+
+- name: ocp_node_health | Set node health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'ocp_node_health': {
+ 'status': ('fail' if (__cluster_healthcheck_nodes_not_ready | length > 0 or
+ __cluster_healthcheck_pressure_nodes | default([]) | length > 0)
+ else ('warning' if __cluster_healthcheck_workers_not_schedulable | length > 0
+ else 'pass')),
+ 'details': [
+ { 'check': 'Nodes Ready',
+ 'status': ('fail' if __cluster_healthcheck_nodes_not_ready | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_nodes_not_ready | length | string) + ' node(s) not Ready')
+ if __cluster_healthcheck_nodes_not_ready | length > 0
+ else 'All nodes Ready' },
+ { 'check': 'Resource Pressure',
+ 'status': ('fail' if __cluster_healthcheck_pressure_nodes | default([]) | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_pressure_nodes | default([]) | length | string) +
+ ' node(s) with resource pressure')
+ if __cluster_healthcheck_pressure_nodes | default([]) | length > 0
+ else 'No resource pressure detected' },
+ { 'check': 'KubeVirt Schedulable',
+ 'status': ('warning' if __cluster_healthcheck_workers_not_schedulable | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_workers_not_schedulable | length | string) +
+ ' worker(s) missing kubevirt.io/schedulable label')
+ if __cluster_healthcheck_workers_not_schedulable | length > 0
+ else 'All workers have kubevirt.io/schedulable label' }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/post_migration_vm.yml b/roles/cluster_healthcheck/tasks/post_migration_vm.yml
new file mode 100644
index 0000000..61728d8
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/post_migration_vm.yml
@@ -0,0 +1,81 @@
+---
+- name: post_migration_vm | Check VirtualMachineInstance status
+ kubernetes.core.k8s_info:
+ api_version: kubevirt.io/v1
+ kind: VirtualMachineInstance
+ name: "{{ __cluster_healthcheck_vm.name }}"
+ namespace: "{{ __cluster_healthcheck_vm.namespace }}"
+ register: __cluster_healthcheck_vmi
+ loop: "{{ cluster_healthcheck_post_migration_vms }}"
+ loop_control:
+ loop_var: __cluster_healthcheck_vm
+ label: "{{ __cluster_healthcheck_vm.namespace }}/{{ __cluster_healthcheck_vm.name }}"
+
+- name: post_migration_vm | Evaluate VM status
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_vm_results: >-
+ {{ __cluster_healthcheck_vm_results | default([]) + [{
+ 'name': __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.namespace + '/' +
+ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.name,
+ 'running': (__cluster_healthcheck_vmi_item.resources | length > 0 and
+ __cluster_healthcheck_vmi_item.resources[0].status.phase | default('') == 'Running'),
+ 'guest_agent': (__cluster_healthcheck_vmi_item.resources | length > 0 and
+ __cluster_healthcheck_vmi_item.resources[0].status.guestOSInfo
+ | default({}) | length > 0),
+ 'interfaces': (__cluster_healthcheck_vmi_item.resources | length > 0 and
+ __cluster_healthcheck_vmi_item.resources[0].status.interfaces | default([])
+ | selectattr('ipAddress', 'defined')
+ | list | length > 0)
+ }] }}
+ loop: "{{ __cluster_healthcheck_vmi.results }}"
+ loop_control:
+ loop_var: __cluster_healthcheck_vmi_item
+ label: >-
+ {{ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.namespace }}/{{
+ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.name }}
+
+- name: post_migration_vm | Report VM status
+ ansible.builtin.debug:
+ msg: >-
+ VM {{ item.name }} -
+ Running: {{ item.running }},
+ Guest Agent: {{ item.guest_agent }},
+ Network: {{ item.interfaces }}
+ loop: "{{ __cluster_healthcheck_vm_results | default([]) }}"
+ loop_control:
+ label: "{{ item.name }}"
+
+- name: post_migration_vm | Optional SSH connectivity check
+ ansible.builtin.wait_for:
+ host: >-
+ {{ __cluster_healthcheck_vmi_item.resources[0].status.interfaces[0].ipAddress }}
+ port: 22
+ timeout: "{{ cluster_healthcheck_ssh_timeout }}"
+ state: started
+ loop: "{{ __cluster_healthcheck_vmi.results }}"
+ loop_control:
+ loop_var: __cluster_healthcheck_vmi_item
+ label: >-
+ {{ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.namespace }}/{{
+ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.name }}
+ when:
+ - __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.check_ssh | default(false)
+ - __cluster_healthcheck_vmi_item.resources | length > 0
+ - __cluster_healthcheck_vmi_item.resources[0].status.interfaces | default([]) | length > 0
+ ignore_errors: true # noqa: ignore-errors
+ register: __cluster_healthcheck_ssh_results
+
+- name: post_migration_vm | Set post-migration VM result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'post_migration_vm': {
+ 'status': ('fail' if (__cluster_healthcheck_vm_results | default([])
+ | selectattr('running', 'false') | list | length > 0)
+ else ('warning' if (__cluster_healthcheck_vm_results | default([])
+ | selectattr('guest_agent', 'false') | list | length > 0)
+ else 'pass')),
+ 'details': __cluster_healthcheck_vm_results | default([])
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/report.yml b/roles/cluster_healthcheck/tasks/report.yml
new file mode 100644
index 0000000..3b04aa7
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/report.yml
@@ -0,0 +1,22 @@
+---
+- name: report | Display healthcheck summary
+ ansible.builtin.debug:
+ msg: >-
+ Healthcheck Summary -
+ Nodes: {{ __cluster_healthcheck_results.ocp_node_health.status }},
+ KubeVirt: {{ __cluster_healthcheck_results.kubevirt_health.status }},
+ MTV: {{ __cluster_healthcheck_results.mtv_health.status }},
+ Storage: {{ __cluster_healthcheck_results.storage_health.status }},
+ Network: {{ __cluster_healthcheck_results.network_health.status }},
+ Post-Migration VMs: {{ __cluster_healthcheck_results.post_migration_vm.status }}
+
+- name: report | Generate HTML healthcheck report
+ ansible.builtin.template:
+ src: cluster_healthcheck_report.html.j2
+ dest: "{{ cluster_healthcheck_report_path }}"
+ mode: "0644"
+
+- name: report | Report file location
+ ansible.builtin.debug:
+ msg: "Healthcheck report written to {{ cluster_healthcheck_report_path }}"
+...
diff --git a/roles/cluster_healthcheck/tasks/storage_health.yml b/roles/cluster_healthcheck/tasks/storage_health.yml
new file mode 100644
index 0000000..abc1882
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/storage_health.yml
@@ -0,0 +1,109 @@
+---
+- name: storage_health | Get StorageClass resources
+ kubernetes.core.k8s_info:
+ api_version: storage.k8s.io/v1
+ kind: StorageClass
+ register: __cluster_healthcheck_storage_classes
+
+- name: storage_health | Check for default StorageClass
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_default_sc: >-
+ {{ __cluster_healthcheck_storage_classes.resources
+ | selectattr('metadata.annotations', 'defined')
+ | selectattr('metadata.annotations', 'ansible.builtin.contains',
+ 'storageclass.kubernetes.io/is-default-class')
+ | list }}
+
+- name: storage_health | Report StorageClasses
+ ansible.builtin.debug:
+ msg: >-
+ StorageClass: {{ item.metadata.name }},
+ Provisioner: {{ item.provisioner }},
+ Default: {{ 'Yes' if (item.metadata.annotations | default({})
+ ).get('storageclass.kubernetes.io/is-default-class', 'false') == 'true'
+ else 'No' }}
+ loop: "{{ __cluster_healthcheck_storage_classes.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+
+- name: storage_health | Check CSI driver pods
+ kubernetes.core.k8s_info:
+ api_version: storage.k8s.io/v1
+ kind: CSIDriver
+ register: __cluster_healthcheck_csi_drivers
+
+- name: storage_health | Report CSI drivers
+ ansible.builtin.debug:
+ msg: "CSI Driver: {{ item.metadata.name }}"
+ loop: "{{ __cluster_healthcheck_csi_drivers.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+
+- name: storage_health | Get PersistentVolumes
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: PersistentVolume
+ register: __cluster_healthcheck_pvs
+
+- name: storage_health | Evaluate PV capacity
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_pv_available: >-
+ {{ __cluster_healthcheck_pvs.resources
+ | selectattr('status.phase', 'equalto', 'Available')
+ | list | length }}
+ __cluster_healthcheck_pv_total: >-
+ {{ __cluster_healthcheck_pvs.resources | length }}
+
+- name: storage_health | Check for PVCs stuck in Pending
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: PersistentVolumeClaim
+ field_selectors:
+ - status.phase=Pending
+ register: __cluster_healthcheck_pending_pvcs
+
+- name: storage_health | Report pending PVCs
+ ansible.builtin.debug:
+ msg: >-
+ PVC {{ item.metadata.namespace }}/{{ item.metadata.name }} is stuck in Pending state
+ loop: "{{ __cluster_healthcheck_pending_pvcs.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+
+- name: storage_health | Set storage health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'storage_health': {
+ 'status': ('fail' if __cluster_healthcheck_storage_classes.resources | length == 0
+ else ('warning' if (__cluster_healthcheck_default_sc | length == 0 or
+ __cluster_healthcheck_pending_pvcs.resources | length > 0)
+ else 'pass')),
+ 'details': [
+ { 'check': 'StorageClasses Exist',
+ 'status': ('pass' if __cluster_healthcheck_storage_classes.resources | length > 0 else 'fail'),
+ 'message': (__cluster_healthcheck_storage_classes.resources | length | string +
+ ' StorageClass(es) found') },
+ { 'check': 'Default StorageClass',
+ 'status': ('pass' if __cluster_healthcheck_default_sc | length > 0 else 'warning'),
+ 'message': ('Default StorageClass configured'
+ if __cluster_healthcheck_default_sc | length > 0
+ else 'No default StorageClass set') },
+ { 'check': 'CSI Drivers',
+ 'status': ('pass' if __cluster_healthcheck_csi_drivers.resources | length > 0 else 'warning'),
+ 'message': (__cluster_healthcheck_csi_drivers.resources | length | string +
+ ' CSI driver(s) found') },
+ { 'check': 'PV Capacity',
+ 'status': 'pass',
+ 'message': (__cluster_healthcheck_pv_available | string + '/' +
+ __cluster_healthcheck_pv_total | string + ' PV(s) Available') },
+ { 'check': 'Pending PVCs',
+ 'status': ('warning' if __cluster_healthcheck_pending_pvcs.resources | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_pending_pvcs.resources | length | string) +
+ ' PVC(s) stuck in Pending')
+ if __cluster_healthcheck_pending_pvcs.resources | length > 0
+ else 'No PVCs stuck in Pending' }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2 b/roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2
new file mode 100644
index 0000000..c3ed2ac
--- /dev/null
+++ b/roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2
@@ -0,0 +1,81 @@
+
+
+
+
+ Cluster Healthcheck Report
+
+
+
+ Cluster Healthcheck Report
+ Generated: {{ ansible_date_time.iso8601 | default(lookup('pipe', 'date -u +%Y-%m-%dT%H:%M:%SZ')) }}
+
+ Summary
+
+
+ | Category |
+ Status |
+
+{% for category, result in __cluster_healthcheck_results.items() %}
+
+ | {{ category | replace('_', ' ') | title }} |
+ {{ result.status | upper }} |
+
+{% endfor %}
+
+
+{% for category, result in __cluster_healthcheck_results.items() %}
+{% if result.details | length > 0 %}
+ {{ category | replace('_', ' ') | title }}
+
+
+ | Check |
+ Status |
+ Details |
+
+{% for detail in result.details %}
+{% if detail.check is defined %}
+
+ | {{ detail.check }} |
+ {{ detail.status | upper }} |
+ {{ detail.message }} |
+
+{% elif detail.name is defined %}
+
+ | {{ detail.name }} |
+ {{ 'PASS' if detail.running else 'FAIL' }} |
+ Running: {{ detail.running }}, Guest Agent: {{ detail.guest_agent }}, Network: {{ detail.interfaces }} |
+
+{% endif %}
+{% endfor %}
+
+{% endif %}
+{% endfor %}
+
+ Recommendations
+
+{% for category, result in __cluster_healthcheck_results.items() %}
+{% if result.status == 'fail' %}
+ - {{ category | replace('_', ' ') | title }}: Critical issues detected. Investigate and resolve before proceeding with migrations.
+{% elif result.status == 'warning' %}
+ - {{ category | replace('_', ' ') | title }}: Non-critical issues found. Review and address when possible.
+{% endif %}
+{% endfor %}
+{% if __cluster_healthcheck_results.values() | map(attribute='status') | select('in', ['fail', 'warning']) | list | length == 0 %}
+ - All checks passed. The cluster is ready for migration workloads.
+{% endif %}
+
+
+
diff --git a/roles/cluster_healthcheck/tests/inventory b/roles/cluster_healthcheck/tests/inventory
new file mode 100644
index 0000000..2302eda
--- /dev/null
+++ b/roles/cluster_healthcheck/tests/inventory
@@ -0,0 +1 @@
+localhost ansible_connection=local
diff --git a/roles/cluster_healthcheck/tests/test.yml b/roles/cluster_healthcheck/tests/test.yml
new file mode 100644
index 0000000..0db8ed0
--- /dev/null
+++ b/roles/cluster_healthcheck/tests/test.yml
@@ -0,0 +1,8 @@
+---
+- name: Test cluster_healthcheck role
+ hosts: localhost
+ connection: local
+ gather_facts: false
+ roles:
+ - role: cluster_healthcheck
+...
diff --git a/roles/cluster_healthcheck/vars/main.yml b/roles/cluster_healthcheck/vars/main.yml
new file mode 100644
index 0000000..8955cfb
--- /dev/null
+++ b/roles/cluster_healthcheck/vars/main.yml
@@ -0,0 +1,4 @@
+---
+# vars file for cluster_healthcheck
+__cluster_healthcheck_results: {}
+...
diff --git a/roles/network_mgmt/README.md b/roles/network_mgmt/README.md
index 3c1d8a7..666ea6e 100644
--- a/roles/network_mgmt/README.md
+++ b/roles/network_mgmt/README.md
@@ -71,24 +71,25 @@ Description: Management of network related components.
| Var | Type | Value |Choices |Required | Title |
|--------------|--------------|-------------|-------------|-------------|-------------|
-| [`network_mgmt_manual_bond_name`](defaults/main.yml#L90) | str | `` | None | True | Bond Name in Manual Mode |
-| [`network_mgmt_manual_bridge_name`](defaults/main.yml#L95) | str | `vm-bridge` | None | True | Bridge Name in Manual Mode |
-| [`network_mgmt_manual_localnet_name`](defaults/main.yml#L100) | str | `` | None | True | Local Network Name in Manual Mode |
-| [`network_mgmt_manual_nad_list`](defaults/main.yml#L105) | list | `[]` | None | True | NAD List in Manual Mode |
-| [`network_mgmt_nad_auto_bridge_name`](defaults/main.yml#L78) | str | `` | None | None | None |
-| [`network_mgmt_nad_name_prefix`](defaults/main.yml#L85) | str | `net-` | None | True | NAD Name Prefix |
-| [`network_mgmt_nad_namespace`](defaults/main.yml#L73) | str | `default` | None | True | NAD Namespace |
-| [`network_mgmt_nncp_max_unavailable`](defaults/main.yml#L54) | int | `3` | None | True | NNCP Max Unavailability |
-| [`network_mgmt_nncp_name_prefix`](defaults/main.yml#L68) | str | `vs-` | None | True | NNCP Name Prefix |
-| [`network_mgmt_nncp_nodeselector`](defaults/main.yml#L62) | dict | `{}` | None | True | NNCP NodeSelector |
-| [`network_mgmt_nncp_nodeselector.node-role.kubernetes.io/worker`](defaults/main.yml#L63) | str | `` | None | None | None |
-| [`network_mgmt_openshift_network_bond_mode`](defaults/main.yml#L40) | str | `802.3ad` | None | True | OpenShift Network Bond Mode |
+| [`network_mgmt_manual_bond_name`](defaults/main.yml#L95) | str | `` | None | True | Bond Name in Manual Mode |
+| [`network_mgmt_manual_bridge_name`](defaults/main.yml#L100) | str | `vm-bridge` | None | True | Bridge Name in Manual Mode |
+| [`network_mgmt_manual_localnet_name`](defaults/main.yml#L105) | str | `` | None | True | Local Network Name in Manual Mode |
+| [`network_mgmt_manual_nad_list`](defaults/main.yml#L110) | list | `[]` | None | True | NAD List in Manual Mode |
+| [`network_mgmt_nad_auto_bridge_name`](defaults/main.yml#L83) | str | `` | None | None | None |
+| [`network_mgmt_nad_name_prefix`](defaults/main.yml#L90) | str | `net-` | None | True | NAD Name Prefix |
+| [`network_mgmt_nad_namespace`](defaults/main.yml#L78) | str | `default` | None | True | NAD Namespace |
+| [`network_mgmt_nncp_max_unavailable`](defaults/main.yml#L59) | int | `3` | None | True | NNCP Max Unavailability |
+| [`network_mgmt_nncp_name_prefix`](defaults/main.yml#L73) | str | `vs-` | None | True | NNCP Name Prefix |
+| [`network_mgmt_nncp_nodeselector`](defaults/main.yml#L67) | dict | `{}` | None | True | NNCP NodeSelector |
+| [`network_mgmt_nncp_nodeselector.node-role.kubernetes.io/worker`](defaults/main.yml#L68) | str | `` | None | None | None |
+| [`network_mgmt_openshift_network_bond_mode`](defaults/main.yml#L45) | str | `802.3ad` | None | True | OpenShift Network Bond Mode |
| [`network_mgmt_openshift_network_bridge_mode`](defaults/main.yml#L26) | str | `linux-bridge` | None | True | OpenShift Network Bridge Mode |
-| [`network_mgmt_openshift_network_supported_bond_modes`](defaults/main.yml#L46) | list | `[]` | None | True | Supported Bond Modes |
-| [`network_mgmt_openshift_network_supported_bond_modes.0`](defaults/main.yml#L47) | str | `802.3ad` | None | None | None |
-| [`network_mgmt_openshift_network_supported_bond_modes.1`](defaults/main.yml#L48) | str | `active-backup` | None | None | None |
-| [`network_mgmt_openshift_network_supported_bond_modes.2`](defaults/main.yml#L49) | str | `balance-xor` | None | None | None |
+| [`network_mgmt_openshift_network_supported_bond_modes`](defaults/main.yml#L51) | list | `[]` | None | True | Supported Bond Modes |
+| [`network_mgmt_openshift_network_supported_bond_modes.0`](defaults/main.yml#L52) | str | `802.3ad` | None | None | None |
+| [`network_mgmt_openshift_network_supported_bond_modes.1`](defaults/main.yml#L53) | str | `active-backup` | None | None | None |
+| [`network_mgmt_openshift_network_supported_bond_modes.2`](defaults/main.yml#L54) | str | `balance-xor` | None | None | None |
| [`network_mgmt_openshift_node_network_ports`](defaults/main.yml#L5) | list | `[]` | None | True | OpenShift Node Network Ports |
+| [`network_mgmt_ovn_topology`](defaults/main.yml#L36) | str | `layer2` | None | False | OVN Topology Type |
| [`network_mgmt_port_is_existing_bond`](defaults/main.yml#L10) | bool | `False` | None | True | Define Bond |
| [`network_mgmt_use_default_ovn_bridge`](defaults/main.yml#L31) | bool | `False` | None | True | OVN Bridge |
| [`network_mgmt_vcenter_datacenter`](defaults/main.yml#L21) | str | `` | None | True | vCenter Data Center |
@@ -132,6 +133,8 @@ Description: Management of network related components.
`network_mgmt_openshift_node_network_ports`: List of Node Network Ports
+`network_mgmt_ovn_topology`: OVN topology type for ovn-k8s-cni-overlay (only used with ovn-layer2 mode)
+
`network_mgmt_port_is_existing_bond`: Boolean value to check if a bond is defined
`network_mgmt_use_default_ovn_bridge`: Boolean value defines usage of OVN bridge
@@ -201,7 +204,8 @@ Description: Management of network related components.
| manual ¦ Validate network_mgmt_manual_nad_list | `ansible.builtin.assert` | False |
| manual ¦ Validate supported bonding mode if also creating bond | `ansible.builtin.assert` | True |
| manual ¦ Validate ovs-bridge mode | `ansible.builtin.assert` | True |
-| manual ¦ Validate linux-bridge | `ansible.builtin.assert` | False |
+| manual ¦ Validate linux-bridge | `ansible.builtin.assert` | True |
+| manual ¦ Validate ovn-layer2 NAD entries | `ansible.builtin.assert` | True |
| manual ¦ Apply NodeNetworkConfigurationPolicy | `redhat.openshift.k8s` | True |
| manual ¦ Validate access port | `ansible.builtin.assert` | True |
| manual ¦ Validate trunk ports | `ansible.builtin.assert` | True |
@@ -225,8 +229,8 @@ classDef rescue stroke:#665352,stroke-width:2px;
Start-->|Include task| automatic___Include_tasks_from_gather_networks_yml_gather_networks_yml_0[automatic include tasks from gather networks yml
include_task: gather networks yml]:::includeTasks
automatic___Include_tasks_from_gather_networks_yml_gather_networks_yml_0-->|Task| automatic___Set_the_switches_and_portgroups_to_migrate1[automatic set the switches and portgroups to
migrate]:::task
- automatic___Set_the_switches_and_portgroups_to_migrate1-->|Include task| automatic___Include_tasks_from_automatic_nncp_yml_automatic_nncp_yml_2[automatic include tasks from automatic nncp yml
When: **network mgmt openshift node network ports
default is iterable and network mgmt
openshift node network ports default is
not string and network mgmt openshift node network
ports default length 0 and network mgmt
vcenter dvswitch default true trim
length 0 and network mgmt vcenter datacenter
default true trim length 0**
include_task: automatic nncp yml]:::includeTasks
- automatic___Include_tasks_from_automatic_nncp_yml_automatic_nncp_yml_2-->|Include task| automatic___Include_tasks_from_automatic_nad_yml_automatic_nad_yml_3[automatic include tasks from automatic nad yml
When: **network mgmt vcenter dvswitch default true
trim length 0 and network mgmt vcenter
datacenter default true trim length 0
and network mgmt openshift node network ports
default is iterable and network mgmt
openshift node network ports default is
not string and network mgmt openshift node
network ports default length 0 or
network mgmt nad auto bridge name is defined and
network mgmt nad auto bridge name length 0**
include_task: automatic nad yml]:::includeTasks
+ automatic___Set_the_switches_and_portgroups_to_migrate1-->|Include task| automatic___Include_tasks_from_automatic_nncp_yml_automatic_nncp_yml_2[automatic include tasks from automatic nncp yml
When: **network mgmt openshift node network ports
default is iterable and network mgmt
openshift node network ports default is
not string and network mgmt openshift node network
ports default length 0 and network mgmt
vcenter dvswitch default true trim
length 0 and network mgmt vcenter datacenter
default true trim length 0 and network
mgmt openshift network bridge mode ovn layer2**
include_task: automatic nncp yml]:::includeTasks
+ automatic___Include_tasks_from_automatic_nncp_yml_automatic_nncp_yml_2-->|Include task| automatic___Include_tasks_from_automatic_nad_yml_automatic_nad_yml_3[automatic include tasks from automatic nad yml
When: **network mgmt vcenter dvswitch default true
trim length 0 and network mgmt vcenter
datacenter default true trim length 0
and network mgmt openshift node network ports
default is iterable and network mgmt
openshift node network ports default is
not string and network mgmt openshift node
network ports default length 0 or
network mgmt nad auto bridge name is defined and
network mgmt nad auto bridge name length 0 or
network mgmt openshift network bridge mode ovn
layer2**
include_task: automatic nad yml]:::includeTasks
automatic___Include_tasks_from_automatic_nad_yml_automatic_nad_yml_3-->End
```
@@ -360,12 +364,13 @@ classDef rescue stroke:#665352,stroke-width:2px;
Start-->|Task| manual___Validate_network_mgmt_manual_nad_list0[manual validate network mgmt manual nad list]:::task
manual___Validate_network_mgmt_manual_nad_list0-->|Task| manual___Validate_supported_bonding_mode_if_also_creating_bond1[manual validate supported bonding mode if also
creating bond
When: **not network mgmt override openshift supported
bond mode default false and network mgmt
openshift node network ports default
length 0**]:::task
manual___Validate_supported_bonding_mode_if_also_creating_bond1-->|Task| manual___Validate_ovs_bridge_mode2[manual validate ovs bridge mode
When: **network mgmt openshift network bridge mode ovs
bridge**]:::task
- manual___Validate_ovs_bridge_mode2-->|Task| manual___Validate_linux_bridge3[manual validate linux bridge]:::task
- manual___Validate_linux_bridge3-->|Task| manual___Apply_NodeNetworkConfigurationPolicy4[manual apply nodenetworkconfigurationpolicy
When: **network mgmt manual bridge name default
length 0 and network mgmt manual bond name
default length 0 and network mgmt
openshift network bridge mode linux bridge**]:::task
- manual___Apply_NodeNetworkConfigurationPolicy4-->|Task| manual___Validate_access_port5[manual validate access port
When: **trunk not in nad or not nad trunk**]:::task
- manual___Validate_access_port5-->|Task| manual___Validate_trunk_ports6[manual validate trunk ports
When: **trunk in nad and nad trunk**]:::task
- manual___Validate_trunk_ports6-->|Task| manual___Apply_NetworkAttachmentDefinitions7[manual apply networkattachmentdefinitions]:::task
- manual___Apply_NetworkAttachmentDefinitions7-->End
+ manual___Validate_ovs_bridge_mode2-->|Task| manual___Validate_linux_bridge3[manual validate linux bridge
When: **network mgmt openshift network bridge mode
linux bridge**]:::task
+ manual___Validate_linux_bridge3-->|Task| manual___Validate_ovn_layer2_NAD_entries4[manual validate ovn layer2 nad entries
When: **network mgmt openshift network bridge mode ovn
layer2**]:::task
+ manual___Validate_ovn_layer2_NAD_entries4-->|Task| manual___Apply_NodeNetworkConfigurationPolicy5[manual apply nodenetworkconfigurationpolicy
When: **network mgmt manual bridge name default
length 0 and network mgmt manual bond name
default length 0 and network mgmt
openshift network bridge mode linux bridge**]:::task
+ manual___Apply_NodeNetworkConfigurationPolicy5-->|Task| manual___Validate_access_port6[manual validate access port
When: **network mgmt openshift network bridge mode ovn
layer2 and trunk not in nad or not nad trunk
**]:::task
+ manual___Validate_access_port6-->|Task| manual___Validate_trunk_ports7[manual validate trunk ports
When: **network mgmt openshift network bridge mode ovn
layer2 and trunk in nad and nad trunk**]:::task
+ manual___Validate_trunk_ports7-->|Task| manual___Apply_NetworkAttachmentDefinitions8[manual apply networkattachmentdefinitions]:::task
+ manual___Apply_NetworkAttachmentDefinitions8-->End
```
## Playbook