diff --git a/cmd/do-agent/aggregation.go b/cmd/do-agent/aggregation.go index 9548dcd..838207f 100644 --- a/cmd/do-agent/aggregation.go +++ b/cmd/do-agent/aggregation.go @@ -129,6 +129,18 @@ var gpuAggregationSpec = map[string][]string{ "dcgm_fi_dev_power_violation": nvidiaAggregatedLabels, "dcgm_fi_dev_thermal_violation": nvidiaAggregatedLabels, "dcgm_fi_dev_ecc_dbe_agg_total": nvidiaAggregatedLabels, + "dcgm_fi_prof_sm_active": nvidiaAggregatedLabels, + "dcgm_fi_prof_pipe_fp16_active": nvidiaAggregatedLabels, + "dcgm_fi_prof_pipe_fp32_active": nvidiaAggregatedLabels, + "dcgm_fi_prof_pipe_fp64_active": nvidiaAggregatedLabels, + "dcgm_fi_dev_retired_sbe": nvidiaAggregatedLabels, + "dcgm_fi_dev_retired_dbe": nvidiaAggregatedLabels, + "dcgm_fi_dev_xid_errors": nvidiaAggregatedLabels, + "dcgm_fi_dev_ecc_sbe_vol_total": nvidiaAggregatedLabels, + "dcgm_fi_dev_ecc_dbe_vol_total": nvidiaAggregatedLabels, + "dcgm_fi_dev_ecc_sbe_agg_total": nvidiaAggregatedLabels, + "dcgm_fi_dev_pcie_replay_counter": nvidiaAggregatedLabels, + "dcgm_fi_dev_sm_clock": nvidiaAggregatedLabels, // GPU Utilization metrics "amd_gpu_prof_gui_util_percent": amdAggregatedLabels, @@ -177,26 +189,32 @@ var gpuAggregationSpec = map[string][]string{ // PCIe bandwidth "amd_pcie_bandwidth": amdAggregatedLabels, - "amd_gpu_ecc_uncorrect_total": amdAggregatedLabels, - "amd_pcie_replay_count": amdAggregatedLabels, - "amd_pcie_recovery_count": amdAggregatedLabels, - "amd_pcie_replay_rollover_count": amdAggregatedLabels, - "amd_pcie_max_speed": amdAggregatedLabels, - "amd_pcie_speed": amdAggregatedLabels, - "amd_gpu_prof_cpf_cpf_stat_stall": amdAggregatedLabels, - "amd_gpu_clock": amdAggregatedLabels, - "amd_gpu_violation_proc_hot_residency_accumulated": amdAggregatedLabels, - "amd_gpu_violation_soc_thermal_residency_accumulated": amdAggregatedLabels, - "amd_gpu_violation_ppt_residency_accumulated": amdAggregatedLabels, - "amd_gpu_violation_hbm_thermal_residency_accumulated": amdAggregatedLabels, - "amd_gpu_violation_vr_thermal_tracking_accumulated": amdAggregatedLabels, - "amd_gpu_violation_current_accumulated_counter": amdAggregatedLabels, - "amd_gpu_junction_temperature": amdAggregatedLabels, - "amd_gpu_power_usage": amdAggregatedLabels, - "amd_gpu_package_power": amdAggregatedLabels, - "amd_gpu_memory_temperature": amdAggregatedLabels, - "amd_gpu_gfx_activity": amdAggregatedLabels, - "amd_gpu_prof_sm_active": amdAggregatedLabels, + "amd_gpu_ecc_uncorrect_total": amdAggregatedLabels, + "amd_gpu_ecc_correct_total": amdAggregatedLabels, + "amd_pcie_replay_count": amdAggregatedLabels, + "amd_pcie_recovery_count": amdAggregatedLabels, + "amd_pcie_replay_rollover_count": amdAggregatedLabels, + "amd_pcie_max_speed": amdAggregatedLabels, + "amd_pcie_speed": amdAggregatedLabels, + "amd_pcie_nac_received_count": amdAggregatedLabels, + "amd_pcie_nack_sent_count": amdAggregatedLabels, + "amd_gpu_prof_cpf_cpf_stat_stall": amdAggregatedLabels, + "amd_gpu_clock": amdAggregatedLabels, + "amd_gpu_violation_proc_hot_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_processor_hot_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_soc_thermal_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_ppt_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_hbm_thermal_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_vr_thermal_tracking_accumulated": amdAggregatedLabels, + "amd_gpu_violation_socket_thermal_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_vr_thermal_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_current_accumulated_counter": amdAggregatedLabels, + "amd_gpu_junction_temperature": amdAggregatedLabels, + "amd_gpu_power_usage": amdAggregatedLabels, + "amd_gpu_package_power": amdAggregatedLabels, + "amd_gpu_memory_temperature": amdAggregatedLabels, + "amd_gpu_gfx_activity": amdAggregatedLabels, + "amd_gpu_prof_sm_active": amdAggregatedLabels, } // DI metrics: drop high-cardinality labels we don't want to keep. diff --git a/cmd/do-agent/whitelist.go b/cmd/do-agent/whitelist.go index 0be6eee..8b7fd4c 100644 --- a/cmd/do-agent/whitelist.go +++ b/cmd/do-agent/whitelist.go @@ -180,6 +180,18 @@ var gpuWhitelist = map[string]bool{ "DCGM_FI_DEV_POWER_VIOLATION": true, "DCGM_FI_DEV_THERMAL_VIOLATION": true, "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": true, + "DCGM_FI_PROF_SM_ACTIVE": true, + "DCGM_FI_PROF_PIPE_FP16_ACTIVE": true, + "DCGM_FI_PROF_PIPE_FP32_ACTIVE": true, + "DCGM_FI_PROF_PIPE_FP64_ACTIVE": true, + "DCGM_FI_DEV_RETIRED_SBE": true, + "DCGM_FI_DEV_RETIRED_DBE": true, + "DCGM_FI_DEV_XID_ERRORS": true, + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": true, + "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": true, + "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": true, + "DCGM_FI_DEV_PCIE_REPLAY_COUNTER": true, + "DCGM_FI_DEV_SM_CLOCK": true, // GPU Utilization "amd_gpu_prof_gui_util_percent": true, @@ -228,26 +240,32 @@ var gpuWhitelist = map[string]bool{ // PCIe bandwidth "amd_pcie_bandwidth": true, - "amd_gpu_ecc_uncorrect_total": true, - "amd_pcie_replay_count": true, - "amd_pcie_recovery_count": true, - "amd_pcie_replay_rollover_count": true, - "amd_pcie_max_speed": true, - "amd_pcie_speed": true, - "amd_gpu_prof_cpf_cpf_stat_stall": true, - "amd_gpu_clock": true, - "amd_gpu_violation_proc_hot_residency_accumulated": true, - "amd_gpu_violation_soc_thermal_residency_accumulated": true, - "amd_gpu_violation_current_accumulated_counter": true, - "amd_gpu_violation_ppt_residency_accumulated": true, - "amd_gpu_violation_hbm_thermal_residency_accumulated": true, - "amd_gpu_violation_vr_thermal_tracking_accumulated": true, - "amd_gpu_junction_temperature": true, - "amd_gpu_power_usage": true, - "amd_gpu_package_power": true, - "amd_gpu_memory_temperature": true, - "amd_gpu_gfx_activity": true, - "amd_gpu_prof_sm_active": true, + "amd_gpu_ecc_uncorrect_total": true, + "amd_gpu_ecc_correct_total": true, + "amd_pcie_replay_count": true, + "amd_pcie_recovery_count": true, + "amd_pcie_replay_rollover_count": true, + "amd_pcie_max_speed": true, + "amd_pcie_speed": true, + "amd_pcie_nac_received_count": true, + "amd_pcie_nack_sent_count": true, + "amd_gpu_prof_cpf_cpf_stat_stall": true, + "amd_gpu_clock": true, + "amd_gpu_violation_proc_hot_residency_accumulated": true, + "amd_gpu_violation_processor_hot_residency_accumulated": true, + "amd_gpu_violation_soc_thermal_residency_accumulated": true, + "amd_gpu_violation_current_accumulated_counter": true, + "amd_gpu_violation_ppt_residency_accumulated": true, + "amd_gpu_violation_hbm_thermal_residency_accumulated": true, + "amd_gpu_violation_vr_thermal_tracking_accumulated": true, + "amd_gpu_violation_socket_thermal_residency_accumulated": true, + "amd_gpu_violation_vr_thermal_residency_accumulated": true, + "amd_gpu_junction_temperature": true, + "amd_gpu_power_usage": true, + "amd_gpu_package_power": true, + "amd_gpu_memory_temperature": true, + "amd_gpu_gfx_activity": true, + "amd_gpu_prof_sm_active": true, } var diWhitelist = map[string]bool{ // DI GPU metrics