From 254bf10d397ed46005f9f89d5b49d879cd6ddc68 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Thu, 16 Apr 2026 12:20:25 +0900 Subject: [PATCH 1/5] in_gpu_metrics: Handle NVIDIA GPU metrics via NVML Signed-off-by: Hiroshi Hatake --- plugins/in_gpu_metrics/CMakeLists.txt | 1 + plugins/in_gpu_metrics/amd_gpu.c | 2 +- plugins/in_gpu_metrics/gpu_metrics.c | 21 +- plugins/in_gpu_metrics/gpu_metrics.h | 7 + plugins/in_gpu_metrics/nvml_gpu.c | 264 ++++++++++++++++++++++++++ plugins/in_gpu_metrics/nvml_gpu.h | 30 +++ 6 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 plugins/in_gpu_metrics/nvml_gpu.c create mode 100644 plugins/in_gpu_metrics/nvml_gpu.h diff --git a/plugins/in_gpu_metrics/CMakeLists.txt b/plugins/in_gpu_metrics/CMakeLists.txt index 73f460d38b0..5174d6ce68f 100644 --- a/plugins/in_gpu_metrics/CMakeLists.txt +++ b/plugins/in_gpu_metrics/CMakeLists.txt @@ -1,6 +1,7 @@ set(src gpu_metrics.c amd_gpu.c + nvml_gpu.c gpu_common.c ) diff --git a/plugins/in_gpu_metrics/amd_gpu.c b/plugins/in_gpu_metrics/amd_gpu.c index f02216f2964..b89854bcf73 100644 --- a/plugins/in_gpu_metrics/amd_gpu.c +++ b/plugins/in_gpu_metrics/amd_gpu.c @@ -275,6 +275,7 @@ int amd_gpu_detect_cards(struct in_gpu_metrics *ctx) return -1; } card->id = id; + card->backend_type = GPU_BACKEND_AMD; card->hwmon_path = NULL; card->hwmon_path = find_hwmon_path(id); if (!card->hwmon_path) { @@ -557,4 +558,3 @@ int amd_gpu_collect_metrics(struct in_gpu_metrics *ctx, struct gpu_card *card) flb_sds_destroy(card_id); return 0; } - diff --git a/plugins/in_gpu_metrics/gpu_metrics.c b/plugins/in_gpu_metrics/gpu_metrics.c index a0ac867ffae..74eaaff6af4 100644 --- a/plugins/in_gpu_metrics/gpu_metrics.c +++ b/plugins/in_gpu_metrics/gpu_metrics.c @@ -31,6 +31,7 @@ #include "gpu_metrics.h" #include "amd_gpu.h" +#include "nvml_gpu.h" static int in_gpu_collect(struct flb_input_instance *ins, struct flb_config *config, void *in_context) @@ -41,7 +42,12 @@ static int in_gpu_collect(struct flb_input_instance *ins, cfl_list_foreach(head, &ctx->cards) { card = cfl_list_entry(head, struct gpu_card, _head); - amd_gpu_collect_metrics(ctx, card); + if (card->backend_type == GPU_BACKEND_AMD) { + amd_gpu_collect_metrics(ctx, card); + } + else if (card->backend_type == GPU_BACKEND_NVML) { + nvml_gpu_collect_metrics(ctx, card); + } } flb_input_metrics_append(ctx->ins, NULL, 0, ctx->cmt); @@ -61,6 +67,8 @@ static int in_gpu_init(struct flb_input_instance *ins, } ctx->ins = ins; ctx->cards_detected = 0; + ctx->nvml_initialized = FLB_FALSE; + ctx->nvml_lib_handle = NULL; cfl_list_init(&ctx->cards); ret = flb_input_config_map_set(ins, (void *) ctx); @@ -117,6 +125,11 @@ static int in_gpu_init(struct flb_input_instance *ins, (char *[]) {"card", "vendor"}); amd_gpu_detect_cards(ctx); + if (nvml_gpu_initialize(ctx) == 0) { + if (nvml_gpu_detect_cards(ctx) != 0) { + flb_plg_debug(ctx->ins, "NVML card detection encountered errors"); + } + } flb_input_set_context(ins, ctx); ret = flb_input_set_collector_time(ins, in_gpu_collect, @@ -162,6 +175,7 @@ static int in_gpu_exit(void *data, struct flb_config *config) cmt_destroy(ctx->cmt); } + nvml_gpu_shutdown(ctx); flb_free(ctx); return 0; } @@ -189,6 +203,11 @@ static struct flb_config_map config_map[] = { 0, FLB_TRUE, offsetof(struct in_gpu_metrics, enable_temperature), "Enable collection of GPU temperature metrics (gpu_temperature_celsius)." }, + { + FLB_CONFIG_MAP_BOOL, "enable_nvml", "true", + 0, FLB_TRUE, offsetof(struct in_gpu_metrics, enable_nvml), + "Enable NVIDIA NVML collection when libnvidia-ml is available." + }, { FLB_CONFIG_MAP_STR, "path_sysfs", "/sys", 0, FLB_TRUE, offsetof(struct in_gpu_metrics, path_sysfs), diff --git a/plugins/in_gpu_metrics/gpu_metrics.h b/plugins/in_gpu_metrics/gpu_metrics.h index 270adeb3b8b..05469dec705 100644 --- a/plugins/in_gpu_metrics/gpu_metrics.h +++ b/plugins/in_gpu_metrics/gpu_metrics.h @@ -25,6 +25,7 @@ struct gpu_card { int id; + int backend_type; flb_sds_t hwmon_path; struct cfl_list _head; }; @@ -36,8 +37,10 @@ struct in_gpu_metrics { int scrape_interval; int enable_power; int enable_temperature; + int enable_nvml; int coll_fd; int cards_detected; + int nvml_initialized; struct cfl_list cards; @@ -53,7 +56,11 @@ struct in_gpu_metrics { /* plugin instance */ struct flb_input_instance *ins; + void *nvml_lib_handle; }; +#define GPU_BACKEND_AMD 1 +#define GPU_BACKEND_NVML 2 + #endif diff --git a/plugins/in_gpu_metrics/nvml_gpu.c b/plugins/in_gpu_metrics/nvml_gpu.c new file mode 100644 index 00000000000..fee8233c31d --- /dev/null +++ b/plugins/in_gpu_metrics/nvml_gpu.c @@ -0,0 +1,264 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2026 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "nvml_gpu.h" + +typedef int nvmlReturn_t; +typedef void *nvmlDevice_t; + +struct nvmlMemory_t { + uint64_t total; + uint64_t free; + uint64_t used; +}; + +struct nvmlUtilization_t { + unsigned int gpu; + unsigned int memory; +}; + +typedef nvmlReturn_t (*nvmlInit_v2_t)(void); +typedef nvmlReturn_t (*nvmlShutdown_t)(void); +typedef nvmlReturn_t (*nvmlDeviceGetCount_v2_t)(unsigned int *device_count); +typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndex_v2_t)(unsigned int index, nvmlDevice_t *device); +typedef nvmlReturn_t (*nvmlDeviceGetMemoryInfo_t)(nvmlDevice_t device, struct nvmlMemory_t *memory); +typedef nvmlReturn_t (*nvmlDeviceGetUtilizationRates_t)(nvmlDevice_t device, struct nvmlUtilization_t *util); +typedef nvmlReturn_t (*nvmlDeviceGetTemperature_t)(nvmlDevice_t device, + unsigned int sensor_type, + unsigned int *temp); +typedef nvmlReturn_t (*nvmlDeviceGetPowerUsage_t)(nvmlDevice_t device, unsigned int *power); +typedef nvmlReturn_t (*nvmlDeviceGetFanSpeed_t)(nvmlDevice_t device, unsigned int *speed); +typedef const char *(*nvmlErrorString_t)(nvmlReturn_t result); + +#define NVML_SUCCESS 0 +#define NVML_TEMPERATURE_GPU 0 + +static nvmlInit_v2_t f_nvml_init_v2; +static nvmlShutdown_t f_nvml_shutdown; +static nvmlDeviceGetCount_v2_t f_nvml_device_get_count_v2; +static nvmlDeviceGetHandleByIndex_v2_t f_nvml_device_get_handle_by_index_v2; +static nvmlDeviceGetMemoryInfo_t f_nvml_device_get_memory_info; +static nvmlDeviceGetUtilizationRates_t f_nvml_device_get_utilization_rates; +static nvmlDeviceGetTemperature_t f_nvml_device_get_temperature; +static nvmlDeviceGetPowerUsage_t f_nvml_device_get_power_usage; +static nvmlDeviceGetFanSpeed_t f_nvml_device_get_fan_speed; +static nvmlErrorString_t f_nvml_error_string; + +static const char *nvml_result_to_string(nvmlReturn_t result) +{ + if (f_nvml_error_string != NULL) { + return f_nvml_error_string(result); + } + return "unknown"; +} + +static int load_nvml_symbol(struct in_gpu_metrics *ctx, const char *name, void **target) +{ + *target = dlsym(ctx->nvml_lib_handle, name); + if (*target == NULL) { + flb_plg_warn(ctx->ins, "NVML symbol '%s' is missing", name); + return -1; + } + + return 0; +} + +int nvml_gpu_initialize(struct in_gpu_metrics *ctx) +{ + nvmlReturn_t result; + + if (ctx->enable_nvml == FLB_FALSE) { + return 0; + } + + ctx->nvml_lib_handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY); + if (ctx->nvml_lib_handle == NULL) { + ctx->nvml_lib_handle = dlopen("libnvidia-ml.so", RTLD_LAZY); + } + if (ctx->nvml_lib_handle == NULL) { + flb_plg_info(ctx->ins, + "NVML shared library not found; NVIDIA GPU metrics are disabled"); + return 0; + } + + if (load_nvml_symbol(ctx, "nvmlInit_v2", (void **) &f_nvml_init_v2) != 0 || + load_nvml_symbol(ctx, "nvmlShutdown", (void **) &f_nvml_shutdown) != 0 || + load_nvml_symbol(ctx, "nvmlDeviceGetCount_v2", (void **) &f_nvml_device_get_count_v2) != 0 || + load_nvml_symbol(ctx, "nvmlDeviceGetHandleByIndex_v2", + (void **) &f_nvml_device_get_handle_by_index_v2) != 0 || + load_nvml_symbol(ctx, "nvmlDeviceGetMemoryInfo", (void **) &f_nvml_device_get_memory_info) != 0 || + load_nvml_symbol(ctx, "nvmlDeviceGetUtilizationRates", + (void **) &f_nvml_device_get_utilization_rates) != 0 || + load_nvml_symbol(ctx, "nvmlDeviceGetTemperature", + (void **) &f_nvml_device_get_temperature) != 0 || + load_nvml_symbol(ctx, "nvmlDeviceGetPowerUsage", (void **) &f_nvml_device_get_power_usage) != 0) { + dlclose(ctx->nvml_lib_handle); + ctx->nvml_lib_handle = NULL; + return -1; + } + + f_nvml_error_string = dlsym(ctx->nvml_lib_handle, "nvmlErrorString"); + f_nvml_device_get_fan_speed = dlsym(ctx->nvml_lib_handle, "nvmlDeviceGetFanSpeed"); + + result = f_nvml_init_v2(); + if (result != NVML_SUCCESS) { + flb_plg_warn(ctx->ins, "NVML init failed: %s", nvml_result_to_string(result)); + dlclose(ctx->nvml_lib_handle); + ctx->nvml_lib_handle = NULL; + return -1; + } + + ctx->nvml_initialized = FLB_TRUE; + flb_plg_info(ctx->ins, "NVML backend enabled"); + return 0; +} + +int nvml_gpu_detect_cards(struct in_gpu_metrics *ctx) +{ + unsigned int index; + unsigned int count; + struct gpu_card *card; + nvmlReturn_t result; + + if (ctx->nvml_initialized == FLB_FALSE) { + return 0; + } + + result = f_nvml_device_get_count_v2(&count); + if (result != NVML_SUCCESS) { + flb_plg_warn(ctx->ins, "NVML device count failed: %s", nvml_result_to_string(result)); + return -1; + } + + for (index = 0; index < count; index++) { + card = flb_calloc(1, sizeof(struct gpu_card)); + if (card == NULL) { + flb_errno(); + return -1; + } + + card->id = (int) index; + card->backend_type = GPU_BACKEND_NVML; + cfl_list_add(&card->_head, &ctx->cards); + } + + if (count > 0) { + flb_plg_info(ctx->ins, "detected %u NVIDIA GPU(s) via NVML", count); + } + + return 0; +} + +int nvml_gpu_collect_metrics(struct in_gpu_metrics *ctx, struct gpu_card *card) +{ + nvmlDevice_t device; + struct nvmlMemory_t memory; + struct nvmlUtilization_t util; + unsigned int temp; + unsigned int power_mw; + unsigned int fan_percent; + nvmlReturn_t result; + uint64_t ts; + flb_sds_t card_id; + + if (ctx->nvml_initialized == FLB_FALSE) { + return -1; + } + + result = f_nvml_device_get_handle_by_index_v2((unsigned int) card->id, &device); + if (result != NVML_SUCCESS) { + flb_plg_debug(ctx->ins, "NVML handle lookup failed for card%d: %s", + card->id, nvml_result_to_string(result)); + return -1; + } + + card_id = flb_sds_create_size(16); + if (card_id == NULL) { + flb_errno(); + return -1; + } + card_id = flb_sds_printf(&card_id, "%d", card->id); + if (card_id == NULL) { + return -1; + } + + ts = cfl_time_now(); + + result = f_nvml_device_get_utilization_rates(device, &util); + if (result == NVML_SUCCESS) { + cmt_gauge_set(ctx->g_utilization, ts, (double) util.gpu, 2, + (char *[]) {card_id, "nvidia"}); + } + + result = f_nvml_device_get_memory_info(device, &memory); + if (result == NVML_SUCCESS) { + cmt_gauge_set(ctx->g_mem_used, ts, (double) memory.used, 2, + (char *[]) {card_id, "nvidia"}); + cmt_gauge_set(ctx->g_mem_total, ts, (double) memory.total, 2, + (char *[]) {card_id, "nvidia"}); + } + + if (ctx->enable_temperature) { + result = f_nvml_device_get_temperature(device, NVML_TEMPERATURE_GPU, &temp); + if (result == NVML_SUCCESS) { + cmt_gauge_set(ctx->g_temp, ts, (double) temp, 2, + (char *[]) {card_id, "nvidia"}); + } + } + + if (ctx->enable_power) { + result = f_nvml_device_get_power_usage(device, &power_mw); + if (result == NVML_SUCCESS) { + cmt_gauge_set(ctx->g_power, ts, (double) power_mw / 1000.0, 2, + (char *[]) {card_id, "nvidia"}); + } + } + + if (f_nvml_device_get_fan_speed != NULL) { + result = f_nvml_device_get_fan_speed(device, &fan_percent); + if (result == NVML_SUCCESS) { + cmt_gauge_set(ctx->g_fan_pwm, ts, (double) fan_percent, 2, + (char *[]) {card_id, "nvidia"}); + } + } + + flb_sds_destroy(card_id); + return 0; +} + +void nvml_gpu_shutdown(struct in_gpu_metrics *ctx) +{ + if (ctx->nvml_initialized == FLB_TRUE) { + f_nvml_shutdown(); + ctx->nvml_initialized = FLB_FALSE; + } + if (ctx->nvml_lib_handle != NULL) { + dlclose(ctx->nvml_lib_handle); + ctx->nvml_lib_handle = NULL; + } +} diff --git a/plugins/in_gpu_metrics/nvml_gpu.h b/plugins/in_gpu_metrics/nvml_gpu.h new file mode 100644 index 00000000000..aae64867498 --- /dev/null +++ b/plugins/in_gpu_metrics/nvml_gpu.h @@ -0,0 +1,30 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2026 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FLB_GPU_METRICS_NVML_H +#define FLB_GPU_METRICS_NVML_H + +#include "gpu_metrics.h" + +int nvml_gpu_initialize(struct in_gpu_metrics *ctx); +int nvml_gpu_detect_cards(struct in_gpu_metrics *ctx); +int nvml_gpu_collect_metrics(struct in_gpu_metrics *ctx, struct gpu_card *card); +void nvml_gpu_shutdown(struct in_gpu_metrics *ctx); + +#endif From 7d7ed0410728a2005c4547d26c38808e7902c065 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Thu, 16 Apr 2026 12:38:28 +0900 Subject: [PATCH 2/5] in_gpu_metrics: Unify include/exclude pattern accross AMD/NVDA GPUs Signed-off-by: Hiroshi Hatake --- plugins/in_gpu_metrics/amd_gpu.c | 57 +--------------------------- plugins/in_gpu_metrics/gpu_common.c | 58 +++++++++++++++++++++++++++++ plugins/in_gpu_metrics/gpu_common.h | 3 ++ plugins/in_gpu_metrics/nvml_gpu.c | 4 ++ 4 files changed, 66 insertions(+), 56 deletions(-) diff --git a/plugins/in_gpu_metrics/amd_gpu.c b/plugins/in_gpu_metrics/amd_gpu.c index b89854bcf73..cb6bde7c91f 100644 --- a/plugins/in_gpu_metrics/amd_gpu.c +++ b/plugins/in_gpu_metrics/amd_gpu.c @@ -47,61 +47,6 @@ static flb_sds_t build_path(int card_id, const char *file) return path; } -static int match_card_pattern(const char *pattern, int card_id) -{ - char *dup; - char *token; - char *saveptr; - int start; - int end; - - if (!pattern || pattern[0] == '\0' || strcmp(pattern, "*") == 0) { - return FLB_TRUE; - } - - dup = flb_strdup(pattern); - if (!dup) { - return FLB_FALSE; - } - - token = strtok_r(dup, ",", &saveptr); - while (token) { - if (sscanf(token, "%d-%d", &start, &end) == 2) { - if (card_id >= start && card_id <= end) { - flb_free(dup); - return FLB_TRUE; - } - } - else { - if (card_id == atoi(token)) { - flb_free(dup); - return FLB_TRUE; - } - } - token = strtok_r(NULL, ",", &saveptr); - } - flb_free(dup); - return FLB_FALSE; -} - -static int should_include_card(struct in_gpu_metrics *ctx, int card_id) -{ - flb_plg_info(ctx->ins, "should_include_card: card%d, exclude='%s', include='%s'", - card_id, ctx->cards_exclude ? ctx->cards_exclude : "NULL", - ctx->cards_include ? ctx->cards_include : "NULL"); - - if (ctx->cards_exclude && ctx->cards_exclude[0] != '\0' && match_card_pattern(ctx->cards_exclude, card_id)) { - flb_plg_info(ctx->ins, "Card%d excluded by exclude pattern", card_id); - return FLB_FALSE; - } - if (ctx->cards_include && ctx->cards_include[0] != '\0' && !match_card_pattern(ctx->cards_include, card_id)) { - flb_plg_info(ctx->ins, "Card%d excluded by include pattern", card_id); - return FLB_FALSE; - } - flb_plg_info(ctx->ins, "Card%d should be included", card_id); - return FLB_TRUE; -} - static void free_cards(struct in_gpu_metrics *ctx) { struct cfl_list *tmp; @@ -260,7 +205,7 @@ int amd_gpu_detect_cards(struct in_gpu_metrics *ctx) } flb_plg_info(ctx->ins, "Checking if card%d should be included", id); - if (!should_include_card(ctx, id)) { + if (!gpu_should_include_card(ctx, id)) { flb_plg_info(ctx->ins, "Card%d excluded by filter", id); continue; } diff --git a/plugins/in_gpu_metrics/gpu_common.c b/plugins/in_gpu_metrics/gpu_common.c index fea0bdcbb64..897c497c897 100644 --- a/plugins/in_gpu_metrics/gpu_common.c +++ b/plugins/in_gpu_metrics/gpu_common.c @@ -22,7 +22,11 @@ #include #include +#include +#include + #include "gpu_common.h" +#include "gpu_metrics.h" int gpu_read_uint64(const char *path, uint64_t *value) { @@ -71,3 +75,57 @@ int gpu_read_line(const char *path, char *buf, size_t size) fclose(fp); return 0; } + +static int match_card_pattern(const char *pattern, int card_id) +{ + char *dup; + char *token; + char *saveptr; + int start; + int end; + + if (!pattern || pattern[0] == '\0' || strcmp(pattern, "*") == 0) { + return FLB_TRUE; + } + + dup = flb_strdup(pattern); + if (!dup) { + return FLB_FALSE; + } + + token = strtok_r(dup, ",", &saveptr); + while (token) { + if (sscanf(token, "%d-%d", &start, &end) == 2) { + if (card_id >= start && card_id <= end) { + flb_free(dup); + return FLB_TRUE; + } + } + else { + if (card_id == atoi(token)) { + flb_free(dup); + return FLB_TRUE; + } + } + token = strtok_r(NULL, ",", &saveptr); + } + flb_free(dup); + return FLB_FALSE; +} + +int gpu_should_include_card(struct in_gpu_metrics *ctx, int card_id) +{ + if (ctx->cards_exclude && ctx->cards_exclude[0] != '\0' && + match_card_pattern(ctx->cards_exclude, card_id)) { + flb_plg_info(ctx->ins, "card%d excluded by exclude pattern", card_id); + return FLB_FALSE; + } + + if (ctx->cards_include && ctx->cards_include[0] != '\0' && + !match_card_pattern(ctx->cards_include, card_id)) { + flb_plg_info(ctx->ins, "card%d excluded by include pattern", card_id); + return FLB_FALSE; + } + + return FLB_TRUE; +} diff --git a/plugins/in_gpu_metrics/gpu_common.h b/plugins/in_gpu_metrics/gpu_common.h index 07cc40e9695..83aa78117ed 100644 --- a/plugins/in_gpu_metrics/gpu_common.h +++ b/plugins/in_gpu_metrics/gpu_common.h @@ -23,8 +23,11 @@ #include #include +struct in_gpu_metrics; + int gpu_read_uint64(const char *path, uint64_t *value); int gpu_read_double(const char *path, double scale, double *value); int gpu_read_line(const char *path, char *buf, size_t size); +int gpu_should_include_card(struct in_gpu_metrics *ctx, int card_id); #endif diff --git a/plugins/in_gpu_metrics/nvml_gpu.c b/plugins/in_gpu_metrics/nvml_gpu.c index fee8233c31d..846e490c8e6 100644 --- a/plugins/in_gpu_metrics/nvml_gpu.c +++ b/plugins/in_gpu_metrics/nvml_gpu.c @@ -156,6 +156,10 @@ int nvml_gpu_detect_cards(struct in_gpu_metrics *ctx) } for (index = 0; index < count; index++) { + if (!gpu_should_include_card(ctx, (int) index)) { + continue; + } + card = flb_calloc(1, sizeof(struct gpu_card)); if (card == NULL) { flb_errno(); From da5954989de45ef803a7947cf5bf28ed3362f651 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Thu, 16 Apr 2026 12:51:17 +0900 Subject: [PATCH 3/5] in_gpu_metrics: Handle strtol properly Signed-off-by: Hiroshi Hatake --- plugins/in_gpu_metrics/gpu_common.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/plugins/in_gpu_metrics/gpu_common.c b/plugins/in_gpu_metrics/gpu_common.c index 897c497c897..7f3718463b3 100644 --- a/plugins/in_gpu_metrics/gpu_common.c +++ b/plugins/in_gpu_metrics/gpu_common.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include @@ -102,7 +104,17 @@ static int match_card_pattern(const char *pattern, int card_id) } } else { - if (card_id == atoi(token)) { + char *endptr; + long parsed_id; + + errno = 0; + parsed_id = strtol(token, &endptr, 10); + if (errno == 0 && + endptr != token && + *endptr == '\0' && + parsed_id >= INT_MIN && + parsed_id <= INT_MAX && + card_id == (int) parsed_id) { flb_free(dup); return FLB_TRUE; } From 017a3b95734532019c1794e9f2368d43925c0998 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Fri, 17 Apr 2026 14:03:23 +0900 Subject: [PATCH 4/5] in_gpu_metrics: Implement dcgm-exporter compatible per process metrics Signed-off-by: Hiroshi Hatake --- plugins/in_gpu_metrics/amd_gpu.c | 4 + plugins/in_gpu_metrics/gpu_metrics.c | 16 + plugins/in_gpu_metrics/gpu_metrics.h | 6 + plugins/in_gpu_metrics/nvml_gpu.c | 568 +++++++++++++++++++++++++-- 4 files changed, 558 insertions(+), 36 deletions(-) diff --git a/plugins/in_gpu_metrics/amd_gpu.c b/plugins/in_gpu_metrics/amd_gpu.c index cb6bde7c91f..91f17b2cb8f 100644 --- a/plugins/in_gpu_metrics/amd_gpu.c +++ b/plugins/in_gpu_metrics/amd_gpu.c @@ -222,6 +222,10 @@ int amd_gpu_detect_cards(struct in_gpu_metrics *ctx) card->id = id; card->backend_type = GPU_BACKEND_AMD; card->hwmon_path = NULL; + card->uuid = NULL; + card->parent_uuid = NULL; + card->gpu_instance_id = -1; + card->compute_instance_id = -1; card->hwmon_path = find_hwmon_path(id); if (!card->hwmon_path) { flb_plg_debug(ctx->ins, "no hwmon path for card%d", id); diff --git a/plugins/in_gpu_metrics/gpu_metrics.c b/plugins/in_gpu_metrics/gpu_metrics.c index 74eaaff6af4..4d107b326c6 100644 --- a/plugins/in_gpu_metrics/gpu_metrics.c +++ b/plugins/in_gpu_metrics/gpu_metrics.c @@ -124,6 +124,15 @@ static int in_gpu_init(struct flb_input_instance *ins, "GPU fan PWM percentage", 2, (char *[]) {"card", "vendor"}); + ctx->g_process_memory = cmt_gauge_create(ctx->cmt, "gpu", "", "process_memory_used_bytes", + "Per-process GPU memory in bytes", 3, + (char *[]) {"card", "vendor", "pid"}); + + ctx->g_mig_info = cmt_gauge_create(ctx->cmt, "gpu", "", "mig_device_info", + "MIG device metadata (always 1)", 5, + (char *[]) {"card", "vendor", "parent_uuid", + "gpu_instance_id", "compute_instance_id"}); + amd_gpu_detect_cards(ctx); if (nvml_gpu_initialize(ctx) == 0) { if (nvml_gpu_detect_cards(ctx) != 0) { @@ -167,6 +176,13 @@ static int in_gpu_exit(void *data, struct flb_config *config) if (card->hwmon_path) { flb_sds_destroy(card->hwmon_path); } + if (card->uuid) { + flb_sds_destroy(card->uuid); + } + if (card->parent_uuid) { + flb_sds_destroy(card->parent_uuid); + } + cfl_list_del(&card->_head); flb_free(card); } diff --git a/plugins/in_gpu_metrics/gpu_metrics.h b/plugins/in_gpu_metrics/gpu_metrics.h index 05469dec705..1df86a6abe9 100644 --- a/plugins/in_gpu_metrics/gpu_metrics.h +++ b/plugins/in_gpu_metrics/gpu_metrics.h @@ -27,6 +27,10 @@ struct gpu_card { int id; int backend_type; flb_sds_t hwmon_path; + flb_sds_t uuid; + flb_sds_t parent_uuid; + int gpu_instance_id; + int compute_instance_id; struct cfl_list _head; }; @@ -53,6 +57,8 @@ struct in_gpu_metrics { struct cmt_gauge *g_temp; struct cmt_gauge *g_fan_speed; struct cmt_gauge *g_fan_pwm; + struct cmt_gauge *g_process_memory; + struct cmt_gauge *g_mig_info; /* plugin instance */ struct flb_input_instance *ins; diff --git a/plugins/in_gpu_metrics/nvml_gpu.c b/plugins/in_gpu_metrics/nvml_gpu.c index 846e490c8e6..3a7abc8b949 100644 --- a/plugins/in_gpu_metrics/nvml_gpu.c +++ b/plugins/in_gpu_metrics/nvml_gpu.c @@ -19,6 +19,9 @@ #include #include +#include +#include +#include #include #include @@ -42,31 +45,118 @@ struct nvmlUtilization_t { unsigned int memory; }; +struct nvmlProcessInfo_v1_t { + unsigned int pid; + uint64_t usedGpuMemory; +}; + +struct nvmlProcessInfo_v2_t { + unsigned int pid; + uint64_t usedGpuMemory; + unsigned int gpuInstanceId; + unsigned int computeInstanceId; +}; + +struct nvmlProcessInfo_v3_t { + unsigned int pid; + uint64_t usedGpuMemory; + unsigned int gpuInstanceId; + unsigned int computeInstanceId; + uint64_t usedGpuCcProtectedMemory; +}; + typedef nvmlReturn_t (*nvmlInit_v2_t)(void); typedef nvmlReturn_t (*nvmlShutdown_t)(void); typedef nvmlReturn_t (*nvmlDeviceGetCount_v2_t)(unsigned int *device_count); -typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndex_v2_t)(unsigned int index, nvmlDevice_t *device); -typedef nvmlReturn_t (*nvmlDeviceGetMemoryInfo_t)(nvmlDevice_t device, struct nvmlMemory_t *memory); -typedef nvmlReturn_t (*nvmlDeviceGetUtilizationRates_t)(nvmlDevice_t device, struct nvmlUtilization_t *util); +typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndex_v2_t)(unsigned int index, + nvmlDevice_t *device); +typedef nvmlReturn_t (*nvmlDeviceGetHandleByUUID_t)(const char *uuid, + nvmlDevice_t *device); +typedef nvmlReturn_t (*nvmlDeviceGetMemoryInfo_t)(nvmlDevice_t device, + struct nvmlMemory_t *memory); +typedef nvmlReturn_t (*nvmlDeviceGetUtilizationRates_t)(nvmlDevice_t device, + struct nvmlUtilization_t *util); typedef nvmlReturn_t (*nvmlDeviceGetTemperature_t)(nvmlDevice_t device, unsigned int sensor_type, unsigned int *temp); -typedef nvmlReturn_t (*nvmlDeviceGetPowerUsage_t)(nvmlDevice_t device, unsigned int *power); -typedef nvmlReturn_t (*nvmlDeviceGetFanSpeed_t)(nvmlDevice_t device, unsigned int *speed); +typedef nvmlReturn_t (*nvmlDeviceGetPowerUsage_t)(nvmlDevice_t device, + unsigned int *power); +typedef nvmlReturn_t (*nvmlDeviceGetFanSpeed_t)(nvmlDevice_t device, + unsigned int *speed); +typedef nvmlReturn_t (*nvmlDeviceGetClockInfo_t)(nvmlDevice_t device, + unsigned int clock_type, + unsigned int *clock); +typedef nvmlReturn_t (*nvmlDeviceGetUUID_t)(nvmlDevice_t device, + char *uuid, + unsigned int length); +typedef nvmlReturn_t (*nvmlDeviceGetMigMode_t)(nvmlDevice_t device, + unsigned int *current_mode, + unsigned int *pending_mode); +typedef nvmlReturn_t (*nvmlDeviceGetMaxMigDeviceCount_t)(nvmlDevice_t device, + unsigned int *count); +typedef nvmlReturn_t (*nvmlDeviceGetMigDeviceHandleByIndex_t)(nvmlDevice_t device, + unsigned int index, + nvmlDevice_t *mig_device); +typedef nvmlReturn_t (*nvmlDeviceGetGpuInstanceId_t)(nvmlDevice_t device, + unsigned int *id); +typedef nvmlReturn_t (*nvmlDeviceGetComputeInstanceId_t)(nvmlDevice_t device, + unsigned int *id); +typedef nvmlReturn_t (*nvmlDeviceGetDeviceHandleFromMigDeviceHandle_t)(nvmlDevice_t device, + nvmlDevice_t *parent); +typedef nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_t)(nvmlDevice_t device, + unsigned int *info_count, + struct nvmlProcessInfo_v1_t *infos); +typedef nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_t)(nvmlDevice_t device, + unsigned int *info_count, + struct nvmlProcessInfo_v1_t *infos); +typedef nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v2_t)(nvmlDevice_t device, + unsigned int *info_count, + struct nvmlProcessInfo_v2_t *infos); +typedef nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v2_t)(nvmlDevice_t device, + unsigned int *info_count, + struct nvmlProcessInfo_v2_t *infos); +typedef nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v3_t)(nvmlDevice_t device, + unsigned int *info_count, + struct nvmlProcessInfo_v3_t *infos); +typedef nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v3_t)(nvmlDevice_t device, + unsigned int *info_count, + struct nvmlProcessInfo_v3_t *infos); typedef const char *(*nvmlErrorString_t)(nvmlReturn_t result); #define NVML_SUCCESS 0 #define NVML_TEMPERATURE_GPU 0 +#define NVML_CLOCK_GRAPHICS 0 +#define NVML_CLOCK_SM 1 +#define NVML_CLOCK_MEM 2 +#define NVML_FEATURE_ENABLED 1 +#define NVML_UUID_BUFFER_SIZE 96 +#define NVML_MAX_PROCESS_SAMPLES 128 +#define NVML_MAX_VALID_PID 4194304 static nvmlInit_v2_t f_nvml_init_v2; static nvmlShutdown_t f_nvml_shutdown; static nvmlDeviceGetCount_v2_t f_nvml_device_get_count_v2; static nvmlDeviceGetHandleByIndex_v2_t f_nvml_device_get_handle_by_index_v2; +static nvmlDeviceGetHandleByUUID_t f_nvml_device_get_handle_by_uuid; static nvmlDeviceGetMemoryInfo_t f_nvml_device_get_memory_info; static nvmlDeviceGetUtilizationRates_t f_nvml_device_get_utilization_rates; static nvmlDeviceGetTemperature_t f_nvml_device_get_temperature; static nvmlDeviceGetPowerUsage_t f_nvml_device_get_power_usage; static nvmlDeviceGetFanSpeed_t f_nvml_device_get_fan_speed; +static nvmlDeviceGetClockInfo_t f_nvml_device_get_clock_info; +static nvmlDeviceGetUUID_t f_nvml_device_get_uuid; +static nvmlDeviceGetMigMode_t f_nvml_device_get_mig_mode; +static nvmlDeviceGetMaxMigDeviceCount_t f_nvml_device_get_max_mig_device_count; +static nvmlDeviceGetMigDeviceHandleByIndex_t f_nvml_device_get_mig_device_handle_by_index; +static nvmlDeviceGetGpuInstanceId_t f_nvml_device_get_gpu_instance_id; +static nvmlDeviceGetComputeInstanceId_t f_nvml_device_get_compute_instance_id; +static nvmlDeviceGetDeviceHandleFromMigDeviceHandle_t f_nvml_device_get_parent_from_mig; +static nvmlDeviceGetComputeRunningProcesses_t f_nvml_device_get_compute_running_processes; +static nvmlDeviceGetGraphicsRunningProcesses_t f_nvml_device_get_graphics_running_processes; +static nvmlDeviceGetComputeRunningProcesses_v2_t f_nvml_device_get_compute_running_processes_v2; +static nvmlDeviceGetGraphicsRunningProcesses_v2_t f_nvml_device_get_graphics_running_processes_v2; +static nvmlDeviceGetComputeRunningProcesses_v3_t f_nvml_device_get_compute_running_processes_v3; +static nvmlDeviceGetGraphicsRunningProcesses_v3_t f_nvml_device_get_graphics_running_processes_v3; static nvmlErrorString_t f_nvml_error_string; static const char *nvml_result_to_string(nvmlReturn_t result) @@ -74,6 +164,7 @@ static const char *nvml_result_to_string(nvmlReturn_t result) if (f_nvml_error_string != NULL) { return f_nvml_error_string(result); } + return "unknown"; } @@ -88,6 +179,338 @@ static int load_nvml_symbol(struct in_gpu_metrics *ctx, const char *name, void * return 0; } +static void load_optional_nvml_symbol(struct in_gpu_metrics *ctx, const char *name, void **target) +{ + *target = dlsym(ctx->nvml_lib_handle, name); + if (*target == NULL) { + flb_plg_debug(ctx->ins, "optional NVML symbol '%s' is unavailable", name); + } +} + +static int nvml_read_device_uuid(nvmlDevice_t device, char *buf, size_t size) +{ + nvmlReturn_t result; + + if (f_nvml_device_get_uuid == NULL) { + return -1; + } + + result = f_nvml_device_get_uuid(device, buf, (unsigned int) size); + if (result != NVML_SUCCESS) { + return -1; + } + + return 0; +} + +static int nvml_register_card(struct in_gpu_metrics *ctx, + int card_id, + int gpu_instance_id, + int compute_instance_id, + const char *uuid, + const char *parent_uuid) +{ + struct gpu_card *card; + + card = flb_calloc(1, sizeof(struct gpu_card)); + if (card == NULL) { + flb_errno(); + return -1; + } + + card->id = card_id; + card->backend_type = GPU_BACKEND_NVML; + card->gpu_instance_id = gpu_instance_id; + card->compute_instance_id = compute_instance_id; + + if (uuid != NULL) { + card->uuid = flb_sds_create(uuid); + if (card->uuid == NULL) { + flb_free(card); + return -1; + } + } + + if (parent_uuid != NULL) { + card->parent_uuid = flb_sds_create(parent_uuid); + if (card->parent_uuid == NULL) { + if (card->uuid != NULL) { + flb_sds_destroy(card->uuid); + } + flb_free(card); + return -1; + } + } + + cfl_list_add(&card->_head, &ctx->cards); + return 0; +} + +static int nvml_detect_mig_devices(struct in_gpu_metrics *ctx, + int parent_card_id, + nvmlDevice_t parent_device, + const char *parent_uuid) +{ + nvmlDevice_t mig_device; + nvmlDevice_t mig_parent; + unsigned int current_mode; + unsigned int pending_mode; + unsigned int mig_count; + unsigned int mig_index; + unsigned int gi; + unsigned int ci; + char mig_uuid[NVML_UUID_BUFFER_SIZE]; + char resolved_parent_uuid[NVML_UUID_BUFFER_SIZE]; + const char *final_parent_uuid; + nvmlReturn_t result; + + if (f_nvml_device_get_mig_mode == NULL || + f_nvml_device_get_max_mig_device_count == NULL || + f_nvml_device_get_mig_device_handle_by_index == NULL || + f_nvml_device_get_gpu_instance_id == NULL || + f_nvml_device_get_compute_instance_id == NULL) { + return 0; + } + + result = f_nvml_device_get_mig_mode(parent_device, ¤t_mode, &pending_mode); + if (result != NVML_SUCCESS || current_mode != NVML_FEATURE_ENABLED) { + return 0; + } + + result = f_nvml_device_get_max_mig_device_count(parent_device, &mig_count); + if (result != NVML_SUCCESS) { + return -1; + } + + for (mig_index = 0; mig_index < mig_count; mig_index++) { + result = f_nvml_device_get_mig_device_handle_by_index(parent_device, mig_index, &mig_device); + if (result != NVML_SUCCESS) { + continue; + } + + if (nvml_read_device_uuid(mig_device, mig_uuid, sizeof(mig_uuid)) != 0) { + continue; + } + + result = f_nvml_device_get_gpu_instance_id(mig_device, &gi); + if (result != NVML_SUCCESS) { + continue; + } + + result = f_nvml_device_get_compute_instance_id(mig_device, &ci); + if (result != NVML_SUCCESS) { + continue; + } + + final_parent_uuid = parent_uuid; + if (final_parent_uuid == NULL && f_nvml_device_get_parent_from_mig != NULL) { + result = f_nvml_device_get_parent_from_mig(mig_device, &mig_parent); + if (result == NVML_SUCCESS && + nvml_read_device_uuid(mig_parent, resolved_parent_uuid, sizeof(resolved_parent_uuid)) == 0) { + final_parent_uuid = resolved_parent_uuid; + } + } + + if (nvml_register_card(ctx, + parent_card_id, + (int) gi, + (int) ci, + mig_uuid, + final_parent_uuid) != 0) { + return -1; + } + } + + return 0; +} + +static int nvml_get_device_handle(struct gpu_card *card, nvmlDevice_t *device) +{ + nvmlReturn_t result; + + if (card->uuid != NULL && f_nvml_device_get_handle_by_uuid != NULL) { + result = f_nvml_device_get_handle_by_uuid(card->uuid, device); + if (result == NVML_SUCCESS) { + return 0; + } + } + + result = f_nvml_device_get_handle_by_index_v2((unsigned int) card->id, device); + if (result != NVML_SUCCESS) { + return -1; + } + + return 0; +} + +static int nvml_process_entry_is_valid(unsigned int pid, uint64_t used_gpu_memory) +{ + if (pid == 0 || pid == UINT_MAX) { + return FLB_FALSE; + } + + /* + * Linux PID upper bound is 2^22. This avoids emitting invalid IDs that + * may appear when some NVML process entries are partially populated. + */ + if (pid > NVML_MAX_VALID_PID) { + return FLB_FALSE; + } + + if (used_gpu_memory == UINT64_MAX) { + return FLB_FALSE; + } + + return FLB_TRUE; +} + +static void nvml_emit_process_memory_samples_v1(struct in_gpu_metrics *ctx, + const char *card_label, + nvmlDevice_t device, + uint64_t ts, + nvmlDeviceGetComputeRunningProcesses_t api) +{ + struct nvmlProcessInfo_v1_t infos[NVML_MAX_PROCESS_SAMPLES]; + unsigned int info_count; + unsigned int i; + char pid_buf[32]; + nvmlReturn_t result; + + if (api == NULL || ctx->g_process_memory == NULL) { + return; + } + + info_count = NVML_MAX_PROCESS_SAMPLES; + result = api(device, &info_count, infos); + if (result != NVML_SUCCESS) { + return; + } + + for (i = 0; i < info_count; i++) { + if (nvml_process_entry_is_valid(infos[i].pid, + infos[i].usedGpuMemory) == FLB_FALSE) { + continue; + } + snprintf(pid_buf, sizeof(pid_buf), "%u", infos[i].pid); + cmt_gauge_set(ctx->g_process_memory, ts, (double) infos[i].usedGpuMemory, 3, + (char *[]) { (char *) card_label, "nvidia", pid_buf}); + } +} + +static void nvml_emit_process_memory_samples_v2(struct in_gpu_metrics *ctx, + const char *card_label, + nvmlDevice_t device, + uint64_t ts, + nvmlDeviceGetComputeRunningProcesses_v2_t api) +{ + struct nvmlProcessInfo_v2_t infos[NVML_MAX_PROCESS_SAMPLES]; + unsigned int info_count; + unsigned int i; + char pid_buf[32]; + nvmlReturn_t result; + + if (api == NULL || ctx->g_process_memory == NULL) { + return; + } + + info_count = NVML_MAX_PROCESS_SAMPLES; + result = api(device, &info_count, infos); + if (result != NVML_SUCCESS) { + return; + } + + for (i = 0; i < info_count; i++) { + if (nvml_process_entry_is_valid(infos[i].pid, + infos[i].usedGpuMemory) == FLB_FALSE) { + continue; + } + snprintf(pid_buf, sizeof(pid_buf), "%u", infos[i].pid); + cmt_gauge_set(ctx->g_process_memory, ts, (double) infos[i].usedGpuMemory, 3, + (char *[]) { (char *) card_label, "nvidia", pid_buf}); + } +} + +static void nvml_emit_process_memory_samples_v3(struct in_gpu_metrics *ctx, + const char *card_label, + nvmlDevice_t device, + uint64_t ts, + nvmlDeviceGetComputeRunningProcesses_v3_t api) +{ + struct nvmlProcessInfo_v3_t infos[NVML_MAX_PROCESS_SAMPLES]; + unsigned int info_count; + unsigned int i; + char pid_buf[32]; + nvmlReturn_t result; + + if (api == NULL || ctx->g_process_memory == NULL) { + return; + } + + info_count = NVML_MAX_PROCESS_SAMPLES; + result = api(device, &info_count, infos); + if (result != NVML_SUCCESS) { + return; + } + + for (i = 0; i < info_count; i++) { + if (nvml_process_entry_is_valid(infos[i].pid, + infos[i].usedGpuMemory) == FLB_FALSE) { + continue; + } + snprintf(pid_buf, sizeof(pid_buf), "%u", infos[i].pid); + cmt_gauge_set(ctx->g_process_memory, ts, (double) infos[i].usedGpuMemory, 3, + (char *[]) { (char *) card_label, "nvidia", pid_buf}); + } +} + +static void nvml_collect_process_memory(struct in_gpu_metrics *ctx, + struct gpu_card *card, + nvmlDevice_t device, + uint64_t ts, + const char *card_label) +{ + if (f_nvml_device_get_compute_running_processes_v3 != NULL) { + nvml_emit_process_memory_samples_v3(ctx, card_label, device, ts, + f_nvml_device_get_compute_running_processes_v3); + } + if (f_nvml_device_get_compute_running_processes_v2 != NULL) { + nvml_emit_process_memory_samples_v2(ctx, card_label, device, ts, + f_nvml_device_get_compute_running_processes_v2); + } + if (f_nvml_device_get_compute_running_processes != NULL) { + nvml_emit_process_memory_samples_v1(ctx, card_label, device, ts, + f_nvml_device_get_compute_running_processes); + } + + if (f_nvml_device_get_graphics_running_processes_v3 != NULL) { + nvml_emit_process_memory_samples_v3(ctx, card_label, device, ts, + f_nvml_device_get_graphics_running_processes_v3); + } + if (f_nvml_device_get_graphics_running_processes_v2 != NULL) { + nvml_emit_process_memory_samples_v2(ctx, card_label, device, ts, + f_nvml_device_get_graphics_running_processes_v2); + } + if (f_nvml_device_get_graphics_running_processes != NULL) { + nvml_emit_process_memory_samples_v1(ctx, card_label, device, ts, + f_nvml_device_get_graphics_running_processes); + } + + if (ctx->g_mig_info != NULL && + card->gpu_instance_id >= 0 && + card->compute_instance_id >= 0 && + card->parent_uuid != NULL) { + char gi_buf[16]; + char ci_buf[16]; + + snprintf(gi_buf, sizeof(gi_buf), "%d", card->gpu_instance_id); + snprintf(ci_buf, sizeof(ci_buf), "%d", card->compute_instance_id); + + cmt_gauge_set(ctx->g_mig_info, ts, 1.0, 5, + (char *[]) { (char *) card_label, "nvidia", card->parent_uuid, + gi_buf, ci_buf}); + } +} + int nvml_gpu_initialize(struct in_gpu_metrics *ctx) { nvmlReturn_t result; @@ -123,7 +546,32 @@ int nvml_gpu_initialize(struct in_gpu_metrics *ctx) } f_nvml_error_string = dlsym(ctx->nvml_lib_handle, "nvmlErrorString"); - f_nvml_device_get_fan_speed = dlsym(ctx->nvml_lib_handle, "nvmlDeviceGetFanSpeed"); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetFanSpeed", (void **) &f_nvml_device_get_fan_speed); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetClockInfo", (void **) &f_nvml_device_get_clock_info); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetUUID", (void **) &f_nvml_device_get_uuid); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetHandleByUUID", (void **) &f_nvml_device_get_handle_by_uuid); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetMigMode", (void **) &f_nvml_device_get_mig_mode); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetMaxMigDeviceCount", + (void **) &f_nvml_device_get_max_mig_device_count); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetMigDeviceHandleByIndex", + (void **) &f_nvml_device_get_mig_device_handle_by_index); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetGpuInstanceId", (void **) &f_nvml_device_get_gpu_instance_id); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetComputeInstanceId", + (void **) &f_nvml_device_get_compute_instance_id); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetDeviceHandleFromMigDeviceHandle", + (void **) &f_nvml_device_get_parent_from_mig); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetComputeRunningProcesses_v3", + (void **) &f_nvml_device_get_compute_running_processes_v3); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetGraphicsRunningProcesses_v3", + (void **) &f_nvml_device_get_graphics_running_processes_v3); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetComputeRunningProcesses_v2", + (void **) &f_nvml_device_get_compute_running_processes_v2); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetGraphicsRunningProcesses_v2", + (void **) &f_nvml_device_get_graphics_running_processes_v2); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetComputeRunningProcesses", + (void **) &f_nvml_device_get_compute_running_processes); + load_optional_nvml_symbol(ctx, "nvmlDeviceGetGraphicsRunningProcesses", + (void **) &f_nvml_device_get_graphics_running_processes); result = f_nvml_init_v2(); if (result != NVML_SUCCESS) { @@ -142,8 +590,10 @@ int nvml_gpu_detect_cards(struct in_gpu_metrics *ctx) { unsigned int index; unsigned int count; - struct gpu_card *card; + nvmlDevice_t device; nvmlReturn_t result; + char uuid[NVML_UUID_BUFFER_SIZE]; + int detected; if (ctx->nvml_initialized == FLB_FALSE) { return 0; @@ -155,24 +605,38 @@ int nvml_gpu_detect_cards(struct in_gpu_metrics *ctx) return -1; } + detected = 0; + for (index = 0; index < count; index++) { if (!gpu_should_include_card(ctx, (int) index)) { continue; } - card = flb_calloc(1, sizeof(struct gpu_card)); - if (card == NULL) { - flb_errno(); - return -1; + result = f_nvml_device_get_handle_by_index_v2(index, &device); + if (result != NVML_SUCCESS) { + continue; + } + + if (nvml_read_device_uuid(device, uuid, sizeof(uuid)) == 0) { + if (nvml_register_card(ctx, (int) index, -1, -1, uuid, NULL) != 0) { + return -1; + } } + else { + if (nvml_register_card(ctx, (int) index, -1, -1, NULL, NULL) != 0) { + return -1; + } + } + detected++; - card->id = (int) index; - card->backend_type = GPU_BACKEND_NVML; - cfl_list_add(&card->_head, &ctx->cards); + if (nvml_detect_mig_devices(ctx, (int) index, device, + nvml_read_device_uuid(device, uuid, sizeof(uuid)) == 0 ? uuid : NULL) != 0) { + flb_plg_warn(ctx->ins, "failed to detect MIG devices for card%d", (int) index); + } } - if (count > 0) { - flb_plg_info(ctx->ins, "detected %u NVIDIA GPU(s) via NVML", count); + if (detected > 0) { + flb_plg_info(ctx->ins, "detected %d NVIDIA GPU card(s) via NVML", detected); } return 0; @@ -186,29 +650,36 @@ int nvml_gpu_collect_metrics(struct in_gpu_metrics *ctx, struct gpu_card *card) unsigned int temp; unsigned int power_mw; unsigned int fan_percent; + unsigned int sm_clock_mhz; + unsigned int mem_clock_mhz; + unsigned int graphics_clock_mhz; nvmlReturn_t result; uint64_t ts; - flb_sds_t card_id; + flb_sds_t fallback_card_id; + const char *card_label; if (ctx->nvml_initialized == FLB_FALSE) { return -1; } - result = f_nvml_device_get_handle_by_index_v2((unsigned int) card->id, &device); - if (result != NVML_SUCCESS) { - flb_plg_debug(ctx->ins, "NVML handle lookup failed for card%d: %s", - card->id, nvml_result_to_string(result)); + if (nvml_get_device_handle(card, &device) != 0) { return -1; } - card_id = flb_sds_create_size(16); - if (card_id == NULL) { - flb_errno(); - return -1; - } - card_id = flb_sds_printf(&card_id, "%d", card->id); - if (card_id == NULL) { - return -1; + fallback_card_id = NULL; + card_label = card->uuid; + + if (card_label == NULL) { + fallback_card_id = flb_sds_create_size(16); + if (fallback_card_id == NULL) { + flb_errno(); + return -1; + } + fallback_card_id = flb_sds_printf(&fallback_card_id, "%d", card->id); + if (fallback_card_id == NULL) { + return -1; + } + card_label = fallback_card_id; } ts = cfl_time_now(); @@ -216,22 +687,42 @@ int nvml_gpu_collect_metrics(struct in_gpu_metrics *ctx, struct gpu_card *card) result = f_nvml_device_get_utilization_rates(device, &util); if (result == NVML_SUCCESS) { cmt_gauge_set(ctx->g_utilization, ts, (double) util.gpu, 2, - (char *[]) {card_id, "nvidia"}); + (char *[]) {(char *) card_label, "nvidia"}); } result = f_nvml_device_get_memory_info(device, &memory); if (result == NVML_SUCCESS) { cmt_gauge_set(ctx->g_mem_used, ts, (double) memory.used, 2, - (char *[]) {card_id, "nvidia"}); + (char *[]) {(char *) card_label, "nvidia"}); cmt_gauge_set(ctx->g_mem_total, ts, (double) memory.total, 2, - (char *[]) {card_id, "nvidia"}); + (char *[]) {(char *) card_label, "nvidia"}); + } + + if (f_nvml_device_get_clock_info != NULL) { + result = f_nvml_device_get_clock_info(device, NVML_CLOCK_SM, &sm_clock_mhz); + if (result == NVML_SUCCESS) { + cmt_gauge_set(ctx->g_clock, ts, (double) sm_clock_mhz, 3, + (char *[]) {(char *) card_label, "nvidia", "sm"}); + } + + result = f_nvml_device_get_clock_info(device, NVML_CLOCK_MEM, &mem_clock_mhz); + if (result == NVML_SUCCESS) { + cmt_gauge_set(ctx->g_clock, ts, (double) mem_clock_mhz, 3, + (char *[]) {(char *) card_label, "nvidia", "memory"}); + } + + result = f_nvml_device_get_clock_info(device, NVML_CLOCK_GRAPHICS, &graphics_clock_mhz); + if (result == NVML_SUCCESS) { + cmt_gauge_set(ctx->g_clock, ts, (double) graphics_clock_mhz, 3, + (char *[]) {(char *) card_label, "nvidia", "graphics"}); + } } if (ctx->enable_temperature) { result = f_nvml_device_get_temperature(device, NVML_TEMPERATURE_GPU, &temp); if (result == NVML_SUCCESS) { cmt_gauge_set(ctx->g_temp, ts, (double) temp, 2, - (char *[]) {card_id, "nvidia"}); + (char *[]) {(char *) card_label, "nvidia"}); } } @@ -239,7 +730,7 @@ int nvml_gpu_collect_metrics(struct in_gpu_metrics *ctx, struct gpu_card *card) result = f_nvml_device_get_power_usage(device, &power_mw); if (result == NVML_SUCCESS) { cmt_gauge_set(ctx->g_power, ts, (double) power_mw / 1000.0, 2, - (char *[]) {card_id, "nvidia"}); + (char *[]) {(char *) card_label, "nvidia"}); } } @@ -247,11 +738,16 @@ int nvml_gpu_collect_metrics(struct in_gpu_metrics *ctx, struct gpu_card *card) result = f_nvml_device_get_fan_speed(device, &fan_percent); if (result == NVML_SUCCESS) { cmt_gauge_set(ctx->g_fan_pwm, ts, (double) fan_percent, 2, - (char *[]) {card_id, "nvidia"}); + (char *[]) {(char *) card_label, "nvidia"}); } } - flb_sds_destroy(card_id); + nvml_collect_process_memory(ctx, card, device, ts, card_label); + + if (fallback_card_id != NULL) { + flb_sds_destroy(fallback_card_id); + } + return 0; } From fadb51fbd4c292272e77cfc2c8c30c233573894f Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Fri, 17 Apr 2026 17:00:35 +0900 Subject: [PATCH 5/5] in_gpu_metrics: Address minor issues Signed-off-by: Hiroshi Hatake --- plugins/in_gpu_metrics/nvml_gpu.c | 47 +++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/plugins/in_gpu_metrics/nvml_gpu.c b/plugins/in_gpu_metrics/nvml_gpu.c index 3a7abc8b949..7507b0445dc 100644 --- a/plugins/in_gpu_metrics/nvml_gpu.c +++ b/plugins/in_gpu_metrics/nvml_gpu.c @@ -159,6 +159,35 @@ static nvmlDeviceGetComputeRunningProcesses_v3_t f_nvml_device_get_compute_runni static nvmlDeviceGetGraphicsRunningProcesses_v3_t f_nvml_device_get_graphics_running_processes_v3; static nvmlErrorString_t f_nvml_error_string; +static void nvml_reset_api_symbols() +{ + f_nvml_init_v2 = NULL; + f_nvml_shutdown = NULL; + f_nvml_device_get_count_v2 = NULL; + f_nvml_device_get_handle_by_index_v2 = NULL; + f_nvml_device_get_handle_by_uuid = NULL; + f_nvml_device_get_memory_info = NULL; + f_nvml_device_get_utilization_rates = NULL; + f_nvml_device_get_temperature = NULL; + f_nvml_device_get_power_usage = NULL; + f_nvml_device_get_fan_speed = NULL; + f_nvml_device_get_clock_info = NULL; + f_nvml_device_get_uuid = NULL; + f_nvml_device_get_mig_mode = NULL; + f_nvml_device_get_max_mig_device_count = NULL; + f_nvml_device_get_mig_device_handle_by_index = NULL; + f_nvml_device_get_gpu_instance_id = NULL; + f_nvml_device_get_compute_instance_id = NULL; + f_nvml_device_get_parent_from_mig = NULL; + f_nvml_device_get_compute_running_processes = NULL; + f_nvml_device_get_graphics_running_processes = NULL; + f_nvml_device_get_compute_running_processes_v2 = NULL; + f_nvml_device_get_graphics_running_processes_v2 = NULL; + f_nvml_device_get_compute_running_processes_v3 = NULL; + f_nvml_device_get_graphics_running_processes_v3 = NULL; + f_nvml_error_string = NULL; +} + static const char *nvml_result_to_string(nvmlReturn_t result) { if (f_nvml_error_string != NULL) { @@ -515,6 +544,8 @@ int nvml_gpu_initialize(struct in_gpu_metrics *ctx) { nvmlReturn_t result; + nvml_reset_api_symbols(); + if (ctx->enable_nvml == FLB_FALSE) { return 0; } @@ -542,6 +573,7 @@ int nvml_gpu_initialize(struct in_gpu_metrics *ctx) load_nvml_symbol(ctx, "nvmlDeviceGetPowerUsage", (void **) &f_nvml_device_get_power_usage) != 0) { dlclose(ctx->nvml_lib_handle); ctx->nvml_lib_handle = NULL; + nvml_reset_api_symbols(); return -1; } @@ -578,6 +610,7 @@ int nvml_gpu_initialize(struct in_gpu_metrics *ctx) flb_plg_warn(ctx->ins, "NVML init failed: %s", nvml_result_to_string(result)); dlclose(ctx->nvml_lib_handle); ctx->nvml_lib_handle = NULL; + nvml_reset_api_symbols(); return -1; } @@ -592,6 +625,7 @@ int nvml_gpu_detect_cards(struct in_gpu_metrics *ctx) unsigned int count; nvmlDevice_t device; nvmlReturn_t result; + int uuid_ok; char uuid[NVML_UUID_BUFFER_SIZE]; int detected; @@ -617,7 +651,9 @@ int nvml_gpu_detect_cards(struct in_gpu_metrics *ctx) continue; } - if (nvml_read_device_uuid(device, uuid, sizeof(uuid)) == 0) { + uuid_ok = (nvml_read_device_uuid(device, uuid, sizeof(uuid)) == 0); + + if (uuid_ok) { if (nvml_register_card(ctx, (int) index, -1, -1, uuid, NULL) != 0) { return -1; } @@ -630,7 +666,7 @@ int nvml_gpu_detect_cards(struct in_gpu_metrics *ctx) detected++; if (nvml_detect_mig_devices(ctx, (int) index, device, - nvml_read_device_uuid(device, uuid, sizeof(uuid)) == 0 ? uuid : NULL) != 0) { + uuid_ok ? uuid : NULL) != 0) { flb_plg_warn(ctx->ins, "failed to detect MIG devices for card%d", (int) index); } } @@ -754,11 +790,16 @@ int nvml_gpu_collect_metrics(struct in_gpu_metrics *ctx, struct gpu_card *card) void nvml_gpu_shutdown(struct in_gpu_metrics *ctx) { if (ctx->nvml_initialized == FLB_TRUE) { - f_nvml_shutdown(); + if (f_nvml_shutdown != NULL) { + f_nvml_shutdown(); + } ctx->nvml_initialized = FLB_FALSE; } + if (ctx->nvml_lib_handle != NULL) { dlclose(ctx->nvml_lib_handle); ctx->nvml_lib_handle = NULL; } + + nvml_reset_api_symbols(); }