Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions plugins/in_gpu_metrics/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
set(src
gpu_metrics.c
amd_gpu.c
nvml_gpu.c
gpu_common.c
)

Expand Down
63 changes: 6 additions & 57 deletions plugins/in_gpu_metrics/amd_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,61 +47,6 @@ static flb_sds_t build_path(int card_id, const char *file)
return path;
}

static int match_card_pattern(const char *pattern, int card_id)
{
char *dup;
char *token;
char *saveptr;
int start;
int end;

if (!pattern || pattern[0] == '\0' || strcmp(pattern, "*") == 0) {
return FLB_TRUE;
}

dup = flb_strdup(pattern);
if (!dup) {
return FLB_FALSE;
}

token = strtok_r(dup, ",", &saveptr);
while (token) {
if (sscanf(token, "%d-%d", &start, &end) == 2) {
if (card_id >= start && card_id <= end) {
flb_free(dup);
return FLB_TRUE;
}
}
else {
if (card_id == atoi(token)) {
flb_free(dup);
return FLB_TRUE;
}
}
token = strtok_r(NULL, ",", &saveptr);
}
flb_free(dup);
return FLB_FALSE;
}

static int should_include_card(struct in_gpu_metrics *ctx, int card_id)
{
flb_plg_info(ctx->ins, "should_include_card: card%d, exclude='%s', include='%s'",
card_id, ctx->cards_exclude ? ctx->cards_exclude : "NULL",
ctx->cards_include ? ctx->cards_include : "NULL");

if (ctx->cards_exclude && ctx->cards_exclude[0] != '\0' && match_card_pattern(ctx->cards_exclude, card_id)) {
flb_plg_info(ctx->ins, "Card%d excluded by exclude pattern", card_id);
return FLB_FALSE;
}
if (ctx->cards_include && ctx->cards_include[0] != '\0' && !match_card_pattern(ctx->cards_include, card_id)) {
flb_plg_info(ctx->ins, "Card%d excluded by include pattern", card_id);
return FLB_FALSE;
}
flb_plg_info(ctx->ins, "Card%d should be included", card_id);
return FLB_TRUE;
}

static void free_cards(struct in_gpu_metrics *ctx)
{
struct cfl_list *tmp;
Expand Down Expand Up @@ -260,7 +205,7 @@ int amd_gpu_detect_cards(struct in_gpu_metrics *ctx)
}

flb_plg_info(ctx->ins, "Checking if card%d should be included", id);
if (!should_include_card(ctx, id)) {
if (!gpu_should_include_card(ctx, id)) {
flb_plg_info(ctx->ins, "Card%d excluded by filter", id);
continue;
}
Expand All @@ -275,7 +220,12 @@ int amd_gpu_detect_cards(struct in_gpu_metrics *ctx)
return -1;
}
card->id = id;
card->backend_type = GPU_BACKEND_AMD;
card->hwmon_path = NULL;
card->uuid = NULL;
card->parent_uuid = NULL;
card->gpu_instance_id = -1;
card->compute_instance_id = -1;
card->hwmon_path = find_hwmon_path(id);
if (!card->hwmon_path) {
flb_plg_debug(ctx->ins, "no hwmon path for card%d", id);
Expand Down Expand Up @@ -557,4 +507,3 @@ int amd_gpu_collect_metrics(struct in_gpu_metrics *ctx, struct gpu_card *card)
flb_sds_destroy(card_id);
return 0;
}

70 changes: 70 additions & 0 deletions plugins/in_gpu_metrics/gpu_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,14 @@
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <errno.h>
#include <limits.h>

#include <fluent-bit/flb_input_plugin.h>
#include <fluent-bit/flb_mem.h>

#include "gpu_common.h"
#include "gpu_metrics.h"

int gpu_read_uint64(const char *path, uint64_t *value)
{
Expand Down Expand Up @@ -71,3 +77,67 @@ int gpu_read_line(const char *path, char *buf, size_t size)
fclose(fp);
return 0;
}

static int match_card_pattern(const char *pattern, int card_id)
{
char *dup;
char *token;
char *saveptr;
int start;
int end;

if (!pattern || pattern[0] == '\0' || strcmp(pattern, "*") == 0) {
return FLB_TRUE;
}

dup = flb_strdup(pattern);
if (!dup) {
return FLB_FALSE;
}

token = strtok_r(dup, ",", &saveptr);
while (token) {
if (sscanf(token, "%d-%d", &start, &end) == 2) {
if (card_id >= start && card_id <= end) {
flb_free(dup);
return FLB_TRUE;
}
}
else {
char *endptr;
long parsed_id;

errno = 0;
parsed_id = strtol(token, &endptr, 10);
if (errno == 0 &&
endptr != token &&
*endptr == '\0' &&
parsed_id >= INT_MIN &&
parsed_id <= INT_MAX &&
card_id == (int) parsed_id) {
flb_free(dup);
Comment thread
coderabbitai[bot] marked this conversation as resolved.
return FLB_TRUE;
}
}
token = strtok_r(NULL, ",", &saveptr);
}
flb_free(dup);
return FLB_FALSE;
}

int gpu_should_include_card(struct in_gpu_metrics *ctx, int card_id)
{
if (ctx->cards_exclude && ctx->cards_exclude[0] != '\0' &&
match_card_pattern(ctx->cards_exclude, card_id)) {
flb_plg_info(ctx->ins, "card%d excluded by exclude pattern", card_id);
return FLB_FALSE;
}

if (ctx->cards_include && ctx->cards_include[0] != '\0' &&
!match_card_pattern(ctx->cards_include, card_id)) {
flb_plg_info(ctx->ins, "card%d excluded by include pattern", card_id);
return FLB_FALSE;
}

return FLB_TRUE;
}
3 changes: 3 additions & 0 deletions plugins/in_gpu_metrics/gpu_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@
#include <inttypes.h>
#include <stddef.h>

struct in_gpu_metrics;

int gpu_read_uint64(const char *path, uint64_t *value);
int gpu_read_double(const char *path, double scale, double *value);
int gpu_read_line(const char *path, char *buf, size_t size);
int gpu_should_include_card(struct in_gpu_metrics *ctx, int card_id);

#endif
37 changes: 36 additions & 1 deletion plugins/in_gpu_metrics/gpu_metrics.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

#include "gpu_metrics.h"
#include "amd_gpu.h"
#include "nvml_gpu.h"

static int in_gpu_collect(struct flb_input_instance *ins,
struct flb_config *config, void *in_context)
Expand All @@ -41,7 +42,12 @@ static int in_gpu_collect(struct flb_input_instance *ins,

cfl_list_foreach(head, &ctx->cards) {
card = cfl_list_entry(head, struct gpu_card, _head);
amd_gpu_collect_metrics(ctx, card);
if (card->backend_type == GPU_BACKEND_AMD) {
amd_gpu_collect_metrics(ctx, card);
}
else if (card->backend_type == GPU_BACKEND_NVML) {
nvml_gpu_collect_metrics(ctx, card);
}
}

flb_input_metrics_append(ctx->ins, NULL, 0, ctx->cmt);
Expand All @@ -61,6 +67,8 @@ static int in_gpu_init(struct flb_input_instance *ins,
}
ctx->ins = ins;
ctx->cards_detected = 0;
ctx->nvml_initialized = FLB_FALSE;
ctx->nvml_lib_handle = NULL;
cfl_list_init(&ctx->cards);

ret = flb_input_config_map_set(ins, (void *) ctx);
Expand Down Expand Up @@ -116,7 +124,21 @@ static int in_gpu_init(struct flb_input_instance *ins,
"GPU fan PWM percentage", 2,
(char *[]) {"card", "vendor"});

ctx->g_process_memory = cmt_gauge_create(ctx->cmt, "gpu", "", "process_memory_used_bytes",
"Per-process GPU memory in bytes", 3,
(char *[]) {"card", "vendor", "pid"});

ctx->g_mig_info = cmt_gauge_create(ctx->cmt, "gpu", "", "mig_device_info",
"MIG device metadata (always 1)", 5,
(char *[]) {"card", "vendor", "parent_uuid",
"gpu_instance_id", "compute_instance_id"});

amd_gpu_detect_cards(ctx);
if (nvml_gpu_initialize(ctx) == 0) {
if (nvml_gpu_detect_cards(ctx) != 0) {
flb_plg_debug(ctx->ins, "NVML card detection encountered errors");
}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
flb_input_set_context(ins, ctx);

ret = flb_input_set_collector_time(ins, in_gpu_collect,
Expand Down Expand Up @@ -154,6 +176,13 @@ static int in_gpu_exit(void *data, struct flb_config *config)
if (card->hwmon_path) {
flb_sds_destroy(card->hwmon_path);
}
if (card->uuid) {
flb_sds_destroy(card->uuid);
}
if (card->parent_uuid) {
flb_sds_destroy(card->parent_uuid);
}

cfl_list_del(&card->_head);
flb_free(card);
}
Expand All @@ -162,6 +191,7 @@ static int in_gpu_exit(void *data, struct flb_config *config)
cmt_destroy(ctx->cmt);
}

nvml_gpu_shutdown(ctx);
flb_free(ctx);
return 0;
}
Expand Down Expand Up @@ -189,6 +219,11 @@ static struct flb_config_map config_map[] = {
0, FLB_TRUE, offsetof(struct in_gpu_metrics, enable_temperature),
"Enable collection of GPU temperature metrics (gpu_temperature_celsius)."
},
{
FLB_CONFIG_MAP_BOOL, "enable_nvml", "true",
0, FLB_TRUE, offsetof(struct in_gpu_metrics, enable_nvml),
"Enable NVIDIA NVML collection when libnvidia-ml is available."
},
{
FLB_CONFIG_MAP_STR, "path_sysfs", "/sys",
0, FLB_TRUE, offsetof(struct in_gpu_metrics, path_sysfs),
Expand Down
13 changes: 13 additions & 0 deletions plugins/in_gpu_metrics/gpu_metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@

struct gpu_card {
int id;
int backend_type;
flb_sds_t hwmon_path;
flb_sds_t uuid;
flb_sds_t parent_uuid;
int gpu_instance_id;
int compute_instance_id;
struct cfl_list _head;
};

Expand All @@ -36,8 +41,10 @@ struct in_gpu_metrics {
int scrape_interval;
int enable_power;
int enable_temperature;
int enable_nvml;
int coll_fd;
int cards_detected;
int nvml_initialized;

struct cfl_list cards;

Expand All @@ -50,10 +57,16 @@ struct in_gpu_metrics {
struct cmt_gauge *g_temp;
struct cmt_gauge *g_fan_speed;
struct cmt_gauge *g_fan_pwm;
struct cmt_gauge *g_process_memory;
struct cmt_gauge *g_mig_info;

/* plugin instance */
struct flb_input_instance *ins;
void *nvml_lib_handle;

};

#define GPU_BACKEND_AMD 1
#define GPU_BACKEND_NVML 2

#endif
Loading
Loading