Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion inputs/elasticsearch/collector/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ type Nodes struct {
}

// NewNodes defines Nodes Prometheus metrics
func NewNodes(client *http.Client, url *url.URL, all bool, node string, local bool, nodeStats []string) *Nodes {
func NewNodes(client *http.Client, url *url.URL, all bool, node string, local bool, nodeStats []string, clusterName string) *Nodes {
return &Nodes{
client: client,
url: url,
Expand All @@ -209,14 +209,26 @@ func NewNodes(client *http.Client, url *url.URL, all bool, node string, local bo
up: prometheus.NewGauge(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, "node_stats", "up"),
Help: "Was the last scrape of the Elasticsearch nodes endpoint successful.",
ConstLabels: prometheus.Labels{
"cluster": clusterName,
"address": url.String(),
},
Comment on lines 209 to +215
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NewNodes sets address ConstLabels to url.String(). Since the URL passed in is configured with basic auth (EsUrl.User = url.UserPassword(...)), this will embed credentials (user/password or API key) into Prometheus labels/metrics output. Please ensure the address label is derived from a sanitized URL (e.g., clone the URL and clear User, or use Scheme+Host only) so secrets are never exported as labels.

Copilot uses AI. Check for mistakes.
}),
totalScrapes: prometheus.NewCounter(prometheus.CounterOpts{
Name: prometheus.BuildFQName(namespace, "node_stats", "total_scrapes"),
Help: "Current total Elasticsearch node scrapes.",
ConstLabels: prometheus.Labels{
"cluster": clusterName,
"address": url.String(),
},
}),
jsonParseFailures: prometheus.NewCounter(prometheus.CounterOpts{
Name: prometheus.BuildFQName(namespace, "node_stats", "json_parse_failures"),
Help: "Number of errors while parsing JSON.",
ConstLabels: prometheus.Labels{
"cluster": clusterName,
"address": url.String(),
},
}),

transportMetrics: []*nodeMetric{
Expand Down Expand Up @@ -2275,6 +2287,39 @@ func GetNodeID(client *http.Client, user, password, s string) (string, error) {
return "", nil
}

func GetClusterName(client *http.Client, user, password, s string) (string, error) {
u, err := url.Parse(s)
if err != nil {
return "", fmt.Errorf("failed to parse URL %s: %s", s, err)
}
if user != "" && password != "" {
u.User = url.UserPassword(user, password)
}

var cir ClusterInfoResponse
res, err := client.Get(u.String())
if err != nil {
return "", fmt.Errorf("failed to get cluster info from %s: %s", u.String(), err)
}
Comment on lines +2290 to +2303
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GetClusterName is newly introduced but doesn’t have unit test coverage. Since similar helpers (e.g., fetchAndDecodeNodeStats) are tested in nodes_test.go, please add tests for GetClusterName covering: successful parsing of cluster_name, non-200 responses, and basic auth handling.

Copilot uses AI. Check for mistakes.
defer func() {
err = res.Body.Close()
if err != nil {
log.Println("failed to close response body, err: ", err)
}
}()
if res.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP Request failed with code %d", res.StatusCode)
}
bts, err := io.ReadAll(res.Body)
if err != nil {
return "", err
}
if err := json.Unmarshal(bts, &cir); err != nil {
return "", err
}
return cir.ClusterName, nil
}
Comment on lines +2290 to +2321
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GetClusterName is new behavior and currently has no unit test coverage. Since this package already has HTTP fixture-based tests (e.g., nodes_test.go), consider adding a small test that validates cluster name extraction and non-200 handling.

Copilot uses AI. Check for mistakes.

func GetCatMaster(client *http.Client, user, password, s string) (string, error) {
u, err := url.Parse(s)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion inputs/elasticsearch/collector/nodes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func TestNodesStats(t *testing.T) {
}
u.User = url.UserPassword("elastic", "changeme")
nodeStats := make([]string, 0)
c := NewNodes(http.DefaultClient, u, true, "_local", false, nodeStats)
c := NewNodes(http.DefaultClient, u, true, "_local", false, nodeStats, "test")
nsr, err := c.fetchAndDecodeNodeStats()
if err != nil {
t.Fatalf("Failed to fetch or decode node stats: %s", err)
Expand Down
18 changes: 14 additions & 4 deletions inputs/elasticsearch/elasticsearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ type (
}

serverInfo struct {
nodeID string
masterID string
nodeID string
masterID string
clusterName string
}

IndicesInfo struct {
Expand Down Expand Up @@ -204,7 +205,16 @@ func (ins *Instance) Gather(slist *types.SampleList) {
return
}

slist.PushSample("elasticsearch", "up", 1, map[string]string{"address": s})
if info.clusterName, err = collector.GetClusterName(ins.Client, ins.UserName, ins.Password, s); err != nil {
slist.PushSample("elasticsearch", "up", 0, map[string]string{"address": s})
log.Println("E! failed to get cluster name:", err)
return
}
Comment on lines +208 to +212
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GetClusterName adds an extra HTTP request per server per Gather, but the existing GetNodeID call already unmarshals a nodeStatsResponse that includes cluster_name (see nodeStatsResponse.ClusterName). To reduce overhead, consider returning the cluster name from the same request (e.g., a new helper that returns both nodeID and clusterName) and avoid this additional GET / call.

Copilot uses AI. Check for mistakes.

slist.PushSample("elasticsearch", "up", 1, map[string]string{
"address": s,
"cluster": info.clusterName,
})
Comment on lines +208 to +217
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The elasticsearch/up metric now uses different label sets depending on success vs failure: on failure you emit only {address: s}, but on success you emit {address: s, cluster: ...}. This creates separate time series for the same target and can leave stale series behind. Consider always including the cluster label (set to ""/"unknown" when unavailable) or keeping labels consistent across both paths.

Copilot uses AI. Check for mistakes.
ins.serverInfoMutex.Lock()
ins.serverInfo[s] = info
ins.serverInfoMutex.Unlock()
Expand Down Expand Up @@ -272,7 +282,7 @@ func (ins *Instance) Gather(slist *types.SampleList) {
}

// Always gather node stats
if err := inputs.Collect(collector.NewNodes(ins.Client, EsUrl, ins.AllNodes, ins.Node, ins.Local, ins.NodeStats), slist); err != nil {
if err := inputs.Collect(collector.NewNodes(ins.Client, EsUrl, ins.AllNodes, ins.Node, ins.Local, ins.NodeStats, ins.serverInfo[ins.Node].clusterName), slist); err != nil {
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ins.serverInfo is keyed by server URL (s), but here the lookup uses ins.Node (often a selector like "_local"), so clusterName will usually be empty and the cluster label on node stats metrics will be wrong. Use the current server key (s) when retrieving clusterName (and consider locking/using a local copy if needed).

Suggested change
if err := inputs.Collect(collector.NewNodes(ins.Client, EsUrl, ins.AllNodes, ins.Node, ins.Local, ins.NodeStats, ins.serverInfo[ins.Node].clusterName), slist); err != nil {
clusterName := ""
if info, ok := ins.serverInfo[s]; ok {
clusterName = info.clusterName
}
if err := inputs.Collect(collector.NewNodes(ins.Client, EsUrl, ins.AllNodes, ins.Node, ins.Local, ins.NodeStats, clusterName), slist); err != nil {

Copilot uses AI. Check for mistakes.
log.Println("E! failed to collect nodes metrics:", err)
Comment on lines +285 to 286
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NewNodes(..., ins.serverInfo[ins.Node].clusterName) looks up serverInfo using ins.Node (e.g. "_local"), but serverInfo is populated with keys of the server URL string (s). This will typically pass an empty cluster name. Also, ins.serverInfo is only refreshed inside the earlier if ins.ClusterStats || len(ins.IndicesInclude) > 0 block; when that block is skipped, this read can use stale data from a previous run. Please derive the cluster name for the current server (s / EsUrl) with consistent key normalization and ensure it’s populated (or explicitly empty) on every Gather run before passing into NewNodes.

Copilot uses AI. Check for mistakes.
}

Expand Down
Loading