Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions .github/workflows/jvm-metrics-kind-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
permissions:
contents: read
name: JVM Metrics (nodemon) Kind Test

on:
pull_request:
workflow_dispatch:

jobs:
jvm-metrics-kind:
runs-on: ubuntu-xl
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: "go.mod"
cache: true

- name: Create kind cluster
uses: helm/kind-action@v1
with:
version: v0.27.0
node_image: kindest/node:v1.32.3
cluster_name: kind-jvm-metrics
wait: 120s

- name: Install metrics-server
run: |
helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/
helm repo update
helm upgrade --install --set args={--kubelet-insecure-tls} metrics-server metrics-server/metrics-server --namespace kube-system
kubectl wait --for=condition=available --timeout=300s deployment/metrics-server -n kube-system

- name: Build nodemon image
run: |
docker build -f Dockerfile.nodemon -t zxporter-nodemon:jvm-ci .

- name: Load nodemon image into kind
run: |
kind load docker-image zxporter-nodemon:jvm-ci --name kind-jvm-metrics

- name: Deploy zxporter-nodemon (JVM metrics enabled)
run: |
kubectl create ns devzero-system || true
helm upgrade --install zxporter-nodemon ./helm-chart/zxporter-nodemon \
-n devzero-system \
--set provider=eks \
--set dcgmExporter.enabled=false \
--set jvmMetrics.enabled=true \
--set gpuMetricsExporter.image.repository=zxporter-nodemon \
--set gpuMetricsExporter.image.tag=jvm-ci \
--set gpuMetricsExporter.image.pullPolicy=IfNotPresent
kubectl -n devzero-system rollout status ds/zxporter-nodemon --timeout=180s

- name: Deploy JVM test workloads
run: |
kubectl apply -f test/fixtures/jvm-apps.yaml
kubectl -n jvm-spike rollout status deploy/java8-xmx-only --timeout=300s
kubectl -n jvm-spike rollout status deploy/java11-tool-options --timeout=300s
kubectl -n jvm-spike rollout status deploy/java17-maxram-only --timeout=300s
kubectl -n jvm-spike rollout status deploy/not-java --timeout=300s

- name: Validate nodemon JVM metrics output
run: |
POD_IP=$(kubectl -n devzero-system get pod -l app.kubernetes.io/name=zxporter-nodemon -o jsonpath='{.items[0].status.podIP}')
echo "nodemon podIP=$POD_IP"

# Assertions (python; no jq dependency). Fetch JSON inside the python container.
kubectl -n devzero-system run py-assert --image=python:3.12-slim -i --rm --restart=Never --command -- \
sh -lc "python - <<'PY'
import json,sys,urllib.request
pod_ip = '${POD_IP}'
url = f'http://{pod_ip}:6061/container/jvm-metrics?namespace=jvm-spike'
with urllib.request.urlopen(url, timeout=30) as r:
data = json.loads(r.read().decode('utf-8'))

by_pod={m.get('pod'):m for m in data}

def req_prefix(prefix):
for k,v in by_pod.items():
if k and k.startswith(prefix):
return v
raise SystemExit(f'missing pod prefix={prefix}; got pods={list(by_pod.keys())}')

# Ensure not-java absent
if any((m.get('pod','') or '').startswith('not-java') for m in data):
raise SystemExit('not-java unexpectedly present in JVM metrics')

# java8: cmdline -Xmx192m => 201326592
m8=req_prefix('java8-xmx-only')
flags=m8.get('flags_extracted',{})
if flags.get('xmx_bytes')!=201326592:
raise SystemExit(f'java8-xmx-only expected xmx_bytes=201326592 got {flags.get("xmx_bytes")}')

# java11: env JAVA_TOOL_OPTIONS provides xmx/xms/maxram; sources should mention JAVA_TOOL_OPTIONS
m11=req_prefix('java11-tool-options')
flags=m11.get('flags_extracted',{})
if flags.get('xmx_bytes')!=167772160 or flags.get('xms_bytes')!=50331648 or flags.get('max_ram_percentage')!=65:
raise SystemExit(f'java11-tool-options flags mismatch: {flags}')
src=m11.get('flag_sources',{})
xmx_src=str(src.get('xmx_bytes',''))
if 'TOOL_OPTIONS' not in xmx_src:
raise SystemExit(f'java11-tool-options expected xmx source to mention JAVA_TOOL_OPTIONS; got {src}')

# java17: maxram only should be 40
m17=req_prefix('java17-maxram-only')
flags=m17.get('flags_extracted',{})
if flags.get('max_ram_percentage')!=40:
raise SystemExit(f'java17-maxram-only expected max_ram_percentage=40 got {flags.get("max_ram_percentage")}')

print('OK')
PY"
22 changes: 20 additions & 2 deletions cmd/zxporter-nodemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"go.uber.org/zap"

"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
)
Expand Down Expand Up @@ -63,6 +64,12 @@ func main() {
os.Exit(1)
}

k8sClient, err := kubernetes.NewForConfig(kubeConfig)
if err != nil {
logger.Error(err, "Failed to create kubernetes client")
os.Exit(1)
}

// Create components
httpClient := &http.Client{Timeout: 15 * time.Second}
scraper := nodemon.NewScraper(httpClient, logger)
Expand All @@ -79,9 +86,19 @@ func main() {
// Create exporter
exporter := nodemon.NewExporter(cfg, dynClient, scraper, mapper, logger)

// Create HTTP handler and server
// Create JVM collector
jvmCollector := nodemon.NewJVMCollector(cfg.NodeName, k8sClient, logger)

// Start JVM collector's pod informer
if err := jvmCollector.Start(); err != nil {
logger.Error(err, "Failed to start JVM collector")
os.Exit(1)
}

// Create HTTP handlers and server
containerMetricsHandler := nodemon.NewContainerMetricsHandler(exporter, logger)
mux := nodemon.NewServerMux(containerMetricsHandler)
jvmMetricsHandler := nodemon.NewJVMMetricsHandler(jvmCollector, logger)
mux := nodemon.NewServerMux(containerMetricsHandler, jvmMetricsHandler)

server := &http.Server{
Addr: fmt.Sprintf(":%d", cfg.HTTPListenPort),
Expand All @@ -107,6 +124,7 @@ func main() {
<-sigChan

logger.Info("Shutting down...")
jvmCollector.Stop()
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer shutdownCancel()
if err := server.Shutdown(shutdownCtx); err != nil {
Expand Down
51 changes: 51 additions & 0 deletions docs/jvm-metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# JVM metrics (zxporter-nodemon) — hsperfdata (no hooking)

This spike adds an **optional** endpoint to zxporter-nodemon:

- `GET /container/jvm-metrics`

It discovers Java processes on the node via `/proc` and reads HotSpot's `hsperfdata` file via:

- `/proc/<hostpid>/root/tmp/hsperfdata_*/<nsPid>`

This requires:
- `hostPID: true`
- running zxporter-nodemon as **UID 0** (root) to read those paths

No attach/JMX/javaagent/async-profiler is used.

## Enable via Helm (chart: zxporter-nodemon)

Set:

```yaml
jvmMetrics:
enabled: true
```

When enabled, the DaemonSet will:
- set `spec.hostPID: true`
- set `runAsUser: 0` and `runAsNonRoot: false` for the `zxporter-nodemon` container

## Local validation (kind)

Assumes you have a kind cluster and a Java workload running.

If using the `kind-jvm-spike` cluster from this spike:

```bash
kubectl config use-context kind-jvm-spike
kubectl -n jvm-spike get pods
```

Then port-forward zxporter-nodemon (namespace depends on your install) and hit:

```bash
curl -sS localhost:6061/container/jvm-metrics | jq '.[0]'
```

Expected fields include:
- `heap_used_bytes`, `heap_size_bytes`, `heap_max_size_bytes`
- `gc_time_seconds_total` (map)
- `safepoint_time_seconds_total`, `safepoint_sync_time_seconds_total`
- `flags_extracted` (best-effort from `/proc/<pid>/cmdline`)
13 changes: 13 additions & 0 deletions helm-chart/zxporter-nodemon/templates/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ spec:
labels:
{{- include "zxporter-nodemon.selectorLabels" . | nindent 8 }}
spec:
{{- if .Values.jvmMetrics.enabled }}
hostPID: true
{{- end }}
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -69,7 +72,16 @@ spec:
containers:
- name: zxporter-nodemon
securityContext:
{{- if .Values.jvmMetrics.enabled }}
runAsNonRoot: false
runAsUser: 0
readOnlyRootFilesystem: true
capabilities:
add:
- SYS_PTRACE
{{- else }}
{{- toYaml .Values.gpuMetricsExporter.securityContext | nindent 12 }}
{{- end }}
image: "{{ .Values.gpuMetricsExporter.image.repository }}:{{ .Values.gpuMetricsExporter.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.gpuMetricsExporter.image.pullPolicy }}
ports:
Expand All @@ -92,6 +104,7 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName

{{- if .Values.dcgmExporter.enabled }}
- name: "DCGM_HOST"
value: "localhost"
Expand Down
5 changes: 5 additions & 0 deletions helm-chart/zxporter-nodemon/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ gpuMetricsExporter:
rbac:
clusterWide: true

# jvmMetrics -- Enable JVM metrics collection via hsperfdata.
# NOTE: requires hostPID + running nodemon as UID 0 to read /proc/<pid>/root/tmp/hsperfdata_*/<nsPid>
jvmMetrics:
enabled: false

dcgmExporter:
enabled: true
image:
Expand Down
10 changes: 10 additions & 0 deletions internal/collector/container_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ type ContainerMetricsSnapshot struct {
GpuLimitCount interface{} `json:"gpuLimitCount,omitempty"`
GpuTotalMemoryMb interface{} `json:"gpuTotalMemoryMb,omitempty"`
IndividualGPUMetrics string `json:"individualGPUMetrics,omitempty"` // JSON string

// JVM metrics (from zxporter-nodemon /container/jvm-metrics)
JvmJavaCommand string `json:"jvmJavaCommand,omitempty"`
JvmJavaVersion string `json:"jvmJavaVersion,omitempty"`
JvmHeapSizeBytes int64 `json:"jvmHeapSizeBytes,omitempty"`
JvmHeapUsedBytes int64 `json:"jvmHeapUsedBytes,omitempty"`
JvmHeapMaxSizeBytes int64 `json:"jvmHeapMaxSizeBytes,omitempty"`
JvmRawCmdline string `json:"jvmRawCmdline,omitempty"`
JvmFlagsExtractedJSON string `json:"jvmFlagsExtractedJson,omitempty"`
JvmFlagSourcesJSON string `json:"jvmFlagSourcesJson,omitempty"`
}

// BuildOOMSnapshot constructs a ContainerMetricsSnapshot for an OOM event.
Expand Down
Loading
Loading