-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbps
More file actions
executable file
·76 lines (70 loc) · 2.26 KB
/
bps
File metadata and controls
executable file
·76 lines (70 loc) · 2.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/bin/bash
(( DEBUG )) && set -x
typeset -i -r VERBOSE=${VERBOSE:-0}
typeset -a FIELDS=()
typeset -a nodes=( ${GPU_NODES:-$@} )
typeset -x GPUCNTCMP='>'
function cmd()
{
fieldsel=$1
oc get pods --all-namespaces --field-selector=status.phase=Running $fieldsel -o json | jq -r "
# Build a flat list of {node, gpus} entries
[ .items[]
| . as \$pod
| select(\$pod.status.phase == \"Running\")
| .spec.containers[]? as \$ctr
| select( (\$ctr.resources.requests[\"nvidia.com/gpu\"]? | try tonumber catch 0) $GPUCNTCMP 0)
| { node: \$pod.spec.nodeName,
pod: \"\(\$pod.metadata.namespace)/\(\$pod.metadata.name)\",
gpus: (\$ctr.resources.requests[\"nvidia.com/gpu\"] | try tonumber catch 0)
}
]
# Group those entries by node
| group_by(.node)
# For each node‐group, sum the gpus
| map({
node: .[0].node,
total: map(.gpus) | add,
pods: map(select(.gpus > 0) | .pod) | unique
})
# And finally print “node: N GPUs”
| map(
select(.node != null)
| if .total > 0
then \"\(.node): BUSY \(.total) \"+ (.pods | join(\" \"))
else \"\(.node): FREE\"
end
)
| .[]
"
}
[[ $1 == -h ]] && {
cat <<EOF
bps [-h] [node-name [node-name ...]]
List any active GPU pods running on the cluster nodes (host computers).
It can take time to check all nodes of the cluster (not all nodes have GPUs).
If you know the node-name of the nodes that have GPUs that you want to
check you can pass them in as arguments, this considerably speeds up the operation.
By default if a node does not have a running pod that has requested a GPU nothing
will be displayed for that node. If you want to see information (eg. FREE)
for the nodes then set VERBOSE=1 eg.
$ VERBOSE=1 gs
of
$ VERBOSE=1 gs wrk-3
See repository README.md for more documentation and examples.
EOF
exit 0
}
(( VERBOSE )) && {
GPUCNTCMP='>='
}
if (( ${#nodes[@]} > 0 )); then
for node in ${nodes[@]}; do
cmd "--field-selector=spec.nodeName=$node"
done
else
# the following jq was developed with the help of OpenAI o4-mini
# be careful I am using double quotes to allow shell expansions to
# customized the jq program (eg. CPUCNTCMP)
cmd
fi