From 9479271c03032edf3e79949aaab2c02f69af554f Mon Sep 17 00:00:00 2001 From: Dusty Mabe Date: Wed, 18 Mar 2026 10:46:01 -0400 Subject: [PATCH 1/2] mantle/system/nproc: account for page cache in cgroup available memory The cgroup available memory calculation used memory.current (total cgroup usage) directly, which includes page cache (file-backed memory). Since inactive page cache is reclaimable by the kernel under memory pressure, it should not count as unavailable. This caused GetCurrentMemAvailableMiB() to significantly underestimate available memory, making QEMU instance scheduling overly conservative. Read the "inactive_file" field from /sys/fs/cgroup/memory.stat, which reports the page cache size that can be reclaimed easily in bytes, and subtract it from current usage before computing available memory. The effective formula becomes: available = limit - (current - inactive_file) This mirrors how /proc/meminfo computes MemAvailable by considering reclaimable caches. A new helper getCgroupMemoryStatField() is added for parsing individual fields from memory.stat, returning 0 gracefully if the file or field is absent. Written-by: --- mantle/system/nproc.go | 63 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/mantle/system/nproc.go b/mantle/system/nproc.go index a91f7d5224..ee5561ae7c 100644 --- a/mantle/system/nproc.go +++ b/mantle/system/nproc.go @@ -176,7 +176,15 @@ func getCgroupMemoryLimitMiB() (uint, error) { } // getCgroupMemoryAvailableMiB returns the available memory within the -// cgroup v2 in MiB (limit - current usage), or math.MaxUint if no limit. +// cgroup v2 in MiB, or math.MaxUint if no limit is set. It computes +// available memory as: limit - (current - inactive_file) where inactive_file +// is not actively used file caches that can be evicted if needed. +// (current - inactive_file) is similar to the "workingSet" calculation over in [1]. +// More context on this also in [2]. This is similar to how /proc/meminfo computes +// MemAvailable by considering reclaimable caches. +// +// [1] https://github.com/kubernetes/kubernetes/blob/ac10370ad2aebde82c2d268dd80d08df0ffc2532/test/e2e/node/node_problem_detector.go#L290-L344 +// [2] https://github.com/kata-containers/kata-containers/issues/10280 func getCgroupMemoryAvailableMiB() (uint, error) { maxBuf, err := os.ReadFile("/sys/fs/cgroup/memory.max") if os.IsNotExist(err) { @@ -200,8 +208,57 @@ func getCgroupMemoryAvailableMiB() (uint, error) { if err != nil { return 0, fmt.Errorf("invalid memory.current value: %w", err) } - if current >= limit { + + // Read inactive_file size from memory.stat to exclude reclaimable + // file-backed memory from the usage calculation. + inactiveFile, err := getCgroupMemoryStatField("inactive_file") + if err != nil { + return 0, err + } + + // Subtract the inactive_file size from the memory.current. This + // cache should always be less than the memory.current but add + // a check and do nothing just in case. + usage := current + if inactiveFile < usage { + usage -= inactiveFile + } + + // This also shouldn't happen, but in case the usage is larger + // than the limit let's just return that there's 0 available memory. + if usage >= limit { + return 0, nil + } + return uint((limit - usage) / (1024 * 1024)), nil +} + +// getCgroupMemoryStatField reads a specific field from +// /sys/fs/cgroup/memory.stat and returns its value in bytes. +// The file contains key-value pairs like "file 123456789". +// Returns 0 if the file does not exist or the field is not found. +func getCgroupMemoryStatField(field string) (uint64, error) { + f, err := os.Open("/sys/fs/cgroup/memory.stat") + if os.IsNotExist(err) { return 0, nil + } else if err != nil { + return 0, fmt.Errorf("reading memory.stat: %w", err) + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + parts := strings.Fields(scanner.Text()) + if len(parts) == 2 && parts[0] == field { + val, err := strconv.ParseUint(parts[1], 10, 64) + if err != nil { + return 0, fmt.Errorf("parsing memory.stat field %s: %w", field, err) + } + return val, nil + } + } + if err := scanner.Err(); err != nil { + return 0, fmt.Errorf("scanning memory.stat: %w", err) } - return uint((limit - current) / (1024 * 1024)), nil + // Field not found; return 0 so callers degrade gracefully. + return 0, nil } From afcec9ac3818e8ef600482856b24652aba501ade Mon Sep 17 00:00:00 2001 From: Dusty Mabe Date: Wed, 18 Mar 2026 11:13:21 -0400 Subject: [PATCH 2/2] mantle/kola/harness: switch strategy for warning about memory starvation Let's pass in a boolean and also warn on the first wait and then periodically after that. --- mantle/kola/harness.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mantle/kola/harness.go b/mantle/kola/harness.go index 9cd47106a0..ca94c0c302 100644 --- a/mantle/kola/harness.go +++ b/mantle/kola/harness.go @@ -1747,7 +1747,7 @@ func makeNonExclusiveTest(bucket int, tests []*register.Test, flight platform.Fl return nonExclusiveWrapper } -func reserveMemoryCountForTest(t *register.Test, needed int, logger func(format string, args ...interface{})) bool { +func reserveMemoryCountForTest(t *register.Test, needed int, warnOnWait bool) bool { reservedMemoryCountMutex.Lock() defer reservedMemoryCountMutex.Unlock() avail, err := system.GetCurrentMemAvailableMiB() @@ -1762,10 +1762,14 @@ func reserveMemoryCountForTest(t *register.Test, needed int, logger func(format reservedMemoryCountMiB += needed reserved := reservedMemoryCountMiB t.ReservedMemoryCountMiB = needed - logger("Reserved %d MiB for %s (available: %d MiB, reserved total: %d MiB)", + plog.Debugf("Reserved %d MiB for %s (available: %d MiB, reserved total: %d MiB)", needed, t.Name, avail, reserved) return true } + logger := plog.Debugf + if warnOnWait { + logger = plog.Warningf + } logger("Waiting on memory to run %s: need %d MiB, effective available %d MiB (system: %d MiB, reserved: %d MiB)", t.Name, needed, effective, avail, reservedMemoryCountMiB) return false @@ -1785,15 +1789,15 @@ func waitForMemory(h *harness.H, flight platform.Flight, t *register.Test) { if flight.Platform() == "qemu" { needed := getNeededMemoryMiB(t) start := time.Now() - logger := plog.Debugf - for !reserveMemoryCountForTest(t, needed, logger) { - // After a period of time switch the logger so we get some + warnOnWait := true // warn on first wait + for !reserveMemoryCountForTest(t, needed, warnOnWait) { + // After a period of time switch to log a warning so we get some // info even if debug isn't turned on. if time.Since(start) > 5*time.Minute { - logger = plog.Warningf + warnOnWait = true start = time.Now() // reset counter } else { - logger = plog.Debugf + warnOnWait = false } // sleep between 0 and 20 seconds and try again time.Sleep(time.Duration(rand.Intn(20)) * time.Second)