From 846e7fddc6f6a01361b9956114a7277b28729423 Mon Sep 17 00:00:00 2001 From: Andrew Wormald Date: Tue, 10 Mar 2026 09:27:42 +0000 Subject: [PATCH 1/5] Fix error counter keying on dynamic error messages Co-Authored-By: Claude Opus 4.6 --- internal/errorcounter/errorcounter.go | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/internal/errorcounter/errorcounter.go b/internal/errorcounter/errorcounter.go index 45b1490..23caf52 100644 --- a/internal/errorcounter/errorcounter.go +++ b/internal/errorcounter/errorcounter.go @@ -20,27 +20,30 @@ func (c *Counter) Add(err error, labels ...string) int { c.mu.Lock() defer c.mu.Unlock() - errMsg := err.Error() - errMsg += strings.Join(labels, "-") - c.store[errMsg] += 1 - return c.store[errMsg] + key := makeKey(labels) + c.store[key] += 1 + return c.store[key] } func (c *Counter) Count(err error, labels ...string) int { c.mu.Lock() defer c.mu.Unlock() - errMsg := err.Error() - errMsg += strings.Join(labels, "-") - return c.store[errMsg] + key := makeKey(labels) + return c.store[key] } func (c *Counter) Clear(err error, labels ...string) { c.mu.Lock() defer c.mu.Unlock() - errMsg := err.Error() - errMsg += strings.Join(labels, "-") - c.store[errMsg] = 0 - return + key := makeKey(labels) + delete(c.store, key) +} + +// makeKey builds a stable key from labels only. The error message is excluded +// because it often contains dynamic data (timestamps, IDs) which would create +// unique keys and prevent the PauseAfterErrCount threshold from ever being reached. +func makeKey(labels []string) string { + return strings.Join(labels, "-") } From 08af657f2215b25be93b3f98b77d2dae5d252621 Mon Sep 17 00:00:00 2001 From: Andrew Wormald Date: Tue, 10 Mar 2026 10:04:45 +0000 Subject: [PATCH 2/5] Add tests for label-based error counter keying Co-Authored-By: Claude Opus 4.6 --- internal/errorcounter/errorcounter_test.go | 42 ++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/internal/errorcounter/errorcounter_test.go b/internal/errorcounter/errorcounter_test.go index 798f294..f758c88 100644 --- a/internal/errorcounter/errorcounter_test.go +++ b/internal/errorcounter/errorcounter_test.go @@ -59,3 +59,45 @@ func TestErrorCounter(t *testing.T) { }) } } + +func TestErrorCounter_DifferentErrorsSameLabels(t *testing.T) { + c := errorcounter.New() + labels := []string{"processName", "run-123"} + + // Different error messages with the same labels should share a counter. + c.Add(errors.New("connection refused at 10:00:01"), labels...) + c.Add(errors.New("connection refused at 10:00:02"), labels...) + count := c.Add(errors.New("timeout after 30s"), labels...) + + require.Equal(t, 3, count) + + // Count should work regardless of which error is passed. + require.Equal(t, 3, c.Count(errors.New("completely different error"), labels...)) +} + +func TestErrorCounter_ClearRemovesKey(t *testing.T) { + c := errorcounter.New() + labels := []string{"process", "run-1"} + + c.Add(errors.New("err"), labels...) + c.Add(errors.New("err"), labels...) + require.Equal(t, 2, c.Count(errors.New("err"), labels...)) + + c.Clear(errors.New("err"), labels...) + + // After clear, count should be 0 and next Add should return 1. + require.Equal(t, 0, c.Count(errors.New("err"), labels...)) + require.Equal(t, 1, c.Add(errors.New("err"), labels...)) +} + +func TestErrorCounter_DifferentLabelsSeparateCounters(t *testing.T) { + c := errorcounter.New() + err := errors.New("same error") + + c.Add(err, "process-a", "run-1") + c.Add(err, "process-a", "run-1") + c.Add(err, "process-b", "run-2") + + require.Equal(t, 2, c.Count(err, "process-a", "run-1")) + require.Equal(t, 1, c.Count(err, "process-b", "run-2")) +} From 33333bcf543de492a10a25aa298c1cd9bf5f39ea Mon Sep 17 00:00:00 2001 From: Andrew Wormald Date: Tue, 10 Mar 2026 16:45:54 +0000 Subject: [PATCH 3/5] Require at least one label in ErrorCounter API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous approach silently dropped the error message from the key but kept labels optional. This meant callers without labels would share a single global counter — a worse behaviour change. Now the interface requires at least one label (label string, extras ...string), making the contract explicit. All existing callers already pass two labels (processName, runID) so no call sites change. Co-Authored-By: Claude Opus 4.6 --- errors.go | 10 +- internal/errorcounter/errorcounter.go | 19 ++-- internal/errorcounter/errorcounter_test.go | 103 +++++++++------------ 3 files changed, 60 insertions(+), 72 deletions(-) diff --git a/errors.go b/errors.go index 66bdcca..95e95c0 100644 --- a/errors.go +++ b/errors.go @@ -10,9 +10,11 @@ var ( ErrInvalidTransition = errors.New("invalid transition") ) -// ErrorCounter defines an interface for counting occurrences of errors with optional labels. +// ErrorCounter defines an interface for counting occurrences of errors keyed by stable labels. +// At least one label is required — labels should identify the process and run (e.g. processName, runID). +// The error value is not used for keying because error messages often contain dynamic data. type ErrorCounter interface { - Add(err error, labels ...string) int - Count(err error, labels ...string) int - Clear(err error, labels ...string) + Add(err error, label string, extras ...string) int + Count(err error, label string, extras ...string) int + Clear(err error, label string, extras ...string) } diff --git a/internal/errorcounter/errorcounter.go b/internal/errorcounter/errorcounter.go index 23caf52..e293913 100644 --- a/internal/errorcounter/errorcounter.go +++ b/internal/errorcounter/errorcounter.go @@ -16,34 +16,37 @@ type Counter struct { store map[string]int } -func (c *Counter) Add(err error, labels ...string) int { +func (c *Counter) Add(err error, label string, extras ...string) int { c.mu.Lock() defer c.mu.Unlock() - key := makeKey(labels) + key := makeKey(label, extras) c.store[key] += 1 return c.store[key] } -func (c *Counter) Count(err error, labels ...string) int { +func (c *Counter) Count(err error, label string, extras ...string) int { c.mu.Lock() defer c.mu.Unlock() - key := makeKey(labels) + key := makeKey(label, extras) return c.store[key] } -func (c *Counter) Clear(err error, labels ...string) { +func (c *Counter) Clear(err error, label string, extras ...string) { c.mu.Lock() defer c.mu.Unlock() - key := makeKey(labels) + key := makeKey(label, extras) delete(c.store, key) } // makeKey builds a stable key from labels only. The error message is excluded // because it often contains dynamic data (timestamps, IDs) which would create // unique keys and prevent the PauseAfterErrCount threshold from ever being reached. -func makeKey(labels []string) string { - return strings.Join(labels, "-") +func makeKey(label string, extras []string) string { + if len(extras) == 0 { + return label + } + return label + "-" + strings.Join(extras, "-") } diff --git a/internal/errorcounter/errorcounter_test.go b/internal/errorcounter/errorcounter_test.go index f758c88..4f78d3f 100644 --- a/internal/errorcounter/errorcounter_test.go +++ b/internal/errorcounter/errorcounter_test.go @@ -10,84 +10,67 @@ import ( ) func TestErrorCounter(t *testing.T) { - testCases := []struct { - name string - inputErr error - labels []string - iterationCount int - expectedCount int - }{ - { - name: "Add 3 and get 3", - inputErr: errors.New("test error"), - labels: []string{"label 1", "label 2"}, - iterationCount: 3, - expectedCount: 3, - }, - { - name: "Add 1 and get 1 - no labels", - inputErr: errors.New("test error"), - labels: []string{}, - iterationCount: 3, - expectedCount: 3, - }, - { - name: "Add 0 and get 0", - inputErr: errors.New("test error"), - labels: []string{"label 1"}, - iterationCount: 0, - expectedCount: 0, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - c := errorcounter.New() - - var currentCount int - for i := 0; i < tc.iterationCount; i++ { - currentCount = c.Add(tc.inputErr, tc.labels...) - } - require.Equal(t, tc.expectedCount, currentCount) - - count := c.Count(tc.inputErr, tc.labels...) - require.Equal(t, tc.expectedCount, count) - - c.Clear(tc.inputErr, tc.labels...) - count = c.Count(tc.inputErr, tc.labels...) - require.Equal(t, 0, count) - }) - } + t.Run("Add 3 and get 3", func(t *testing.T) { + c := errorcounter.New() + err := errors.New("test error") + + c.Add(err, "label 1", "label 2") + c.Add(err, "label 1", "label 2") + count := c.Add(err, "label 1", "label 2") + require.Equal(t, 3, count) + + require.Equal(t, 3, c.Count(err, "label 1", "label 2")) + + c.Clear(err, "label 1", "label 2") + require.Equal(t, 0, c.Count(err, "label 1", "label 2")) + }) + + t.Run("Single label", func(t *testing.T) { + c := errorcounter.New() + err := errors.New("test error") + + c.Add(err, "only-label") + count := c.Add(err, "only-label") + require.Equal(t, 2, count) + + require.Equal(t, 2, c.Count(err, "only-label")) + + c.Clear(err, "only-label") + require.Equal(t, 0, c.Count(err, "only-label")) + }) + + t.Run("Add 0 and get 0", func(t *testing.T) { + c := errorcounter.New() + require.Equal(t, 0, c.Count(errors.New("test error"), "label 1")) + }) } func TestErrorCounter_DifferentErrorsSameLabels(t *testing.T) { c := errorcounter.New() - labels := []string{"processName", "run-123"} // Different error messages with the same labels should share a counter. - c.Add(errors.New("connection refused at 10:00:01"), labels...) - c.Add(errors.New("connection refused at 10:00:02"), labels...) - count := c.Add(errors.New("timeout after 30s"), labels...) + c.Add(errors.New("connection refused at 10:00:01"), "processName", "run-123") + c.Add(errors.New("connection refused at 10:00:02"), "processName", "run-123") + count := c.Add(errors.New("timeout after 30s"), "processName", "run-123") require.Equal(t, 3, count) // Count should work regardless of which error is passed. - require.Equal(t, 3, c.Count(errors.New("completely different error"), labels...)) + require.Equal(t, 3, c.Count(errors.New("completely different error"), "processName", "run-123")) } func TestErrorCounter_ClearRemovesKey(t *testing.T) { c := errorcounter.New() - labels := []string{"process", "run-1"} - c.Add(errors.New("err"), labels...) - c.Add(errors.New("err"), labels...) - require.Equal(t, 2, c.Count(errors.New("err"), labels...)) + c.Add(errors.New("err"), "process", "run-1") + c.Add(errors.New("err"), "process", "run-1") + require.Equal(t, 2, c.Count(errors.New("err"), "process", "run-1")) - c.Clear(errors.New("err"), labels...) + c.Clear(errors.New("err"), "process", "run-1") // After clear, count should be 0 and next Add should return 1. - require.Equal(t, 0, c.Count(errors.New("err"), labels...)) - require.Equal(t, 1, c.Add(errors.New("err"), labels...)) + require.Equal(t, 0, c.Count(errors.New("err"), "process", "run-1")) + require.Equal(t, 1, c.Add(errors.New("err"), "process", "run-1")) } func TestErrorCounter_DifferentLabelsSeparateCounters(t *testing.T) { From 01c6cd010e93d0fac23d51eb16ca948fa71278fa Mon Sep 17 00:00:00 2001 From: Andrew Wormald Date: Sun, 29 Mar 2026 13:32:06 +0100 Subject: [PATCH 4/5] Remove unused err parameter from ErrorCounter interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The error value was accepted but ignored for keying — drop it entirely to make the label-only keying explicit in the API. Co-Authored-By: Claude Opus 4.6 --- errors.go | 9 ++- internal/errorcounter/errorcounter.go | 9 +-- internal/errorcounter/errorcounter_test.go | 64 ++++++++-------------- pause.go | 4 +- pause_internal_test.go | 4 +- step_internal_test.go | 2 +- timeout_internal_test.go | 4 +- 7 files changed, 37 insertions(+), 59 deletions(-) diff --git a/errors.go b/errors.go index 95e95c0..2504503 100644 --- a/errors.go +++ b/errors.go @@ -10,11 +10,10 @@ var ( ErrInvalidTransition = errors.New("invalid transition") ) -// ErrorCounter defines an interface for counting occurrences of errors keyed by stable labels. +// ErrorCounter defines an interface for counting errors keyed by stable labels. // At least one label is required — labels should identify the process and run (e.g. processName, runID). -// The error value is not used for keying because error messages often contain dynamic data. type ErrorCounter interface { - Add(err error, label string, extras ...string) int - Count(err error, label string, extras ...string) int - Clear(err error, label string, extras ...string) + Add(label string, extras ...string) int + Count(label string, extras ...string) int + Clear(label string, extras ...string) } diff --git a/internal/errorcounter/errorcounter.go b/internal/errorcounter/errorcounter.go index e293913..d92012d 100644 --- a/internal/errorcounter/errorcounter.go +++ b/internal/errorcounter/errorcounter.go @@ -16,7 +16,7 @@ type Counter struct { store map[string]int } -func (c *Counter) Add(err error, label string, extras ...string) int { +func (c *Counter) Add(label string, extras ...string) int { c.mu.Lock() defer c.mu.Unlock() @@ -25,7 +25,7 @@ func (c *Counter) Add(err error, label string, extras ...string) int { return c.store[key] } -func (c *Counter) Count(err error, label string, extras ...string) int { +func (c *Counter) Count(label string, extras ...string) int { c.mu.Lock() defer c.mu.Unlock() @@ -33,7 +33,7 @@ func (c *Counter) Count(err error, label string, extras ...string) int { return c.store[key] } -func (c *Counter) Clear(err error, label string, extras ...string) { +func (c *Counter) Clear(label string, extras ...string) { c.mu.Lock() defer c.mu.Unlock() @@ -41,9 +41,6 @@ func (c *Counter) Clear(err error, label string, extras ...string) { delete(c.store, key) } -// makeKey builds a stable key from labels only. The error message is excluded -// because it often contains dynamic data (timestamps, IDs) which would create -// unique keys and prevent the PauseAfterErrCount threshold from ever being reached. func makeKey(label string, extras []string) string { if len(extras) == 0 { return label diff --git a/internal/errorcounter/errorcounter_test.go b/internal/errorcounter/errorcounter_test.go index 4f78d3f..6951f4d 100644 --- a/internal/errorcounter/errorcounter_test.go +++ b/internal/errorcounter/errorcounter_test.go @@ -1,7 +1,6 @@ package errorcounter_test import ( - "errors" "testing" "github.com/stretchr/testify/require" @@ -12,75 +11,58 @@ import ( func TestErrorCounter(t *testing.T) { t.Run("Add 3 and get 3", func(t *testing.T) { c := errorcounter.New() - err := errors.New("test error") - c.Add(err, "label 1", "label 2") - c.Add(err, "label 1", "label 2") - count := c.Add(err, "label 1", "label 2") + c.Add("label 1", "label 2") + c.Add("label 1", "label 2") + count := c.Add("label 1", "label 2") require.Equal(t, 3, count) - require.Equal(t, 3, c.Count(err, "label 1", "label 2")) + require.Equal(t, 3, c.Count("label 1", "label 2")) - c.Clear(err, "label 1", "label 2") - require.Equal(t, 0, c.Count(err, "label 1", "label 2")) + c.Clear("label 1", "label 2") + require.Equal(t, 0, c.Count("label 1", "label 2")) }) t.Run("Single label", func(t *testing.T) { c := errorcounter.New() - err := errors.New("test error") - c.Add(err, "only-label") - count := c.Add(err, "only-label") + c.Add("only-label") + count := c.Add("only-label") require.Equal(t, 2, count) - require.Equal(t, 2, c.Count(err, "only-label")) + require.Equal(t, 2, c.Count("only-label")) - c.Clear(err, "only-label") - require.Equal(t, 0, c.Count(err, "only-label")) + c.Clear("only-label") + require.Equal(t, 0, c.Count("only-label")) }) t.Run("Add 0 and get 0", func(t *testing.T) { c := errorcounter.New() - require.Equal(t, 0, c.Count(errors.New("test error"), "label 1")) + require.Equal(t, 0, c.Count("label 1")) }) } -func TestErrorCounter_DifferentErrorsSameLabels(t *testing.T) { - c := errorcounter.New() - - // Different error messages with the same labels should share a counter. - c.Add(errors.New("connection refused at 10:00:01"), "processName", "run-123") - c.Add(errors.New("connection refused at 10:00:02"), "processName", "run-123") - count := c.Add(errors.New("timeout after 30s"), "processName", "run-123") - - require.Equal(t, 3, count) - - // Count should work regardless of which error is passed. - require.Equal(t, 3, c.Count(errors.New("completely different error"), "processName", "run-123")) -} - func TestErrorCounter_ClearRemovesKey(t *testing.T) { c := errorcounter.New() - c.Add(errors.New("err"), "process", "run-1") - c.Add(errors.New("err"), "process", "run-1") - require.Equal(t, 2, c.Count(errors.New("err"), "process", "run-1")) + c.Add("process", "run-1") + c.Add("process", "run-1") + require.Equal(t, 2, c.Count("process", "run-1")) - c.Clear(errors.New("err"), "process", "run-1") + c.Clear("process", "run-1") // After clear, count should be 0 and next Add should return 1. - require.Equal(t, 0, c.Count(errors.New("err"), "process", "run-1")) - require.Equal(t, 1, c.Add(errors.New("err"), "process", "run-1")) + require.Equal(t, 0, c.Count("process", "run-1")) + require.Equal(t, 1, c.Add("process", "run-1")) } func TestErrorCounter_DifferentLabelsSeparateCounters(t *testing.T) { c := errorcounter.New() - err := errors.New("same error") - c.Add(err, "process-a", "run-1") - c.Add(err, "process-a", "run-1") - c.Add(err, "process-b", "run-2") + c.Add("process-a", "run-1") + c.Add("process-a", "run-1") + c.Add("process-b", "run-2") - require.Equal(t, 2, c.Count(err, "process-a", "run-1")) - require.Equal(t, 1, c.Count(err, "process-b", "run-2")) + require.Equal(t, 2, c.Count("process-a", "run-1")) + require.Equal(t, 1, c.Count("process-b", "run-2")) } diff --git a/pause.go b/pause.go index 93d5464..fb4d9d2 100644 --- a/pause.go +++ b/pause.go @@ -24,7 +24,7 @@ func maybePause[Type any, Status StatusType]( return false, nil } - count := counter.Add(originalErr, processName, run.RunID) + count := counter.Add(processName, run.RunID) if count < pauseAfterErrCount { return false, nil } @@ -41,7 +41,7 @@ func maybePause[Type any, Status StatusType]( }) // Run paused - now clear the error counter. - counter.Clear(originalErr, processName, run.RunID) + counter.Clear(processName, run.RunID) return true, nil } diff --git a/pause_internal_test.go b/pause_internal_test.go index 6a0c6b5..f5d0f5e 100644 --- a/pause_internal_test.go +++ b/pause_internal_test.go @@ -69,9 +69,9 @@ func Test_maybeAutoPause(t *testing.T) { RunID: "run-id", }, "test", WithPauseFn(tc.pauseFn)) - counter.Clear(testErr, processName, r.RunID) + counter.Clear(processName, r.RunID) for range tc.errCount { - counter.Add(testErr, processName, r.RunID) + counter.Add(processName, r.RunID) } paused, err := maybePause( diff --git a/step_internal_test.go b/step_internal_test.go index bd299a7..37f1855 100644 --- a/step_internal_test.go +++ b/step_internal_test.go @@ -208,7 +208,7 @@ func Test_stepConsumer(t *testing.T) { }) t.Run("Pause record after exceeding allowed error count", func(t *testing.T) { - counter.Clear(testErr, processName, current.RunID) + counter.Clear(processName, current.RunID) calls := map[string]int{ "consumerFunc": 0, diff --git a/timeout_internal_test.go b/timeout_internal_test.go index c44a196..fefc33d 100644 --- a/timeout_internal_test.go +++ b/timeout_internal_test.go @@ -178,9 +178,9 @@ func TestProcessTimeout(t *testing.T) { } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - counter.Clear(testErr, processName, tc.record.RunID) + counter.Clear(processName, tc.record.RunID) for range tc.currentErrCount { - counter.Add(testErr, processName, tc.record.RunID) + counter.Add(processName, tc.record.RunID) } calls := map[string]int{} From cfce9c0eb341dc6bc5f7580e33ddf516573c7747 Mon Sep 17 00:00:00 2001 From: Andrew Wormald Date: Wed, 13 May 2026 19:59:33 +0100 Subject: [PATCH 5/5] Use null byte separator in makeKey to prevent key collisions Labels containing hyphens (e.g. "a-b","c" vs "a","b-c") could map to the same key. Use \x00 as the separator since it cannot appear in human-readable label strings. Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/errorcounter/errorcounter.go | 2 +- internal/errorcounter/errorcounter_test.go | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/internal/errorcounter/errorcounter.go b/internal/errorcounter/errorcounter.go index d92012d..c94893d 100644 --- a/internal/errorcounter/errorcounter.go +++ b/internal/errorcounter/errorcounter.go @@ -45,5 +45,5 @@ func makeKey(label string, extras []string) string { if len(extras) == 0 { return label } - return label + "-" + strings.Join(extras, "-") + return label + "\x00" + strings.Join(extras, "\x00") } diff --git a/internal/errorcounter/errorcounter_test.go b/internal/errorcounter/errorcounter_test.go index 6951f4d..6fc1f61 100644 --- a/internal/errorcounter/errorcounter_test.go +++ b/internal/errorcounter/errorcounter_test.go @@ -66,3 +66,18 @@ func TestErrorCounter_DifferentLabelsSeparateCounters(t *testing.T) { require.Equal(t, 2, c.Count("process-a", "run-1")) require.Equal(t, 1, c.Count("process-b", "run-2")) } + +func TestErrorCounter_NoKeyCollision(t *testing.T) { + c := errorcounter.New() + + // ("a-b", "c") and ("a", "b-c") must map to distinct keys. + c.Add("a-b", "c") + c.Add("a", "b-c") + + require.Equal(t, 1, c.Count("a-b", "c")) + require.Equal(t, 1, c.Count("a", "b-c")) + + c.Clear("a-b", "c") + require.Equal(t, 0, c.Count("a-b", "c")) + require.Equal(t, 1, c.Count("a", "b-c")) +}