-
Notifications
You must be signed in to change notification settings - Fork 5
Detect Lambda OOM crashes and return codes.ResourceExhausted #859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,8 @@ import ( | |
| "testing" | ||
|
|
||
| "github.com/stretchr/testify/require" | ||
| "google.golang.org/grpc/codes" | ||
| "google.golang.org/grpc/status" | ||
| ) | ||
|
|
||
| func TestExtractMeaningfulLogLines(t *testing.T) { | ||
|
|
@@ -39,6 +41,11 @@ func TestExtractMeaningfulLogLines(t *testing.T) { | |
| `account_inactive`, | ||
| output: "lambda-run: failed to get connector: authenticating during initialization\naccount_inactive", | ||
| }, | ||
| { | ||
| name: "Runtime.ExitError preserved in output", | ||
| raw: "START RequestId: abc-123 Version: $LATEST\nRuntime.ExitError\nEND RequestId: abc-123\n", | ||
| output: "Runtime.ExitError", | ||
| }, | ||
| } | ||
|
|
||
| for _, c := range cases { | ||
|
|
@@ -48,3 +55,149 @@ func TestExtractMeaningfulLogLines(t *testing.T) { | |
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestIsLambdaOOM(t *testing.T) { | ||
| cases := []struct { | ||
| name string | ||
| rawLog string | ||
| want bool | ||
| }{ | ||
| { | ||
| name: "empty log", | ||
| rawLog: "", | ||
| want: false, | ||
| }, | ||
| { | ||
| name: "normal execution", | ||
| rawLog: "START RequestId: abc-123\nEND RequestId: abc-123\nREPORT RequestId: abc-123 Duration: 100 ms Memory Size: 512 MB Max Memory Used: 128 MB\n", | ||
| want: false, | ||
| }, | ||
| { | ||
| name: "OOM via signal killed", | ||
| rawLog: "START RequestId: abc-123\nRequestId: abc-123 Error: Runtime exited with error: signal: killed\n" + | ||
| "Runtime.ExitError\nEND RequestId: abc-123\n" + | ||
| "REPORT RequestId: abc-123 Duration: 5000 ms Memory Size: 512 MB Max Memory Used: 512 MB\n", | ||
| want: true, | ||
| }, | ||
| { | ||
| name: "OOM via memory match without signal killed", | ||
| rawLog: "START RequestId: abc-123\nEND RequestId: abc-123\nREPORT RequestId: abc-123 Duration: 5000 ms Memory Size: 256 MB Max Memory Used: 256 MB\n", | ||
| want: true, | ||
| }, | ||
| { | ||
| name: "timeout not detected as OOM", | ||
| rawLog: "START RequestId: abc-123\nEND RequestId: abc-123\nREPORT RequestId: abc-123 Duration: 300000 ms Memory Size: 512 MB Max Memory Used: 200 MB\n", | ||
| want: false, | ||
| }, | ||
| { | ||
| name: "signal killed without Runtime.ExitError not detected", | ||
| rawLog: "some log line with signal: killed but no exit error marker\n", | ||
| want: false, | ||
| }, | ||
| { | ||
| name: "Runtime.ExitError without signal killed not detected", | ||
| rawLog: "Runtime.ExitError\n", | ||
| want: false, | ||
| }, | ||
| { | ||
| name: "memory fields on separate lines", | ||
| rawLog: "Memory Size: 1024 MB\nMax Memory Used: 1024 MB\n", | ||
| want: true, | ||
| }, | ||
| } | ||
|
|
||
| for _, c := range cases { | ||
| t.Run(c.name, func(t *testing.T) { | ||
| got := isLambdaOOM(c.rawLog) | ||
| require.Equal(t, c.want, got) | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestClassifyLambdaError(t *testing.T) { | ||
| cases := []struct { | ||
| name string | ||
| functionError string | ||
| statusCode int32 | ||
| payload []byte | ||
| rawLog string | ||
| wantCode codes.Code | ||
| wantSubstring string | ||
| wantIsGRPC bool | ||
| }{ | ||
| { | ||
| name: "timeout via payload", | ||
| functionError: "Unhandled", | ||
| statusCode: 200, | ||
| payload: []byte(`{"errorMessage":"Task timed out after 300.00 seconds"}`), | ||
| rawLog: "START RequestId: abc-123\nEND RequestId: abc-123\n", | ||
| wantCode: codes.DeadlineExceeded, | ||
| wantSubstring: "function timed out", | ||
| wantIsGRPC: true, | ||
| }, | ||
| { | ||
| name: "timeout via context deadline exceeded in logs", | ||
| functionError: "Unhandled", | ||
| statusCode: 200, | ||
| payload: []byte(`{}`), | ||
| rawLog: `{"level":"error","error":"context deadline exceeded","msg":"sync failed"}`, | ||
| wantCode: codes.DeadlineExceeded, | ||
| wantSubstring: "function timed out", | ||
| wantIsGRPC: true, | ||
| }, | ||
|
Comment on lines
+136
to
+147
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟠 Bug: This test case will fail at runtime due to two compounding issues:
The function falls through to the generic error path returning |
||
| { | ||
| name: "OOM via signal killed", | ||
| functionError: "Unhandled", | ||
| statusCode: 200, | ||
| payload: []byte(`{}`), | ||
| rawLog: "START RequestId: abc-123\nRequestId: abc-123 Error: Runtime exited with error: signal: killed\n" + | ||
| "Runtime.ExitError\nEND RequestId: abc-123\n" + | ||
| "REPORT RequestId: abc-123 Duration: 5000 ms Memory Size: 512 MB Max Memory Used: 512 MB\n", | ||
| wantCode: codes.ResourceExhausted, | ||
| wantSubstring: "function ran out of memory", | ||
| wantIsGRPC: true, | ||
| }, | ||
| { | ||
| name: "OOM via memory limit reached", | ||
| functionError: "Unhandled", | ||
| statusCode: 200, | ||
| payload: []byte(`{}`), | ||
| rawLog: "START RequestId: abc-123\nEND RequestId: abc-123\nREPORT RequestId: abc-123 Duration: 5000 ms Memory Size: 256 MB Max Memory Used: 256 MB\n", | ||
| wantCode: codes.ResourceExhausted, | ||
| wantSubstring: "function ran out of memory", | ||
| wantIsGRPC: true, | ||
| }, | ||
| { | ||
| name: "generic error with filtered logs", | ||
| functionError: "Unhandled", | ||
| statusCode: 200, | ||
| payload: []byte(`{}`), | ||
| rawLog: "START RequestId: abc-123\nlambda-run: failed to get connector: auth error\nEND RequestId: abc-123\n", | ||
| wantSubstring: "lambda-run: failed to get connector: auth error", | ||
| wantIsGRPC: false, | ||
| }, | ||
| { | ||
| name: "generic error without meaningful logs", | ||
| functionError: "Unhandled", | ||
| statusCode: 200, | ||
| payload: []byte(`{}`), | ||
| rawLog: "START RequestId: abc-123\nEND RequestId: abc-123\nREPORT RequestId: abc-123 Duration: 100 ms Memory Size: 512 MB Max Memory Used: 128 MB\n", | ||
| wantSubstring: "lambda_transport: function returned error: Unhandled; status code: 200", | ||
| wantIsGRPC: false, | ||
| }, | ||
| } | ||
|
|
||
| for _, c := range cases { | ||
| t.Run(c.name, func(t *testing.T) { | ||
| err := classifyLambdaError(c.functionError, c.statusCode, c.payload, c.rawLog) | ||
| require.Error(t, err) | ||
| require.Contains(t, err.Error(), c.wantSubstring) | ||
|
|
||
| if c.wantIsGRPC { | ||
| st, ok := status.FromError(err) | ||
| require.True(t, ok, "expected gRPC status error") | ||
| require.Equal(t, c.wantCode, st.Code()) | ||
| } | ||
| }) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 Suggestion: This pre-existing check searches
filteredLogsfor escaped-quote patterns (`\"error\":\"context deadline exceeded\"`), butextractMeaningfulLogLinesalready strips all JSON lines starting with{(line 189). If in production this pattern only appears in JSON-formatted log lines, this check would be dead code. Consider checkingrawLoginstead offilteredLogs, and matching the unescaped form"error":"context deadline exceeded".