From 44a51792ade211fc056d39949047419d03b6f95c Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Wed, 23 Nov 2022 14:50:31 +0100 Subject: [PATCH] manager/state/raft/storage: TestReadRepairWAL: fix for etcd v3.5.5 etcd server v3.5.5 includes a fix to prevent the reader from trying to read more data than available in the file, introduced in https://github.com/etcd-io/etcd/commit/621cd7b9e5aa2ccf634b555e4ebe0037b8975066 > restrict the max size of each WAL entry to the remaining size of the file > > Currently the max size of each WAL entry is hard coded as 10MB. If users > set a value > 10MB for the flag --max-request-bytes, then etcd may run > into a situation that it successfully processes a big request, but fails > to decode it when replaying the WAL file on startup. > > On the other hand, we can't just remove the limitation, because if a > WAL entry is somehow corrupted, and its recByte is a huge value, then > etcd may run out of memory. So the solution is to restrict the max size > of each WAL entry as a dynamic value, which is the remaining size of > the WAL file. This patch updates the test to have sufficient data available in the corrupted file to perform recovery. Signed-off-by: Sebastiaan van Stijn --- manager/state/raft/storage/walwrap.go | 6 +++++- manager/state/raft/storage/walwrap_test.go | 24 ++++++++++++++++------ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/manager/state/raft/storage/walwrap.go b/manager/state/raft/storage/walwrap.go index 2fd0a91804..48252059eb 100644 --- a/manager/state/raft/storage/walwrap.go +++ b/manager/state/raft/storage/walwrap.go @@ -174,7 +174,11 @@ func ReadRepairWAL( return nil, WALData{}, errors.Wrap(err, "failed to decrypt WAL") } // we can only repair ErrUnexpectedEOF and we never repair twice. - if repaired || err != io.ErrUnexpectedEOF { + if repaired || !errors.Is(err, io.ErrUnexpectedEOF) { + // TODO(thaJeztah): should ReadRepairWAL be updated to handle cases where + // some (last) of the files cannot be recovered? ("best effort" recovery?) + // Or should an informative error be produced to help the user (which could + // mean: remove the last file?). See TestReadRepairWAL for more details. return nil, WALData{}, errors.Wrap(err, "irreparable WAL error") } if !wal.Repair(nil, walDir) { diff --git a/manager/state/raft/storage/walwrap_test.go b/manager/state/raft/storage/walwrap_test.go index 6105315bee..9996c5b234 100644 --- a/manager/state/raft/storage/walwrap_test.go +++ b/manager/state/raft/storage/walwrap_test.go @@ -231,12 +231,24 @@ func TestReadRepairWAL(t *testing.T) { require.NoError(t, err) require.Len(t, files, 1) - fName := filepath.Join(tempdir, files[0].Name()) - fileContents, err := os.ReadFile(fName) - require.NoError(t, err) - info, err := files[0].Info() - require.NoError(t, err) - require.NoError(t, os.WriteFile(fName, fileContents[:200], info.Mode())) + // Corrupt the file by truncating it to replicate a broken state (partial + // write), but for this test, make sure that there's enough bytes left to + // read a record (24 bytes). The record size (recBytes) is calculated here; + // + // - https://github.com/etcd-io/etcd/blob/621cd7b9e5aa2ccf634b555e4ebe0037b8975066/server/wal/decoder.go#L83 + // - https://github.com/etcd-io/etcd/blob/621cd7b9e5aa2ccf634b555e4ebe0037b8975066/server/wal/decoder.go#L122-L131 + // + // Using a shorter length will make the test fail: + // + // wal: max entry size limit exceeded, recBytes: 24, fileSize(200) - offset(184) - padBytes(0) = entryLimit(16) + // + // So the file should be >= 208 bytes. + // + // For further details, see: + // + // - https://github.com/etcd-io/etcd/commit/621cd7b9e5aa2ccf634b555e4ebe0037b8975066 / https://github.com/etcd-io/etcd/pull/14127 (backport of https://github.com/etcd-io/etcd/pull/14122) + // - https://github.com/etcd-io/etcd/issues/14114 + require.NoError(t, os.Truncate(filepath.Join(tempdir, files[0].Name()), 300)) ogWAL, err := OriginalWAL.Open(tempdir, snapshot) require.NoError(t, err)