-
Notifications
You must be signed in to change notification settings - Fork 243
Expand file tree
/
Copy pathstats-ring.c
More file actions
402 lines (368 loc) · 12.9 KB
/
Copy pathstats-ring.c
File metadata and controls
402 lines (368 loc) · 12.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
/*
* Per-child stats ring buffer + parent-side aggregate.
*
* Children produce stats deltas into their own ring (write-only-by-owner);
* the parent drains every ring once per main_loop iteration and applies
* the deltas to a parent-private struct stats_aggregate that lives in
* MAP_PRIVATE memory invisible to the kernel. The kernel can no longer
* scribble those counters via a wild syscall arg pointer because the
* authoritative copy is not at any kernel-visible address.
*
* The mirror page (struct stats_published) carries the small subset of
* the aggregate that children also need to read -- currently just
* fleet_op_count for the strategy rotation clock and the syscalls_todo
* termination check. Republished once per drain.
*/
#include <errno.h>
#include <stdatomic.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <sys/mman.h>
#include "arch.h" /* page_size, PAGE_MASK */
#include "child.h"
#include "pids.h"
#include "shm.h"
#include "spsc-ring.h"
#include "stats_ring.h"
#include "trinity.h"
#include "utils.h"
struct stats_aggregate parent_stats;
struct stats_published *shm_published;
void stats_ring_init(struct stats_ring *ring)
{
memset(ring->slots, 0, sizeof(ring->slots));
spsc_ring_init(&ring->base);
}
bool stats_ring_enqueue(struct stats_ring *ring, enum stats_field field,
uint16_t aux, uint32_t delta)
{
struct stats_ring_slot slot = {
.field_id = (uint16_t)field,
.aux = aux,
.delta = delta,
._reserved = 0,
};
if (ring == NULL)
return false;
return spsc_ring_try_enqueue(&ring->base, ring->slots, STATS_RING_SIZE,
sizeof(ring->slots[0]), &slot);
}
bool stats_ring_enqueue_call_complete(struct stats_ring *ring,
uint16_t category,
enum stats_result_class result)
{
struct stats_ring_slot slot = {
.field_id = (uint16_t)STATS_FIELD_CALL_COMPLETE,
.aux = category,
.delta = 1,
._reserved = (uint64_t)(uint8_t)result,
};
if (ring == NULL)
return false;
return spsc_ring_try_enqueue(&ring->base, ring->slots, STATS_RING_SIZE,
sizeof(ring->slots[0]), &slot);
}
/*
* Apply a single ring slot to parent_stats. Validates the field_id /
* aux combination before touching any array index -- children produce
* hostile fuzzed workload and a wild value-result syscall buffer that
* scribbled a slot can leave any field at any value.
*/
static void apply_slot(const void *p, void *ctx __unused__)
{
const struct stats_ring_slot *s = p;
enum stats_field field = (enum stats_field)s->field_id;
uint16_t aux = s->aux;
unsigned long delta = s->delta;
switch (field) {
case STATS_FIELD_OP_COUNT:
parent_stats.op_count += delta;
break;
case STATS_FIELD_FAULT_INJECTED:
parent_stats.fault_injected += delta;
break;
case STATS_FIELD_FAULT_CONSUMED:
parent_stats.fault_consumed += delta;
break;
case STATS_FIELD_SHARED_BUFFER_REDIRECTED:
parent_stats.shared_buffer_redirected += delta;
break;
case STATS_FIELD_LIBC_HEAP_REDIRECTED:
parent_stats.libc_heap_redirected += delta;
break;
case STATS_FIELD_LIBC_HEAP_EMBEDDED_REDIRECTED:
parent_stats.libc_heap_embedded_redirected += delta;
break;
case STATS_FIELD_ASB_RELOCATE_READABLE_SKIP:
parent_stats.asb_relocate_readable_skip += delta;
break;
case STATS_FIELD_ASB_RELOCATE_COPY_FAULT:
parent_stats.asb_relocate_copy_fault += delta;
break;
case STATS_FIELD_HEAP_POINTER_OUTSIDE_CACHE:
parent_stats.heap_pointer_outside_cache += delta;
break;
case STATS_FIELD_HEAP_BRK_STALE_WINDOW_HIT:
parent_stats.heap_brk_stale_window_hit += delta;
break;
case STATS_FIELD_RANGE_OVERLAPS_SHARED_REJECTS:
parent_stats.range_overlaps_shared_rejects += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_SHM_RANGE:
parent_stats.get_writable_address_scribbled_shm_range += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_MPROTECT_MMAP:
parent_stats.get_writable_address_scribbled_mprotect_mmap += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_MPROTECT_SHM:
parent_stats.get_writable_address_scribbled_mprotect_shm += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_POSTMP_MMAP:
parent_stats.get_writable_address_scribbled_postmp_mmap += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_POSTMP_SHM:
parent_stats.get_writable_address_scribbled_postmp_shm += delta;
break;
case STATS_FIELD_GET_WRITABLE_ENOMEM_EXHAUSTED:
parent_stats.get_writable_address_enomem_exhausted += delta;
break;
case STATS_FIELD_CHILDREN_RECYCLED_ON_STORM:
parent_stats.children_recycled_on_storm += delta;
break;
case STATS_FIELD_WATCHDOG_FD_EVICT:
parent_stats.watchdog_fd_evict += delta;
break;
case STATS_FIELD_UNSHARE_NEWNET_THROTTLED:
parent_stats.unshare_newnet_throttled += delta;
break;
case STATS_FIELD_RANGE_REJECTS_PER_SYSCALL_64:
if (aux < MAX_NR_SYSCALL)
parent_stats.range_overlaps_shared_rejects_per_syscall_64[aux] += delta;
break;
case STATS_FIELD_RANGE_REJECTS_PER_SYSCALL_32:
if (aux < MAX_NR_SYSCALL)
parent_stats.range_overlaps_shared_rejects_per_syscall_32[aux] += delta;
break;
case STATS_FIELD_POST_HANDLER_CORRUPT_PTR:
parent_stats.post_handler_corrupt_ptr += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT:
parent_stats.deferred_free_reject += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT_PATHNAME:
parent_stats.deferred_free_reject_pathname += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT_IOVEC:
parent_stats.deferred_free_reject_iovec += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT_SOCKADDR:
parent_stats.deferred_free_reject_sockaddr += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT_OTHER:
parent_stats.deferred_free_reject_other += delta;
break;
case STATS_FIELD_SNAPSHOT_NON_HEAP_REJECT:
parent_stats.snapshot_non_heap_reject += delta;
break;
case STATS_FIELD_RING_EVICTION_CORRUPT:
parent_stats.ring_eviction_corrupt += delta;
break;
case STATS_FIELD_DEFERRED_FREE_CORRUPT_PTR:
parent_stats.deferred_free_corrupt_ptr += delta;
break;
case STATS_FIELD_ARG_SHADOW_STOMP:
parent_stats.arg_shadow_stomp += delta;
break;
case STATS_FIELD_TOTAL_CALLS:
parent_stats.total_calls += delta;
break;
case STATS_FIELD_REMOTE_CALLS:
parent_stats.remote_calls += delta;
break;
case STATS_FIELD_TOTAL_PCS:
parent_stats.total_pcs += delta;
break;
case STATS_FIELD_WARM_KNOWN_HITS:
parent_stats.total_warm_known_hits += delta;
break;
case STATS_FIELD_CMP_HINTS_TRY_GET_ATTEMPTS:
parent_stats.cmp_hints_try_get_attempts += delta;
break;
case STATS_FIELD_CMP_HINTS_TRY_GET_RETURNED:
parent_stats.cmp_hints_try_get_returned += delta;
break;
case STATS_FIELD_PER_SYSCALL_CMP_ATTEMPTS:
if (aux < MAX_NR_SYSCALL)
parent_stats.per_syscall_cmp_attempts[aux] += delta;
break;
case STATS_FIELD_PER_SYSCALL_CMP_RETURNED:
if (aux < MAX_NR_SYSCALL)
parent_stats.per_syscall_cmp_returned[aux] += delta;
break;
case STATS_FIELD_CALL_COMPLETE: {
/* One slot, three logical bumps. op_count is unconditional
* (the SPSC slot wouldn't have made it past spsc_ring_drain
* without head/tail ordering, so its arrival IS the proof
* that a child dispatched a syscall). category is gated on
* aux < NR_SYSCAT; a scribbled aux loses just the category
* bump for this slot. successes/failures is gated on a
* known result_class; any other byte value in _reserved is
* treated as INCOMPLETE so a scribbled slot cannot fabricate
* a success/failure attribution. */
uint8_t result = (uint8_t)s->_reserved;
parent_stats.op_count += delta;
if (aux < NR_SYSCAT)
parent_stats.syscall_category_count[aux] += delta;
if (result == STATS_RESULT_SUCCESS)
parent_stats.successes += delta;
else if (result == STATS_RESULT_FAILURE)
parent_stats.failures += delta;
break;
}
case STATS_FIELD_NR:
default:
/* Out-of-range field_id: silent drop. A scribbled slot can
* carry any value; the surrounding ring overflow counter
* already conveys "we lost samples". */
break;
}
}
unsigned int stats_ring_drain(struct stats_ring *ring)
{
uint64_t overflow = 0;
uint32_t processed;
if (ring == NULL)
return 0;
processed = spsc_ring_drain(&ring->base, ring->slots, STATS_RING_SIZE,
sizeof(ring->slots[0]),
apply_slot, NULL, &overflow);
parent_stats.ring_overflow_total += overflow;
return processed;
}
/*
* Republish the mirror page from parent_stats. Caller must have already
* thawed the global-obj freeze (so the parent can write through to the
* mprotected page) and will refreeze afterwards.
*
* Mirror integrity is verified separately by shm_is_corrupt(): between
* this publish and the next iteration's read-back, nothing should write
* to the mirror, so a mismatch there flags a scribble.
*/
static void stats_publish_locked(void)
{
if (shm_published == NULL)
return;
__atomic_store_n(&shm_published->fleet_op_count, parent_stats.op_count,
__ATOMIC_RELAXED);
}
void stats_ring_drain_all(void)
{
unsigned int i;
if (children == NULL)
return;
for_each_child(i) {
struct childdata *child;
struct stats_ring *ring;
child = __atomic_load_n(&children[i], __ATOMIC_ACQUIRE);
if (child == NULL)
continue;
ring = __atomic_load_n(&child->stats_ring, __ATOMIC_ACQUIRE);
if (ring == NULL)
continue;
/*
* Sanity-check the ring pointer before dereferencing it.
* A D-state zombie waking after its slot was recycled can
* write a wild pointer here. fd_event_drain_all() caught
* 0x9c000000890000 in the wild (bit 47 set, bits 48-63
* clear -- non-canonical on x86-64). Catch that pattern
* and any obviously low address rather than taking a
* SIGSEGV that would take the parent down.
*/
{
uintptr_t raddr = (uintptr_t)ring;
uintptr_t top = raddr >> 47;
if (raddr < 0x10000 ||
(top != 0 && top != 0x1ffff)) {
output(0, "stats_ring: child[%u] ring pointer %p is non-canonical, skipping\n",
i, ring);
__atomic_add_fetch(&shm->stats.stats_ring_corrupted, 1,
__ATOMIC_RELAXED);
continue;
}
}
/*
* Canary check: compare the live pointer against the
* known-good value captured at init time. A mismatch means
* the pointer field was overwritten after init (e.g. a stray
* write from a recycled child slot). Use the expected
* pointer for the drain so fuzzing can continue, but only
* after it passes the same sanity check we applied to the
* live pointer above.
*/
if (ring != expected_stats_rings[i]) {
struct stats_ring *expected = expected_stats_rings[i];
uintptr_t eaddr = (uintptr_t)expected;
uintptr_t etop = eaddr >> 47;
output(0, "stats_ring: child[%u] ring pointer %p overwritten (expected %p)\n",
i, ring, expected);
__atomic_add_fetch(&shm->stats.stats_ring_overwritten, 1,
__ATOMIC_RELAXED);
if (eaddr < 0x10000 ||
(etop != 0 && etop != 0x1ffff)) {
output(0, "stats_ring: child[%u] expected ring %p also non-canonical, skipping\n",
i, expected);
continue;
}
ring = expected;
}
(void) stats_ring_drain(ring);
}
stats_publish_locked();
}
void stats_published_init(void)
{
shm_published = alloc_shared(sizeof(struct stats_published));
memset(shm_published, 0, sizeof(*shm_published));
}
/*
* Per-child mprotect freeze of the shm_published mirror page. The
* mirror is parent-write / child-read: children read fleet_op_count
* off it on the cold path (maybe_rotate_strategy()'s rotation clock
* in random-syscall.c and the syscalls_todo termination check in
* child_process()), and the parent's stats_publish_locked() inside
* stats_ring_drain_all() is the sole writer. The mirror-integrity
* sample in shm_is_corrupt() (main.c) already documents the
* PROT_READ contract -- "republish-time we wrote ... and then
* mprotected the page PROT_READ" -- but the matching mprotect()
* call was missing, leaving the contract as comment only. A wild
* kernel store through a fuzzed syscall arg pointer could scribble
* fleet_op_count between publishes, perturbing the rotation clock
* and syscalls_todo progress; the integrity check would only flag
* the damage post-hoc.
*
* Called from the per-child post-fork init hook so the freeze
* applies in child address space. mprotect is per-process, so the
* parent's mapping stays PROT_READ|PROT_WRITE and the drain's
* publish keeps writing through; only children see the read-only
* view.
*
* Best-effort on failure: log via the canonical helper and continue.
* mprotect can ENOMEM if the kernel runs out of VMA slots splitting
* the mapping that backs the mirror (same failure mode as the
* freeze_sibling_childdata sweep) and turning a transient kernel
* limit into a fleet-wide crash would be
* worse than leaving the mirror RW for the lifetime of the affected
* child.
*/
void stats_published_freeze(void)
{
size_t bytes;
if (shm_published == NULL)
return;
bytes = sizeof(struct stats_published);
bytes = (bytes + page_size - 1) & PAGE_MASK;
if (mprotect(shm_published, bytes, PROT_READ) != 0)
log_mprotect_failure(shm_published, bytes, PROT_READ,
__builtin_return_address(0), errno);
}