diff --git a/collectivus-plugin-kernel-types.d.ts b/collectivus-plugin-kernel-types.d.ts index 25a27c3..10e92ee 100644 --- a/collectivus-plugin-kernel-types.d.ts +++ b/collectivus-plugin-kernel-types.d.ts @@ -1109,6 +1109,36 @@ export interface QueryScope { limit: number } +/** + * Opaque, versioned continuation token marking a sink's incremental-read + * watermark: the highest `_hyp_ingest_seq` a `(sink instance, partition)` has + * durably exported. `seq` is an int64 encoded as a decimal string to dodge + * bigint/JSON precision hazards. Opaque + versioned so the underlying watermark + * mechanism can change without invalidating persisted watermarks. See LLP 0040 §2. + */ +export interface SinkContinuation { + v: 1 + seq: string +} + +/** Options for the back-compatible incremental extension to `readRows`. */ +export interface ReadRowsOptions { + /** + * Yield only rows newer than this watermark (`_hyp_ingest_seq > since.seq`). + * Absent ⇒ full scan (today's behaviour). + */ + since?: SinkContinuation + /** + * Disposition of pre-upgrade null-seq "legacy" rows when `since` is set. + * `true` (default) treats them as new (one-time backlog export); `false` + * treats them as already-exported (skip). A sink passes `false` once it has a + * durable watermark, so the legacy backlog re-exports exactly once instead of + * on every tick (LLP 0040 §6 risk #1). No new null-seq row can appear + * post-upgrade, so excluding them after the first export never skips live data. + */ + includeLegacy?: boolean +} + /** * Intrinsic storage service exposed by core to plugins that materialize * rows into the local Iceberg-backed cache. Plugins do not configure @@ -1132,7 +1162,20 @@ export interface QueryStorageService { discoverCachePartitions(scope?: Partial): Promise tableExists(tablePath: string): boolean tableUrl(tablePath: string): string - readRows(tablePath: string, columns?: string[]): AsyncIterable> + readRows(tablePath: string, columns?: string[], opts?: ReadRowsOptions): AsyncIterable> + /** + * Cursor-aware sibling of `readRows` for sinks that must advance a + * per-(sink instance, partition) watermark. Pairs each internal-stripped row + * with the `after` continuation to persist ONCE that row is durably exported. + * The internal `_hyp_ingest_seq` never reaches the row payload — it is read to + * derive `after`, then stripped. `after` is a monotonic high-water mark, so a + * null-seq legacy row carries the prior watermark forward unchanged. See + * LLP 0040 §2. + */ + readRowsSince( + tablePath: string, + opts: { since?: SinkContinuation; columns?: string[]; includeLegacy?: boolean }, + ): AsyncIterable<{ row: Record; after: SinkContinuation }> } export interface CachePartitionMeta { diff --git a/hypaware-core/plugins-workspace/central/index.js b/hypaware-core/plugins-workspace/central/index.js index 5df8f2d..1ebea1e 100644 --- a/hypaware-core/plugins-workspace/central/index.js +++ b/hypaware-core/plugins-workspace/central/index.js @@ -2,6 +2,8 @@ import path from 'node:path' +import { createInstanceWatermarkStore } from '../../../src/core/sinks/incremental.js' + import { validateCentralConfig } from './src/config.js' import { createConfigPullLoop } from './src/config_client.js' import { IdentityClient } from './src/identity_client.js' @@ -60,11 +62,21 @@ export async function activate(ctx) { hyp_identity_source: source, }) + // Per-(sink instance, partition) incremental-read watermarks. The plugin + // `stateDir` is per-PLUGIN, so two `@hypaware/central` instances would + // share — and clobber — one watermark file and skip each other's rows; + // `createInstanceWatermarkStore` namespaces by the instance name, matching + // local-fs/s3. Each forward instance then reads only rows added since its + // own last successful export. + // @ref LLP 0040#watermark-contract [implements] — one watermark per (sink instance, partition), scoped by instance name + const watermarks = createInstanceWatermarkStore({ paths: sinkCtx.paths, instanceName: sinkCtx.name }) + const sink = createForwardSink({ config, identityClient, query, storage, + watermarks, log: sinkCtx.log, }) diff --git a/hypaware-core/plugins-workspace/central/src/sink.js b/hypaware-core/plugins-workspace/central/src/sink.js index f55ed97..674835f 100644 --- a/hypaware-core/plugins-workspace/central/src/sink.js +++ b/hypaware-core/plugins-workspace/central/src/sink.js @@ -5,7 +5,8 @@ import { createHash } from 'node:crypto' import { RETRY_BACKOFF_SECONDS, parseRetryAfter, abortableSleep } from './backoff.js' /** - * @import { ExportBatch, ExportOptions, ExportResult, PluginLogger, QueryPartition, QueryRegistry, QueryStorageService, Sink } from '../../../../collectivus-plugin-kernel-types.js' + * @import { ExportBatch, ExportOptions, ExportResult, PluginLogger, QueryPartition, QueryRegistry, QueryStorageService, Sink, SinkContinuation } from '../../../../collectivus-plugin-kernel-types.js' + * @import { SinkWatermarkKey, SinkWatermarkStore } from '../../../../src/core/sinks/types.js' * @import { IdentityClient } from './identity_client.js' * @import { CentralSinkConfig } from './types.js' */ @@ -47,6 +48,7 @@ const MAX_CHUNK_BYTES = 4 * 1024 * 1024 * identityClient: IdentityClient, * query: QueryRegistry, * storage: QueryStorageService, + * watermarks: SinkWatermarkStore, * log: PluginLogger, * fetchFn?: typeof fetch, * sleepFn?: (ms: number, signal?: AbortSignal) => Promise, @@ -54,7 +56,7 @@ const MAX_CHUNK_BYTES = 4 * 1024 * 1024 * @returns {Sink} */ export function createForwardSink(args) { - const { config, identityClient, query, storage, log } = args + const { config, identityClient, query, storage, watermarks, log } = args const fetchFn = args.fetchFn ?? fetch // Injectable so tests drive backpressure pacing without real waits. const sleepFn = args.sleepFn ?? abortableSleep @@ -90,7 +92,7 @@ export function createForwardSink(args) { const signal = signalForPartition(query, partition) try { bytesWritten += await forwardPartition({ - partition, signal, config, identityClient, storage, fetchFn, log, + partition, signal, config, identityClient, storage, watermarks, fetchFn, log, abortSignal: abortController.signal, sleepFn, }) partitionsExported += 1 @@ -158,16 +160,27 @@ function signalForPartition(query, partition) { /** * Stream one partition's rows to `/v1/ingest/{signal}` in bounded - * chunks, never materializing the whole table. Each chunk POSTs with an - * `X-Hyp-Batch-Id` derived from the signal, the partition identity, the - * chunk's position, and its bytes (see {@link batchIdForChunk}): stable - * across retries of that exact chunk, yet distinct for any other chunk, - * so two byte-identical chunks never collide. When the driver re-hands a - * partition after a transport failure, re-streaming reproduces the same - * chunk boundaries, so the unchanged prefix chunks hash to the same ids - * and the server's idempotency ledger (server LLP 0001) acks them `202` - * without re-storing. A partial-then-retried partition thus converges to - * exactly-once instead of duplicating every already-delivered row. + * chunks, never materializing the whole table. Only rows added since the + * last durable export are read: the `(sink instance, partition)` + * watermark is loaded up front and handed to `readRowsSince({ since })`, + * so a tick with no new rows reads zero rows and sends zero chunks. Each + * chunk POSTs with an `X-Hyp-Batch-Id` derived from the signal, the + * partition identity, the chunk's position, and its bytes (see + * {@link batchIdForChunk}): stable across retries of that exact chunk, + * yet distinct for any other chunk — so two byte-identical chunks never + * collide. When the driver re-hands a partition after a transport + * failure, re-streaming from the same watermark reproduces the same chunk + * boundaries, so the unchanged prefix chunks hash to the same ids and the + * server's idempotency ledger (server LLP 0001) acks them `202` without + * re-storing. The watermark advances ONCE, after the whole partition's chunks + * are acked (ship first, advance second), to the partition's high-water `after` + * — never mid-partition. A partial partition (an early chunk acked, a later one + * failed) therefore never checkpoints, so a crash/failure re-reads the whole + * partition next tick and the server ledger dedupes the already-acked prefix. + * Mid-partition advance is unsafe because the scan is NOT seq-ordered (LLP 0040 + * §4 risk #3): `after` is a running max, so a chunk that physically precedes a + * lower-seq chunk would advance the watermark past rows still un-acked in a + * later chunk, silently skipping them forever on a between-chunk failure. * * @param {{ * partition: QueryPartition, @@ -175,6 +188,7 @@ function signalForPartition(query, partition) { * config: CentralSinkConfig, * identityClient: IdentityClient, * storage: QueryStorageService, + * watermarks: SinkWatermarkStore, * fetchFn: typeof fetch, * log: PluginLogger, * abortSignal: AbortSignal, @@ -182,7 +196,7 @@ function signalForPartition(query, partition) { * }} args * @returns {Promise} bytes successfully POSTed for this partition */ -async function forwardPartition({ partition, signal, config, identityClient, storage, fetchFn, log, abortSignal, sleepFn }) { +async function forwardPartition({ partition, signal, config, identityClient, storage, watermarks, fetchFn, log, abortSignal, sleepFn }) { if (!KNOWN_SIGNALS.has(signal)) { throw new Error(`central.forward: unknown signal '${signal}' (expected logs|traces|metrics|proxy)`) } @@ -193,18 +207,62 @@ async function forwardPartition({ partition, signal, config, identityClient, sto const tablePath = partition.tablePath await flushPartition(storage, tablePath, 'sink_export') + // @ref LLP 0040#watermark-contract [implements] — load the per-(sink instance, partition) watermark so this tick reads only rows added since the last durable export; a missing/unreadable watermark reads from the start (at-least-once + server dedup), never a silent skip. + /** @type {SinkContinuation | undefined} */ + let since + /** @type {SinkWatermarkKey | undefined} */ + let watermarkKey + let exportedRowCount = 0 + try { + watermarkKey = watermarks.keyFor(storage.cacheRoot, tablePath) + const record = await watermarks.read(watermarkKey) + since = record?.continuation + exportedRowCount = record?.exportedRowCount ?? 0 + } catch (err) { + // An underivable key or unreadable watermark must not wedge the sink: + // fall back to a full scan (the server ledger dedupes the redelivery) + // and skip watermark writes for this partition this tick. + watermarkKey = undefined + since = undefined + exportedRowCount = 0 + log.warn('central.forward.watermark_read_failed', { + hyp_dataset: partition.dataset, + message: err instanceof Error ? err.message : String(err), + }) + } + let bytesWritten = 0 let chunkIndex = 0 + // Rows acked across THIS partition's chunks. Accumulated as each chunk POSTs + // so the single end-of-partition watermark write carries an accurate count. + let shippedRowCount = 0 /** @type {string[]} */ let lines = [] let pendingBytes = 0 + // `after` token of the most recently buffered row; after the loop it is the + // partition's high-water `after`, the watermark to persist once every chunk + // is acked. + /** @type {SinkContinuation | undefined} */ + let lastAfter + // The seq this chunk starts AFTER — the `since` watermark for the first + // chunk, then the previous chunk's last `after` seq. The idempotency key is + // derived from THIS (not the per-tick `chunkIndex`) so a chunk's id is stable + // across watermark advances: once an earlier chunk is acked and the watermark + // moves, a respool re-reads the un-acked suffix from that same watermark, the + // re-streamed chunk reproduces the same `[startSeq, body]`, and the server + // ledger dedupes the redelivery. Keying on `chunkIndex` would re-number the + // suffix from 0 and mint a NEW id for an already-committed-but-unacked chunk, + // double-storing it on the server. + let chunkStartSeq = since?.seq ?? '0' const flushChunk = async () => { if (lines.length === 0) return const body = lines.join('\n') + '\n' - const batchId = batchIdForChunk(signal, tablePath, chunkIndex, body) + // @ref LLP 0040#applying-it-to-both-sinks [implements] — stable per-chunk batch id keyed by the chunk's start seq, so a post-watermark-advance respool reproduces the same id and the server ledger dedupes. + const batchId = batchIdForChunk(signal, tablePath, chunkStartSeq, body) const bytes = Buffer.byteLength(body, 'utf8') const rows = lines.length + const after = lastAfter try { await postNdjson({ centralUrl: config.url, signal, body, batchId, identityClient, fetchFn, log, abortSignal, sleepFn, @@ -231,13 +289,24 @@ async function forwardPartition({ partition, signal, config, identityClient, sto }) bytesWritten += bytes chunkIndex += 1 + shippedRowCount += rows + // The next chunk starts after this chunk's last row, so its batch id keys + // off this chunk's `after` — keeping ids stable whether a tick streams the + // whole partition or a respool replays only the un-acked suffix. + if (after) chunkStartSeq = after.seq lines = [] pendingBytes = 0 } - for await (const row of storage.readRows(tablePath)) { + // @ref LLP 0040#storage-api-extension [implements] — pre-upgrade null-seq rows + // are "new" only on a sink with no durable watermark (export the backlog once); + // once a watermark exists they are already shipped, so exclude them and the + // legacy backlog never re-exports every tick (LLP 0040 §6 risk #1). + const includeLegacy = since === undefined + for await (const { row, after } of storage.readRowsSince(tablePath, { since, includeLegacy })) { const line = JSON.stringify(serializeRow(row)) lines.push(line) + lastAfter = after // Count UTF-8 bytes (not UTF-16 code units) so the budget bounds the // actual wire size for multibyte payloads, e.g. CJK `content_text`. pendingBytes += Buffer.byteLength(line, 'utf8') + 1 @@ -246,28 +315,52 @@ async function forwardPartition({ partition, signal, config, identityClient, sto } } await flushChunk() + + // @ref LLP 0040#watermark-contract [implements] — ship first, advance second, + // but advance ONLY at end-of-partition (like the blob sink). Every chunk is + // acked by the time we reach here (a failed POST throws out of flushChunk + // before this), so persisting the partition's high-water `after` can never + // checkpoint past an un-acked row. A between-chunk failure leaves the + // watermark untouched: the next tick re-reads the whole partition and the + // server ledger dedupes the already-acked prefix. Advancing per chunk to the + // running-max `after` would skip lower-seq rows in a later un-acked chunk + // whenever the scan is not seq-ordered (LLP 0040 §4 risk #3). + if (watermarkKey && lastAfter && shippedRowCount > 0) { + await watermarks.write(watermarkKey, { + continuation: lastAfter, + exportedRowCount: exportedRowCount + shippedRowCount, + }) + } return bytesWritten } /** * Deterministic idempotency key for one chunk. Hashes the signal, the - * partition identity (`tablePath`), the chunk's ordinal position, and - * its exact bytes. Re-streaming a partition reproduces the same chunk - * boundaries and order, so a re-sent chunk hashes to the same id (the - * server dedupes it); two byte-identical chunks at different positions or - * in different partitions get distinct ids and are both stored. + * partition identity (`tablePath`), the seq this chunk starts AFTER, and its + * exact bytes. + * + * Keying on `chunkStartSeq` (the watermark the chunk resumes from) rather than a + * per-tick ordinal is what keeps the id stable across a watermark advance: when + * an earlier chunk is acked the watermark moves, and a respool re-reads only the + * un-acked suffix — which reproduces the same `[startSeq, body]` and so the same + * id, letting the server ledger dedupe a chunk that committed but whose ack was + * lost. (An ordinal would re-number the suffix from 0 and mint a fresh id for an + * already-stored chunk, double-storing it.) Two byte-identical chunks at + * different positions still get distinct ids because a row's `_hyp_ingest_seq` + * is unique, so their start seqs differ; chunks in different partitions differ + * on `tablePath`. * * @param {string} signal * @param {string} tablePath - * @param {number} chunkIndex + * @param {string} chunkStartSeq decimal `_hyp_ingest_seq` the chunk starts after * @param {string} body * @returns {string} */ -function batchIdForChunk(signal, tablePath, chunkIndex, body) { +function batchIdForChunk(signal, tablePath, chunkStartSeq, body) { return createHash('sha256') .update(signal).update('\0') .update(tablePath).update('\0') - .update(String(chunkIndex)).update('\0') + .update(chunkStartSeq).update('\0') .update(body) .digest('hex').slice(0, 32) } diff --git a/hypaware-core/plugins-workspace/local-fs/src/index.js b/hypaware-core/plugins-workspace/local-fs/src/index.js index 0561da8..3acf93e 100644 --- a/hypaware-core/plugins-workspace/local-fs/src/index.js +++ b/hypaware-core/plugins-workspace/local-fs/src/index.js @@ -4,12 +4,20 @@ import { Buffer } from 'node:buffer' import fs from 'node:fs/promises' import path from 'node:path' -import { encodePartition, clusterColumnsForDataset } from 'hypaware/core/sinks' +import { + encodePartition, + clusterColumnsForDataset, + createInstanceWatermarkStore, + openIncrementalRows, + watermarkKeyFor, + withSeqRangeFilename, +} from 'hypaware/core/sinks' import { createLocalFsBlobStore, resolveExportsBaseDir } from './blob-store.js' /** - * @import { ExportBatch, ExportOptions, ExportResult, PluginActivationContext, QueryPartition, QueryRegistry, QueryStorageService, Sink, SinkCreateContext, SinkEncodedBlob, SinkEncoder } from '../../../../collectivus-plugin-kernel-types.js' + * @import { ExportBatch, ExportOptions, ExportResult, PluginActivationContext, QueryPartition, QueryRegistry, QueryStorageService, Sink, SinkCreateContext, SinkEncoder } from '../../../../collectivus-plugin-kernel-types.js' + * @import { SinkWatermarkStore } from '../../../../src/core/sinks/types.js' */ const PLUGIN_NAME = '@hypaware/local-fs' @@ -30,9 +38,9 @@ const PLUGIN_VERSION = '1.0.0' * * The sink closes over the activation context so its `exportBatch` can * (a) look up dataset schemas through `ctx.query.getDataset` and - * (b) stream cache rows through `ctx.storage.readRows`: both inputs - * are then handed to the paired encoder via the kernel's - * `sink.encode_partition` helper. + * (b) stream the cache rows added since its watermark through + * `ctx.storage.readRowsSince` — both inputs are then handed to the paired + * encoder via the kernel's `sink.encode_partition` helper. * * @param {PluginActivationContext} ctx * @ref LLP 0014#bytes-flow-down-semantics-flow-up [implements]: provides hypaware.blob-store, never knows its bytes' format @@ -68,6 +76,7 @@ export async function activate(ctx) { sinkCtx, query: ctx.query, storage: ctx.storage, + watermarks: createInstanceWatermarkStore({ paths: sinkCtx.paths, instanceName: sinkCtx.name }), }) return sink }, @@ -81,10 +90,11 @@ export async function activate(ctx) { * sinkCtx: SinkCreateContext, * query: QueryRegistry, * storage: QueryStorageService, + * watermarks: SinkWatermarkStore, * }} args * @returns {Sink} */ -function buildSink({ baseDir, encoder, sinkCtx, query, storage }) { +function buildSink({ baseDir, encoder, sinkCtx, query, storage, watermarks }) { return { /** * @param {ExportBatch} batch @@ -102,24 +112,53 @@ function buildSink({ baseDir, encoder, sinkCtx, query, storage }) { if (partition.tablePath) { await flushPartition(storage, partition.tablePath, 'sink_export') } - const rows = openRows(storage, partition) + // Incremental read: only rows added since this (instance, partition) + // last durably exported. The watermark is keyed by the stable logical + // partition path, so it survives retention prunes and compaction + // generation swaps. @ref LLP 0040#applying-it-to-both-sinks + const wmKey = watermarkKeyFor(watermarks, storage, partition) + const prev = wmKey ? await watermarks.read(wmKey) : null + const reader = await openIncrementalRows(storage, partition, prev?.continuation) + if (reader.empty) { + // No new rows since the watermark ⇒ write no blob (0 bytes). + sinkCtx.log.debug('local-fs.export_partition.skip_empty', { + hyp_plugin: PLUGIN_NAME, + hyp_dataset: partition.dataset, + hyp_sink_instance: sinkCtx.name, + since_seq: reader.sinceSeq, + }) + continue + } const blob = await encodePartition(encoder, partition, { log: sinkCtx.log, tempDir: sinkCtx.paths.tempDir, columns, - rows, + rows: reader.rows, clusterColumns: clusterColumnsForDataset(query, partition.dataset), sinkInstance: sinkCtx.name, plugin: PLUGIN_NAME, }) - const destPath = await writeBlob(baseDir, partition, blob) + // Embed [sinceSeq, lastSeq] so a crash-retry re-writes the same file + // (idempotent overwrite) — the blob sink's server-ledger stand-in. + const filename = withSeqRangeFilename(blob.filename, reader.sinceSeq, reader.lastAfter.seq) + const destPath = await writeBlob(baseDir, partition, filename, blob.bytes) + // Durable now: advance the watermark to this blob's last row. + if (wmKey) { + await watermarks.write(wmKey, { + continuation: reader.lastAfter, + exportedRowCount: (prev?.exportedRowCount ?? 0) + reader.rowCount, + }) + } bytesWritten += blob.bytesWritten ?? 0 exported += 1 sinkCtx.log.debug('local-fs.export_partition.ok', { hyp_plugin: PLUGIN_NAME, hyp_dataset: partition.dataset, hyp_sink_instance: sinkCtx.name, - hyp_sink_filename: blob.filename, + hyp_sink_filename: filename, + since_seq: reader.sinceSeq, + last_seq: reader.lastAfter.seq, + row_count: reader.rowCount, bytes_written: blob.bytesWritten ?? 0, dest_path: destPath, }) @@ -173,54 +212,32 @@ function lookupColumns(query, datasetName) { return dataset.schema.columns } -/** - * Open the partition's cache rows as an async iterable. When the - * partition lacks a `tablePath` (or the table doesn't exist yet on - * disk), yield nothing instead of throwing: the encoder will land an - * empty file at the expected path, which is the right behavior for a - * registered-but-empty partition. - * - * @param {QueryStorageService} storage - * @param {QueryPartition} partition - * @returns {AsyncIterable>} - */ -function openRows(storage, partition) { - if (!partition.tablePath) return emptyAsyncIterable() - if (!storage.tableExists(partition.tablePath)) return emptyAsyncIterable() - return storage.readRows(partition.tablePath) -} - -/** @returns {AsyncIterable>} */ -function emptyAsyncIterable() { - return { - async *[Symbol.asyncIterator]() {}, - } -} - /** * Persist the encoded bytes under - * `///`. Streams + * `///`. `filename` carries + * the encoder's name plus the embedded `[sinceSeq, lastSeq]` range. Streams * are concatenated in memory before the write because the smoke pipes * 50-row files; a future streaming refactor lives behind the same * interface. * * @param {string} baseDir * @param {QueryPartition} partition - * @param {SinkEncodedBlob} blob + * @param {string} filename + * @param {Uint8Array | AsyncIterable} blobBytes * @returns {Promise} */ -async function writeBlob(baseDir, partition, blob) { +async function writeBlob(baseDir, partition, filename, blobBytes) { const partitionDir = path.join(baseDir, partition.dataset, partitionSegment(partition)) await fs.mkdir(partitionDir, { recursive: true }) - const destPath = path.join(partitionDir, blob.filename) + const destPath = path.join(partitionDir, filename) /** @type {Uint8Array} */ let bytes - if (blob.bytes instanceof Uint8Array) { - bytes = blob.bytes + if (blobBytes instanceof Uint8Array) { + bytes = blobBytes } else { /** @type {Uint8Array[]} */ const chunks = [] - for await (const chunk of blob.bytes) chunks.push(chunk) + for await (const chunk of blobBytes) chunks.push(chunk) bytes = chunks.length === 1 ? chunks[0] : Buffer.concat(chunks.map((c) => Buffer.from(c.buffer, c.byteOffset, c.byteLength))) } await fs.writeFile(destPath, bytes) diff --git a/hypaware-core/plugins-workspace/s3/src/index.js b/hypaware-core/plugins-workspace/s3/src/index.js index 6b59b67..e72567a 100644 --- a/hypaware-core/plugins-workspace/s3/src/index.js +++ b/hypaware-core/plugins-workspace/s3/src/index.js @@ -2,7 +2,14 @@ import { Buffer } from 'node:buffer' -import { encodePartition, clusterColumnsForDataset } from 'hypaware/core/sinks' +import { + encodePartition, + clusterColumnsForDataset, + createInstanceWatermarkStore, + openIncrementalRows, + watermarkKeyFor, + withSeqRangeFilename, +} from 'hypaware/core/sinks' import { createS3BlobStore, @@ -18,6 +25,7 @@ import { keyIsWithinPrefix, renderObjectKey } from './keys.js' /** * @import { BlobStore, ExportBatch, ExportOptions, ExportResult, PluginActivationContext, QueryPartition, QueryRegistry, QueryStorageService, Sink, SinkCreateContext, SinkEncoder } from '../../../../collectivus-plugin-kernel-types.js' + * @import { SinkWatermarkStore } from '../../../../src/core/sinks/types.js' * @import { S3BlobStoreClientFactory, S3ClientFactory, S3ClientHandle, S3ErrorKind, S3QuerySourceConfig, S3SinkConfig } from './types.js' */ @@ -109,6 +117,7 @@ export async function activate(ctx) { sinkCtx, query: ctx.query, storage: ctx.storage, + watermarks: createInstanceWatermarkStore({ paths: sinkCtx.paths, instanceName: sinkCtx.name }), }) }, }) @@ -240,10 +249,11 @@ function resolveClientFactory(sinkCtx) { * sinkCtx: SinkCreateContext, * query: QueryRegistry, * storage: QueryStorageService, + * watermarks: SinkWatermarkStore, * }} args * @returns {Sink} */ -function buildSink({ config, client, encoder, sinkCtx, query, storage }) { +function buildSink({ config, client, encoder, sinkCtx, query, storage, watermarks }) { return { /** * @param {ExportBatch} batch @@ -263,14 +273,30 @@ function buildSink({ config, client, encoder, sinkCtx, query, storage }) { if (partition.tablePath) { await flushPartition(storage, partition.tablePath, 'sink_export') } - const rows = openRows(storage, partition) + // Incremental read: only rows added since this (instance, partition) + // last durably PUT. The watermark is keyed by the stable logical + // partition path, so it survives retention prunes and compaction + // generation swaps. @ref LLP 0040#applying-it-to-both-sinks + const wmKey = watermarkKeyFor(watermarks, storage, partition) + const prev = wmKey ? await watermarks.read(wmKey) : null + const reader = await openIncrementalRows(storage, partition, prev?.continuation) + if (reader.empty) { + // No new rows since the watermark ⇒ PUT no object (0 bytes). + sinkCtx.log.debug('s3.put_object.skip_empty', { + hyp_plugin: PLUGIN_NAME, + hyp_sink_instance: sinkCtx.name, + hyp_dataset: partition.dataset, + since_seq: reader.sinceSeq, + }) + continue + } let blob try { blob = await encodePartition(encoder, partition, { log: sinkCtx.log, tempDir: sinkCtx.paths.tempDir, columns, - rows, + rows: reader.rows, clusterColumns: clusterColumnsForDataset(query, partition.dataset), sinkInstance: sinkCtx.name, plugin: PLUGIN_NAME, @@ -278,10 +304,13 @@ function buildSink({ config, client, encoder, sinkCtx, query, storage }) { } catch (err) { throw tagError(err, 'encoder_failed') } + // Embed [sinceSeq, lastSeq] so a crash-retry re-PUTs the same object + // key (idempotent overwrite) — the blob sink's server-ledger stand-in. + const filename = withSeqRangeFilename(blob.filename, reader.sinceSeq, reader.lastAfter.seq) const objectKey = renderObjectKey({ prefix: config.prefix, partition, - filename: blob.filename, + filename, }) if (!keyIsWithinPrefix({ prefix: config.prefix, dataset: partition.dataset, key: objectKey })) { throw new Error(`${PLUGIN_NAME}: rendered key '${objectKey}' is outside prefix '${config.prefix}'`) @@ -299,6 +328,13 @@ function buildSink({ config, client, encoder, sinkCtx, query, storage }) { else if (encoder.format === 'jsonl') putInput.ContentType = 'application/jsonl' await client.putObject(putInput) + // Durable now: advance the watermark to this object's last row. + if (wmKey) { + await watermarks.write(wmKey, { + continuation: reader.lastAfter, + exportedRowCount: (prev?.exportedRowCount ?? 0) + reader.rowCount, + }) + } const partitionBytes = blob.bytesWritten ?? body.byteLength bytesWritten += partitionBytes exported += 1 @@ -311,7 +347,9 @@ function buildSink({ config, client, encoder, sinkCtx, query, storage }) { prefix: config.prefix, object_key: objectKey, bytes_written: partitionBytes, - row_count: blob.rowCount ?? 0, + row_count: reader.rowCount, + since_seq: reader.sinceSeq, + last_seq: reader.lastAfter.seq, status: 'ok', }) } catch (err) { @@ -426,24 +464,6 @@ async function flushPartition(storage, tablePath, reason) { } } -/** - * @param {QueryStorageService} storage - * @param {QueryPartition} partition - * @returns {AsyncIterable>} - */ -function openRows(storage, partition) { - if (!partition.tablePath) return emptyAsyncIterable() - if (!storage.tableExists(partition.tablePath)) return emptyAsyncIterable() - return storage.readRows(partition.tablePath) -} - -/** @returns {AsyncIterable>} */ -function emptyAsyncIterable() { - return { - async *[Symbol.asyncIterator]() {}, - } -} - /** * S3's `PutObject` API works best with a known-length payload. Drain * streaming encoders into a single `Uint8Array` here so we can supply diff --git a/hypaware-core/smoke/flows/blob_sink_parquet_local_fs.js b/hypaware-core/smoke/flows/blob_sink_parquet_local_fs.js index 55a1aa5..8579a60 100644 --- a/hypaware-core/smoke/flows/blob_sink_parquet_local_fs.js +++ b/hypaware-core/smoke/flows/blob_sink_parquet_local_fs.js @@ -206,12 +206,19 @@ export async function run({ harness, expect }) { ) // Path: ///. - // The fixture's only partition is `{partition: 'all'}`, so both the - // directory and the file render as `partition=all` (different encoders - // can short-circuit to `all.parquet` for partition-less datasets via - // the empty-entries branch). + // The fixture's only partition is `{partition: 'all'}`, so the directory + // renders as `partition=all`. Incremental sink reads (LLP 0040) embed the + // exported `[sinceSeq, lastSeq]` ingest-seq range in the filename for + // idempotent re-PUT, so the blob is `partition=all.-.parquet` + // (a first export starts at sinceSeq 0). const expectedDir = path.join(destinationDir, DATASET, 'partition=all') - const expectedFile = path.join(expectedDir, 'partition=all.parquet') + const blobNames = (await fs.readdir(expectedDir)).filter((n) => /^partition=all\.\d+-\d+\.parquet$/.test(n)) + expect.that( + `destination: exactly one ranged parquet blob under ${expectedDir}`, + blobNames, + (names) => Array.isArray(names) && names.length === 1 + ) + const expectedFile = path.join(expectedDir, blobNames[0] ?? 'partition=all.0-0.parquet') const stat = await fs.stat(expectedFile) expect.that( `destination: ${expectedFile} is a non-empty regular file`, diff --git a/hypaware-core/smoke/flows/incremental_sink_compaction.js b/hypaware-core/smoke/flows/incremental_sink_compaction.js new file mode 100644 index 0000000..bd9e867 --- /dev/null +++ b/hypaware-core/smoke/flows/incremental_sink_compaction.js @@ -0,0 +1,306 @@ +// @ts-check + +import fs from 'node:fs/promises' +import path from 'node:path' +import { fileURLToPath } from 'node:url' + +import { parquetReadObjects } from 'hyparquet' + +import { + Attr, + installObservability, + runRoot, +} from '../../../src/core/observability/index.js' +import { createCommandRegistry } from '../../../src/core/registry/commands.js' +import { createSinkDriver } from '../../../src/core/sinks/driver.js' +import { createKernelRuntime } from '../../../src/core/runtime/activation.js' +import { activatePlugins } from '../../../src/core/runtime/loader.js' +import { loadManifests } from '../../../src/core/manifest.js' +import { maintainCache } from '../../../src/core/cache/maintenance.js' +import { readCursorSync } from '../../../src/core/cache/partition.js' + +/** + * @import { ActivePlugin, ColumnSpec } from '../../../collectivus-plugin-kernel-types.js' + * @import { Dirent } from 'node:fs' + */ + +const SMOKE_DIR = path.dirname(fileURLToPath(import.meta.url)) +const PLUGINS_WORKSPACE = path.resolve(SMOKE_DIR, '../../plugins-workspace') +const DATASET = 'proxy' +const SOURCE = 'claude' +const SINK_INSTANCE = 'archive' + +/** @type {ColumnSpec[]} */ +const COLUMNS = [ + { name: 'id', type: 'INT64', nullable: false }, + { name: 'client_name', type: 'STRING', nullable: false }, + { name: 'msg', type: 'STRING', nullable: false }, +] + +/** + * Acceptance smoke for incremental sink reads (LLP 0040, T6) through the REAL + * sink driver. Stands up `@hypaware/format-parquet` + `@hypaware/local-fs` plus + * a fixture `proxy` dataset, then drives the blob sink across the cache rewrite + * that makes incremental export hard — a compaction GENERATION SWAP — and proves: + * + * - tick 1 (3 rows): one parquet blob lands carrying exactly those rows; + * - tick 2 (no new rows): the sink writes NO new blob and reports ≈0 bytes; + * - a compaction rewrites the partition into a fresh `table-` dir; + * - tick 3 (2 new rows): exactly one new blob lands carrying ONLY the 2 new + * rows — the row-resident `_hyp_ingest_seq` rode the compaction verbatim and + * the logical-path watermark read straight through the generation swap; + * - across all ticks every row is exported exactly once (no skip, no dup). + * + * The forward-sink and retention-prune equivalents are covered by the + * deterministic acceptance suite in `test/core/sink-incremental-acceptance.test.js`. + * + * @param {{ harness: any, expect: any }} args + * @ref LLP 0040#exactly-once-argument [tests] — blob sink reads straight through a compaction generation swap; ≈0 on no-new-rows, ≈N on N-new + */ +export async function run({ harness, expect }) { + const obs = installObservability() + if (!obs.tracer.provider) { + throw new Error('incremental_sink_compaction: tracer provider not installed — expected HYP_DEV_TELEMETRY=1') + } + + const cacheRoot = path.join(harness.stateDir, 'cache') + const registry = createCommandRegistry() + const kernel = createKernelRuntime({ commandRegistry: registry, cacheRoot }) + + const destinationDir = path.join(harness.tmpDir, 'sink-out') + await fs.mkdir(destinationDir, { recursive: true }) + + const fixtureDir = path.join(harness.tmpDir, 'plugins', 'test-proxy') + await writeFixturePlugin(fixtureDir) + + const parquetDir = path.join(PLUGINS_WORKSPACE, 'format-parquet') + const localFsDir = path.join(PLUGINS_WORKSPACE, 'local-fs') + const tmpRoot = path.join(harness.tmpDir, 'plugin-temp') + await fs.mkdir(tmpRoot, { recursive: true }) + + await runRoot( + 'kernel.boot', + { + [Attr.COMPONENT]: 'kernel', + [Attr.OPERATION]: 'boot', + [Attr.SMOKE_NAME]: harness.smokeName, + [Attr.SMOKE_STEP]: 'plugin_activate', + [Attr.DEV_RUN_ID]: harness.devRunId, + status: 'ok', + }, + async () => { + const { loaded, failed } = await loadManifests([parquetDir, localFsDir, fixtureDir]) + if (failed.length > 0) { + throw new Error(`incremental_sink_compaction: manifest failures — ${failed.map((f) => `${f.manifestPath}: ${f.message}`).join('; ')}`) + } + const entries = loaded.map((l) => ({ manifest: l.manifest, rootDir: l.rootDir })) + const result = await activatePlugins({ plugins: entries, stateRoot: harness.stateDir, runId: harness.devRunId, runtime: kernel, tmpRoot }) + for (const r of result.results) { + if (!r.ok) throw new Error(`activate ${r.plugin.name} failed (${r.errorKind}): ${r.message}`) + } + } + ) + + const encoder = /** @type {any} */ (kernel.capabilities.require('@hypaware/local-fs', 'hypaware.encoder', '^1.0.0')) + const contribution = kernel.sinks.getContribution('@hypaware/local-fs', 'local-fs') + expect.that('sinks: local-fs contributed a local-fs sink', contribution, (v) => v !== undefined) + if (!contribution) return + + /** @type {ActivePlugin} */ + const destinationPlugin = { + name: '@hypaware/local-fs', + version: '1.0.0', + manifest: { schema_version: 1, name: '@hypaware/local-fs', version: '1.0.0', hypaware_api: '^1.0.0', runtime: 'node', entrypoint: './src/index.js' }, + rootDir: localFsDir, + } + await kernel.sinks.instantiate({ + kind: 'blob', + instanceName: SINK_INSTANCE, + destination: contribution, + writerPlugin: '@hypaware/format-parquet', + encoder, + config: { schedule: '* * * * *', dir: destinationDir }, + plugin: destinationPlugin, + paths: { + rootDir: localFsDir, + stateDir: path.join(harness.stateDir, 'plugins', '@hypaware/local-fs'), + cacheDir: path.join(harness.stateDir, 'cache', 'plugins', '@hypaware/local-fs'), + tempDir: path.join(tmpRoot, 'local-fs'), + }, + log: makeNoopLogger(), + }) + + const driver = createSinkDriver({ sinkRegistry: kernel.sinks, queryRegistry: kernel.query, storage: kernel.storage, stateRoot: harness.stateDir }) + const now = new Date('2026-02-15T10:00:00Z') + const spoolPath = kernel.storage.cacheTablePath(DATASET, ['all']) + + // ---- tick 1: 3 rows -> one blob carrying exactly {0,1,2} ---- + await flushRows(kernel.storage, spoolPath, [0, 1, 2]) + const t1 = await driver.tick({ now, force: true }) + expect.that('tick 1: blob sink exported', t1.sinks[0]?.status, (v) => v === 'exported') + expect.that('tick 1: bytes written > 0', t1.sinks[0]?.bytesWritten, (v) => typeof v === 'number' && v > 0) + let blobs = await readBlobs(destinationDir) + expect.that('tick 1: exactly one blob written', blobs, (b) => b.length === 1) + expect.that('tick 1: blob carries rows {0,1,2}', blobs[0]?.ids, (ids) => sameSet(ids, [0, 1, 2])) + + // ---- tick 2: no new rows -> NO new blob, ≈0 bytes ---- + const t2 = await driver.tick({ now, force: true }) + expect.that('tick 2: status exported (no-op)', t2.sinks[0]?.status, (v) => v === 'exported') + expect.that('tick 2: ≈0 bytes on a no-new-rows tick', t2.sinks[0]?.bytesWritten, (v) => v === 0) + blobs = await readBlobs(destinationDir) + expect.that('tick 2: no second blob for a no-new-rows tick', blobs, (b) => b.length === 1) + + // ---- compaction generation swap ---- + const sourceDir = path.join(cacheRoot, 'datasets', DATASET, `source=${SOURCE}`) + await flushRows(kernel.storage, spoolPath, [3, 4]) + const before = readCursorSync(sourceDir).tableDir ?? 'table' + const maint = await maintainCache({ cacheRoot, force: true, compactOnly: true }) + expect.that('compaction: at least one partition compacted', maint.totalCompacted, (v) => typeof v === 'number' && v > 0) + const after = readCursorSync(sourceDir).tableDir ?? 'table' + expect.that('compaction: generation directory swapped', [before, after], ([b, a]) => b !== a) + + // ---- tick 3: 2 new rows -> exactly one new blob carrying ONLY {3,4} ---- + const t3 = await driver.tick({ now, force: true }) + expect.that('tick 3: blob sink exported', t3.sinks[0]?.status, (v) => v === 'exported') + blobs = await readBlobs(destinationDir) + expect.that('tick 3: exactly two blobs total', blobs, (b) => b.length === 2) + const newest = blobs[blobs.length - 1] + expect.that('tick 3: new blob carries ONLY {3,4} (seq survived compaction)', newest?.ids, (ids) => sameSet(ids, [3, 4])) + + // ---- exactly-once across the whole run ---- + const allIds = blobs.flatMap((b) => b.ids).sort((a, b) => a - b) + expect.that('exactly-once: union of exported rows is {0,1,2,3,4}', allIds, (ids) => sameSet(ids, [0, 1, 2, 3, 4])) + expect.that('exactly-once: no row exported twice', allIds, (ids) => new Set(ids).size === ids.length) + + await obs.shutdown() + + // ---- telemetry: the export path and the encoder both ran ---- + const traces = await expect.traces() + const exportSpans = traces.filter((t) => t.name === 'sink.export_batch' && t.attributes?.hyp_sink_instance === SINK_INSTANCE) + expect.that('traces: sink.export_batch spans for the archive instance (one per tick)', exportSpans, (rows) => rows.length >= 3) + const encodeSpans = traces.filter((t) => t.name === 'encoder.encode_parquet') + expect.that('traces: encoder.encode_parquet ran for the non-empty ticks', encodeSpans, (rows) => rows.length >= 2) +} + +/** + * Append a batch of rows to the live spool then flush, so the rows pass the + * `decorateRow` chokepoint and get a monotonic `_hyp_ingest_seq` stamped. + * + * @param {any} storage + * @param {string} spoolPath + * @param {number[]} ids + */ +async function flushRows(storage, spoolPath, ids) { + await storage.appendRows(spoolPath, COLUMNS, ids.map((id) => ({ id: BigInt(id), client_name: SOURCE, msg: `m${id}` }))) + await storage.flushTable(spoolPath, { reason: 'manual', force: true }) +} + +/** + * Read every parquet blob under the destination dir, decoding each to the set + * of `id`s it carries. Sorted by filename so the newest ranged blob is last. + * + * @param {string} destDir + * @returns {Promise>} + */ +async function readBlobs(destDir) { + /** @type {Array<{ name: string, ids: number[] }>} */ + const out = [] + /** @param {string} dir */ + async function walk(dir) { + /** @type {Dirent[]} */ + let entries + try { entries = await fs.readdir(dir, { withFileTypes: true }) } catch { return } + for (const e of entries) { + const full = path.join(dir, e.name) + if (e.isDirectory()) await walk(full) + else if (e.name.endsWith('.parquet')) { + const buf = await fs.readFile(full) + const ab = buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) + const decoded = await parquetReadObjects({ file: ab }) + out.push({ name: e.name, ids: decoded.map((r) => Number(r.id)) }) + } + } + } + await walk(destDir) + return out.sort((a, b) => a.name.localeCompare(b.name)) +} + +/** + * @param {number[]} a + * @param {number[]} b + */ +function sameSet(a, b) { + if (!Array.isArray(a) || a.length !== b.length) return false + const sa = [...a].sort((x, y) => x - y) + const sb = [...b].sort((x, y) => x - y) + return sa.every((v, i) => v === sb[i]) +} + +/** @param {string} dir */ +async function writeFixturePlugin(dir) { + await fs.mkdir(dir, { recursive: true }) + const manifest = { + schema_version: 1, + name: '@hypaware/test-proxy', + version: '1.0.0', + hypaware_api: '^1.0.0', + runtime: 'node', + entrypoint: './index.js', + } + await fs.writeFile(path.join(dir, 'hypaware.plugin.json'), JSON.stringify(manifest, null, 2)) + await fs.writeFile(path.join(dir, 'index.js'), fixturePluginSource()) +} + +function fixturePluginSource() { + return `// auto-generated by incremental_sink_compaction smoke; fixture: @hypaware/test-proxy +import path from 'node:path' + +const DATASET = '${DATASET}' +const SOURCE = '${SOURCE}' +const COLUMNS = ${JSON.stringify(COLUMNS)} + +let activatedStorage = null + +const dataset = { + name: DATASET, + plugin: '@hypaware/test-proxy', + sourceSignal: 'proxy', + schema: { columns: COLUMNS }, + primaryTimestampColumn: undefined, + discoverPartitions(ctx) { + const cacheDir = ctx.cacheDir ?? activatedStorage?.cacheRoot ?? '' + if (!cacheDir) return [] + return [ + { + dataset: DATASET, + partition: { source: SOURCE }, + tablePath: path.join(cacheDir, 'datasets', DATASET, 'source=' + SOURCE), + }, + ] + }, + async createDataSource(partitions, ctx) { + const partition = partitions[0] + if (!partition || !partition.tablePath) return emptySource() + const source = await ctx.storage.dataSourceForTable(partition.tablePath) + return source ?? emptySource() + }, +} + +function emptySource() { + return { + columns: COLUMNS.map((c) => c.name), + numRows: 0, + scan() { return { appliedWhere: false, appliedLimitOffset: false, async *rows() {} } }, + } +} + +export async function activate(ctx) { + activatedStorage = ctx.storage + ctx.query.registerDataset(dataset) +} +` +} + +function makeNoopLogger() { + return { debug() {}, info() {}, warn() {}, error() {} } +} diff --git a/hypaware-core/smoke/flows/local_parquet_export.js b/hypaware-core/smoke/flows/local_parquet_export.js index fb8eaf9..cba8af8 100644 --- a/hypaware-core/smoke/flows/local_parquet_export.js +++ b/hypaware-core/smoke/flows/local_parquet_export.js @@ -177,8 +177,17 @@ export async function run({ harness, expect }) { ) // ----- Inspect the Parquet artifact written by `good` ----- + // Incremental sink reads (LLP 0040) embed the exported `[sinceSeq, lastSeq]` + // ingest-seq range in the filename, so the blob is + // `partition=all.-.parquet` (first export starts at 0). const goodPartitionDir = path.join(goodDir, 'logs', 'partition=all') - const goodFile = path.join(goodPartitionDir, 'partition=all.parquet') + const goodBlobs = (await fs.readdir(goodPartitionDir)).filter((n) => /^partition=all\.\d+-\d+\.parquet$/.test(n)) + expect.that( + `good sink: exactly one ranged parquet blob under ${goodPartitionDir}`, + goodBlobs, + (names) => Array.isArray(names) && names.length === 1, + ) + const goodFile = path.join(goodPartitionDir, goodBlobs[0] ?? 'partition=all.0-0.parquet') const goodStat = await fs.stat(goodFile) expect.that( `good sink: ${goodFile} is a non-empty Parquet file`, diff --git a/llp/0040-incremental-sink-reads.design.md b/llp/0040-incremental-sink-reads.design.md new file mode 100644 index 0000000..9d1dc0a --- /dev/null +++ b/llp/0040-incremental-sink-reads.design.md @@ -0,0 +1,377 @@ +# LLP 0040: Incremental sink reads — design + +**Type:** design +**Status:** Active +**Systems:** Sinks, Cache +**Author:** neutral +**Date:** 2026-06-25 +**Generated-by:** neutral +**Related:** LLP 0039, LLP 0013, LLP 0014 + +> Technical design covering the request in [LLP 0039](./0039-incremental-sink-reads.spec.md): +> give the central **forward** sink and the core **blob** sink a per-sink +> watermark so each tick reads and ships only rows added since its last +> successful export, surviving both retention front-prunes and compaction +> generation swaps. + +@ref LLP 0039 — incremental sink reads requirement + +## Ground truth this design is built on + +Three facts from the code shape every decision below: + +1. **The driver re-hands the whole partition set every tick, with a fresh + batch id.** `createSinkDriver.runSink` (`src/core/sinks/driver.js`) calls + `discoverReadyPartitions` (every partition for the sink's datasets, scope + limit 1000) and mints `nextBatchId = instance--` per tick. So + nothing batch-id-keyed can be a *cross-tick* cursor — a cross-tick cursor + must be keyed by `(sink instance, partition)` and persist outside the batch. + +2. **The cache rewrites partitions out from under any positional reference.** + - *Retention* (`src/core/cache/retention.js`) position-deletes the oldest + rows of each source table via `icebergDelete`, producing a new delete + snapshot on the *same* linear lineage. Data files and their physical row + positions are untouched; the visible set shrinks **from the front**. + - *Compaction* (`src/core/cache/maintenance.js` → `compactSourceTable`) + rewrites the whole partition into a **brand-new table directory** + (`table-/`, or `epoch=/` on the legacy layout) with a **fresh + Iceberg snapshot lineage**, then repoints `cursor.json.tableDir`. Iceberg + stores absolute `file://` URLs, so the directory can't be renamed and the + new lineage has **no ancestry link** to the old one. Row values are copied + verbatim; dedup only drops exact `_hyp_cache_row_id` matches. + + The **logical partition directory** (`/datasets//source=/`, + i.e. `partition.tablePath`) is **stable** across both — only the `tableDir` + inside it changes on compaction. + +3. **Every cache row already passes one kernel write chokepoint.** + `decorateRow` (`src/core/cache/streaming-reader.js`) stamps + `_hyp_cache_row_id` (a content SHA-256, preserved through compaction and used + as the compaction dedup key) and `_hyp_cache_batch_id` on every flushed row, + and both are listed in `INTERNAL_FIELDS` so they never leak to query output + or to `readRows` consumers. + +## 1. Watermark shape decision + +Two **hard** constraints from the spec, plus one cost constraint the acceptance +criteria add: + +- **(A)** survives a retention front-prune, +- **(B)** survives a compaction generation swap, +- **(C)** a tick reads ≈ *N* rows for *N* new rows — bounded reads, not just + bounded sends (acceptance: "reads/sends ≈N rows, independent of total + partition size"). + +### Candidate A — snapshot ancestry, as `format-iceberg` does it + +What it actually is today: the iceberg sink's marker +(`format-iceberg/src/state.js`, `markerSubsumedBySnapshot`) is keyed by +`(prefix, sink, dataset, batchId)` and records the **destination archive** +snapshot id; the ancestry walk proves "this batch already committed into the +archive" so a respool doesn't double-append. It is **destination-side, +batch-id-scoped retry idempotency** — not a source-read cursor. + +Why it does not generalize to the forward/blob source-read problem: + +- It is batch-id-keyed, and batch ids are minted fresh per tick (fact 1), so it + can never match across ticks — exactly the cross-tick reuse we need. +- Even reframed as "remember the last *source* snapshot exported and + incrementally scan files appended since it": that survives retention (A — + delete-only, linear lineage) but **fails (B)**. Compaction starts a fresh + lineage, so the recorded snapshot id is absent from the new table's metadata; + `markerSubsumedBySnapshot` would correctly judge it stale and fall back to a + **full re-read after every compaction** — and compaction fires on routine + file-count thresholds. Rejected on (B). + +### Candidate B — monotonic per-row ingest sequence column (**recommended**) {#ingest-seq-column} + +Add a kernel-assigned, append-monotonic `int64` column `_hyp_ingest_seq`, +stamped at the same chokepoint as `_hyp_cache_row_id` (fact 3) and carried as an +ordinary **internal** (hidden) Iceberg column. The watermark for a +`(sink, partition)` is the highest seq it has durably exported; an incremental +read yields rows with `seq > watermark`. + +- **(A)** Retention deletes the lowest-seq rows, all already `< watermark`; a + strict `> watermark` filter never looks at them. No skip, no dup. **Pass.** +- **(B)** The seq is a **row-resident value**, so the compaction rewrite copies + it verbatim into the new generation. The watermark is keyed by the **logical + partition path** (stable; fact 2), not by snapshot id or physical position, so + it reads straight through the `tableDir` swap. **Pass.** +- **(C)** A strict `seq > watermark` predicate emits only the *N_new* rows. + **As built, the predicate is a yielded-row filter** over the partition's + current snapshot (see §2): the scan still visits every surviving row, so a tick + reads ≈ the *surviving-partition* size, not ≈*N_new*. Retention front-prunes + already-exported rows, so "surviving" is bounded and shrinks — reads do not grow + with total history — but the stronger O(*N_new*) bound is **not yet delivered**. + A numeric `min/max` column statistic *can in principle* let icebird skip whole + data files whose `max(seq) ≤ watermark` (seq correlates with append order, and + numeric stats dodge the string-stats truncation hazard); landing that + null-aware file/row-group pushdown is a **follow-up optimization** that layers + on without changing the read contract. **Partial (bounded by surviving + partition, pending icebird pushdown).** + +It is the **only** shape that meets (A) and (B) and is the right basis for (C): +row-resident (survives every cache rewrite), totally ordered (a strict `>` is +exactly-once), and stats-prunable *once icebird gains null-aware seq pushdown*. + +### Candidate C — content-addressed continuation (seen-set over `_hyp_cache_row_id`) + +Persist the set of exported row ids; skip any row already in it. + +- Correct across (A) and (B) — the row id is preserved through compaction. +- But **fails (C)**: to decide which ids are new you must scan the **whole** + partition every tick (O(*N*) read per tick → O(*N·K*) cumulative; only the + *send* shrinks), and the id set grows unboundedly (needs GC coupled to + retention). Rejected on cost. + +### Decision + +**Recommend Candidate B — a monotonic `_hyp_ingest_seq` watermark.** The +mechanism iceberg proves (snapshot ancestry) is destination-side, batch-scoped, +and does not survive a source compaction; the content-addressed set is correct +but cannot meet the bounded-read goal even in principle. A row-resident, +totally-ordered, stats-prunable sequence clears (A) and (B) outright and is the +only shape that can also reach the O(*N_new*) bound (C) once icebird gains +null-aware seq pushdown; until then it bounds reads by the surviving-partition +size (full scan emitting *N_new*), which is the same correctness with a weaker +cost guarantee. + +## 2. Storage API extension {#storage-api-extension} + +`QueryStorageService.readRows` today +(`collectivus-plugin-kernel-types.d.ts`, impl in `src/core/cache/storage.js`): + +```ts +readRows(tablePath: string, columns?: string[]): AsyncIterable> +``` + +Extend with an optional, **back-compatible** third argument and add a +cursor-aware sibling: + +```ts +interface SinkContinuation { v: 1; seq: string } // int64 as decimal string +interface ReadRowsOptions { since?: SinkContinuation } + +readRows(tablePath: string, columns?: string[], opts?: ReadRowsOptions): + AsyncIterable> + +// cursor-aware surface for sinks that must advance a watermark +readRowsSince(tablePath: string, opts: { since?: SinkContinuation, columns?: string[] }): + AsyncIterable<{ row: Record, after: SinkContinuation }> +``` + +- **Back-compat:** `opts` absent ⇒ identical to today (full scan). Every current + caller — `central/sink.js`, `local-fs`, `s3`, `format-iceberg`, + `ai-gateway` projector & dataset, `vector-search`, backfill, and the query + `dataSourceForTable` path — passes nothing and is byte-for-byte unchanged. +- **`since` semantics:** yields only rows with `_hyp_ingest_seq > since.seq`. + The token is **opaque and versioned** so the mechanism can change later + without invalidating persisted watermarks; `seq` is a decimal string to dodge + bigint/JSON hazards (the column is int64). +- **Why a sibling, not just a filter:** `_hyp_ingest_seq` is an `INTERNAL_FIELD` + stripped from output, so a sink reading `readRows` cannot learn the high-water + seq to persist. `readRowsSince` pairs each clean (internal-stripped) row with + the `after` token to store *once this row is durably shipped*. Internally both + share one scan; the kernel reads `_hyp_ingest_seq`, emits the token, then + strips it — so the seq never reaches the wire payload or query results. +- Implementation point: both route through `scanRowsFromTable` + (`src/core/cache/iceberg/store.js`), which already projects columns over the + latest snapshot; `since` becomes a predicate (ideally pushed to icebird as a + `seq > x` file/row-group skip, falling back to a yielded-row filter). + +## 3. Persisted watermark contract {#watermark-contract} + +- **One watermark per `(sink instance, partition)`.** Stored under the sink + plugin's `PluginPaths.stateDir` (the kernel already threads `ctx.paths` to + request sinks and to blob writer/destination ctx in + `src/core/sinks/materialize.js`): + + ```text + /watermarks//.json + { "v": 1, "continuation": { "v": 1, "seq": "" }, + "exportedRowCount": , "updatedAt": "" } + ``` + +- **`partition-key` is the LOGICAL partition identity**, derived from + `partition.tablePath` relative to `cacheRoot` (the `source=` segments), + **never** the physical `tableDir`. This is the hinge of constraint (B): the + logical path is stable, the `tableDir` is not. Segments are sanitized as in + `state.js`'s `sanitizeSegment`. +- **Local for both sinks.** The watermark tracks progress reading the *local* + cache, so it lives on local disk even when the blob destination is S3. (The + iceberg sink's destination-side marker is a separate concern and stays where + it is.) +- **Advances only after a successful, durable export, ONCE per partition:** + - forward sink — after **every** chunk of the partition has been acked + (`202`/`2xx`), advance to the partition's high-water `after` token (the last + row read). The advance is **end-of-partition, never per-chunk**: the scan is + not seq-ordered (§4 risk #3), so `after` is a running max — a chunk that + physically precedes a lower-seq chunk would, if checkpointed, advance the + watermark past rows still un-acked in a later chunk and skip them forever on a + between-chunk failure. A partial partition therefore never checkpoints; a + crash/failure re-reads the whole partition next tick and the server ledger + dedupes the already-acked prefix (stable per-chunk batch ids, §4). + - blob sink — after the encoded blob is durably PUT, advance to the `after` + token of the **last row in that blob**. +- **Crash-safety:** atomic write-rename (the `writeCursor` / `writeProgress` + idiom). **Invariant: ship/PUT first, advance watermark second.** A crash + between the two re-exports a bounded suffix next tick (at-least-once); §5 + shows dedup makes it exactly-once. + +## 4. Applying it to both sinks {#applying-it-to-both-sinks} + +**Forward sink** (`hypaware-core/plugins-workspace/central/src/sink.js`, +`forwardPartition`): load the continuation for `(instance, partition-key)`; +replace + +```js +for await (const row of storage.readRows(tablePath)) { … } +``` + +with + +```js +for await (const { row, after } of storage.readRowsSince(tablePath, { since })) { … } +``` + +keep the existing `MAX_CHUNK_ROWS` / `MAX_CHUNK_BYTES` chunking and the +backpressure/`Retry-After` loop (LLP 0014) untouched; once **every** chunk is +acked, persist the partition's high-water `after` as the new watermark +(end-of-partition, never per-chunk — see §3). The +`batchIdForChunk(signal, tablePath, chunkStartSeq, body)` derivation keys each +chunk by the **seq it starts after** (the prior chunk's `after`, or `since` for +the first chunk) plus its bytes — **not** a per-tick `chunkIndex` ordinal. Keying +on the start seq keeps an id stable across a watermark advance: a respool +re-reads from the (unchanged) watermark, reproduces the same `[startSeq, body]`, +and so the same id, so the server ledger (server LLP 0001) dedupes the redelivered +prefix. An ordinal would re-number the suffix from 0 after an advance and mint a +fresh id for an already-stored chunk, double-storing it. With nothing new, the +`since` filter yields zero rows → zero chunks → 0 bytes. + +**Core blob sink** (`src/core/sinks/materialize.js` → destination +`local-fs`/`s3` `index.js` → `src/core/sinks/encoder.js`): the destinations feed +`storage.readRows(partition.tablePath)` into `encodePartition`. Switch them to +load the continuation and read via `readRowsSince({ since })`, feeding the clean +row stream into the unchanged `encoder.encodePartition` contract; after the blob +is PUT, advance the watermark to the blob's last `after`. An empty new-row set +writes **no blob** (skip, 0 bytes). The output filename embeds the +`[sinceSeq, lastSeq]` range so a crash-retry re-PUTs the **same object key** +(idempotent overwrite) — the blob sink's stand-in for the server ledger. + +The `format-iceberg` sink is unchanged (it already has destination-side +idempotency); it may later adopt the same source watermark to bound its reads, +but that is out of scope. The **server idempotency ledger is retained** as the +in-flight retry net (spec requirement); it now backstops only a bounded suffix +instead of the whole partition. + +## 5. Exactly-once argument {#exactly-once-argument} + +- **No new rows.** `readRowsSince(since=watermark)` yields nothing → forward + sends 0 chunks; blob writes no file. Watermark unchanged. ≈0 bytes. + *(acceptance 1)* +- **N new rows.** Exactly the rows with `seq > watermark` are yielded and + shipped (≈*N* sent, independent of total history). As built the read **scans + the surviving partition** and filters to *N* (a yielded-row filter, §2/§1(C)); + file-level `max(seq) ≤ watermark` pruning to make the *read* ≈*N* is a pending + icebird-pushdown optimization. The watermark advances to `max(seq)`. + *(acceptance 2 — sends bounded now; reads bounded by surviving partition, + O(N_new) reads pending pushdown)* +- **Across a retention prune.** The prune deletes only rows with `seq` far below + the watermark (already exported). A `> watermark` read is blind to them — no + skip, no dup. *(acceptance 3a)* +- **Across a compaction generation swap.** The seq rides the row into the new + `tableDir`; the watermark is keyed by the stable logical partition path; the + read filters `> watermark` over the new generation and yields the same + survivors. Compaction dedup can only remove exact `_hyp_cache_row_id` + duplicates, which were never two distinct un-exported rows. No skip, no dup. + *(acceptance 3b)* +- **Mid-batch retry.** The watermark advances only after a durable export + completes (forward: every chunk acked, end-of-partition; blob: blob PUT), so a + crash leaves it at the last *fully* exported seq. The next tick re-reads from + there — the forward sink re-streams the whole partition and the server ledger + dedupes the already-acked chunks (stable `chunkStartSeq` batch ids); the blob + sink re-PUTs the same `[sinceSeq,lastSeq]` object key (idempotent overwrite). + *(acceptance 4)* + - **Known gap (not yet closed):** if a chunk/blob commits, its watermark write + is then lost (crash in the PUT→advance window), **and new rows append before + the retry**, the resumed export reads from the old watermark and the in-flight + unit grows past what committed: the forward sink's last partial chunk gets a + new body → new batch id → the server stores it alongside the committed one; + the blob sink's range grows → `S.-` and `S.-` both + land. The dedup nets (server ledger / seq-range filename) assume the retry + reproduces the *identical* unit, which later arrivals break. Closing this + needs the in-flight upper bound persisted as a pre-commit intent so the retry + caps its read at the committed `lastSeq` (a read `until` bound + a pending + intent record). Tracked as risk #7 (§6). + +## 6. Risks / open questions for review + +1. **Pre-upgrade rows have a null `_hyp_ingest_seq`. (RESOLVED.)** A sink with + **no durable watermark** treats null-seq rows as "new" (one full backlog + export, `readRowsSince`/`readRows` default `includeLegacy: true`); once it has + a watermark it passes `includeLegacy: false`, so the backlog is re-exported + **exactly once** instead of on every tick (which also dodges duplicates after + a compaction reorders the body). This is safe because no NEW null-seq row can + appear post-upgrade — `decorateRow` stamps a real seq on every flushed row — + so a row that is null after the first export is always one already shipped. +2. **Seq allocator durability is the most delicate piece.** `decorateRow` runs + in the spool reader, which resumes from a byte offset + (`streamFlushFile` / `writeProgress`). The monotonic counter (e.g. `nextSeq` + reserved in blocks in `cursor.json`) must **never go backwards** across a + crash/resume — a new row stamped `≤ watermark` would be skipped forever. + Duplicate seqs across a crash boundary are tolerable (strict `>` plus row-id + dedup); regressions are not. The allocator that satisfies this is specified + in [§7](#seq-allocator). +3. **Interleaving weakens file-level pruning.** Multiple sources/late arrivals + in one partition can scatter seq ranges across files, so a tick reads more + than *N* (still correct, just less cheap). Acceptable; flagged. +4. **New internal int64 column is an additive cache-schema change** (LLP 0029 + path): must be nullable, must not perturb partition-spec stability, and must + be added to `INTERNAL_FIELDS` so it never leaks to query output or the + forward NDJSON payload. +5. **Retention-vs-export coupling** (the open question in + [LLP 0013](./0013-local-query-cache.decision.md#open-question)): a durable + per-sink watermark finally makes "evict only past the minimum exported + watermark" (`wait_for_sink_ack`) implementable. This design does **not** + change retention — a lagging sink can still have un-exported rows pruned + (data loss). Decide whether to wire ack-coupled eviction alongside this. +6. **Watermark vs. driver outbox. (TESTED.)** The driver's outbox respool and + the watermark are two retry mechanisms; they compose — the outbox replays the + partition, the watermark bounds the replay to the un-acked work — and the + end-to-end acceptance suite exercises it (`sink-incremental-acceptance`). +7. **Watermark-write-lost + new-arrivals duplication. (OPEN — escalated.)** The + narrow exactly-once gap detailed in §5: a unit commits, its watermark write is + lost in the commit→advance window, and new rows append before the retry, so + the resumed in-flight unit grows past what committed and the dedup net (server + ledger / seq-range filename) no longer recognizes it. The common in-flight + retry (no new arrivals) IS covered. Closing this needs a pre-commit intent: a + read `until` upper bound plus a persisted intent record so the retry caps its + read at the committed `lastSeq` and reproduces the identical body/key. This is + a design-level addition to the watermark contract (both sinks; the forward + sink's streaming chunking makes the upper bound awkward) and is deferred to a + follow-up rather than patched half-way under the exactly-once claim. + +## 7. Seq allocator (as built, T1) {#seq-allocator} + +Refines risk #2. The `_hyp_ingest_seq` counter is **cache-global**, persisted at +`/_hyp_ingest_seq.json` (`{ v, nextSeq, updatedAt }`, atomic +write-rename) — **not** in a per-partition `cursor.json`. Two reasons: + +- `decorateRow` runs **before** rows are grouped into `source=<…>` destination + partitions (fact 3 + the flush re-grouping in `appendChunk`), so at the stamp + point there is no destination partition cursor to write. +- Two distinct spool table paths — live capture (`datasets/`) and `backfill` + (`datasets//`) — flush into the **same** destination + partition. Only a single cache-wide counter guarantees every partition + observes a strictly-increasing seq subsequence; a per-partition counter would + interleave two independent sequences and could regress. + +Reservation is **block-wise** (`createIngestSeqAllocator`, default block 1024): +a whole block is durably reserved (persisted `nextSeq` advanced) **before** any +seq in it is stamped onto a row. A crash therefore abandons at most the unused +tail of the current block (a harmless gap in the sequence) and can never +re-issue a seq `≤` one already stamped. Seqs start at 1, so a null/`0` watermark +("exported nothing") is `< ` every real row's seq. In-process concurrency (two +flushes sharing the one allocator) is serialized through a promise-chain mutex; +cross-process concurrent flush of one cache is out of scope (the daemon owns the +cache, matching the existing single-writer write-rename idiom). diff --git a/llp/0042-incremental-sink-reads.plan.md b/llp/0042-incremental-sink-reads.plan.md new file mode 100644 index 0000000..e9da343 --- /dev/null +++ b/llp/0042-incremental-sink-reads.plan.md @@ -0,0 +1,104 @@ +# LLP 0042: Incremental sink reads — plan + +**Type:** plan +**Status:** Active +**Systems:** Sinks, Cache +**Author:** neutral +**Date:** 2026-06-25 +**Generated-by:** neutral +**Related:** LLP 0040 + +> Implementation plan refining [LLP 0040](./0040-incremental-sink-reads.design.md) +> (which answers the spec in [LLP 0039](./0039-incremental-sink-reads.spec.md)) +> into small, independently-mergeable tasks. The design's chosen shape — a +> row-resident monotonic `_hyp_ingest_seq` int64 watermark, a `since`/continuation +> extension to `QueryStorageService.readRows`, and a per-`(sink instance, partition)` +> watermark keyed by the **logical** partition path — decomposes cleanly along the +> producer → read-API → persistence → consumer seam. + +@ref LLP 0040 — incremental sink reads design + +## How the work splits + +The design has one hard ordering: **a value must be produced before it can be +read, and the read surface must exist before a sink can consume it.** Everything +else parallelizes. The seam is: + +1. **Producer (T1).** Stamp the monotonic `_hyp_ingest_seq` at the single kernel + write chokepoint (`decorateRow` in `src/core/cache/streaming-reader.js`, fact 3 + of the design) and add it as a hidden, nullable, additive int64 column. This is + self-contained and back-compatible: nothing reads the column yet, and + `INTERNAL_FIELDS` already strips it from every existing `readRows` consumer, so + it merges with zero behavioural change. The delicate part the design flags + (risk #2) lives entirely here: a crash/resume-safe allocator that **never goes + backwards**, reserving seq blocks durably in `cursor.json` (`nextSeq`, + reserve-before-stamp) so a resumed flush never re-issues a seq `≤` one already + exported. + +2. **Read API (T2).** Extend the kernel storage contract + (`collectivus-plugin-kernel-types.d.ts` decl; `src/core/cache/storage.js` impl; + predicate pushed through `scanRowsFromTable` in `src/core/cache/iceberg/store.js`). + Adds a back-compatible `opts.since` to `readRows` plus the cursor-aware + `readRowsSince` sibling that pairs each internal-stripped row with its `after` + token. `opts` absent ⇒ byte-for-byte identical to today, so every current caller + (forward sink, local-fs/s3, format-iceberg, ai-gateway projector & dataset, + vector-search, backfill, query) is untouched until it opts in. This task also + owns the **null-seq migration contract** (design risk #1): a row whose seq is + null (pre-upgrade) is treated as **new** — emitted, never skipped — so the + upgrade is at worst a one-time full re-export, never silent data loss. + +3. **Persistence (T3).** A small per-`(sink instance, partition)` watermark store + under the sink plugin's `PluginPaths.stateDir` + (`/watermarks//.json`), keyed by the **stable + logical partition path** (relative to `cacheRoot`, sanitized as in + `state.js`'s `sanitizeSegment`) — never the physical `tableDir`. This keying is + the hinge of design constraint (B): it reads straight through a compaction + generation swap. Atomic write-rename, like `writeCursor`/`writeProgress`. + +4. **Consumers (T4, T5).** Two disjoint wirings, parallelizable against each + other once T2+T3 land: + - **Forward sink** (`hypaware-core/plugins-workspace/central/src/sink.js`, + `forwardPartition`): swap the full `readRows(tablePath)` loop for + `readRowsSince({ since })`; advance the watermark **once, at end-of-partition** + (after every chunk acks), to the partition's high-water `after` token — never + per-chunk, because the scan is not seq-ordered so a per-chunk advance to the + running-max `after` could skip lower-seq rows in a later un-acked chunk + (design §3/§4). The existing `MAX_CHUNK_ROWS`/`MAX_CHUNK_BYTES` chunking, the + `batchIdForChunk` derivation (keyed by chunk **start seq**, stable across a + respool), and the `Retry-After` backpressure loop (LLP 0014) are untouched; + a partial partition does not checkpoint, so a failure re-reads the whole + partition and the server ledger dedupes the already-acked prefix. + - **Core blob sink** (`src/core/sinks/materialize.js` → + `local-fs`/`s3` destination `index.js` → `src/core/sinks/encoder.js`): feed + `readRowsSince({ since })` into the unchanged `encodePartition` contract; an + empty new-row set writes **no blob**; embed the `[sinceSeq, lastSeq]` range in + the output filename so a crash-retry re-PUTs the **same object key** + (idempotent overwrite — the blob sink's stand-in for the server ledger); + advance the watermark after the durable PUT. + +5. **Proof (T6).** Exactly-once acceptance across the two cache rewrites that + make this hard — retention front-prune and compaction generation swap — for + **both** sinks, plus the watermark/outbox-respool composition (design risk #6). + +The `format-iceberg` sink is out of scope (it already has destination-side +idempotency); the server idempotency ledger is **retained** as the in-flight +retry net, now backstopping a bounded suffix rather than the whole partition. + +## Dependency rationale + +- **T2 → T1**: the `since` filter and `readRowsSince`'s `after` token can only be + written and tested once the seq column is produced and stamped. +- **T3 → T2**: the watermark file persists a `SinkContinuation`, the token type + the read API introduces. +- **T4, T5 → T2, T3**: each sink needs both the cursor-aware read surface (T2) + and the persisted watermark (T3); the two sinks touch disjoint files and merge + independently. +- **T6 → T4, T5**: the exactly-once proof exercises both wired sinks end to end. + +## Tasks +- id: T1 branch: task/incremental-sink-reads/T1 deps: [] -- Stamp internal nullable int64 `_hyp_ingest_seq` at the `decorateRow` flush chokepoint via a crash-safe never-regressing allocator (cursor.json `nextSeq`, reserve-before-stamp); add to `INTERNAL_FIELDS`; additive nullable schema that rides compaction verbatim +- id: T2 branch: task/incremental-sink-reads/T2 deps: [T1] -- Extend storage read contract: back-compat `readRows(...,opts.since)` + cursor-aware `readRowsSince` emitting `{row, after}`; push `seq>since` predicate through `scanRowsFromTable`/icebird with yielded-row fallback; null-seq = new (one-time migration, never skipped); update kernel-types decl +- id: T3 branch: task/incremental-sink-reads/T3 deps: [T2] -- Persisted per-(sink instance, partition) watermark store under sink `stateDir/watermarks//.json`, keyed by the stable LOGICAL partition path (not `tableDir`); atomic write-rename +- id: T4 branch: task/incremental-sink-reads/T4 deps: [T2, T3] -- Wire central forward sink (`forwardPartition`) to `readRowsSince({ since })`; advance watermark ONCE at end-of-partition (every chunk acked) to the high-water `after`, never per-chunk (unordered scan would skip lower-seq rows in a later un-acked chunk); chunking/backpressure/`batchIdForChunk` (keyed by chunk start seq) unchanged; server ledger dedupes the re-read prefix on failure +- id: T5 branch: task/incremental-sink-reads/T5 deps: [T2, T3] -- Wire core blob sink (local-fs + s3 destinations) to `readRowsSince({ since })`; skip empty new-row set (no blob); embed `[sinceSeq,lastSeq]` in filename for idempotent re-PUT; advance watermark after durable PUT +- id: T6 branch: task/incremental-sink-reads/T6 deps: [T4, T5] -- Exactly-once acceptance tests/smoke across retention front-prune + compaction generation swap for both sinks; assert ≈0 bytes on no-new-rows and ≈N on N-new; cover watermark vs. driver-outbox respool composition diff --git a/src/core/cache/iceberg/store.js b/src/core/cache/iceberg/store.js index a4b3ecc..efd8090 100644 --- a/src/core/cache/iceberg/store.js +++ b/src/core/cache/iceberg/store.js @@ -30,6 +30,7 @@ import { partitionSpecForDeclaration, validatePartitionSpecStability, } from '../../iceberg/partition-spec.js' +import { INGEST_SEQ_COLUMN } from '../streaming-reader.js' /** * @import { ColumnSpec } from '../../../../collectivus-plugin-kernel-types.js' @@ -283,24 +284,92 @@ export async function readRowsFromTable(tablePath) { * time so callers (in particular `QueryStorageService.readRows`) never * materialize the full table in memory. * + * When `opts.since` is set (a bigint `_hyp_ingest_seq` watermark) only rows + * NEWER than the watermark are yielded. A row whose `_hyp_ingest_seq` is + * `null`/absent is a pre-column "legacy" row; its disposition is governed by + * `opts.includeLegacy`: + * + * - `includeLegacy` true (default) — legacy rows are treated as NEW (yielded). + * This is the safe migration default: a fresh sink with no durable watermark + * exports the pre-upgrade backlog once rather than silently skipping it + * (LLP 0040 risk #1, the data-loss hazard). + * - `includeLegacy` false — legacy rows are treated as ALREADY EXPORTED + * (skipped). A sink passes this once it HAS a durable watermark, so the + * pre-upgrade backlog is re-exported exactly once, never on every subsequent + * tick (LLP 0040 §6 risk #1). No new null-seq row can appear post-upgrade — + * the `decorateRow` chokepoint stamps a real seq on every flushed row — so + * excluding legacy rows after the first export is safe. + * + * A real seq is yielded iff strictly `> since`. The seq column is force-projected + * so the predicate can be evaluated even when the caller asked for a narrower + * set; `QueryStorageService` strips it from the row afterwards. + * + * The predicate is applied as a yielded-row filter rather than pushed into + * icebird's `scan({ where })`. icebird couples file/row-group pruning with a + * per-row match that DROPS nulls (`null > since` is false in both hyparquet's + * matcher and JS), which would skip exactly the legacy null-seq rows the + * migration must preserve. The design (LLP 0040 §2) names this yielded-row + * filter as the fallback; a future null-aware icebird filter can layer the + * file-skip optimization on top without changing this contract. + * + * @ref LLP 0040#storage-api-extension [implements] — since-filtered incremental scan; null-seq new on first export, then excluded * @param {string} tablePath * @param {string[]} [columns] + * @param {{ since?: bigint, includeLegacy?: boolean }} [opts] * @returns {AsyncGenerator>} */ -export async function* scanRowsFromTable(tablePath, columns) { +export async function* scanRowsFromTable(tablePath, columns, opts) { if (!tableExists(tablePath)) return + const since = opts?.since + const filtering = since !== undefined + const includeLegacy = opts?.includeLegacy !== false const { resolver, lister } = await getLocalIO() const url = tableUrlForDir(tablePath) const { metadata } = await loadLatestFileCatalogMetadata({ tableUrl: url, resolver, lister }) if (metadata['current-snapshot-id'] === undefined || !metadata.snapshots?.length) return const source = await icebergDataSource({ tableUrl: url, metadata, resolver, lister }) - const projected = columns && columns.length > 0 ? columns : source.columns + // A table that has never been flushed under the seq-column schema carries no + // seq field: every row is implicitly null-seq, so the seq is read as `null` + // and the `includeLegacy` policy decides it. + const hasSeqColumn = source.columns.includes(INGEST_SEQ_COLUMN.name) + let projected = columns && columns.length > 0 ? columns : source.columns + if (filtering && hasSeqColumn && !projected.includes(INGEST_SEQ_COLUMN.name)) { + projected = [...projected, INGEST_SEQ_COLUMN.name] + } const scan = source.scan({ columns: projected }) for await (const row of scan.rows()) { - yield await resolveAsyncRow(row, projected) + const resolved = await resolveAsyncRow(row, projected) + if (filtering) { + const seq = hasSeqColumn ? seqValue(resolved[INGEST_SEQ_COLUMN.name]) : null + if (seq === null) { + // Legacy (pre-upgrade) row: new on a fresh sink, already-exported once a + // durable watermark exists. + if (!includeLegacy) continue + } else if (seq <= /** @type {bigint} */ (since)) { + continue + } + } + yield resolved } } +/** + * Decode a raw `_hyp_ingest_seq` cell to a bigint, or `null` when the row has + * no usable seq — a pre-column legacy row (null/absent), or an unparseable + * value. Returning `null` for an unparseable value is the safe direction: the + * caller treats `null` as a NEW row and never skips it (LLP 0040 risk #1). + * + * @param {unknown} raw + * @returns {bigint | null} + */ +export function seqValue(raw) { + if (raw === null || raw === undefined) return null + if (typeof raw === 'bigint') return raw + if (typeof raw === 'number' && Number.isInteger(raw)) return BigInt(raw) + if (typeof raw === 'string' && /^-?\d+$/.test(raw)) return BigInt(raw) + return null +} + /** * Build a squirreling-compatible `AsyncDataSource` over the latest * snapshot of the table. Returns `null` if the table does not exist diff --git a/src/core/cache/ingest-seq.js b/src/core/cache/ingest-seq.js new file mode 100644 index 0000000..a3811ee --- /dev/null +++ b/src/core/cache/ingest-seq.js @@ -0,0 +1,135 @@ +// @ts-check + +import fs from 'node:fs/promises' +import path from 'node:path' + +/** + * @import { IngestSeqAllocator } from '../../../src/core/cache/types.js' + */ + +/** + * Cache-global allocator state file. Lives at the cache root (a sibling of + * `datasets/`, never inside it) so it is invisible to `discoverCachePartitions` + * / `discoverSpoolTables`, which only ever walk the `datasets/` subtree. + */ +const SEQ_FILE = '_hyp_ingest_seq.json' + +/** + * Default reservation block size. Each block costs one durable + * write-rename; a crash abandons at most the unused tail of the current + * block (a harmless gap), so the block can be generous. + */ +export const DEFAULT_SEQ_BLOCK_SIZE = 1024 + +/** + * Read the persisted `nextSeq` watermark. The value is stored as a decimal + * string so the int64 range survives JSON without bigint/precision hazards. + * Returns `null` when the file is missing or unparseable — the caller then + * starts from 1 (a fresh cache), which never collides with a previously + * issued seq because a real cache always has a `nextSeq >= 1` on disk. + * + * @param {string} statePath + * @returns {Promise} + */ +async function readNextSeq(statePath) { + try { + const raw = await fs.readFile(statePath, 'utf8') + const parsed = JSON.parse(raw) + if (typeof parsed?.nextSeq === 'string' && /^\d+$/.test(parsed.nextSeq)) { + return BigInt(parsed.nextSeq) + } + return null + } catch { + return null + } +} + +/** + * Persist the `nextSeq` watermark with atomic write-rename — the same + * crash-safety idiom as `writeCursor` / `writeProgress`. + * + * @param {string} statePath + * @param {bigint} nextSeq + */ +async function writeNextSeq(statePath, nextSeq) { + const tmp = `${statePath}.tmp.${process.pid}.${Date.now()}` + const payload = { v: 1, nextSeq: nextSeq.toString(), updatedAt: new Date().toISOString() } + await fs.writeFile(tmp, JSON.stringify(payload, null, 2), 'utf8') + await fs.rename(tmp, statePath) +} + +/** + * Crash-safe, never-regressing monotonic int64 allocator backing the + * `_hyp_ingest_seq` column stamped at the `decorateRow` flush chokepoint. + * + * **Reserve-before-stamp.** A block of `blockSize` seqs is reserved by + * durably advancing the persisted `nextSeq` (atomic write-rename) BEFORE any + * seq in the block is handed to a row. So a crash/resume can only re-enter at + * or above the persisted watermark: a resumed flush never re-issues a seq + * `<=` one already stamped (and possibly already exported). Duplicate content + * across a crash boundary is tolerable (strict `>` watermark + row-id dedup); + * a *regression* would let a never-exported row slip below an advanced + * watermark and be skipped forever — which this allocator makes impossible. + * + * **Cache-global, not per-partition.** The chokepoint runs in the spool reader + * before rows are grouped into `source=<...>` destination partitions, and two + * distinct spool table paths (e.g. live capture vs. `backfill`) can feed the + * SAME destination partition. A single cache-wide counter therefore guarantees + * that every partition only ever observes a strictly increasing subsequence of + * seqs — the property the sink watermark relies on — which a per-partition + * counter (interleaving two independent sequences into one partition) would + * break. Gaps between consecutively-appended rows in one partition are fine. + * + * Concurrency: in-process calls are serialized through a promise-chain mutex so + * two concurrent flushes (different `tablePath`s share one allocator) never + * double-reserve the same block. Cross-process concurrent flush of one cache is + * not a supported scenario — the daemon owns the cache, matching the existing + * single-writer write-rename idiom. + * + * @ref LLP 0040#seq-allocator [implements] — cache-global never-regressing reserve-before-stamp allocator + * @param {{ cacheRoot: string, blockSize?: number }} opts + * @returns {IngestSeqAllocator} + */ +export function createIngestSeqAllocator({ cacheRoot, blockSize = DEFAULT_SEQ_BLOCK_SIZE }) { + if (!cacheRoot) throw new Error('createIngestSeqAllocator: cacheRoot is required') + if (!Number.isInteger(blockSize) || blockSize < 1) { + throw new Error('createIngestSeqAllocator: blockSize must be a positive integer') + } + const statePath = path.join(cacheRoot, SEQ_FILE) + const block = BigInt(blockSize) + + /** Next seq to hand out from the in-memory reservation. */ + let cursor = 0n + /** Exclusive upper bound of the in-memory reservation. */ + let blockEnd = 0n + let initialized = false + /** @type {Promise} */ + let mutex = Promise.resolve() + + async function reserveBlock() { + await fs.mkdir(cacheRoot, { recursive: true }) + const start = (await readNextSeq(statePath)) ?? 1n + const end = start + block + // Durable BEFORE any seq in [start, end) is stamped onto a row. + await writeNextSeq(statePath, end) + cursor = start + blockEnd = end + initialized = true + } + + /** @returns {Promise} */ + async function next() { + const run = mutex.then(async () => { + if (!initialized || cursor >= blockEnd) await reserveBlock() + const seq = cursor + cursor += 1n + return seq + }) + // Keep the chain alive (and serialized) even if a reservation rejects; the + // failed caller still sees the rejection, and the next call retries. + mutex = run.then(() => undefined, () => undefined) + return run + } + + return { next } +} diff --git a/src/core/cache/spool.js b/src/core/cache/spool.js index f968cee..d2a4141 100644 --- a/src/core/cache/spool.js +++ b/src/core/cache/spool.js @@ -4,6 +4,7 @@ import fs from 'node:fs/promises' import fsSync from 'node:fs' import path from 'node:path' +import { createIngestSeqAllocator } from './ingest-seq.js' import { readProgress, removeProgress, streamFlushFile, writeProgress } from './streaming-reader.js' /** @@ -36,6 +37,10 @@ export function createCacheSpool(args) { const states = new Map() /** @type {Set} */ const knownTables = new Set() + // One cache-global allocator shared across every table's flush, so a seq is + // monotonic across the whole cache (and thus strictly increasing within each + // destination partition, which can receive rows from more than one spool path). + const seqAllocator = createIngestSeqAllocator({ cacheRoot: args.cacheRoot }) /** * @param {string} tablePath @@ -121,7 +126,7 @@ export function createCacheSpool(args) { const batchId = `flush-${Date.now()}-${process.pid}` let fileMalformed = 0 - for await (const batch of streamFlushFile({ filePath, batchId, startOffset, batchRowLimit: args.batchRowLimit, batchByteLimit: args.batchByteLimit })) { + for await (const batch of streamFlushFile({ filePath, batchId, startOffset, batchRowLimit: args.batchRowLimit, batchByteLimit: args.batchByteLimit, nextSeq: seqAllocator.next })) { const written = await args.appendChunk(tablePath, batch.chunk.columns, batch.chunk.rows) rowCount += batch.chunk.rows.length chunkCount += 1 diff --git a/src/core/cache/storage.js b/src/core/cache/storage.js index c03d6a1..0ff4edd 100644 --- a/src/core/cache/storage.js +++ b/src/core/cache/storage.js @@ -4,6 +4,7 @@ import { Attr, getLogger, getMeter, withSpan } from '../observability/index.js' import { dataSourceForTable, scanRowsFromTable, + seqValue, tableExists as icebergTableExists, tableUrl as icebergTableUrl, } from './iceberg/store.js' @@ -19,16 +20,34 @@ import { } from './partition.js' import { cacheTablePath, datasetForTablePath } from './paths.js' import { createCacheSpool, discoverSpoolTables, DEFAULT_SPOOL_BYTES_THRESHOLD } from './spool.js' -import { INTERNAL_FIELDS } from './streaming-reader.js' +import { INGEST_SEQ_COLUMN, INTERNAL_FIELDS } from './streaming-reader.js' import path from 'node:path' /** - * @import { ColumnSpec, QueryScope, QueryStorageService } from '../../../collectivus-plugin-kernel-types.js' + * @import { ColumnSpec, QueryScope, QueryStorageService, SinkContinuation } from '../../../collectivus-plugin-kernel-types.js' * @import { CachePartitioningDeclaration, ExtendedQueryStorageService } from '../../../src/core/cache/types.js' * @import { AsyncCells } from 'squirreling' */ +/** + * Decode a persisted `SinkContinuation` into its int64 `_hyp_ingest_seq` + * watermark. Absent ⇒ `0n` ("exported nothing"): the allocator starts seqs at + * 1, so `0` is strictly below every real row and a fresh sink reads the whole + * table. The token is opaque + versioned so the watermark mechanism can change + * later without invalidating persisted watermarks (LLP 0040 §2). + * + * @param {SinkContinuation | undefined} since + * @returns {bigint} + */ +function continuationToSeq(since) { + if (since === undefined || since === null) return 0n + if (since.v !== 1 || typeof since.seq !== 'string' || !/^\d+$/.test(since.seq)) { + throw new Error(`readRows: invalid SinkContinuation ${JSON.stringify(since)}`) + } + return BigInt(since.seq) +} + /** * Resolve a tablePath to the Iceberg table directory. * @@ -183,14 +202,47 @@ export function createQueryStorageService({ cacheRoot, getDeclaration, getSettle return icebergTableUrl(resolveIcebergDir(tablePath)) }, - async *readRows(tablePath, columns) { + // @ref LLP 0040#storage-api-extension [implements] — back-compatible + // `opts.since`: absent ⇒ byte-for-byte the pre-existing full scan, so every + // current caller is untouched until it opts in. When set, the scan yields + // only rows newer than the watermark (null-seq legacy rows always yielded). + async *readRows(tablePath, columns, opts) { + const since = opts?.since !== undefined ? continuationToSeq(opts.since) : undefined const projected = columns?.filter((c) => !INTERNAL_FIELDS.includes(c)) - for await (const row of scanRowsFromTable(resolveIcebergDir(tablePath), projected)) { + const scanOpts = since !== undefined ? { since, includeLegacy: opts?.includeLegacy } : undefined + for await (const row of scanRowsFromTable(resolveIcebergDir(tablePath), projected, scanOpts)) { for (const f of INTERNAL_FIELDS) delete row[f] yield row } }, + // @ref LLP 0040#storage-api-extension [implements] — cursor-aware sibling + // for sinks that advance a per-(sink, partition) watermark. `_hyp_ingest_seq` + // is an INTERNAL_FIELD stripped from the row, so a sink reading `readRows` + // can't learn the high-water seq; `readRowsSince` reads it to derive the + // `after` token, then strips it so the seq never reaches the wire payload. + // `includeLegacy` (default true) governs pre-upgrade null-seq rows: a sink + // with no durable watermark passes true (export the backlog once); once it + // has a watermark it passes false (the backlog is already shipped), so the + // one-time migration never re-exports on every tick (LLP 0040 §6 risk #1). + async *readRowsSince(tablePath, opts = {}) { + const since = continuationToSeq(opts.since) + const projected = opts.columns?.filter((c) => !INTERNAL_FIELDS.includes(c)) + // Running high-water of REAL (non-null) seqs seen so far, seeded with the + // incoming watermark. `after` is this monotonic max, so a null-seq legacy + // row never advances the watermark and progress never regresses even when + // the scan visits seqs out of order (interleaved sources; LLP 0040 risk #3). + let high = since + for await (const row of scanRowsFromTable(resolveIcebergDir(tablePath), projected, { since, includeLegacy: opts.includeLegacy })) { + const seq = seqValue(row[INGEST_SEQ_COLUMN.name]) + if (seq !== null && seq > high) high = seq + for (const f of INTERNAL_FIELDS) delete row[f] + /** @type {SinkContinuation} */ + const after = { v: 1, seq: high.toString() } + yield { row, after } + } + }, + async dataSourceForTable(tablePath) { const source = await dataSourceForTable(resolveIcebergDir(tablePath)) if (!source) return null diff --git a/src/core/cache/streaming-reader.js b/src/core/cache/streaming-reader.js index ba75285..ebe8529 100644 --- a/src/core/cache/streaming-reader.js +++ b/src/core/cache/streaming-reader.js @@ -12,6 +12,21 @@ import fs from 'node:fs/promises' export const BATCH_BYTE_LIMIT = 128 * 1024 * 1024 export const BATCH_ROW_LIMIT = 100_000 +/** + * Internal append-monotonic ingest sequence column. Hidden, nullable, and + * additive: it is stamped on every flushed row at the `decorateRow` + * chokepoint, carried verbatim through compaction (it is a row-resident + * value, so a generation-swap rewrite copies it into the new table), and + * stripped from every query/`readRows` consumer via `INTERNAL_FIELDS`. + * + * Nullable so it rides existing tables as an additive schema change — rows + * written before the column existed read back as `null`. + * + * @ref LLP 0040#ingest-seq-column [implements] — row-resident monotonic int64 watermark column + * @type {ColumnSpec} + */ +export const INGEST_SEQ_COLUMN = { name: '_hyp_ingest_seq', type: 'INT64', nullable: true } + /** * Read a rotated spool file as a stream, yielding batches of rows that * respect both a byte-size ceiling and a row-count ceiling. Partial @@ -21,8 +36,14 @@ export const BATCH_ROW_LIMIT = 100_000 * or is still being written). * * Each emitted row is decorated with: - * - `_hyp_cache_row_id`: SHA-256 of the serialized row (stable dedup key) - * - `_hyp_cache_batch_id`: caller-supplied batch identifier + * - `_hyp_cache_row_id` — SHA-256 of the serialized row (stable dedup key) + * - `_hyp_cache_batch_id` — caller-supplied batch identifier + * - `_hyp_ingest_seq` — monotonic int64 from `nextSeq` (null when absent) + * + * The decorated chunk's `columns` carry an extra nullable `_hyp_ingest_seq` + * `ColumnSpec` so the value lands in the Iceberg schema (additive, never + * required). The `_hyp_cache_row_id` hash is computed over the ORIGINAL row, + * before any decoration, so the seq does not perturb the dedup identity. * * Resume support: if `startOffset` > 0 the reader seeks past already- * flushed bytes and continues from there. After each yielded batch the @@ -35,6 +56,7 @@ export const BATCH_ROW_LIMIT = 100_000 * startOffset?: number, * batchByteLimit?: number, * batchRowLimit?: number, + * nextSeq?: () => Promise, * }} opts * @returns {AsyncGenerator<{ * chunk: FlushChunk, @@ -49,6 +71,7 @@ export async function* streamFlushFile(opts) { startOffset = 0, batchByteLimit = BATCH_BYTE_LIMIT, batchRowLimit = BATCH_ROW_LIMIT, + nextSeq, } = opts const stream = createReadStream(filePath, { @@ -72,7 +95,7 @@ export async function* streamFlushFile(opts) { */ function sealBatch() { if (!currentColumns || currentRows.length === 0) return null - const chunk = { columns: currentColumns, rows: currentRows } + const chunk = { columns: withIngestSeqColumn(currentColumns), rows: currentRows } currentColumns = null currentSignature = '' currentRows = [] @@ -127,7 +150,11 @@ export async function* streamFlushFile(opts) { for (let idx = 0; idx < envelope.rows.length; idx++) { const row = envelope.rows[idx] - const decorated = decorateRow(row, batchId) + // Reserve-before-stamp: each seq is durably reserved (allocator + // advances the persisted nextSeq one block ahead) before it reaches + // a row, so a resumed flush never re-issues a seq it already stamped. + const seq = nextSeq ? await nextSeq() : null + const decorated = decorateRow(row, batchId, seq) const rowBytes = Buffer.byteLength(JSON.stringify(row), 'utf8') currentRows.push(decorated) currentBatchBytes += rowBytes @@ -161,18 +188,33 @@ export async function* streamFlushFile(opts) { /** * @param {Record} row * @param {string} batchId + * @param {bigint | null} seq monotonic ingest sequence, or `null` when no + * allocator is wired (the seq is then absent and reads back as null) * @returns {Record} */ -function decorateRow(row, batchId) { +function decorateRow(row, batchId, seq) { const serialized = JSON.stringify(row, stableReplacer) const hash = createHash('sha256').update(serialized).digest('hex') return { ...row, _hyp_cache_row_id: hash, _hyp_cache_batch_id: batchId, + [INGEST_SEQ_COLUMN.name]: seq, } } +/** + * Append the nullable `_hyp_ingest_seq` column to a chunk's column list so the + * stamped value lands in the Iceberg schema. Idempotent — never double-adds. + * + * @param {readonly ColumnSpec[]} columns + * @returns {ColumnSpec[]} + */ +function withIngestSeqColumn(columns) { + if (columns.some((c) => c.name === INGEST_SEQ_COLUMN.name)) return [...columns] + return [...columns, INGEST_SEQ_COLUMN] +} + /** * Stable JSON key ordering for deterministic hashes. * @@ -243,6 +285,11 @@ function progressPath(spoolFilePath) { } /** - * Internal-field names that should be hidden from query output. + * Internal-field names that should be hidden from query output and from every + * `readRows` consumer (forward/blob sinks, query, projectors). `_hyp_ingest_seq` + * is included so the sink-read watermark column never leaks to the wire payload + * or query results — `readRowsSince` (T2) re-exposes it as an opaque token only. + * + * @ref LLP 0040#storage-api-extension [constrained-by] — internal, stripped on read */ -export const INTERNAL_FIELDS = ['_hyp_cache_row_id', '_hyp_cache_batch_id'] +export const INTERNAL_FIELDS = ['_hyp_cache_row_id', '_hyp_cache_batch_id', INGEST_SEQ_COLUMN.name] diff --git a/src/core/cache/types.d.ts b/src/core/cache/types.d.ts index f80f26d..3bb7895 100644 --- a/src/core/cache/types.d.ts +++ b/src/core/cache/types.d.ts @@ -48,6 +48,16 @@ export interface ProgressState { updatedAt: string } +/** + * Crash-safe, never-regressing monotonic int64 allocator for the + * `_hyp_ingest_seq` column. `next()` reserves seq blocks durably + * (reserve-before-stamp) so a resumed flush never re-issues a seq `<=` one + * already stamped. See `createIngestSeqAllocator`. + */ +export interface IngestSeqAllocator { + next(): Promise +} + export interface SpoolAppendResult { bytesWritten: number pendingBytes: number diff --git a/src/core/sinks/incremental.js b/src/core/sinks/incremental.js new file mode 100644 index 0000000..71622f4 --- /dev/null +++ b/src/core/sinks/incremental.js @@ -0,0 +1,187 @@ +// @ts-check + +import path from 'node:path' + +import { createSinkWatermarkStore } from './watermarks.js' + +/** + * @import { + * PluginPaths, + * QueryPartition, + * QueryStorageService, + * SinkContinuation, + * } from '../../../collectivus-plugin-kernel-types.js' + * @import { IncrementalRowReader, SinkWatermarkKey, SinkWatermarkStore } from '../../../src/core/sinks/types.js' + */ + +/** + * Sub-directory under a destination plugin's `stateDir` that namespaces one + * sink instance's watermarks. See {@link createInstanceWatermarkStore}. + */ +const INSTANCE_DIR = 'sink-instances' + +/** + * Build a watermark store scoped to a single sink instance. + * + * `PluginPaths.stateDir` is per-**plugin** (`/plugins/`), not per + * sink **instance** — but the design's watermark contract is one watermark per + * `(sink instance, partition)`. Two instances of one destination plugin (e.g. + * dual-writing the same dataset to two buckets) would otherwise share — and + * clobber — a single watermark file, silently skipping rows the other instance + * exported. The wiring layer is the only place that knows the instance name, so + * it scopes the store here, satisfying `watermarks.js`'s documented precondition + * that the `stateDir` it receives is already instance-scoped. + * + * @ref LLP 0040#watermark-contract [implements] — one watermark per (sink instance, partition) + * @param {{ paths: PluginPaths, instanceName: string }} opts + * @returns {SinkWatermarkStore} + */ +export function createInstanceWatermarkStore({ paths, instanceName }) { + if (!paths?.stateDir) { + throw new Error('createInstanceWatermarkStore: paths.stateDir is required') + } + if (!instanceName) { + throw new Error('createInstanceWatermarkStore: instanceName is required') + } + const stateDir = path.join(paths.stateDir, INSTANCE_DIR, sanitizeInstance(instanceName)) + return createSinkWatermarkStore({ stateDir }) +} + +/** + * Restrict a user-chosen sink instance name to a safe directory segment so it + * cannot escape the plugin state directory. + * + * @param {string} name + * @returns {string} + */ +function sanitizeInstance(name) { + const cleaned = String(name).replace(/[^A-Za-z0-9._-]/g, '_') + return cleaned.length > 0 ? cleaned : '_instance' +} + +/** @returns {AsyncIterable>} */ +function emptyAsyncIterable() { + return { async *[Symbol.asyncIterator]() {} } +} + +/** + * Derive the stable logical watermark key for a partition, or `null` when the + * partition has no `tablePath` (registered-but-not-materialized) so the caller + * exports without persisting a watermark. A `tablePath` that is set but not + * under the cache datasets root throws via `keyFor` — a genuine misconfiguration + * the per-partition error path should surface, not silently ignore. + * + * @param {SinkWatermarkStore} watermarks + * @param {QueryStorageService} storage + * @param {QueryPartition} partition + * @returns {SinkWatermarkKey | null} + */ +export function watermarkKeyFor(watermarks, storage, partition) { + if (!partition.tablePath) return null + return watermarks.keyFor(storage.cacheRoot, partition.tablePath) +} + +/** + * Open a partition's **new** rows (those with `_hyp_ingest_seq > since`) as a + * single-use, self-tracking row stream for a blob destination. + * + * The returned reader: + * + * - decides emptiness up front by **peeking** the first row, so the skip-empty + * decision never depends on the encoder actually draining the stream; + * - exposes `rows` — the clean (internal-stripped) rows to feed straight into + * the unchanged `encoder.encodePartition` contract; + * - tracks `rowCount` and the high-water `lastAfter` continuation as the encoder + * consumes `rows` (both are final once the encoder has drained the stream, + * which it must to encode them). + * + * A partition with no `tablePath`, or whose table does not exist on disk yet, is + * reported `empty` (yield nothing) rather than throwing — the caller writes no + * blob, exactly as for a partition with no new rows. + * + * @ref LLP 0040#storage-api-extension [implements] — feed readRowsSince into the unchanged encoder; empty new-row set ⇒ no blob + * @param {QueryStorageService} storage + * @param {QueryPartition} partition + * @param {SinkContinuation | undefined} since + * @returns {Promise} + */ +export async function openIncrementalRows(storage, partition, since) { + const sinceSeq = since?.seq ?? '0' + const state = { + rowCount: 0, + /** @type {SinkContinuation} */ + lastAfter: since ?? { v: 1, seq: sinceSeq }, + } + + // @ref LLP 0040#storage-api-extension [implements] — pre-upgrade null-seq rows + // are "new" only on a sink with no durable watermark (export the backlog once); + // once a watermark exists (`since` set) they are already shipped, so exclude + // them and the legacy backlog never re-exports every tick (LLP 0040 §6 risk #1). + const includeLegacy = since === undefined + const tablePath = partition.tablePath + const iterator = tablePath && storage.tableExists(tablePath) + ? storage.readRowsSince(tablePath, { since, includeLegacy })[Symbol.asyncIterator]() + : null + + /** @type {IteratorResult<{ row: Record, after: SinkContinuation }> | null} */ + let first = null + if (iterator) { + first = await iterator.next() + if (first.done) { + // Release the underlying scan immediately — nothing new to export. + await iterator.return?.() + } + } + const empty = iterator === null || first === null || first.done === true + + async function* rows() { + if (empty || iterator === null || first === null) return + try { + let entry = first + while (!entry.done) { + state.rowCount += 1 + state.lastAfter = entry.value.after + yield entry.value.row + entry = await iterator.next() + } + } finally { + // Release the scan if the consumer stopped early (e.g. encoder threw). + await iterator.return?.() + } + } + + return { + empty, + sinceSeq, + rows: empty ? emptyAsyncIterable() : { [Symbol.asyncIterator]: rows }, + get rowCount() { + return state.rowCount + }, + get lastAfter() { + return state.lastAfter + }, + } +} + +/** + * Embed an incremental export's `[sinceSeq, lastSeq]` range in the encoder's + * filename, inserted before the final extension: + * `all.parquet` → `all.-.parquet`. + * + * The range is a deterministic function of the watermark and the rows read, so a + * crash-retry (watermark not yet advanced) reproduces the **same** filename and + * thus the same object key — an idempotent overwrite. This is the blob sink's + * stand-in for the central sink's server-side idempotency ledger. + * + * @ref LLP 0040#applying-it-to-both-sinks [implements] — [sinceSeq,lastSeq] filename ⇒ idempotent re-PUT + * @param {string} filename + * @param {string} sinceSeq + * @param {string} lastSeq + * @returns {string} + */ +export function withSeqRangeFilename(filename, sinceSeq, lastSeq) { + const range = `${sinceSeq}-${lastSeq}` + const dot = filename.lastIndexOf('.') + if (dot <= 0) return `${filename}.${range}` + return `${filename.slice(0, dot)}.${range}${filename.slice(dot)}` +} diff --git a/src/core/sinks/index.js b/src/core/sinks/index.js index 07991c2..7a44da3 100644 --- a/src/core/sinks/index.js +++ b/src/core/sinks/index.js @@ -2,14 +2,21 @@ /** * Public surface for blob-sink destination plugins. Re-exports the - * kernel's `sink.encode_partition` wrapper so plugins can drop in the - * common observability contract instead of re-implementing it. + * kernel's `sink.encode_partition` wrapper plus the incremental-read + * helpers so plugins can drop in the common observability and + * watermark contracts instead of re-implementing them. * * Plugins import via: * * ```js - * import { encodePartition } from 'hypaware/core/sinks' + * import { encodePartition, openIncrementalRows } from 'hypaware/core/sinks' * ``` */ export { encodePartition, clusterColumnsForDataset } from './encoder.js' +export { + createInstanceWatermarkStore, + openIncrementalRows, + watermarkKeyFor, + withSeqRangeFilename, +} from './incremental.js' diff --git a/src/core/sinks/types.d.ts b/src/core/sinks/types.d.ts index 78040c7..ac0c225 100644 --- a/src/core/sinks/types.d.ts +++ b/src/core/sinks/types.d.ts @@ -2,10 +2,69 @@ import type { ExportResult, HypAwareV2Config, QueryRegistry, + SinkContinuation, } from '../../../collectivus-plugin-kernel-types.d.ts' import type { ExtendedQueryStorageService } from '../cache/types.d.ts' import type { ExtendedSinkHandle, ExtendedSinkRegistry } from '../registry/types.d.ts' +/** + * A single-use, self-tracking incremental row stream for a blob destination, + * returned by `openIncrementalRows`. `rows` is fed straight into the unchanged + * `encoder.encodePartition` contract; `rowCount` and `lastAfter` are final once + * the encoder has drained `rows`. + */ +export interface IncrementalRowReader { + /** True when there are no new rows since the watermark — the sink writes no blob. */ + empty: boolean + /** Incoming watermark seq (decimal string; `'0'` when none) — the range lower bound. */ + sinceSeq: string + /** Clean (internal-stripped) rows to feed the encoder. Single-use; do not re-iterate. */ + rows: AsyncIterable> + /** Rows yielded so far; final once `rows` is fully drained. */ + readonly rowCount: number + /** + * Monotonic high-water continuation; final once `rows` is drained. Advance the + * watermark to this only after the blob is durably PUT. + */ + readonly lastAfter: SinkContinuation +} + +/** + * Stable logical identity of a partition for watermark storage: the partition's + * directory relative to `/datasets/`, split into the dataset and the + * (sanitized, `/`-joined) partition path — never the physical `tableDir`. + */ +export interface SinkWatermarkKey { + dataset: string + partitionKey: string +} + +/** + * On-disk per-`(sink instance, partition)` incremental-read watermark. + * `continuation` is the highest `_hyp_ingest_seq` durably exported. + */ +export interface SinkWatermarkRecord { + v: 1 + continuation: SinkContinuation + exportedRowCount: number + updatedAt: string +} + +/** + * Persisted watermark store scoped to one sink instance via its `stateDir`. + * Files live at `/watermarks//.json`; `write` + * is atomic write-rename. + */ +export interface SinkWatermarkStore { + keyFor(cacheRoot: string, tablePath: string): SinkWatermarkKey + filePath(key: SinkWatermarkKey): string + read(key: SinkWatermarkKey): Promise + write( + key: SinkWatermarkKey, + update: { continuation: SinkContinuation; exportedRowCount?: number }, + ): Promise +} + export interface DriverOptions { sinkRegistry: ExtendedSinkRegistry queryRegistry: QueryRegistry diff --git a/src/core/sinks/watermarks.js b/src/core/sinks/watermarks.js new file mode 100644 index 0000000..48f1a82 --- /dev/null +++ b/src/core/sinks/watermarks.js @@ -0,0 +1,215 @@ +// @ts-check + +import fs from 'node:fs/promises' +import path from 'node:path' + +import { datasetsRoot } from '../cache/paths.js' + +/** + * @import { SinkContinuation } from '../../../collectivus-plugin-kernel-types.js' + * @import { SinkWatermarkKey, SinkWatermarkRecord, SinkWatermarkStore } from '../../../src/core/sinks/types.js' + */ + +/** + * Sub-directory, under a sink plugin's `PluginPaths.stateDir`, that holds the + * per-`(sink instance, partition)` incremental-read watermarks. The `stateDir` + * is already scoped to one sink instance, so the instance dimension is implicit + * in the root; only `/` discriminates within it. + */ +const WATERMARKS_DIR = 'watermarks' + +const RECORD_VERSION = 1 + +/** + * Restrict a path segment to a safe character class so a stray dataset or + * source name like `foo/../bar` cannot escape the watermarks prefix. Mirrors + * `format-iceberg/src/state.js`'s `sanitizeSegment` (the design names it as the + * reference) and, deliberately, keeps `=` / `,` so `source=` partition + * segments stay legible on disk. + * + * @param {string} value + * @param {string} field + * @returns {string} + */ +function sanitizeSegment(value, field) { + if (typeof value !== 'string' || value.length === 0) { + throw new Error(`sink-watermark: ${field} must be a non-empty string`) + } + const cleaned = value.replace(/[^A-Za-z0-9._=,-]/g, '_') + if (cleaned.length === 0) { + throw new Error(`sink-watermark: ${field} sanitized to empty string from '${value}'`) + } + return cleaned +} + +/** + * Derive the **stable logical** watermark key for a partition. + * + * The key is the partition's logical identity — its directory relative to + * `/datasets/` — NOT the physical `tableDir` inside it. This is the + * hinge of design constraint (B): retention rewrites the table on the same + * lineage and compaction swaps in a brand-new `table-/` directory, but the + * logical partition directory (`datasets//source=/`) is stable + * across both, so a watermark keyed by it reads straight through either rewrite. + * Keying by `tableDir` would reset the watermark on every compaction. + * + * The first segment under `datasets/` is the dataset; the remaining segments are + * the partition path. Each segment is sanitized; the partition segments are + * re-joined with `/` so the key reconstructs a (possibly nested) on-disk path + * deterministically. A partition with no segments below the dataset falls back + * to the `_partition` sentinel so it still gets a single stable file. + * + * @ref LLP 0040#watermark-contract [implements] — key by stable logical partition path, never tableDir + * @param {string} cacheRoot + * @param {string} tablePath logical partition path (`partition.tablePath`) + * @returns {SinkWatermarkKey} + */ +export function deriveWatermarkKey(cacheRoot, tablePath) { + if (!cacheRoot) throw new Error('deriveWatermarkKey: cacheRoot is required') + if (!tablePath) throw new Error('deriveWatermarkKey: tablePath is required') + const rel = path.relative(datasetsRoot(cacheRoot), tablePath) + if (!rel || rel.startsWith('..') || path.isAbsolute(rel)) { + throw new Error( + `deriveWatermarkKey: tablePath '${tablePath}' is not under the cache datasets root` + ) + } + const [dataset, ...rest] = rel.split(path.sep).filter((s) => s.length > 0) + if (!dataset) { + throw new Error(`deriveWatermarkKey: tablePath '${tablePath}' has no dataset segment`) + } + const partitionSegments = rest.length > 0 ? rest : ['_partition'] + const partitionKey = partitionSegments + .map((seg) => sanitizeSegment(seg, 'partition-segment')) + .join('/') + return { dataset: sanitizeSegment(dataset, 'dataset'), partitionKey } +} + +/** + * Validate a `SinkContinuation` before it is persisted, so a malformed token can + * never reach disk and silently corrupt a watermark. Same shape the storage + * read API enforces: `{ v: 1, seq: }`. + * + * @param {SinkContinuation} continuation + * @returns {SinkContinuation} + */ +function validateContinuation(continuation) { + if ( + !continuation || + continuation.v !== 1 || + typeof continuation.seq !== 'string' || + !/^\d+$/.test(continuation.seq) + ) { + throw new Error( + `sink-watermark: invalid SinkContinuation ${JSON.stringify(continuation)}` + ) + } + return { v: 1, seq: continuation.seq } +} + +/** + * Parse a persisted record, returning `null` for anything that is missing, + * unparseable, or structurally wrong. A `null` read means "no durable + * watermark", so the sink re-exports from the start of the partition — the safe + * direction (at-least-once + downstream dedup), never a silent skip. This mirrors + * the null-seq migration default and `ingest-seq.js`'s tolerant `readNextSeq`. + * + * @param {string} raw + * @returns {SinkWatermarkRecord | null} + */ +function parseRecord(raw) { + let parsed + try { + parsed = JSON.parse(raw) + } catch { + return null + } + const continuation = parsed?.continuation + if ( + !continuation || + continuation.v !== 1 || + typeof continuation.seq !== 'string' || + !/^\d+$/.test(continuation.seq) + ) { + return null + } + return { + v: RECORD_VERSION, + continuation: { v: 1, seq: continuation.seq }, + exportedRowCount: + typeof parsed.exportedRowCount === 'number' && Number.isFinite(parsed.exportedRowCount) + ? parsed.exportedRowCount + : 0, + updatedAt: typeof parsed.updatedAt === 'string' ? parsed.updatedAt : '', + } +} + +/** + * A persisted per-`(sink instance, partition)` watermark store. + * + * One store instance is scoped to one sink instance via its `stateDir` (the + * kernel threads `ctx.paths.stateDir` to request sinks and to the blob + * destination ctx). Files live at: + * + * `/watermarks//.json` + * `{ v, continuation: { v, seq }, exportedRowCount, updatedAt }` + * + * `write` is atomic write-rename (the `writeCursor` / `writeProgress` / + * `ingest-seq.js` idiom) so a crash never leaves a torn watermark; the design's + * **ship/PUT first, advance watermark second** invariant lives in the sink + * wiring (T4/T5), where a crash between the two re-exports a bounded suffix. + * + * @ref LLP 0040#watermark-contract [implements] — persisted per-(sink, partition) watermark, atomic write-rename + * @param {{ stateDir: string }} opts + * @returns {SinkWatermarkStore} + */ +export function createSinkWatermarkStore({ stateDir }) { + if (!stateDir) throw new Error('createSinkWatermarkStore: stateDir is required') + const root = path.join(stateDir, WATERMARKS_DIR) + + /** + * @param {SinkWatermarkKey} key + * @returns {string} + */ + function filePath(key) { + const segments = key.partitionKey.split('/').filter((s) => s.length > 0) + return `${path.join(root, key.dataset, ...segments)}.json` + } + + return { + keyFor(cacheRoot, tablePath) { + return deriveWatermarkKey(cacheRoot, tablePath) + }, + + filePath, + + async read(key) { + let raw + try { + raw = await fs.readFile(filePath(key), 'utf8') + } catch { + return null + } + return parseRecord(raw) + }, + + async write(key, update) { + const continuation = validateContinuation(update.continuation) + /** @type {SinkWatermarkRecord} */ + const record = { + v: RECORD_VERSION, + continuation, + exportedRowCount: + typeof update.exportedRowCount === 'number' && Number.isFinite(update.exportedRowCount) + ? update.exportedRowCount + : 0, + updatedAt: new Date().toISOString(), + } + const dest = filePath(key) + await fs.mkdir(path.dirname(dest), { recursive: true }) + const tmp = `${dest}.tmp.${process.pid}.${Date.now()}` + await fs.writeFile(tmp, JSON.stringify(record, null, 2), 'utf8') + await fs.rename(tmp, dest) + return record + }, + } +} diff --git a/test/core/ingest-seq.test.js b/test/core/ingest-seq.test.js new file mode 100644 index 0000000..1b55eaf --- /dev/null +++ b/test/core/ingest-seq.test.js @@ -0,0 +1,209 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs/promises' +import path from 'node:path' +import os from 'node:os' + +import { + createIngestSeqAllocator, + DEFAULT_SEQ_BLOCK_SIZE, +} from '../../src/core/cache/ingest-seq.js' +import { + INGEST_SEQ_COLUMN, + INTERNAL_FIELDS, + streamFlushFile, +} from '../../src/core/cache/streaming-reader.js' +import { createQueryStorageService, resolveIcebergDir } from '../../src/core/cache/storage.js' +import { scanRowsFromTable } from '../../src/core/cache/iceberg/store.js' +import { discoverCachePartitions } from '../../src/core/cache/partition.js' + +/** + * @import { ColumnSpec } from '../../collectivus-plugin-kernel-types.d.ts' + */ + +/** @returns {Promise} */ +async function makeTmpDir() { + return fs.mkdtemp(path.join(os.tmpdir(), 'hyp-ingest-seq-')) +} + +const SEQ_FILE = '_hyp_ingest_seq.json' + +test('allocator hands out a strictly increasing run from 1', async () => { + const dir = await makeTmpDir() + const alloc = createIngestSeqAllocator({ cacheRoot: dir, blockSize: 4 }) + + /** @type {bigint[]} */ + const seqs = [] + for (let i = 0; i < 10; i++) seqs.push(await alloc.next()) + + assert.deepEqual(seqs, [1n, 2n, 3n, 4n, 5n, 6n, 7n, 8n, 9n, 10n]) + await fs.rm(dir, { recursive: true, force: true }) +}) + +test('reserve-before-stamp: persisted nextSeq is always ahead of the last issued seq', async () => { + const dir = await makeTmpDir() + const statePath = path.join(dir, SEQ_FILE) + const alloc = createIngestSeqAllocator({ cacheRoot: dir, blockSize: 8 }) + + // After the very first allocation, a whole block must already be durable. + const first = await alloc.next() + assert.equal(first, 1n) + const persistedAfterFirst = JSON.parse(await fs.readFile(statePath, 'utf8')) + assert.equal(persistedAfterFirst.v, 1) + // Block of 8 reserved up front => nextSeq persisted at 9 while only seq 1 issued. + assert.equal(persistedAfterFirst.nextSeq, '9') + + // Drain the rest of the block; nextSeq stays at the reserved boundary. + for (let i = 0; i < 7; i++) await alloc.next() + const persistedAfterBlock = JSON.parse(await fs.readFile(statePath, 'utf8')) + assert.equal(persistedAfterBlock.nextSeq, '9') + + // Crossing the block boundary reserves the next block durably *before* use. + const ninth = await alloc.next() + assert.equal(ninth, 9n) + const persistedAfterCross = JSON.parse(await fs.readFile(statePath, 'utf8')) + assert.equal(persistedAfterCross.nextSeq, '17') + + await fs.rm(dir, { recursive: true, force: true }) +}) + +test('allocator never regresses across a restart and skips the abandoned block tail', async () => { + const dir = await makeTmpDir() + + const a = createIngestSeqAllocator({ cacheRoot: dir, blockSize: 100 }) + const s1 = await a.next() + const s2 = await a.next() + assert.deepEqual([s1, s2], [1n, 2n]) + + // Simulate a crash/restart: a brand-new allocator over the same cache root. + // The previous in-memory block (3..100) is abandoned; the new one must start + // at the persisted watermark (101) and never re-issue 3..100. + const b = createIngestSeqAllocator({ cacheRoot: dir, blockSize: 100 }) + const s3 = await b.next() + assert.equal(s3, 101n) + assert.ok(s3 > s2) + + await fs.rm(dir, { recursive: true, force: true }) +}) + +test('concurrent next() calls never collide (single allocator, parallel flushes)', async () => { + const dir = await makeTmpDir() + const alloc = createIngestSeqAllocator({ cacheRoot: dir, blockSize: 3 }) + + const seqs = await Promise.all(Array.from({ length: 50 }, () => alloc.next())) + const sorted = [...seqs].map((s) => s.toString()) + const unique = new Set(sorted) + assert.equal(unique.size, 50, 'every issued seq is unique') + // The multiset is exactly 1..50 (no gaps inside a fully-drained run). + const asNums = seqs.map((s) => Number(s)).sort((x, y) => x - y) + assert.deepEqual(asNums, Array.from({ length: 50 }, (_, i) => i + 1)) + + await fs.rm(dir, { recursive: true, force: true }) +}) + +test('default block size is a positive integer and rejects bad input', async () => { + const dir = await makeTmpDir() + assert.ok(Number.isInteger(DEFAULT_SEQ_BLOCK_SIZE) && DEFAULT_SEQ_BLOCK_SIZE > 0) + assert.throws(() => createIngestSeqAllocator({ cacheRoot: dir, blockSize: 0 })) + assert.throws(() => createIngestSeqAllocator({ cacheRoot: '' })) + await fs.rm(dir, { recursive: true, force: true }) +}) + +test('streamFlushFile stamps a monotonic _hyp_ingest_seq and adds the column', async () => { + const dir = await makeTmpDir() + const filePath = path.join(dir, 'seq.jsonl') + + /** @type {ColumnSpec[]} */ + const cols = [ + { name: 'id', type: 'INT64', nullable: false }, + { name: 'msg', type: 'STRING', nullable: false }, + ] + const lines = [] + for (let i = 0; i < 5; i++) { + lines.push(JSON.stringify({ version: 1, columns: cols, rows: [{ id: i, msg: `r${i}` }] }) + '\n') + } + await fs.writeFile(filePath, lines.join('')) + + let n = 100n + const nextSeq = async () => n++ + + /** @type {bigint[]} */ + const stamped = [] + for await (const batch of streamFlushFile({ filePath, batchId: 'b1', nextSeq })) { + // Chunk columns carry the additive nullable seq column. + assert.ok(batch.chunk.columns.some((c) => c.name === '_hyp_ingest_seq' && c.type === 'INT64' && c.nullable === true)) + for (const row of batch.chunk.rows) { + assert.equal(typeof row._hyp_ingest_seq, 'bigint') + stamped.push(/** @type {bigint} */ (row._hyp_ingest_seq)) + } + } + + assert.deepEqual(stamped, [100n, 101n, 102n, 103n, 104n]) + assert.equal(INGEST_SEQ_COLUMN.name, '_hyp_ingest_seq') + assert.ok(INTERNAL_FIELDS.includes('_hyp_ingest_seq')) + + await fs.rm(dir, { recursive: true, force: true }) +}) + +test('streamFlushFile leaves seq null and still declares the column when no allocator is wired', async () => { + const dir = await makeTmpDir() + const filePath = path.join(dir, 'noalloc.jsonl') + /** @type {ColumnSpec[]} */ + const cols = [{ name: 'id', type: 'INT64', nullable: false }] + await fs.writeFile(filePath, JSON.stringify({ version: 1, columns: cols, rows: [{ id: 1 }] }) + '\n') + + for await (const batch of streamFlushFile({ filePath, batchId: 'b' })) { + assert.ok(batch.chunk.columns.some((c) => c.name === '_hyp_ingest_seq')) + for (const row of batch.chunk.rows) { + assert.equal(row._hyp_ingest_seq, null) + } + } + await fs.rm(dir, { recursive: true, force: true }) +}) + +test('seq survives a flush into Iceberg, increases per row, and is stripped from readRows', async () => { + const cacheRoot = await makeTmpDir() + const svc = createQueryStorageService({ cacheRoot }) + /** @type {ColumnSpec[]} */ + const cols = [ + { name: 'id', type: 'INT64', nullable: false }, + { name: 'msg', type: 'STRING', nullable: false }, + ] + const tablePath = svc.cacheTablePath('demo', ['all']) + await svc.appendRows(tablePath, cols, [ + { id: 1, msg: 'a' }, + { id: 2, msg: 'b' }, + { id: 3, msg: 'c' }, + ]) + await svc.flushTable(tablePath, { reason: 'manual' }) + + const parts = await discoverCachePartitions(cacheRoot) + assert.equal(parts.length, 1) + const icebergDir = resolveIcebergDir(parts[0].path) + + /** @type {bigint[]} */ + const rawSeqs = [] + for await (const row of scanRowsFromTable(icebergDir)) { + assert.ok('_hyp_ingest_seq' in row, 'seq column persisted in the iceberg schema') + rawSeqs.push(/** @type {bigint} */ (row._hyp_ingest_seq)) + } + assert.equal(rawSeqs.length, 3) + // Strictly increasing, regardless of read order. + const sorted = [...rawSeqs].sort((x, y) => (x < y ? -1 : x > y ? 1 : 0)) + for (let i = 1; i < sorted.length; i++) assert.ok(sorted[i] > sorted[i - 1]) + + // Query/readRows consumers never see the internal seq column. + for await (const row of svc.readRows(tablePath)) { + assert.ok(!('_hyp_ingest_seq' in row)) + assert.ok(!('_hyp_cache_row_id' in row)) + } + + // The cache-global allocator state file lives at the cache root, outside + // datasets/, so partition discovery is unaffected. + const stat = await fs.stat(path.join(cacheRoot, SEQ_FILE)) + assert.ok(stat.isFile()) + + await fs.rm(cacheRoot, { recursive: true, force: true }) +}) diff --git a/test/core/sink-incremental-acceptance.test.js b/test/core/sink-incremental-acceptance.test.js new file mode 100644 index 0000000..ac93827 --- /dev/null +++ b/test/core/sink-incremental-acceptance.test.js @@ -0,0 +1,792 @@ +// @ts-check + +// Exactly-once acceptance proof for incremental sink reads (LLP 0040, T6). +// +// The T4/T5 unit suites (`sink-incremental`, `central-forward-chunking`) wire +// each sink against a STUBBED storage that hands back fixed rows and a hand-rolled +// `readRowsSince`. They prove the wiring; they cannot prove the design's load-bearing +// claim — that a row-resident `_hyp_ingest_seq` watermark survives the two cache +// rewrites that motivate the whole design (LLP 0039 "why this is a design"): +// +// - a retention FRONT-PRUNE (position-delete of the oldest rows), and +// - a compaction GENERATION SWAP (rewrite into a fresh `table-` dir). +// +// This suite drives the REAL kernel cache (`createQueryStorageService`), the REAL +// retention enforcer, the REAL `maintainCache` compaction, BOTH real sinks (the +// central `forward` request sink and the core `local-fs` blob sink), and the REAL +// sink driver's outbox respool — end to end — and asserts exactly-once across both +// rewrites, ≈0 bytes on a no-new-rows tick, ≈N on an N-new tick, and that the +// per-(sink, partition) watermark composes with the driver's outbox replay. +// +// @ref LLP 0040#exactly-once-argument [tests] — proves no skip / no dup across retention + compaction for both sinks +// @ref LLP 0039 [tests] — acceptance: ≈0 bytes on no-new-rows, ≈N on N-new, exactly-once across prune + compaction + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs/promises' +import path from 'node:path' +import os from 'node:os' + +import { createQueryStorageService } from '../../src/core/cache/storage.js' +import { appendRowsToTable } from '../../src/core/cache/iceberg/store.js' +import { INGEST_SEQ_COLUMN } from '../../src/core/cache/streaming-reader.js' +import { createRetentionEnforcer } from '../../src/core/cache/retention.js' +import { maintainCache } from '../../src/core/cache/maintenance.js' +import { readCursorSync, discoverCachePartitions } from '../../src/core/cache/partition.js' +import { createInstanceWatermarkStore } from '../../src/core/sinks/incremental.js' +import { createSinkRegistry } from '../../src/core/registry/sinks.js' +import { createQueryRegistry } from '../../src/core/registry/datasets.js' +import { createSinkDriver } from '../../src/core/sinks/driver.js' +import { createForwardSink } from '../../hypaware-core/plugins-workspace/central/src/sink.js' +import { activate as activateLocalFs } from '../../hypaware-core/plugins-workspace/local-fs/src/index.js' + +/** + * @import { ColumnSpec, QueryPartition, SinkEncoder } from '../../collectivus-plugin-kernel-types.d.ts' + * @import { Dirent } from 'node:fs' + */ + +const DATASET = 'proxy' +const SOURCE = 'claude' +const SIGNAL = 'proxy' // a KNOWN_SIGNALS member for the forward sink + +/** @type {ColumnSpec[]} */ +const COLS = [ + { name: 'id', type: 'INT64', nullable: false }, + { name: 'client_name', type: 'STRING', nullable: false }, + { name: 'timestamp', type: 'STRING', nullable: true }, + { name: 'msg', type: 'STRING', nullable: false }, +] + +/** @returns {Promise} */ +async function makeTmpDir() { + return fs.mkdtemp(path.join(os.tmpdir(), 'hyp-incr-accept-')) +} + +/** @param {number} daysAgo @returns {string} */ +function isoDaysAgo(daysAgo) { + return new Date(Date.now() - daysAgo * 24 * 60 * 60 * 1000).toISOString() +} + +/** + * @param {number[]} ids + * @param {string} timestamp + * @returns {Record[]} + */ +function rows(ids, timestamp) { + return ids.map((id) => ({ id, client_name: SOURCE, timestamp, msg: `m${id}` })) +} + +function noopLog() { + return { debug() {}, info() {}, warn() {}, error() {} } +} + +/** + * Append a batch to the live spool then flush it: this routes through the + * `decorateRow` chokepoint that stamps `_hyp_ingest_seq` (T1), so the committed + * rows carry real monotonic seqs — the precondition for incremental reads. + * + * @param {ReturnType} svc + * @param {string} spoolPath + * @param {Record[]} batch + */ +async function flushBatch(svc, spoolPath, batch) { + await svc.appendRows(spoolPath, COLS, batch) + await svc.flushTable(spoolPath, { reason: 'manual' }) +} + +/** + * Resolve the single committed logical partition the spool flush produced. + * `.path` is the stable `/datasets//source=` dir — the + * watermark key the design keys on, NOT the physical `tableDir`. + * + * @param {string} cacheRoot + * @returns {Promise} + */ +async function logicalPartition(cacheRoot) { + const parts = await discoverCachePartitions(cacheRoot) + const part = parts.find((p) => p.dataset === DATASET) + assert.ok(part, 'expected a committed proxy partition after flush') + return { dataset: DATASET, partition: { source: SOURCE }, tablePath: part.path } +} + +/** + * Build a committed partition table DIRECTLY at the stable logical partition + * path (no spool, no cursor) so the seq values — including pre-upgrade nulls — + * are controlled exactly. This reproduces a migration-era cache: some rows + * pre-date the `_hyp_ingest_seq` column (null), some carry real seqs. `seq:null` + * stamps a legacy row; a bigint stamps a real one. + * + * @param {string} cacheRoot + * @param {{ id: number, seq: bigint | null }[]} spec + * @returns {Promise} + */ +async function buildLegacyPartition(cacheRoot, spec) { + const dir = path.join(cacheRoot, 'datasets', DATASET, `source=${SOURCE}`) + /** @type {ColumnSpec[]} */ + const cols = [{ name: 'id', type: 'INT64', nullable: false }, INGEST_SEQ_COLUMN] + await appendRowsToTable( + dir, + cols, + spec.map((s) => ({ id: s.id, [INGEST_SEQ_COLUMN.name]: s.seq })), + ) + return { dataset: DATASET, partition: { source: SOURCE }, tablePath: dir } +} + +// -------------------------------------------------------------------------- +// Forward sink test rig: REAL createForwardSink + REAL watermark store + REAL +// cache, with a recording fetch stub standing in for the central server. +// -------------------------------------------------------------------------- + +/** + * @param {{ storage: any, watermarks: any, responder?: (c: any) => number, query?: any }} args + */ +function makeForwardSink({ storage, watermarks, responder, query }) { + /** @type {Array<{ url: string, batchId: string, ids: number[], status: number }>} */ + const calls = [] + /** @type {typeof fetch} */ + const fetchFn = /** @type {any} */ (async (url, init) => { + const headers = /** @type {Record} */ (init?.headers ?? {}) + const body = String(init?.body ?? '') + const ids = body.split('\n').filter((l) => l.length > 0).map((l) => Number(JSON.parse(l).id)) + const status = responder ? responder({ url: String(url) }) : 202 + calls.push({ url: String(url), batchId: headers['x-hyp-batch-id'], ids, status }) + return /** @type {any} */ ({ + status, + ok: status >= 200 && status < 300, + headers: { get: () => null }, + async text() { return '' }, + body: { cancel: async () => {} }, + }) + }) + const identityClient = /** @type {any} */ ({ async getCurrentJwt() { return 'jwt' }, async refresh() {} }) + const sink = createForwardSink({ + config: /** @type {any} */ ({ url: 'http://central.test', identity: {} }), + identityClient, + query: query ?? /** @type {any} */ ({ getDataset: () => ({ sourceSignal: SIGNAL }) }), + storage, + watermarks, + log: /** @type {any} */ (noopLog()), + fetchFn, + sleepFn: async () => {}, + }) + return { sink, calls } +} + +/** Ids that were durably ACKed (2xx) across a set of recorded POSTs. */ +function ackedIds(/** @type {Array<{ ids: number[], status: number }>} */ calls) { + return calls.filter((c) => c.status >= 200 && c.status < 300).flatMap((c) => c.ids) +} + +// -------------------------------------------------------------------------- +// Blob sink test rig: the REAL @hypaware/local-fs sink (its buildSink wiring, +// T5) built via the plugin's own activate()/create(), with a trivial JSON +// encoder so the written blob is decodable to the exact ids it carried. +// -------------------------------------------------------------------------- + +/** @returns {SinkEncoder} */ +function makeJsonEncoder() { + return { + format: 'json', extension: 'json', supports: [], + async encodePartition(partition, ctx) { + /** @type {number[]} */ + const ids = [] + for await (const row of ctx.rows ?? []) ids.push(Number(row.id)) + const bytes = new TextEncoder().encode(JSON.stringify({ ids })) + return { filename: `${partition.dataset}.json`, bytes, bytesWritten: bytes.byteLength, rowCount: ids.length } + }, + } +} + +/** + * @param {{ storage: any, destDir: string, stateDir: string, instanceName: string }} args + */ +async function makeBlobSink({ storage, destDir, stateDir, instanceName }) { + const sinkRegistry = createSinkRegistry() + const query = /** @type {any} */ ({ getDataset: () => ({ schema: { columns: COLS } }) }) + const ctx = /** @type {any} */ ({ + config: { exports_dir: path.join(stateDir, 'exports') }, + env: {}, + provideCapability() {}, + sinks: sinkRegistry, + query, + storage, + }) + await activateLocalFs(ctx) + const contribution = sinkRegistry.getContribution('@hypaware/local-fs', 'local-fs') + assert.ok(contribution, 'local-fs registered a sink contribution') + const sink = await contribution.create(/** @type {any} */ ({ + name: instanceName, + plugin: '@hypaware/local-fs', + config: { dir: destDir }, + paths: { rootDir: stateDir, stateDir, cacheDir: stateDir, tempDir: path.join(stateDir, 'tmp') }, + log: noopLog(), + encoder: makeJsonEncoder(), + })) + return sink +} + +/** + * Recursively collect the blob files written under a destination dir, newest + * first by name, returning `{ name, ids }` with the decoded ids per blob. + * + * @param {string} destDir + * @returns {Promise>} + */ +async function listBlobs(destDir) { + /** @type {Array<{ name: string, ids: number[] }>} */ + const out = [] + /** @param {string} dir */ + async function walk(dir) { + /** @type {Dirent[]} */ + let entries + try { entries = await fs.readdir(dir, { withFileTypes: true }) } catch { return } + for (const e of entries) { + const full = path.join(dir, e.name) + if (e.isDirectory()) await walk(full) + else if (e.name.endsWith('.json')) { + const parsed = JSON.parse(await fs.readFile(full, 'utf8')) + out.push({ name: e.name, ids: parsed.ids.map(Number) }) + } + } + } + await walk(destDir) + return out.sort((a, b) => a.name.localeCompare(b.name)) +} + +// ========================================================================== +// Baseline: ≈0 bytes on a no-new-rows tick, ≈N on an N-new tick. +// ========================================================================== + +test('forward sink: ≈0 bytes on a no-new-rows tick, ≈N on an N-new tick', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath(DATASET, ['all']) + await flushBatch(svc, spoolPath, rows([0, 1, 2], isoDaysAgo(1))) + const part = await logicalPartition(cacheRoot) + const watermarks = createInstanceWatermarkStore({ + paths: /** @type {any} */ ({ stateDir: path.join(cacheRoot, 'state') }), + instanceName: 'forward', + }) + const { sink, calls } = makeForwardSink({ storage: svc, watermarks }) + + // First tick: 3 new rows ship. + const r1 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r1.status, 'exported') + assert.deepEqual(ackedIds(calls).sort((a, b) => a - b), [0, 1, 2]) + assert.ok((r1.bytesWritten ?? 0) > 0) + + // Second tick: nothing new -> 0 chunks, 0 bytes (today the whole partition re-sends). + calls.length = 0 + const r2 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r2.status, 'exported') + assert.equal(r2.bytesWritten, 0, 'no-new-rows tick transmits ≈0 bytes') + assert.equal(calls.length, 0, 'no-new-rows tick makes zero POSTs') + + // Add 2 rows -> exactly those 2 are read/sent, independent of the 3 already shipped. + calls.length = 0 + await flushBatch(svc, spoolPath, rows([3, 4], isoDaysAgo(1))) + const r3 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r3.status, 'exported') + assert.deepEqual(ackedIds(calls).sort((a, b) => a - b), [3, 4], 'reads/sends only the N new rows') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +test('blob sink: no blob on a no-new-rows tick, exactly N on an N-new tick', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath(DATASET, ['all']) + await flushBatch(svc, spoolPath, rows([0, 1, 2], isoDaysAgo(1))) + const part = await logicalPartition(cacheRoot) + const destDir = path.join(cacheRoot, 'blob-out') + const sink = await makeBlobSink({ storage: svc, destDir, stateDir: path.join(cacheRoot, 'state'), instanceName: 'archive' }) + + const r1 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r1.status, 'exported') + assert.ok((r1.bytesWritten ?? 0) > 0) + let blobs = await listBlobs(destDir) + assert.equal(blobs.length, 1) + assert.deepEqual(blobs[0].ids.sort((a, b) => a - b), [0, 1, 2]) + // The filename embeds the [sinceSeq, lastSeq] range for idempotent re-PUT. + assert.match(blobs[0].name, /^proxy\.\d+-\d+\.json$/) + + // No new rows -> no blob written, 0 bytes (skip-empty). + const r2 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r2.status, 'exported') + assert.equal(r2.bytesWritten, 0) + assert.equal(r2.partitionsExported, 0) + assert.equal((await listBlobs(destDir)).length, 1, 'no second blob for a no-new-rows tick') + + // N new rows -> exactly one more blob carrying only the new ids. + await flushBatch(svc, spoolPath, rows([3, 4], isoDaysAgo(1))) + const r3 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r3.status, 'exported') + blobs = await listBlobs(destDir) + assert.equal(blobs.length, 2) + const newest = blobs[blobs.length - 1] + assert.deepEqual(newest.ids.sort((a, b) => a - b), [3, 4], 'second blob holds only the N new rows') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +// ========================================================================== +// Exactly-once across a RETENTION FRONT-PRUNE (LLP 0040 §5 acceptance 3a). +// ========================================================================== + +test('forward sink: exactly-once across a retention front-prune', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath(DATASET, ['all']) + const watermarks = createInstanceWatermarkStore({ + paths: /** @type {any} */ ({ stateDir: path.join(cacheRoot, 'state') }), + instanceName: 'forward', + }) + const { sink, calls } = makeForwardSink({ storage: svc, watermarks }) + + // Old rows shipped, watermark advances past them. + await flushBatch(svc, spoolPath, rows([0, 1, 2], isoDaysAgo(45))) + await sink.exportBatch(/** @type {any} */ ({ partitions: [await logicalPartition(cacheRoot)] }), /** @type {any} */ ({})) + const acked1 = ackedIds(calls) + + // Newer rows arrive, then retention deletes the old front (all already < watermark). + await flushBatch(svc, spoolPath, rows([3, 4], isoDaysAgo(1))) + const enforcer = createRetentionEnforcer({ cacheRoot, config: { default_days: 30 } }) + const ret = await enforcer.tick() + const pruned = ret.sourceTableResults.reduce((n, r) => n + r.rowsDeleted, 0) + assert.equal(pruned, 3, 'retention front-pruned the 3 old rows') + + // A `> watermark` read is blind to the pruned rows: yields only the new suffix. + calls.length = 0 + await sink.exportBatch(/** @type {any} */ ({ partitions: [await logicalPartition(cacheRoot)] }), /** @type {any} */ ({})) + const acked2 = ackedIds(calls) + assert.deepEqual(acked2.sort((a, b) => a - b), [3, 4], 'no re-send of pruned rows, no skip of survivors') + + // No row skipped or duplicated across the whole run. + const all = [...acked1, ...acked2].sort((a, b) => a - b) + assert.deepEqual(all, [0, 1, 2, 3, 4]) + assert.equal(new Set(all).size, all.length, 'exactly-once: no duplicates') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +test('blob sink: exactly-once across a retention front-prune', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath(DATASET, ['all']) + const destDir = path.join(cacheRoot, 'blob-out') + const sink = await makeBlobSink({ storage: svc, destDir, stateDir: path.join(cacheRoot, 'state'), instanceName: 'archive' }) + + await flushBatch(svc, spoolPath, rows([0, 1, 2], isoDaysAgo(45))) + await sink.exportBatch(/** @type {any} */ ({ partitions: [await logicalPartition(cacheRoot)] }), /** @type {any} */ ({})) + + await flushBatch(svc, spoolPath, rows([3, 4], isoDaysAgo(1))) + const enforcer = createRetentionEnforcer({ cacheRoot, config: { default_days: 30 } }) + await enforcer.tick() + + await sink.exportBatch(/** @type {any} */ ({ partitions: [await logicalPartition(cacheRoot)] }), /** @type {any} */ ({})) + + const blobs = await listBlobs(destDir) + const exported = blobs.flatMap((b) => b.ids).sort((a, b) => a - b) + assert.deepEqual(exported, [0, 1, 2, 3, 4], 'every row exported exactly once across the prune') + assert.equal(new Set(exported).size, exported.length, 'exactly-once: no duplicates') + // The post-prune blob carried only the survivors, not the pruned front. + assert.deepEqual(blobs[blobs.length - 1].ids.sort((a, b) => a - b), [3, 4]) + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +// ========================================================================== +// Exactly-once across a COMPACTION GENERATION SWAP (LLP 0040 §5 acceptance 3b). +// The seq is row-resident, so it rides verbatim into the new `table-` dir; +// the watermark is keyed by the stable LOGICAL path, so it reads straight through. +// ========================================================================== + +test('forward sink: exactly-once across a compaction generation swap', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath(DATASET, ['all']) + const watermarks = createInstanceWatermarkStore({ + paths: /** @type {any} */ ({ stateDir: path.join(cacheRoot, 'state') }), + instanceName: 'forward', + }) + const { sink, calls } = makeForwardSink({ storage: svc, watermarks }) + + await flushBatch(svc, spoolPath, rows([0, 1, 2], isoDaysAgo(1))) + await sink.exportBatch(/** @type {any} */ ({ partitions: [await logicalPartition(cacheRoot)] }), /** @type {any} */ ({})) + const acked1 = ackedIds(calls) + + await flushBatch(svc, spoolPath, rows([3, 4], isoDaysAgo(1))) + + // Compaction rewrites the whole partition into a fresh generation directory. + const sourceDir = path.join(cacheRoot, 'datasets', DATASET, `source=${SOURCE}`) + const tableDirBefore = readCursorSync(sourceDir).tableDir ?? 'table' + const report = await maintainCache({ cacheRoot, force: true, compactOnly: true }) + assert.ok(report.totalCompacted > 0, 'compaction ran') + const tableDirAfter = readCursorSync(sourceDir).tableDir ?? 'table' + assert.notEqual(tableDirAfter, tableDirBefore, 'generation directory swapped') + + // The watermark (logical-keyed) reads straight through the swap: only the new suffix. + calls.length = 0 + await sink.exportBatch(/** @type {any} */ ({ partitions: [await logicalPartition(cacheRoot)] }), /** @type {any} */ ({})) + const acked2 = ackedIds(calls) + assert.deepEqual(acked2.sort((a, b) => a - b), [3, 4], 'seq survived compaction: only the new suffix re-read') + + const all = [...acked1, ...acked2].sort((a, b) => a - b) + assert.deepEqual(all, [0, 1, 2, 3, 4]) + assert.equal(new Set(all).size, all.length, 'exactly-once: no duplicates') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +test('blob sink: exactly-once across a compaction generation swap', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath(DATASET, ['all']) + const destDir = path.join(cacheRoot, 'blob-out') + const sink = await makeBlobSink({ storage: svc, destDir, stateDir: path.join(cacheRoot, 'state'), instanceName: 'archive' }) + + await flushBatch(svc, spoolPath, rows([0, 1, 2], isoDaysAgo(1))) + await sink.exportBatch(/** @type {any} */ ({ partitions: [await logicalPartition(cacheRoot)] }), /** @type {any} */ ({})) + + await flushBatch(svc, spoolPath, rows([3, 4], isoDaysAgo(1))) + + const sourceDir = path.join(cacheRoot, 'datasets', DATASET, `source=${SOURCE}`) + const before = readCursorSync(sourceDir).tableDir ?? 'table' + await maintainCache({ cacheRoot, force: true, compactOnly: true }) + const after = readCursorSync(sourceDir).tableDir ?? 'table' + assert.notEqual(after, before, 'generation directory swapped') + + await sink.exportBatch(/** @type {any} */ ({ partitions: [await logicalPartition(cacheRoot)] }), /** @type {any} */ ({})) + + const blobs = await listBlobs(destDir) + const exported = blobs.flatMap((b) => b.ids).sort((a, b) => a - b) + assert.deepEqual(exported, [0, 1, 2, 3, 4], 'every row exported exactly once across the compaction') + assert.equal(new Set(exported).size, exported.length, 'exactly-once: no duplicates') + assert.deepEqual(blobs[blobs.length - 1].ids.sort((a, b) => a - b), [3, 4], 'post-compaction blob holds only the new suffix') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +// ========================================================================== +// Watermark vs. driver-outbox respool composition (LLP 0040 risk #6). +// A failed tick lands the partition in the driver outbox; the next tick's +// respool re-hands the SAME partition, and the watermark ensures the replay +// reads only the un-acked suffix (not the whole partition), with a stable +// X-Hyp-Batch-Id so any in-flight redelivery is server-dedup safe. +// ========================================================================== + +test('forward sink: watermark composes with the driver-outbox respool (suffix-only replay)', async () => { + const cacheRoot = await makeTmpDir() + const stateRoot = path.join(cacheRoot, 'state-root') + try { + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath(DATASET, ['all']) + await flushBatch(svc, spoolPath, rows([0, 1, 2], isoDaysAgo(1))) + + const watermarks = createInstanceWatermarkStore({ + paths: /** @type {any} */ ({ stateDir: path.join(cacheRoot, 'state') }), + instanceName: 'forward', + }) + + // The query registry the driver discovers partitions from (and the sink + // resolves its signal from): one dataset whose discoverPartitions returns + // the committed logical partition. + const query = createQueryRegistry() + const partition = await logicalPartition(cacheRoot) + query.registerDataset(/** @type {any} */ ({ + name: DATASET, + plugin: '@hypaware/central', + sourceSignal: SIGNAL, + schema: { columns: COLS }, + discoverPartitions: () => [partition], + })) + + let serverMode = /** @type {'ok' | 'fail'} */ ('ok') + const { sink, calls } = makeForwardSink({ + storage: svc, + watermarks, + query, + responder: () => (serverMode === 'fail' ? 500 : 202), + }) + + const sinkRegistry = createSinkRegistry() + sinkRegistry.register({ name: 'forward', plugin: '@hypaware/central', supports: [], create: async () => sink }) + const contribution = sinkRegistry.getContribution('@hypaware/central', 'forward') + assert.ok(contribution) + await sinkRegistry.instantiate(/** @type {any} */ ({ + kind: 'request', + instanceName: 'forward', + contribution, + config: { schedule: '* * * * *' }, + plugin: { name: '@hypaware/central', version: '1.0.0', manifest: {}, rootDir: '/fake' }, + paths: { rootDir: stateRoot, stateDir: stateRoot, cacheDir: stateRoot, tempDir: stateRoot }, + log: noopLog(), + })) + + const driver = createSinkDriver({ sinkRegistry, queryRegistry: query, storage: svc, stateRoot }) + const now = new Date('2026-06-25T10:00:00Z') + + // Tick A: server OK -> the 3 rows are delivered, watermark advances. + let mark = calls.length + const a = await driver.tick({ now, force: true }) + assert.equal(a.sinks[0].status, 'exported') + const ackedA = ackedIds(calls.slice(mark)) + assert.deepEqual(ackedA.sort((x, y) => x - y), [0, 1, 2]) + + // New rows arrive; the next tick fails at the server. + await flushBatch(svc, spoolPath, rows([3, 4], isoDaysAgo(1))) + + // Tick B: server FAILS. The sink read ONLY the un-acked suffix {3,4} (the + // watermark bounds the re-read even on the failing tick), the POST 500s, + // and the driver spools the partition to the outbox; the watermark holds. + serverMode = 'fail' + mark = calls.length + const b = await driver.tick({ now, force: true }) + assert.notEqual(b.sinks[0].status, 'exported') + const tickBCalls = calls.slice(mark) + assert.equal(tickBCalls.length, 1, 'exactly one (failed) chunk attempted') + assert.deepEqual(tickBCalls[0].ids.sort((x, y) => x - y), [3, 4], 'respool read is bounded to the un-acked suffix, not the whole partition') + const outboxDir = path.join(stateRoot, 'sinks', 'forward', 'outbox') + const outboxFiles = await fs.readdir(outboxDir) + assert.ok(outboxFiles.length >= 1, 'the failed batch landed in the driver outbox (respool record)') + + // Tick C: server OK again -> the outbox respool re-hands the partition and + // the watermark replays only {3,4} with the SAME batch-id as the failed + // attempt, so an in-flight redelivery is server-dedup safe. + serverMode = 'ok' + mark = calls.length + const c = await driver.tick({ now, force: true }) + assert.equal(c.sinks[0].status, 'exported') + const tickCCalls = calls.slice(mark) + assert.deepEqual(ackedIds(tickCCalls).sort((x, y) => x - y), [3, 4]) + assert.equal(tickCCalls[0].batchId, tickBCalls[0].batchId, 'identical batch-id across the respool (idempotent backstop)') + + // Exactly-once over the successfully-acked deliveries. + const allAcked = ackedIds(calls).sort((x, y) => x - y) + assert.deepEqual(allAcked, [0, 1, 2, 3, 4]) + assert.equal(new Set(allAcked).size, allAcked.length, 'no row acked twice') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +test('blob sink: a lost watermark after a durable PUT re-PUTs the same object key (idempotent overwrite)', async () => { + // The blob sink's stand-in for the server ledger: the [sinceSeq,lastSeq] + // filename. A crash between PUT and watermark-advance re-reads the same + // suffix next tick, which re-derives the SAME filename -> idempotent + // overwrite, never a duplicate blob. (LLP 0040 §4 / risk #6.) + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath(DATASET, ['all']) + await flushBatch(svc, spoolPath, rows([0, 1, 2], isoDaysAgo(1))) + const part = await logicalPartition(cacheRoot) + + const stateDir = path.join(cacheRoot, 'state') + const destDir = path.join(cacheRoot, 'blob-out') + const sink = await makeBlobSink({ storage: svc, destDir, stateDir, instanceName: 'archive' }) + + // First export writes the blob and advances the watermark. + await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + const after1 = await listBlobs(destDir) + assert.equal(after1.length, 1) + const firstName = after1[0].name + + // Simulate the watermark write being lost after the durable PUT by deleting + // the persisted watermark file: the next tick believes nothing was exported. + const wmStore = createInstanceWatermarkStore({ paths: /** @type {any} */ ({ stateDir }), instanceName: 'archive' }) + const wmFile = wmStore.filePath(wmStore.keyFor(svc.cacheRoot, part.tablePath ?? '')) + await fs.rm(wmFile, { force: true }) + + // Re-export: same suffix -> same [sinceSeq,lastSeq] -> same filename -> + // overwrites the one blob rather than creating a duplicate. + await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + const after2 = await listBlobs(destDir) + assert.equal(after2.length, 1, 'idempotent re-PUT: still exactly one blob, not a duplicate') + assert.equal(after2[0].name, firstName, 'same object key re-PUT') + assert.deepEqual(after2[0].ids.sort((a, b) => a - b), [0, 1, 2]) + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +// ========================================================================== +// One-time legacy backlog re-export (LLP 0040 §6 risk #1). +// Pre-upgrade rows carry a null `_hyp_ingest_seq`. A fresh sink (no watermark) +// exports them ONCE; once it has a durable watermark it treats them as +// already-shipped, so the backlog never re-exports on every tick (which would +// also duplicate after a compaction reorders the body). +// ========================================================================== + +test('forward sink: a pure-legacy partition re-exports the null-seq backlog exactly once', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const part = await buildLegacyPartition(cacheRoot, [ + { id: 0, seq: null }, { id: 1, seq: null }, { id: 2, seq: null }, + ]) + const watermarks = createInstanceWatermarkStore({ + paths: /** @type {any} */ ({ stateDir: path.join(cacheRoot, 'state') }), + instanceName: 'forward', + }) + const { sink, calls } = makeForwardSink({ storage: svc, watermarks }) + + // Tick 1: the legacy backlog ships once. + const r1 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r1.status, 'exported') + assert.deepEqual(ackedIds(calls).sort((a, b) => a - b), [0, 1, 2]) + assert.ok((r1.bytesWritten ?? 0) > 0) + + // Tick 2: the backlog does NOT re-export — zero POSTs, zero bytes. + calls.length = 0 + const r2 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r2.status, 'exported') + assert.equal(r2.bytesWritten, 0, 'legacy backlog re-exports once, not every tick') + assert.equal(calls.length, 0, 'second tick makes zero POSTs') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +test('forward sink: a mixed legacy+real partition ships everything once, then steady-state', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const part = await buildLegacyPartition(cacheRoot, [ + { id: 0, seq: null }, { id: 1, seq: null }, // legacy + { id: 10, seq: 5n }, { id: 11, seq: 10n }, // real + ]) + const watermarks = createInstanceWatermarkStore({ + paths: /** @type {any} */ ({ stateDir: path.join(cacheRoot, 'state') }), + instanceName: 'forward', + }) + const { sink, calls } = makeForwardSink({ storage: svc, watermarks }) + + const r1 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r1.status, 'exported') + assert.deepEqual(ackedIds(calls).sort((a, b) => a - b), [0, 1, 10, 11], 'first tick ships legacy + real') + + calls.length = 0 + const r2 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r2.status, 'exported') + assert.equal(r2.bytesWritten, 0, 'no re-export of legacy or already-shipped real rows') + assert.equal(calls.length, 0) + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +test('blob sink: a pure-legacy partition writes one blob, then no blob', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const part = await buildLegacyPartition(cacheRoot, [ + { id: 0, seq: null }, { id: 1, seq: null }, { id: 2, seq: null }, + ]) + const destDir = path.join(cacheRoot, 'blob-out') + const sink = await makeBlobSink({ storage: svc, destDir, stateDir: path.join(cacheRoot, 'state'), instanceName: 'archive' }) + + const r1 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r1.status, 'exported') + let blobs = await listBlobs(destDir) + assert.equal(blobs.length, 1) + assert.deepEqual(blobs[0].ids.sort((a, b) => a - b), [0, 1, 2]) + + // Second tick: the backlog is already shipped → no new blob, 0 bytes. + const r2 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r2.status, 'exported') + assert.equal(r2.bytesWritten, 0) + blobs = await listBlobs(destDir) + assert.equal(blobs.length, 1, 'no second blob for the legacy backlog') + // No id is duplicated across artifacts. + const all = blobs.flatMap((b) => b.ids) + assert.equal(new Set(all).size, all.length, 'no row in two blobs') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +test('blob sink: a mixed legacy+real partition writes one blob, then no blob', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const part = await buildLegacyPartition(cacheRoot, [ + { id: 0, seq: null }, { id: 1, seq: null }, + { id: 10, seq: 5n }, { id: 11, seq: 10n }, + ]) + const destDir = path.join(cacheRoot, 'blob-out') + const sink = await makeBlobSink({ storage: svc, destDir, stateDir: path.join(cacheRoot, 'state'), instanceName: 'archive' }) + + const r1 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r1.status, 'exported') + let blobs = await listBlobs(destDir) + assert.equal(blobs.length, 1) + assert.deepEqual(blobs[0].ids.sort((a, b) => a - b), [0, 1, 10, 11]) + + const r2 = await sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(r2.status, 'exported') + assert.equal(r2.bytesWritten, 0) + blobs = await listBlobs(destDir) + assert.equal(blobs.length, 1, 'no second blob') + const all = blobs.flatMap((b) => b.ids) + assert.equal(new Set(all).size, all.length, 'no row in two blobs') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) + +// ========================================================================== +// Per-(sink INSTANCE, partition) watermark scoping (LLP 0040 §3). +// Two `@hypaware/central` instances of one plugin share a `stateDir`; their +// watermarks must NOT collide, or one instance's advance would make the other +// skip rows it never exported. `createInstanceWatermarkStore` namespaces by the +// instance name (the fix for the central sink using the per-PLUGIN store). +// ========================================================================== + +test('forward sink: two instances on one partition keep independent watermarks (no cross-instance skip)', async () => { + const cacheRoot = await makeTmpDir() + try { + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath(DATASET, ['all']) + await flushBatch(svc, spoolPath, rows([0, 1, 2], isoDaysAgo(1))) + const part = await logicalPartition(cacheRoot) + + // SAME plugin stateDir, two instance names — the per-plugin store would + // collapse these onto one file and let A's advance clobber B's cursor. + const stateDir = path.join(cacheRoot, 'state') + const wmA = createInstanceWatermarkStore({ paths: /** @type {any} */ ({ stateDir }), instanceName: 'fleet-a' }) + const wmB = createInstanceWatermarkStore({ paths: /** @type {any} */ ({ stateDir }), instanceName: 'fleet-b' }) + assert.notEqual( + wmA.filePath(wmA.keyFor(svc.cacheRoot, part.tablePath ?? '')), + wmB.filePath(wmB.keyFor(svc.cacheRoot, part.tablePath ?? '')), + 'each instance gets its own watermark file', + ) + + // Instance A ships all three rows and advances ITS watermark. + const a = makeForwardSink({ storage: svc, watermarks: wmA }) + const ra = await a.sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(ra.status, 'exported') + assert.deepEqual(ackedIds(a.calls).sort((x, y) => x - y), [0, 1, 2]) + + // Instance B, fresh: must STILL see all three rows — A's advance must not + // have clobbered B's (independent) watermark. + const b = makeForwardSink({ storage: svc, watermarks: wmB }) + const rb = await b.sink.exportBatch(/** @type {any} */ ({ partitions: [part] }), /** @type {any} */ ({})) + assert.equal(rb.status, 'exported') + assert.deepEqual(ackedIds(b.calls).sort((x, y) => x - y), [0, 1, 2], 'instance B is not skipped by instance A') + } finally { + await fs.rm(cacheRoot, { recursive: true, force: true }) + } +}) diff --git a/test/core/sink-incremental.test.js b/test/core/sink-incremental.test.js new file mode 100644 index 0000000..7c9fc1e --- /dev/null +++ b/test/core/sink-incremental.test.js @@ -0,0 +1,175 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs/promises' +import path from 'node:path' +import os from 'node:os' + +import { + createInstanceWatermarkStore, + openIncrementalRows, + watermarkKeyFor, + withSeqRangeFilename, +} from '../../src/core/sinks/incremental.js' + +/** @returns {Promise} */ +async function tmpDir() { + return fs.mkdtemp(path.join(os.tmpdir(), 'hyp-sink-incr-')) +} + +/** + * Storage stub: `readRowsSince` yields the registered rows honouring `seq > since` + * and emitting the monotonic high-water `after` token (null-seq rows carry it). + * + * @param {Record>>} rowsByTable + */ +function makeStorage(rowsByTable) { + return { + cacheRoot: '/cache', + /** @param {string} tp */ + tableExists: (tp) => tp in rowsByTable, + /** @param {string} tablePath @param {{ since?: { seq: string } }} [opts] */ + readRowsSince(tablePath, opts) { + const rows = rowsByTable[tablePath] ?? [] + const sinceSeq = opts?.since ? BigInt(opts.since.seq) : 0n + return { + async *[Symbol.asyncIterator]() { + let high = sinceSeq + for (const r of rows) { + const { _seq, ...row } = r + if (_seq !== null) { + const seq = BigInt(_seq) + if (seq <= sinceSeq) continue + if (seq > high) high = seq + } + yield { row, after: { v: 1, seq: high.toString() } } + } + }, + } + }, + } +} + +test('withSeqRangeFilename inserts the range before the final extension', () => { + assert.equal(withSeqRangeFilename('all.parquet', '0', '50'), 'all.0-50.parquet') + assert.equal(withSeqRangeFilename('source=claude.jsonl', '7', '12'), 'source=claude.7-12.jsonl') +}) + +test('withSeqRangeFilename is deterministic and preserves dots in the base name', () => { + // A partition value with a dot must keep its extension at the very end. + assert.equal(withSeqRangeFilename('date=2026.06.parquet', '1', '2'), 'date=2026.06.1-2.parquet') + // No extension ⇒ append the range. + assert.equal(withSeqRangeFilename('blob', '1', '9'), 'blob.1-9') +}) + +test('openIncrementalRows reports empty for a missing table (no blob written)', async () => { + const storage = makeStorage({}) + const reader = await openIncrementalRows(/** @type {any} */ (storage), { dataset: 'd', partition: {}, tablePath: '/cache/datasets/d/source=x' }, undefined) + assert.equal(reader.empty, true) + assert.equal(reader.rowCount, 0) + // Draining the empty stream yields nothing. + const seen = [] + for await (const r of reader.rows) seen.push(r) + assert.equal(seen.length, 0) +}) + +test('openIncrementalRows reports empty for a partition with no tablePath', async () => { + const storage = makeStorage({}) + const reader = await openIncrementalRows(/** @type {any} */ (storage), { dataset: 'd', partition: {} }, undefined) + assert.equal(reader.empty, true) + assert.equal(reader.sinceSeq, '0') +}) + +test('openIncrementalRows tracks rowCount and the high-water lastAfter as the encoder drains', async () => { + const tablePath = '/cache/datasets/d/source=x' + const storage = makeStorage({ [tablePath]: [{ _seq: 5, id: 'a' }, { _seq: 9, id: 'b' }] }) + const reader = await openIncrementalRows(/** @type {any} */ (storage), { dataset: 'd', partition: {}, tablePath }, undefined) + assert.equal(reader.empty, false) + assert.equal(reader.sinceSeq, '0') + // Before draining, the counters reflect only the peeked state. + const rows = [] + for await (const r of reader.rows) rows.push(r) + assert.deepEqual(rows, [{ id: 'a' }, { id: 'b' }], 'rows are clean (no _seq)') + assert.equal(reader.rowCount, 2) + assert.equal(reader.lastAfter.seq, '9', 'lastAfter is the max seq seen') +}) + +test('openIncrementalRows honours the since filter (only seq > since)', async () => { + const tablePath = '/cache/datasets/d/source=x' + const storage = makeStorage({ [tablePath]: [{ _seq: 5, id: 'a' }, { _seq: 9, id: 'b' }] }) + const reader = await openIncrementalRows(/** @type {any} */ (storage), { dataset: 'd', partition: {}, tablePath }, { v: 1, seq: '5' }) + assert.equal(reader.empty, false) + assert.equal(reader.sinceSeq, '5') + const rows = [] + for await (const r of reader.rows) rows.push(r) + assert.deepEqual(rows, [{ id: 'b' }]) + assert.equal(reader.lastAfter.seq, '9') +}) + +test('openIncrementalRows reports empty when since already covers every row', async () => { + const tablePath = '/cache/datasets/d/source=x' + const storage = makeStorage({ [tablePath]: [{ _seq: 5, id: 'a' }] }) + const reader = await openIncrementalRows(/** @type {any} */ (storage), { dataset: 'd', partition: {}, tablePath }, { v: 1, seq: '5' }) + assert.equal(reader.empty, true, 'no rows newer than the watermark ⇒ empty') +}) + +test('openIncrementalRows: null-seq legacy rows are emitted but never advance lastAfter', async () => { + // Null-seq (pre-upgrade) rows are always yielded but carry the watermark + // forward unchanged — the one-time migration is a re-export, never a skip. + const tablePath = '/cache/datasets/d/source=x' + const storage = makeStorage({ [tablePath]: [{ _seq: null, id: 'legacy' }] }) + const reader = await openIncrementalRows(/** @type {any} */ (storage), { dataset: 'd', partition: {}, tablePath }, { v: 1, seq: '4' }) + assert.equal(reader.empty, false) + const rows = [] + for await (const r of reader.rows) rows.push(r) + assert.deepEqual(rows, [{ id: 'legacy' }]) + assert.equal(reader.lastAfter.seq, '4', 'a null-seq row keeps the prior watermark') +}) + +test('watermarkKeyFor returns null without a tablePath and the logical key otherwise', () => { + const store = createSinkWatermarkStub() + assert.equal(watermarkKeyFor(/** @type {any} */ (store), /** @type {any} */ ({ cacheRoot: '/cache' }), { dataset: 'd', partition: {} }), null) + const key = watermarkKeyFor(/** @type {any} */ (store), /** @type {any} */ ({ cacheRoot: '/cache' }), { dataset: 'd', partition: {}, tablePath: '/cache/datasets/d/source=x' }) + assert.deepEqual(key, { dataset: 'd', partitionKey: 'source=x' }) +}) + +test('createInstanceWatermarkStore isolates instances under one plugin stateDir', async (t) => { + // PluginPaths.stateDir is per-plugin; two sink instances of the same + // destination must not share (and clobber) one watermark file. + const stateDir = await tmpDir() + t.after(() => fs.rm(stateDir, { recursive: true, force: true })) + const paths = { rootDir: stateDir, stateDir, cacheDir: stateDir, tempDir: stateDir } + const a = createInstanceWatermarkStore({ paths, instanceName: 'alpha' }) + const b = createInstanceWatermarkStore({ paths, instanceName: 'beta' }) + + const key = a.keyFor('/cache', '/cache/datasets/d/source=x') + await a.write(key, { continuation: { v: 1, seq: '10' } }) + await b.write(key, { continuation: { v: 1, seq: '20' } }) + + const ra = await a.read(key) + const rb = await b.read(key) + assert.equal(ra?.continuation.seq, '10', "alpha's watermark is independent") + assert.equal(rb?.continuation.seq, '20', "beta's watermark is independent") + assert.ok(a.filePath(key).includes(path.join('sink-instances', 'alpha'))) + assert.ok(b.filePath(key).includes(path.join('sink-instances', 'beta'))) +}) + +test('createInstanceWatermarkStore requires stateDir and instanceName', () => { + assert.throws(() => createInstanceWatermarkStore({ paths: /** @type {any} */ ({}), instanceName: 'x' }), /stateDir is required/) + assert.throws(() => createInstanceWatermarkStore({ paths: /** @type {any} */ ({ stateDir: '/s' }), instanceName: '' }), /instanceName is required/) +}) + +/** Minimal store stub exposing only keyFor for watermarkKeyFor tests. */ +function createSinkWatermarkStub() { + return { + keyFor(/** @type {string} */ cacheRoot, /** @type {string} */ tablePath) { + const rel = path.relative(path.join(cacheRoot, 'datasets'), tablePath) + const [dataset, ...rest] = rel.split(path.sep) + return { dataset, partitionKey: rest.join('/') } + }, + filePath: () => '', + read: async () => null, + write: async () => ({ v: 1, continuation: { v: 1, seq: '0' }, exportedRowCount: 0, updatedAt: '' }), + } +} diff --git a/test/core/sink-reads-since.test.js b/test/core/sink-reads-since.test.js new file mode 100644 index 0000000..0f714d6 --- /dev/null +++ b/test/core/sink-reads-since.test.js @@ -0,0 +1,210 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs/promises' +import path from 'node:path' +import os from 'node:os' + +import { createQueryStorageService } from '../../src/core/cache/storage.js' +import { appendRowsToTable, scanRowsFromTable } from '../../src/core/cache/iceberg/store.js' +import { INGEST_SEQ_COLUMN } from '../../src/core/cache/streaming-reader.js' + +/** + * @import { ColumnSpec } from '../../collectivus-plugin-kernel-types.d.ts' + */ + +/** @returns {Promise} */ +async function makeTmpDir() { + return fs.mkdtemp(path.join(os.tmpdir(), 'hyp-since-')) +} + +/** @type {ColumnSpec[]} */ +const COLS = [ + { name: 'id', type: 'INT64', nullable: false }, + { name: 'msg', type: 'STRING', nullable: false }, +] + +test('readRows back-compat: no opts is unchanged, internal fields never leak', async () => { + const cacheRoot = await makeTmpDir() + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath('demo', ['all']) + await svc.appendRows(spoolPath, COLS, [ + { id: 1, msg: 'a' }, + { id: 2, msg: 'b' }, + { id: 3, msg: 'c' }, + ]) + await svc.flushTable(spoolPath, { reason: 'manual' }) + + // The spool re-groups rows into a committed `source=` partition; a + // sink reads from the discovered partition path, not the spool path. + const parts = await svc.discoverCachePartitions() + assert.equal(parts.length, 1) + const tablePath = parts[0].path + + /** @type {Record[]} */ + const all = [] + for await (const row of svc.readRows(tablePath)) all.push(row) + assert.equal(all.length, 3) + for (const row of all) { + assert.ok(!('_hyp_ingest_seq' in row)) + assert.ok(!('_hyp_cache_row_id' in row)) + assert.ok(!('_hyp_cache_batch_id' in row)) + } + + // Column projection is still honoured and still strips internals. + /** @type {Record[]} */ + const idOnly = [] + for await (const row of svc.readRows(tablePath, ['id'])) idOnly.push(row) + assert.equal(idOnly.length, 3) + for (const row of idOnly) assert.deepEqual(Object.keys(row), ['id']) + + await fs.rm(cacheRoot, { recursive: true, force: true }) +}) + +test('readRowsSince pairs each row with a monotonic after token and strips the seq', async () => { + const cacheRoot = await makeTmpDir() + const svc = createQueryStorageService({ cacheRoot }) + const spoolPath = svc.cacheTablePath('demo', ['all']) + await svc.appendRows(spoolPath, COLS, [ + { id: 1, msg: 'a' }, + { id: 2, msg: 'b' }, + { id: 3, msg: 'c' }, + ]) + await svc.flushTable(spoolPath, { reason: 'manual' }) + + const parts = await svc.discoverCachePartitions() + assert.equal(parts.length, 1) + const tablePath = parts[0].path + + /** @type {{ row: Record, after: { v: 1, seq: string } }[]} */ + const seen = [] + for await (const pair of svc.readRowsSince(tablePath, {})) seen.push(pair) + assert.equal(seen.length, 3) + + let prev = -1n + for (const { row, after } of seen) { + assert.ok(!('_hyp_ingest_seq' in row), 'seq never reaches the row payload') + assert.equal(after.v, 1) + assert.match(after.seq, /^\d+$/) + const cur = BigInt(after.seq) + assert.ok(cur >= prev, 'after token never regresses across the scan') + prev = cur + } + const watermark = seen[seen.length - 1].after + + // A second read from the watermark with no new rows yields nothing (≈0 bytes), + // via both the cursor-aware surface and the plain `readRows` `since`. + /** @type {unknown[]} */ + const none = [] + for await (const pair of svc.readRowsSince(tablePath, { since: watermark })) none.push(pair) + assert.equal(none.length, 0) + /** @type {unknown[]} */ + const noneFlat = [] + for await (const row of svc.readRows(tablePath, undefined, { since: watermark })) noneFlat.push(row) + assert.equal(noneFlat.length, 0) + + // After N new rows, only the N new ones are read, independent of the rest. + await svc.appendRows(spoolPath, COLS, [ + { id: 4, msg: 'd' }, + { id: 5, msg: 'e' }, + ]) + await svc.flushTable(spoolPath, { reason: 'manual' }) + + /** @type {Record[]} */ + const fresh = [] + for await (const { row, after } of svc.readRowsSince(tablePath, { since: watermark })) { + fresh.push(row) + assert.ok(BigInt(after.seq) > BigInt(watermark.seq)) + } + assert.equal(fresh.length, 2) + assert.deepEqual(fresh.map((r) => Number(r.id)).sort((a, b) => a - b), [4, 5]) + + await fs.rm(cacheRoot, { recursive: true, force: true }) +}) + +test('null-seq (legacy) rows are always treated as new and never skipped', async () => { + const root = await makeTmpDir() + const dir = path.join(root, 'legacy-table') + /** @type {ColumnSpec[]} */ + const cols = [ + { name: 'id', type: 'INT64', nullable: false }, + INGEST_SEQ_COLUMN, + ] + // A migration-era table: some rows pre-date the seq column (null), some carry + // real seqs. Built directly so the seq values are controlled exactly. + await appendRowsToTable(dir, cols, [ + { id: 1, [INGEST_SEQ_COLUMN.name]: null }, + { id: 2, [INGEST_SEQ_COLUMN.name]: 5n }, + { id: 3, [INGEST_SEQ_COLUMN.name]: 10n }, + { id: 4, [INGEST_SEQ_COLUMN.name]: null }, + ]) + + // since = 5: keep null(1), skip seq 5(2), keep seq 10(3), keep null(4). + /** @type {number[]} */ + const kept = [] + for await (const row of scanRowsFromTable(dir, undefined, { since: 5n })) kept.push(Number(row.id)) + assert.deepEqual(kept, [1, 3, 4]) + + // since = 0: every row is new. + /** @type {number[]} */ + const allIds = [] + for await (const row of scanRowsFromTable(dir, undefined, { since: 0n })) allIds.push(Number(row.id)) + assert.deepEqual(allIds, [1, 2, 3, 4]) + + // Through the cursor-aware surface: a null-seq row carries the prior watermark + // forward unchanged (it does not advance the high-water seq). + const svc = createQueryStorageService({ cacheRoot: root }) + /** @type {{ id: number, after: string }[]} */ + const pairs = [] + for await (const { row, after } of svc.readRowsSince(dir, { since: { v: 1, seq: '5' } })) { + assert.ok(!(INGEST_SEQ_COLUMN.name in row)) + pairs.push({ id: Number(row.id), after: after.seq }) + } + assert.deepEqual(pairs, [ + { id: 1, after: '5' }, + { id: 3, after: '10' }, + { id: 4, after: '10' }, + ]) + + await fs.rm(root, { recursive: true, force: true }) +}) + +test('a table with no seq column at all yields everything (pure legacy)', async () => { + const root = await makeTmpDir() + const dir = path.join(root, 'no-seq-col') + await appendRowsToTable(dir, COLS, [ + { id: 1, msg: 'a' }, + { id: 2, msg: 'b' }, + ]) + + // Even with a high watermark, a table that never carried the seq column has + // only implicit null-seq rows, so all are new. + const svc = createQueryStorageService({ cacheRoot: root }) + /** @type {{ id: number, after: string }[]} */ + const pairs = [] + for await (const { row, after } of svc.readRowsSince(dir, { since: { v: 1, seq: '999' } })) { + pairs.push({ id: Number(row.id), after: after.seq }) + } + assert.deepEqual(pairs, [ + { id: 1, after: '999' }, + { id: 2, after: '999' }, + ]) + + await fs.rm(root, { recursive: true, force: true }) +}) + +test('an invalid continuation token is rejected', async () => { + const cacheRoot = await makeTmpDir() + const svc = createQueryStorageService({ cacheRoot }) + const tablePath = svc.cacheTablePath('demo', ['all']) + await svc.appendRows(tablePath, COLS, [{ id: 1, msg: 'a' }]) + await svc.flushTable(tablePath, { reason: 'manual' }) + + await assert.rejects(async () => { + // @ts-expect-error — deliberately malformed token + for await (const _ of svc.readRowsSince(tablePath, { since: { v: 2, seq: '1' } })) { /* drain */ } + }, /invalid SinkContinuation/) + + await fs.rm(cacheRoot, { recursive: true, force: true }) +}) diff --git a/test/core/sink-watermarks.test.js b/test/core/sink-watermarks.test.js new file mode 100644 index 0000000..683a198 --- /dev/null +++ b/test/core/sink-watermarks.test.js @@ -0,0 +1,174 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs/promises' +import path from 'node:path' +import os from 'node:os' + +import { + createSinkWatermarkStore, + deriveWatermarkKey, +} from '../../src/core/sinks/watermarks.js' + +/** + * @import { SinkContinuation } from '../../collectivus-plugin-kernel-types.d.ts' + */ + +/** @returns {Promise} */ +async function makeTmpDir() { + return fs.mkdtemp(path.join(os.tmpdir(), 'hyp-sink-wm-')) +} + +/** + * @param {string} seq + * @returns {SinkContinuation} + */ +function cont(seq) { + return { v: 1, seq } +} + +test('deriveWatermarkKey splits dataset from the partition path', () => { + const cacheRoot = '/cache' + const key = deriveWatermarkKey(cacheRoot, '/cache/datasets/proxy/source=claude') + assert.deepEqual(key, { dataset: 'proxy', partitionKey: 'source=claude' }) +}) + +test('deriveWatermarkKey is keyed by the LOGICAL path, independent of tableDir', () => { + // The logical partition path is stable; the physical `table-` directory + // inside it changes on every compaction. Callers pass the logical path, so the + // key (and thus the watermark file) reads straight through a generation swap. + const cacheRoot = '/cache' + const logical = '/cache/datasets/proxy/source=claude' + const a = deriveWatermarkKey(cacheRoot, logical) + const b = deriveWatermarkKey(cacheRoot, logical) + assert.deepEqual(a, b) + + const store = createSinkWatermarkStore({ stateDir: '/state' }) + const file = store.filePath(a) + assert.ok(!file.includes('table-'), 'watermark file must not embed a physical tableDir') + assert.equal(file, path.join('/state', 'watermarks', 'proxy', 'source=claude.json')) +}) + +test('deriveWatermarkKey preserves nested partition segments', () => { + const key = deriveWatermarkKey('/cache', '/cache/datasets/logs/source=otel/date=2026-06-25') + assert.deepEqual(key, { dataset: 'logs', partitionKey: 'source=otel/date=2026-06-25' }) + const store = createSinkWatermarkStore({ stateDir: '/state' }) + assert.equal( + store.filePath(key), + path.join('/state', 'watermarks', 'logs', 'source=otel', 'date=2026-06-25.json') + ) +}) + +test('deriveWatermarkKey sanitizes unsafe segment characters', () => { + const key = deriveWatermarkKey('/cache', '/cache/datasets/weird ds/source=a b') + assert.equal(key.dataset, 'weird_ds') + assert.equal(key.partitionKey, 'source=a_b') +}) + +test('deriveWatermarkKey falls back to a sentinel when no partition segment', () => { + const key = deriveWatermarkKey('/cache', '/cache/datasets/proxy') + assert.deepEqual(key, { dataset: 'proxy', partitionKey: '_partition' }) +}) + +test('deriveWatermarkKey rejects paths outside the datasets root', () => { + assert.throws(() => deriveWatermarkKey('/cache', '/elsewhere/datasets/proxy/source=x')) + assert.throws(() => deriveWatermarkKey('/cache', '/cache/_hyp_ingest_seq.json')) +}) + +test('read returns null when no watermark has been written', async () => { + const stateDir = await makeTmpDir() + const store = createSinkWatermarkStore({ stateDir }) + const key = deriveWatermarkKey('/cache', '/cache/datasets/proxy/source=claude') + assert.equal(await store.read(key), null) +}) + +test('write then read round-trips the continuation and row count', async () => { + const stateDir = await makeTmpDir() + const store = createSinkWatermarkStore({ stateDir }) + const key = deriveWatermarkKey('/cache', '/cache/datasets/proxy/source=claude') + + const written = await store.write(key, { continuation: cont('42'), exportedRowCount: 7 }) + assert.deepEqual(written.continuation, cont('42')) + assert.equal(written.exportedRowCount, 7) + assert.equal(written.v, 1) + assert.ok(written.updatedAt.length > 0) + + const read = await store.read(key) + assert.ok(read) + assert.deepEqual(read.continuation, cont('42')) + assert.equal(read.exportedRowCount, 7) +}) + +test('write advances the watermark in place (latest wins)', async () => { + const stateDir = await makeTmpDir() + const store = createSinkWatermarkStore({ stateDir }) + const key = deriveWatermarkKey('/cache', '/cache/datasets/proxy/source=claude') + + await store.write(key, { continuation: cont('10'), exportedRowCount: 10 }) + await store.write(key, { continuation: cont('25'), exportedRowCount: 25 }) + + const read = await store.read(key) + assert.ok(read) + assert.deepEqual(read.continuation, cont('25')) + assert.equal(read.exportedRowCount, 25) + + // One file per (dataset, partition) — no per-write accumulation. + const dir = path.join(stateDir, 'watermarks', 'proxy') + const entries = await fs.readdir(dir) + assert.deepEqual(entries, ['source=claude.json']) +}) + +test('write is atomic write-rename and leaves no temp files', async () => { + const stateDir = await makeTmpDir() + const store = createSinkWatermarkStore({ stateDir }) + const key = deriveWatermarkKey('/cache', '/cache/datasets/proxy/source=claude') + + await store.write(key, { continuation: cont('1'), exportedRowCount: 1 }) + const dir = path.join(stateDir, 'watermarks', 'proxy') + const entries = await fs.readdir(dir) + assert.ok(entries.every((e) => !e.includes('.tmp.')), `no temp file should survive: ${entries}`) +}) + +test('write rejects a malformed continuation before touching disk', async () => { + const stateDir = await makeTmpDir() + const store = createSinkWatermarkStore({ stateDir }) + const key = deriveWatermarkKey('/cache', '/cache/datasets/proxy/source=claude') + + // Type-valid token, but `seq` is not a decimal string — only the runtime + // guard catches it. + await assert.rejects(() => store.write(key, { continuation: { v: 1, seq: 'not-a-number' } })) + await assert.rejects( + // @ts-expect-error — wrong version + () => store.write(key, { continuation: { v: 2, seq: '1' } }) + ) + // Nothing was persisted. + assert.equal(await store.read(key), null) +}) + +test('read returns null on a corrupt watermark file (safe re-export, never silent skip)', async () => { + const stateDir = await makeTmpDir() + const store = createSinkWatermarkStore({ stateDir }) + const key = deriveWatermarkKey('/cache', '/cache/datasets/proxy/source=claude') + + const dest = store.filePath(key) + await fs.mkdir(path.dirname(dest), { recursive: true }) + await fs.writeFile(dest, '{ not valid json', 'utf8') + assert.equal(await store.read(key), null) + + await fs.writeFile(dest, JSON.stringify({ v: 1, continuation: { v: 1, seq: 12 } }), 'utf8') + assert.equal(await store.read(key), null, 'numeric seq is not the decimal-string contract') +}) + +test('keyFor matches deriveWatermarkKey', () => { + const store = createSinkWatermarkStore({ stateDir: '/state' }) + assert.deepEqual( + store.keyFor('/cache', '/cache/datasets/proxy/source=claude'), + deriveWatermarkKey('/cache', '/cache/datasets/proxy/source=claude') + ) +}) + +test('createSinkWatermarkStore requires a stateDir', () => { + // @ts-expect-error — exercising the runtime guard + assert.throws(() => createSinkWatermarkStore({})) +}) diff --git a/test/plugins/central-forward-chunking.test.js b/test/plugins/central-forward-chunking.test.js index 1c9d064..f6c91fc 100644 --- a/test/plugins/central-forward-chunking.test.js +++ b/test/plugins/central-forward-chunking.test.js @@ -31,6 +31,7 @@ function makeStorage(tablePath, count, rowFactory) { const factory = rowFactory ?? ((i) => ({ message_id: `m${i}`, content_text: `row ${i}` })) let flushes = 0 return { + cacheRoot: '/cache', get flushes() { return flushes }, /** @param {string} p */ tableExists: (p) => p === tablePath, @@ -42,6 +43,57 @@ function makeStorage(tablePath, count, rowFactory) { yield factory(i) } }, + // Cursor-aware sibling: row `i` carries `_hyp_ingest_seq = i + 1`, so a + // `since` watermark of seq K skips the first K rows. `after` is the + // running high-water as a decimal string, mirroring storage.js. + /** + * @param {string} _p + * @param {{ since?: { v: 1, seq: string } }} [opts] + */ + async *readRowsSince(_p, opts) { + const since = opts?.since ? BigInt(opts.since.seq) : 0n + for (let i = 0; i < count; i += 1) { + const seq = BigInt(i + 1) + if (seq <= since) continue + yield { row: factory(i), after: { v: 1, seq: seq.toString() } } + } + }, + } +} + +/** + * In-memory stand-in for the per-(sink instance, partition) watermark store. + * `keyFor` collapses to a single key (these tests forward one partition), and + * `write` records every advance so a test can assert per-chunk progress and the + * ship-first/advance-second ordering. + * + * @param {{ v: 1, continuation: { v: 1, seq: string }, exportedRowCount: number, updatedAt: string } | null} [initial] + */ +function makeWatermarks(initial) { + let record = initial ?? null + /** @type {Array<{ v: 1, continuation: { v: 1, seq: string }, exportedRowCount: number, updatedAt: string }>} */ + const writes = [] + return { + get record() { return record }, + get writes() { return writes }, + keyFor: () => ({ dataset: 'ai_gateway_messages', partitionKey: 'source=claude' }), + /** @param {any} _key */ + filePath: (_key) => '/state/watermarks/ai_gateway_messages/source=claude.json', + async read() { return record }, + /** + * @param {any} _key + * @param {{ continuation: { v: 1, seq: string }, exportedRowCount?: number }} update + */ + async write(_key, update) { + record = { + v: 1, + continuation: update.continuation, + exportedRowCount: update.exportedRowCount ?? 0, + updatedAt: '2026-06-25T00:00:00.000Z', + } + writes.push(record) + return record + }, } } @@ -111,13 +163,15 @@ const TABLE = '/cache/ai_gateway_messages/source=claude' * rowFactory?: (i: number) => Record, * signal?: string | null, * sleepFn?: (ms: number, signal?: AbortSignal) => Promise, + * watermark?: { v: 1, continuation: { v: 1, seq: string }, exportedRowCount: number, updatedAt: string } | null, * }} opts */ -function buildSink({ count, responder, rowFactory, signal = 'logs', sleepFn }) { +function buildSink({ count, responder, rowFactory, signal = 'logs', sleepFn, watermark }) { const storage = makeStorage(TABLE, count, rowFactory) const identityClient = makeIdentity() const { calls, fn, drains } = makeFetch(responder) const log = makeLog() + const watermarks = makeWatermarks(watermark) // Default sleep records the requested delay and returns instantly, so // backpressure pacing is asserted without real waits; a test can pass // the real abortableSleep to exercise close()-driven abort. @@ -129,11 +183,12 @@ function buildSink({ count, responder, rowFactory, signal = 'logs', sleepFn }) { identityClient: /** @type {any} */ (identityClient), query: /** @type {any} */ (makeQuery(signal)), storage: /** @type {any} */ (storage), + watermarks: /** @type {any} */ (watermarks), log: /** @type {any} */ (log), fetchFn: fn, sleepFn: sleepFn ?? recordingSleep, }) - return { sink, calls, storage, identityClient, log, sleeps, drains } + return { sink, calls, storage, identityClient, log, sleeps, drains, watermarks } } const batch = { partitions: [{ dataset: 'ai_gateway_messages', tablePath: TABLE }] } @@ -439,3 +494,191 @@ test('close() aborts a chunk paused on backpressure (no shutdown wedge)', async assert.match(String(result.error), /closed/) assert.equal(calls.length, 1) // never got past the first throttled POST }) + +// ---- Incremental reads: per-(sink, partition) watermark (LLP 0040, T4) ---- + +test('a tick with no new rows transmits zero bytes and zero chunks', async () => { + // Watermark already at the partition's max seq (10 rows -> seq 10): the + // since-filtered read yields nothing, so the sink POSTs nothing. + const { sink, calls, watermarks } = buildSink({ + count: 10, + watermark: { v: 1, continuation: { v: 1, seq: '10' }, exportedRowCount: 10, updatedAt: '' }, + }) + const result = await sink.exportBatch(/** @type {any} */ (batch), /** @type {any} */ ({})) + assert.equal(result.status, 'exported') + assert.equal(result.partitionsExported, 1) + assert.equal(result.bytesWritten, 0) + assert.equal(calls.length, 0) + // nothing acked -> watermark untouched + assert.equal(watermarks.writes.length, 0) + assert.equal(watermarks.record?.continuation.seq, '10') +}) + +test('a tick after N new rows reads/sends only the new suffix and advances the watermark', async () => { + // 10 rows total, watermark at seq 7: only rows 8,9,10 are new. + const { sink, calls, watermarks } = buildSink({ + count: 10, + watermark: { v: 1, continuation: { v: 1, seq: '7' }, exportedRowCount: 7, updatedAt: '' }, + }) + const result = await sink.exportBatch(/** @type {any} */ (batch), /** @type {any} */ ({})) + assert.equal(result.status, 'exported') + assert.equal(calls.length, 1) + assert.equal(calls[0].rowCount, 3) // not 10 — the prefix is skipped + // watermark advanced to the last row's seq, count carried forward + assert.equal(watermarks.record?.continuation.seq, '10') + assert.equal(watermarks.record?.exportedRowCount, 10) +}) + +test('the watermark advances once, at end-of-partition, to the high-water after', async () => { + // 12000 rows -> chunks of 5000,5000,2000. The watermark is NOT advanced per + // chunk (that would be unsafe under an unordered scan: a chunk's running-max + // `after` could skip lower-seq rows in a later chunk — LLP 0040 §4 risk #3). + // It advances exactly once, after every chunk acks, to the partition max. + const { sink, watermarks } = buildSink({ count: 12_000 }) + const result = await sink.exportBatch(/** @type {any} */ (batch), /** @type {any} */ ({})) + assert.equal(result.status, 'exported') + assert.equal(watermarks.writes.length, 1, 'one watermark write per partition, not per chunk') + assert.equal(watermarks.writes[0].continuation.seq, '12000') + assert.equal(watermarks.writes[0].exportedRowCount, 12_000) + assert.equal(watermarks.record?.continuation.seq, '12000') +}) + +test('a mid-partition failure leaves the watermark unadvanced (no partial checkpoint)', async () => { + // Fail the 2nd chunk: chunk 0 acked but chunk 1 did not. Because the watermark + // advances only at end-of-partition, a partial partition NEVER checkpoints, so + // the next tick re-reads the whole partition (the server ledger dedupes the + // already-acked prefix). Advancing to chunk 0's running-max `after` here would + // risk skipping lower-seq rows in the un-acked chunk 1 forever. + let n = 0 + const { sink, watermarks } = buildSink({ count: 12_000, responder: () => (++n === 2 ? 500 : 202) }) + const result = await sink.exportBatch(/** @type {any} */ (batch), /** @type {any} */ ({})) + assert.equal(result.status, 'failed') + assert.equal(watermarks.writes.length, 0, 'no checkpoint past an un-acked chunk') + assert.equal(watermarks.record, null) +}) + +test('a respool re-reads the whole partition with STABLE batch-ids (ledger-dedups the acked prefix)', async () => { + // Cross-tick idempotency: tick 1 acks chunk 0 (seq 5000) then chunk 1 fails + // AFTER the server may have committed it. Because the partition did not + // complete, the watermark is NOT advanced, so tick 2 re-reads the whole + // partition. The re-sent prefix chunks MUST carry the batch-ids they had in + // tick 1 so the server ledger drops the redelivery; an id keyed on the + // per-tick chunk ordinal would still be stable here, but keying on the chunk + // start seq keeps it stable even when the watermark DOES advance elsewhere. + let n = 0 + const built = buildSink({ count: 12_000, responder: () => (++n === 2 ? 500 : 202) }) + const r1 = await built.sink.exportBatch(/** @type {any} */ (batch), /** @type {any} */ ({})) + assert.equal(r1.status, 'failed') + assert.equal(built.calls.length, 2) // chunk 0 (202) + chunk 1 (500), then stop + assert.equal(built.watermarks.writes.length, 0, 'partial partition does not checkpoint') + const tick1Chunk0BatchId = built.calls[0].batchId + const tick1Chunk1BatchId = built.calls[1].batchId + + // Tick 2: same sink, server now healthy. The respool re-reads the whole + // partition (no advanced watermark) and replays every chunk. + const r2 = await built.sink.exportBatch(/** @type {any} */ (batch), /** @type {any} */ ({})) + assert.equal(r2.status, 'exported') + const tick2 = built.calls.slice(2) + assert.deepEqual(tick2.map((c) => c.rowCount), [5000, 5000, 2000], 'whole partition re-read') + assert.equal( + tick2[0].batchId, + tick1Chunk0BatchId, + 'acked prefix chunk re-sends with its tick-1 batch-id (server dedupes the redelivery)' + ) + assert.equal( + tick2[1].batchId, + tick1Chunk1BatchId, + 'the previously-failed chunk re-sends with a stable batch-id too' + ) + assert.equal(built.watermarks.record?.continuation.seq, '12000') +}) + +test('a fresh partition (no watermark) reads from the start and advances', async () => { + // No persisted watermark -> since undefined -> full read (the safe + // at-least-once direction), then the watermark is created. + const { sink, calls, watermarks } = buildSink({ count: 10 }) + const result = await sink.exportBatch(/** @type {any} */ (batch), /** @type {any} */ ({})) + assert.equal(result.status, 'exported') + assert.equal(calls.length, 1) + assert.equal(calls[0].rowCount, 10) + assert.equal(watermarks.record?.continuation.seq, '10') +}) + +test('an unordered scan never skips a lower-seq row when a later chunk fails (BLOCKER, LLP 0040 §4 risk #3)', async () => { + // The scan is NOT seq-ordered: a high-seq row leads the partition, so the + // running-max `after` saturates inside chunk 0 while chunk 1 holds LOWER seqs. + // Per-chunk advance would jump the watermark past chunk 1's rows on the + // chunk-0 ack; a chunk-1 failure would then strip them from every future + // `seq > watermark` read — silent permanent data loss. End-of-partition advance + // refuses to checkpoint a partial partition, so the next tick re-reads them. + const TOTAL = 5006 // chunk 0 = 5000 rows (MAX_CHUNK_ROWS), chunk 1 = 6 rows + /** @type {{ id: number, seq: bigint }[]} */ + const physical = [{ id: 0, seq: 1_000_000n }] // high seq first -> running max saturates + for (let i = 1; i < 5000; i += 1) physical.push({ id: i, seq: BigInt(i) }) + for (let i = 5000; i < TOTAL; i += 1) physical.push({ id: i, seq: BigInt(i) }) // chunk 1: low seqs + + const storage = { + cacheRoot: '/cache', + /** @param {string} p */ + tableExists: (p) => p === TABLE, + async flushTable() {}, + /** + * @param {string} _p + * @param {{ since?: { v: 1, seq: string } }} [opts] + */ + async *readRowsSince(_p, opts) { + const since = opts?.since ? BigInt(opts.since.seq) : 0n + let high = since + for (const { id, seq } of physical) { + if (seq <= since) continue // mirrors the real since-filter + if (seq > high) high = seq // `after` is a RUNNING MAX, not the row's own seq + yield { row: { id }, after: { v: 1, seq: high.toString() } } + } + }, + } + + // Fail chunk 1 the FIRST time it is POSTed (tick 1); accept it on the retry. + let chunk1Failed = false + /** @type {number[]} */ + const acked = [] + /** @type {typeof fetch} */ + const fetchFn = /** @type {any} */ (async (_url, init) => { + const body = String(init?.body ?? '') + const ids = body.split('\n').filter((l) => l.length > 0).map((l) => Number(JSON.parse(l).id)) + const isChunk1 = ids.includes(5000) + let status = 202 + if (isChunk1 && !chunk1Failed) { chunk1Failed = true; status = 500 } + if (status === 202) acked.push(...ids) + return /** @type {any} */ ({ + status, ok: status >= 200 && status < 300, + headers: { get: () => null }, async text() { return '' }, body: { cancel: async () => {} }, + }) + }) + + const watermarks = makeWatermarks(null) + const sink = createForwardSink({ + config: /** @type {any} */ ({ url: 'http://server:8740', identity: {} }), + identityClient: /** @type {any} */ (makeIdentity()), + query: /** @type {any} */ (makeQuery('logs')), + storage: /** @type {any} */ (storage), + watermarks: /** @type {any} */ (watermarks), + log: /** @type {any} */ (makeLog()), + fetchFn, + sleepFn: async () => {}, + }) + + // Tick 1: chunk 0 acks, chunk 1 fails -> partition fails. + const r1 = await sink.exportBatch(/** @type {any} */ (batch), /** @type {any} */ ({})) + assert.equal(r1.status, 'failed') + + // Tick 2: server healthy. The whole partition re-reads (watermark unadvanced), + // so the low-seq chunk-1 rows are delivered — never skipped. + const r2 = await sink.exportBatch(/** @type {any} */ (batch), /** @type {any} */ ({})) + assert.equal(r2.status, 'exported') + + for (let i = 5000; i < TOTAL; i += 1) { + assert.ok(acked.includes(i), `low-seq row ${i} from the previously-failed chunk was delivered (no skip)`) + } + assert.equal(new Set(acked).size, TOTAL, 'every row delivered exactly once across the retry') + assert.equal(watermarks.record?.continuation.seq, '1000000', 'watermark advances only after the whole partition acks') +}) diff --git a/test/plugins/local-fs-incremental-export.test.js b/test/plugins/local-fs-incremental-export.test.js new file mode 100644 index 0000000..c53068e --- /dev/null +++ b/test/plugins/local-fs-incremental-export.test.js @@ -0,0 +1,154 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs/promises' +import path from 'node:path' +import os from 'node:os' + +import { activate } from '../../hypaware-core/plugins-workspace/local-fs/src/index.js' + +const CACHE_ROOT = '/cache' +const DATASET = 'd' +const TABLE = `${CACHE_ROOT}/datasets/${DATASET}/source=x` + +/** @returns {Promise} */ +async function tmpDir() { + return fs.mkdtemp(path.join(os.tmpdir(), 'hyp-localfs-incr-')) +} + +/** + * Storage stub whose row set for the one table is mutable across ticks, so a + * test can append rows and re-export. `readRowsSince` honours `seq > since`. + * + * @param {Array<{ _seq: number } & Record>} rows + */ +function makeStorage(rows) { + return { + cacheRoot: CACHE_ROOT, + /** @param {string} tp */ + tableExists: (tp) => tp === TABLE, + /** @param {string} tablePath @param {{ since?: { seq: string } }} [opts] */ + readRowsSince(tablePath, opts) { + const list = tablePath === TABLE ? rows : [] + const sinceSeq = opts?.since ? BigInt(opts.since.seq) : 0n + return { + async *[Symbol.asyncIterator]() { + let high = sinceSeq + for (const r of list) { + const seq = BigInt(r._seq) + if (seq <= sinceSeq) continue + if (seq > high) high = seq + const { _seq, ...row } = r + yield { row, after: { v: 1, seq: high.toString() } } + } + }, + } + }, + readRows() { + return { async *[Symbol.asyncIterator]() {} } + }, + } +} + +/** A jsonl-ish fake encoder that drains rows (as a real encoder must). */ +function makeEncoder() { + return { + format: 'jsonl', + extension: 'jsonl', + supports: ['queryable'], + /** @param {any} partition @param {any} ctx */ + async encodePartition(partition, ctx) { + const lines = [] + for await (const row of ctx.rows ?? []) lines.push(JSON.stringify(row)) + const bytes = new TextEncoder().encode(lines.join('\n') + (lines.length ? '\n' : '')) + return { filename: 'all.jsonl', bytes, bytesWritten: bytes.byteLength, rowCount: lines.length } + }, + } +} + +/** + * @param {{ rows: Array<{ _seq: number } & Record>, exportsDir: string, stateDir: string }} args + */ +async function buildSink({ rows, exportsDir, stateDir }) { + /** @type {any} */ + let registered + /** @type {any} */ + const ctx = { + config: { exports_dir: exportsDir }, + env: {}, + provideCapability() {}, + sinks: { register(/** @type {any} */ d) { registered = d } }, + log: { debug() {}, info() {}, warn() {}, error() {} }, + query: { getDataset: () => undefined, listDatasets: () => [] }, + storage: makeStorage(rows), + } + await activate(ctx) + const dir = path.join(exportsDir, 'out') + return registered.create({ + name: 'local', + config: { dir }, + encoder: makeEncoder(), + log: { debug() {}, info() {}, warn() {}, error() {} }, + paths: { tempDir: exportsDir, stateDir }, + }) +} + +function partition() { + return { dataset: DATASET, partition: {}, tablePath: TABLE } +} + +/** @param {string} dir */ +async function listBlobs(dir) { + const partDir = path.join(dir, DATASET, 'all') + try { + return (await fs.readdir(partDir)).sort() + } catch { + return [] + } +} + +test('local-fs incremental export: ranged filename, watermark advance, skip-empty, then a new range', async (t) => { + const exportsDir = await tmpDir() + const stateDir = await tmpDir() + t.after(() => fs.rm(exportsDir, { recursive: true, force: true })) + t.after(() => fs.rm(stateDir, { recursive: true, force: true })) + + const rows = [{ _seq: 1, id: 'a' }, { _seq: 2, id: 'b' }] + const sink = await buildSink({ rows, exportsDir, stateDir }) + const dir = path.join(exportsDir, 'out') + + // Tick 1: two new rows ⇒ one blob named with [0,2]. + const r1 = await sink.exportBatch({ batchId: 'b1', partitions: [partition()] }, {}) + assert.equal(r1.status, 'exported') + assert.equal(r1.partitionsExported, 1) + let blobs = await listBlobs(dir) + assert.deepEqual(blobs, ['all.0-2.jsonl'], 'first blob embeds [sinceSeq=0, lastSeq=2]') + + const wmFile = path.join(stateDir, 'sink-instances', 'local', 'watermarks', DATASET, 'source=x.json') + const wm = JSON.parse(await fs.readFile(wmFile, 'utf8')) + assert.equal(wm.continuation.seq, '2') + assert.equal(wm.exportedRowCount, 2) + + // Tick 2: no new rows ⇒ no new blob (skip-empty), watermark unchanged. + const r2 = await sink.exportBatch({ batchId: 'b2', partitions: [partition()] }, {}) + assert.equal(r2.partitionsExported, 0, 'no new rows ⇒ nothing exported') + blobs = await listBlobs(dir) + assert.deepEqual(blobs, ['all.0-2.jsonl'], 'no second blob written') + + // Tick 3: append a row ⇒ a new blob covering only (2, 5]. + rows.push({ _seq: 5, id: 'c' }) + const r3 = await sink.exportBatch({ batchId: 'b3', partitions: [partition()] }, {}) + assert.equal(r3.partitionsExported, 1) + blobs = await listBlobs(dir) + assert.deepEqual(blobs, ['all.0-2.jsonl', 'all.2-5.jsonl'], 'second blob embeds [sinceSeq=2, lastSeq=5]') + const wm3 = JSON.parse(await fs.readFile(wmFile, 'utf8')) + assert.equal(wm3.continuation.seq, '5') + assert.equal(wm3.exportedRowCount, 3, 'exportedRowCount accumulates across ticks') + + // The third blob contains exactly the one new row. + const newBlob = await fs.readFile(path.join(dir, DATASET, 'all', 'all.2-5.jsonl'), 'utf8') + assert.equal(newBlob.trim(), JSON.stringify({ id: 'c' })) + + await sink.close() +}) diff --git a/test/plugins/s3-export-batch.test.js b/test/plugins/s3-export-batch.test.js index 73fb18f..4a9927e 100644 --- a/test/plugins/s3-export-batch.test.js +++ b/test/plugins/s3-export-batch.test.js @@ -2,57 +2,103 @@ import test from 'node:test' import assert from 'node:assert/strict' +import fs from 'node:fs/promises' +import path from 'node:path' +import os from 'node:os' import { activate } from '../../hypaware-core/plugins-workspace/s3/src/index.js' +const CACHE_ROOT = '/cache' + +/** @returns {Promise} */ +async function tmpStateDir() { + return fs.mkdtemp(path.join(os.tmpdir(), 'hyp-s3-export-')) +} + /** - * Drive the s3 plugin's `activate` against a captured kernel context and - * return the sink-registration descriptor it registers. Each test gets a - * fresh activation so the captured `create` is independent. `activate` - * is async (it resolves the plugin-level BlobStore before registering - * the sink), so this helper must await it before reading `registered`. + * A storage mock whose `readRowsSince` yields the rows registered for each + * logical table path, honouring the `seq > since` filter and emitting the + * monotonic high-water `after` token the real kernel produces. Rows are + * `{ _seq, ...payload }`; `_seq` is stripped before the row is yielded. + * + * @param {Record>>} [rowsByTable] */ -async function activatePlugin() { - /** @type {any} */ - let registered - /** @type {any} */ - const ctx = { - provideCapability() {}, - sinks: { - register(descriptor) { - registered = descriptor - }, +function makeStorage(rowsByTable = {}) { + return { + cacheRoot: CACHE_ROOT, + /** @param {string} tp */ + tableExists: (tp) => Boolean(tp) && tp in rowsByTable, + /** @param {string} tablePath @param {{ since?: { seq: string } }} [opts] */ + readRowsSince(tablePath, opts) { + const rows = rowsByTable[tablePath] ?? [] + const sinceSeq = opts?.since ? BigInt(opts.since.seq) : 0n + return { + async *[Symbol.asyncIterator]() { + let high = sinceSeq + for (const r of rows) { + const seq = BigInt(r._seq) + if (seq <= sinceSeq) continue + if (seq > high) high = seq + const { _seq, ...row } = r + yield { row, after: { v: 1, seq: high.toString() } } + } + }, + } }, - log: { debug() {}, info() {}, warn() {}, error() {} }, - query: { getDataset: () => undefined, listDatasets: () => [] }, - storage: { - tableExists: () => false, - readRows: () => ({ async *[Symbol.asyncIterator]() {} }), + readRows() { + return { async *[Symbol.asyncIterator]() {} } }, } - await activate(ctx) - if (!registered) throw new Error('plugin did not register a sink') - return registered } -function makeEncoder() { +/** A fake encoder that drains `ctx.rows` (as every real encoder must). */ +function makeEncoder(format = 'jsonl') { return { - format: 'jsonl', - extension: 'jsonl', + format, + extension: format, supports: ['queryable'], - async encodePartition(partition) { + /** @param {any} partition @param {any} ctx */ + async encodePartition(partition, ctx) { + let rowCount = 0 + for await (const _row of ctx.rows ?? []) rowCount++ const bytes = new TextEncoder().encode('{}\n') return { - filename: `${partition.dataset}.jsonl`, + filename: `${partition.dataset}.${format}`, bytes, bytesWritten: bytes.byteLength, - rowCount: 0, + rowCount, } }, } } -function makeSinkCtx({ clientFactory }) { +/** + * Activate the s3 plugin against a captured context backed by `storage`, + * returning the registered sink descriptor. + * + * @param {ReturnType} storage + * @param {{ query?: any }} [opts] + */ +async function activatePlugin(storage, opts = {}) { + /** @type {any} */ + let registered + /** @type {any} */ + const ctx = { + provideCapability() {}, + sinks: { register(/** @type {any} */ d) { registered = d } }, + log: { debug() {}, info() {}, warn() {}, error() {} }, + query: opts.query ?? { getDataset: () => undefined, listDatasets: () => [] }, + storage, + } + await activate(ctx) + if (!registered) throw new Error('plugin did not register a sink') + return registered +} + +/** + * @param {{ clientFactory: any, stateDir: string, encoder?: any }} args + */ +function makeSinkCtx({ clientFactory, stateDir, encoder }) { return { name: 'test', config: { @@ -60,21 +106,31 @@ function makeSinkCtx({ clientFactory }) { prefix: 'p', __clientFactory: clientFactory, }, - encoder: makeEncoder(), + encoder: encoder ?? makeEncoder(), log: { debug() {}, info() {}, warn() {}, error() {} }, - paths: { tempDir: '/tmp' }, + paths: { tempDir: '/tmp', stateDir }, } } -test('exportBatch terminal failure: retryPartitions excludes already-uploaded partitions', async () => { - const registration = await activatePlugin() +/** A partition whose logical path sits under the cache datasets root. */ +function partitionFor(dataset) { + return { dataset, partition: {}, tablePath: `${CACHE_ROOT}/datasets/${dataset}/source=x` } +} + +test('exportBatch terminal failure: retryPartitions excludes already-uploaded partitions', async (t) => { + const stateDir = await tmpStateDir() + t.after(() => fs.rm(stateDir, { recursive: true, force: true })) + const storage = makeStorage({ + [`${CACHE_ROOT}/datasets/p1/source=x`]: [{ _seq: 1, id: 1 }], + [`${CACHE_ROOT}/datasets/p2/source=x`]: [{ _seq: 1, id: 1 }], + }) + const registration = await activatePlugin(storage) /** @type {Array<{ Key: string }>} */ const putCalls = [] const fakeClient = { - async putObject(input) { + async putObject(/** @type {any} */ input) { putCalls.push({ Key: input.Key }) if (putCalls.length === 1) return {} // p1 succeeds - // p2 hits a terminal config error (s3_access_denied) const err = Object.assign(new Error('Access Denied'), { name: 'AccessDenied', $metadata: { httpStatusCode: 403 }, @@ -85,42 +141,47 @@ test('exportBatch terminal failure: retryPartitions excludes already-uploaded pa } const clientFactory = async () => ({ client: fakeClient, credential_source_kind: 'injected' }) - const sink = await registration.create(makeSinkCtx({ clientFactory })) - - const p1 = { dataset: 'p1', partition: {}, tablePath: '' } - const p2 = { dataset: 'p2', partition: {}, tablePath: '' } - const result = await sink.exportBatch({ batchId: 'b1', partitions: [p1, p2] }, {}) + const sink = await registration.create(makeSinkCtx({ clientFactory, stateDir })) + const result = await sink.exportBatch( + { batchId: 'b1', partitions: [partitionFor('p1'), partitionFor('p2')] }, + {} + ) assert.equal(result.status, 'failed', 'terminal failure must produce failed status') assert.equal(result.partitionsExported, 1, 'p1 was uploaded successfully') - assert.ok(Array.isArray(result.retryPartitions), 'retryPartitions must be set on terminal failure so the driver does not outbox successful uploads') + assert.ok(Array.isArray(result.retryPartitions), 'retryPartitions must be set on terminal failure') assert.equal(result.retryPartitions.length, 1, 'only the failed partition should be retried') - assert.equal(result.retryPartitions[0].dataset, 'p2', 'the failed partition is p2, not the already-uploaded p1') + assert.equal(result.retryPartitions[0].dataset, 'p2', 'the failed partition is p2') assert.ok(typeof result.error === 'string' && result.error.length > 0, 'error description present') await sink.close() }) -test('exportBatch partial failure: retryPartitions has only the failed partition', async () => { - const registration = await activatePlugin() +test('exportBatch partial failure: retryPartitions has only the failed partition', async (t) => { + const stateDir = await tmpStateDir() + t.after(() => fs.rm(stateDir, { recursive: true, force: true })) + const storage = makeStorage({ + [`${CACHE_ROOT}/datasets/p1/source=x`]: [{ _seq: 1, id: 1 }], + [`${CACHE_ROOT}/datasets/p2/source=x`]: [{ _seq: 1, id: 1 }], + }) + const registration = await activatePlugin(storage) let call = 0 const fakeClient = { async putObject() { call += 1 if (call === 1) return {} // p1 succeeds // Non-terminal error. Driver expects to retry just p2. - const err = Object.assign(new Error('slow down'), { name: 'SlowDown' }) - throw err + throw Object.assign(new Error('slow down'), { name: 'SlowDown' }) }, destroy() {}, } const clientFactory = async () => ({ client: fakeClient, credential_source_kind: 'injected' }) - const sink = await registration.create(makeSinkCtx({ clientFactory })) - - const p1 = { dataset: 'p1', partition: {}, tablePath: '' } - const p2 = { dataset: 'p2', partition: {}, tablePath: '' } - const result = await sink.exportBatch({ batchId: 'b1', partitions: [p1, p2] }, {}) + const sink = await registration.create(makeSinkCtx({ clientFactory, stateDir })) + const result = await sink.exportBatch( + { batchId: 'b1', partitions: [partitionFor('p1'), partitionFor('p2')] }, + {} + ) assert.equal(result.status, 'partial') assert.equal(result.partitionsExported, 1) @@ -131,30 +192,23 @@ test('exportBatch partial failure: retryPartitions has only the failed partition await sink.close() }) -test('exportBatch forwards dataset cluster columns to the encoder', async () => { +test('exportBatch forwards dataset cluster columns to the encoder', async (t) => { // The s3 sink must derive cluster columns from the dataset's Iceberg // partition fields and pass them to the encoder (same as local-fs), so the // Parquet encoder keeps wide repeated columns dictionary-encoded. - /** @type {any} */ - let registered - /** @type {any} */ - const ctx = { - provideCapability() {}, - sinks: { register(/** @type {any} */ d) { registered = d } }, - log: { debug() {}, info() {}, warn() {}, error() {} }, - query: { - getDataset: (/** @type {string} */ name) => - name === 'ai_gateway_messages' - ? { cachePartitioning: { iceberg: { fields: [{ column: 'conversation_id' }, { column: 'date' }] } } } - : undefined, - listDatasets: () => [], - }, - storage: { - tableExists: () => false, - readRows: () => ({ async *[Symbol.asyncIterator]() {} }), - }, + const stateDir = await tmpStateDir() + t.after(() => fs.rm(stateDir, { recursive: true, force: true })) + const storage = makeStorage({ + [`${CACHE_ROOT}/datasets/ai_gateway_messages/source=x`]: [{ _seq: 1, id: 1 }], + }) + const query = { + getDataset: (/** @type {string} */ name) => + name === 'ai_gateway_messages' + ? { cachePartitioning: { iceberg: { fields: [{ column: 'conversation_id' }, { column: 'date' }] } } } + : undefined, + listDatasets: () => [], } - await activate(ctx) + const registration = await activatePlugin(storage, { query }) /** @type {any} */ let seenCtx @@ -164,25 +218,22 @@ test('exportBatch forwards dataset cluster columns to the encoder', async () => supports: ['queryable'], async encodePartition(/** @type {any} */ _p, /** @type {any} */ encodeCtx) { seenCtx = encodeCtx + let rowCount = 0 + for await (const _row of encodeCtx.rows ?? []) rowCount++ const bytes = new TextEncoder().encode('x') - return { filename: 'f.parquet', bytes, bytesWritten: 1, rowCount: 0 } + return { filename: 'f.parquet', bytes, bytesWritten: 1, rowCount } }, } const fakeClient = { async putObject() { return {} }, destroy() {} } - const sinkCtx = { - name: 'test', - config: { - bucket: 'b', - prefix: 'p', - __clientFactory: async () => ({ client: fakeClient, credential_source_kind: 'injected' }), - }, - encoder: spyEncoder, - log: { debug() {}, info() {}, warn() {}, error() {} }, - paths: { tempDir: '/tmp' }, - } - const sink = await registered.create(sinkCtx) + const sink = await registration.create( + makeSinkCtx({ + clientFactory: async () => ({ client: fakeClient, credential_source_kind: 'injected' }), + stateDir, + encoder: spyEncoder, + }) + ) await sink.exportBatch( - { batchId: 'b1', partitions: [{ dataset: 'ai_gateway_messages', partition: {}, tablePath: '' }] }, + { batchId: 'b1', partitions: [partitionFor('ai_gateway_messages')] }, {} ) assert.deepEqual(seenCtx?.clusterColumns, ['conversation_id', 'date'], @@ -190,18 +241,16 @@ test('exportBatch forwards dataset cluster columns to the encoder', async () => await sink.close() }) -test('exportBatch all-success: no retryPartitions field', async () => { - const registration = await activatePlugin() - const fakeClient = { - async putObject() { return {} }, - destroy() {}, - } +test('exportBatch all-success: no retryPartitions field', async (t) => { + const stateDir = await tmpStateDir() + t.after(() => fs.rm(stateDir, { recursive: true, force: true })) + const storage = makeStorage({ [`${CACHE_ROOT}/datasets/p1/source=x`]: [{ _seq: 1, id: 1 }] }) + const registration = await activatePlugin(storage) + const fakeClient = { async putObject() { return {} }, destroy() {} } const clientFactory = async () => ({ client: fakeClient, credential_source_kind: 'injected' }) - const sink = await registration.create(makeSinkCtx({ clientFactory })) - - const p1 = { dataset: 'p1', partition: {}, tablePath: '' } - const result = await sink.exportBatch({ batchId: 'b1', partitions: [p1] }, {}) + const sink = await registration.create(makeSinkCtx({ clientFactory, stateDir })) + const result = await sink.exportBatch({ batchId: 'b1', partitions: [partitionFor('p1')] }, {}) assert.equal(result.status, 'exported') assert.equal(result.partitionsExported, 1) @@ -209,3 +258,102 @@ test('exportBatch all-success: no retryPartitions field', async () => { await sink.close() }) + +test('exportBatch skips a partition with no new rows: no PUT, no blob', async (t) => { + // @ref LLP 0040 §5 acceptance 1 — empty new-row set writes ≈0 bytes. + const stateDir = await tmpStateDir() + t.after(() => fs.rm(stateDir, { recursive: true, force: true })) + // Registered table, but readRowsSince yields nothing. + const storage = makeStorage({ [`${CACHE_ROOT}/datasets/p1/source=x`]: [] }) + const registration = await activatePlugin(storage) + let puts = 0 + const fakeClient = { async putObject() { puts++; return {} }, destroy() {} } + const sink = await registration.create( + makeSinkCtx({ clientFactory: async () => ({ client: fakeClient, credential_source_kind: 'injected' }), stateDir }) + ) + const result = await sink.exportBatch({ batchId: 'b1', partitions: [partitionFor('p1')] }, {}) + + assert.equal(puts, 0, 'no object is PUT when there are no new rows') + assert.equal(result.status, 'exported') + assert.equal(result.partitionsExported, 0, 'nothing exported on an empty new-row set') + assert.equal(result.bytesWritten, 0) + assert.equal(result.retryPartitions, undefined) + await sink.close() +}) + +test('exportBatch embeds the [sinceSeq,lastSeq] range in the object key and advances the watermark', async (t) => { + // @ref LLP 0040 §4 — [sinceSeq,lastSeq] filename + watermark advance. + const stateDir = await tmpStateDir() + t.after(() => fs.rm(stateDir, { recursive: true, force: true })) + const tablePath = `${CACHE_ROOT}/datasets/p1/source=x` + const storage = makeStorage({ tablePath: [] }) // placeholder, replaced below + // @ts-ignore — install two rows with seqs 5 and 7 under the real key. + storage.tableExists = (tp) => tp === tablePath + // @ts-ignore + storage.readRowsSince = (tp, opts) => { + const rows = tp === tablePath ? [{ _seq: 5, id: 'a' }, { _seq: 7, id: 'b' }] : [] + const sinceSeq = opts?.since ? BigInt(opts.since.seq) : 0n + return { + async *[Symbol.asyncIterator]() { + let high = sinceSeq + for (const r of rows) { + const seq = BigInt(r._seq) + if (seq <= sinceSeq) continue + if (seq > high) high = seq + const { _seq, ...row } = r + yield { row, after: { v: 1, seq: high.toString() } } + } + }, + } + } + const registration = await activatePlugin(storage) + /** @type {string[]} */ + const keys = [] + const fakeClient = { async putObject(/** @type {any} */ i) { keys.push(i.Key); return {} }, destroy() {} } + const sink = await registration.create( + makeSinkCtx({ clientFactory: async () => ({ client: fakeClient, credential_source_kind: 'injected' }), stateDir }) + ) + + const first = await sink.exportBatch({ batchId: 'b1', partitions: [partitionFor('p1')] }, {}) + assert.equal(first.partitionsExported, 1) + assert.equal(keys.length, 1) + assert.ok(/\/p1\.0-7\.jsonl$/.test(keys[0]), `object key embeds [0,7]: ${keys[0]}`) + + // Watermark persisted at seq 7. + const wmFile = path.join(stateDir, 'sink-instances', 'test', 'watermarks', 'p1', 'source=x.json') + const wm = JSON.parse(await fs.readFile(wmFile, 'utf8')) + assert.equal(wm.continuation.seq, '7', 'watermark advanced to the max exported seq') + assert.equal(wm.exportedRowCount, 2) + + // Second tick with no new rows ⇒ skip, no extra PUT. + const second = await sink.exportBatch({ batchId: 'b2', partitions: [partitionFor('p1')] }, {}) + assert.equal(second.partitionsExported, 0, 'no new rows ⇒ nothing exported on the second tick') + assert.equal(keys.length, 1, 'no second PUT when the watermark already covers every row') + + await sink.close() +}) + +test('exportBatch re-PUTs the same object key when the watermark is lost (idempotent crash retry)', async (t) => { + // @ref LLP 0040 §5 acceptance 4 — a crash before the watermark write re-PUTs + // the same key (same since ⇒ same rows ⇒ same [sinceSeq,lastSeq] filename). + const stateDir = await tmpStateDir() + t.after(() => fs.rm(stateDir, { recursive: true, force: true })) + const storage = makeStorage({ [`${CACHE_ROOT}/datasets/p1/source=x`]: [{ _seq: 3, id: 'a' }] }) + const registration = await activatePlugin(storage) + /** @type {string[]} */ + const keys = [] + const fakeClient = { async putObject(/** @type {any} */ i) { keys.push(i.Key); return {} }, destroy() {} } + const sink = await registration.create( + makeSinkCtx({ clientFactory: async () => ({ client: fakeClient, credential_source_kind: 'injected' }), stateDir }) + ) + + await sink.exportBatch({ batchId: 'b1', partitions: [partitionFor('p1')] }, {}) + assert.equal(keys.length, 1) + // Simulate the watermark write being lost (crash between PUT and advance). + await fs.rm(path.join(stateDir, 'sink-instances', 'test', 'watermarks', 'p1', 'source=x.json'), { force: true }) + await sink.exportBatch({ batchId: 'b2', partitions: [partitionFor('p1')] }, {}) + + assert.equal(keys.length, 2, 'the row is re-PUT after a lost watermark') + assert.equal(keys[0], keys[1], 'the re-PUT targets the same object key (idempotent overwrite)') + await sink.close() +})