diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..3c7ef75 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,6 @@ +dist +node_modules +tests/data + +# Vendored third-party source — keep byte-identical to upstream +src/vendor diff --git a/package-lock.json b/package-lock.json index ab00b56..ca2ca8f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,8 +10,7 @@ "license": "MIT", "dependencies": { "@msgpack/msgpack": "^3.1.3", - "flatbuffers": "^25.9.23", - "fzstd": "^0.1.1" + "flatbuffers": "^25.9.23" }, "devDependencies": { "@types/node": "^20.0.0", @@ -1339,12 +1338,6 @@ "node": "^8.16.0 || ^10.6.0 || >=11.0.0" } }, - "node_modules/fzstd": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/fzstd/-/fzstd-0.1.1.tgz", - "integrity": "sha512-dkuVSOKKwh3eas5VkJy1AW1vFpet8TA/fGmVA5krThl8YcOVE/8ZIoEA1+U1vEn5ckxxhLirSdY837azmbaNHA==", - "license": "MIT" - }, "node_modules/get-func-name": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz", diff --git a/package.json b/package.json index e56e7ca..2121d9e 100644 --- a/package.json +++ b/package.json @@ -45,8 +45,7 @@ "license": "MIT", "dependencies": { "@msgpack/msgpack": "^3.1.3", - "flatbuffers": "^25.9.23", - "fzstd": "^0.1.1" + "flatbuffers": "^25.9.23" }, "devDependencies": { "@types/node": "^20.0.0", diff --git a/src/format/flatbuffers/generated/snapshot-info.ts b/src/format/flatbuffers/generated/snapshot-info.ts index 4262f56..a4dc347 100644 --- a/src/format/flatbuffers/generated/snapshot-info.ts +++ b/src/format/flatbuffers/generated/snapshot-info.ts @@ -66,4 +66,19 @@ export class SnapshotInfo { const offset = this.bb!.__offset(this.bb_pos, 12); return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; } + + prunedAncestorTxLogs(index: number, obj?: ObjectId12): ObjectId12 | null { + const offset = this.bb!.__offset(this.bb_pos, 14); + return offset + ? (obj || new ObjectId12()).__init( + this.bb!.__vector(this.bb_pos + offset) + index * 12, + this.bb!, + ) + : null; + } + + prunedAncestorTxLogsLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 14); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } } diff --git a/src/format/flatbuffers/manifest-parser.ts b/src/format/flatbuffers/manifest-parser.ts index f24c919..11582e5 100644 --- a/src/format/flatbuffers/manifest-parser.ts +++ b/src/format/flatbuffers/manifest-parser.ts @@ -5,6 +5,7 @@ */ import { ByteBuffer } from "flatbuffers"; +import { decompress } from "../../vendor/fzstd/index.js"; import { Manifest as FbsManifest } from "./generated/manifest.js"; import { ArrayManifest as FbsArrayManifest } from "./generated/array-manifest.js"; import { ChunkRef as FbsChunkRef } from "./generated/chunk-ref.js"; @@ -18,6 +19,99 @@ import { type ObjectId8, } from "./types.js"; +/** compression_algorithm value for zstd dictionary-compressed locations. */ +const COMPRESSION_ALG_ZSTD_DICT = 1; + +/** Upper bound on a decompressed location, matching the Rust + * `MAX_DECOMPRESSED_LOCATION_SIZE`. */ +const MAX_DECOMPRESSED_LOCATION_SIZE = 1024; + +/** + * Bytes that a zstd decoder allocates up front for `frame`, read from the frame + * header: the advertised frame content size, falling back to the window size. + * Mirrors the allocation in the vendored fzstd `rzfh`. Requires `frame` to be a + * single complete frame covering the whole input; returns `Infinity` for a + * missing/short/unrecognized header, a malformed block, or any trailing bytes + * (a concatenated second frame), so callers reject rather than trust it. + * + * Used to bound allocation *before* decompressing an untrusted + * `compressed_location`: the size check on the decompressed output alone is too + * late, since the decoder would already have allocated per the advertised size, + * and it decodes every concatenated frame. + */ +function zstdFrameAllocSize(frame: Uint8Array): number { + // Magic number 0xFD2FB528 (little-endian), then the frame header descriptor. + if ( + frame.length < 5 || + frame[0] !== 0x28 || + frame[1] !== 0xb5 || + frame[2] !== 0x2f || + frame[3] !== 0xfd + ) { + return Infinity; + } + const flg = frame[4]; + const singleSegment = (flg >> 5) & 1; + const contentChecksum = (flg >> 2) & 1; + const dictIdFlag = flg & 3; + const contentSizeFlag = flg >> 6; + + let pos = 5; + let windowSize = 0; + if (!singleSegment) { + if (pos >= frame.length) return Infinity; + const wd = frame[pos++]; + // Arithmetic (not bitwise `1 <<`/`>> 3`): the exponent can reach 41, which + // would overflow 32-bit JS shifts and let a crafted window descriptor + // compute a bogus small size that slips past the bound below. + const base = 2 ** (10 + (wd >> 3)); + windowSize = base + (base / 8) * (wd & 7); + } + pos += dictIdFlag === 3 ? 4 : dictIdFlag; // skip dictionary id + + // Frame content size: 0/1/2/4/8 bytes per the flags (matching fzstd's rzfh). + const fcsBytes = contentSizeFlag ? 1 << contentSizeFlag : singleSegment; + let contentSize = 0; + if (fcsBytes) { + if (pos + fcsBytes > frame.length) return Infinity; + for (let i = 0; i < fcsBytes; i++) { + contentSize += frame[pos + i] * 2 ** (8 * i); + } + if (contentSizeFlag === 1) contentSize += 256; + if (singleSegment) windowSize = contentSize; + } + pos += fcsBytes; + + // Walk the data blocks to the frame end. `decompress()` decodes *every* + // concatenated frame and allocates per frame, so a first-frame-only size + // check is bypassable by appending a frame that advertises a huge size. + // Require exactly one complete frame covering the whole input; reject + // trailing bytes (a second frame) or any malformed block. + for (;;) { + if (pos + 3 > frame.length) return Infinity; + const blockHeader = + frame[pos] | (frame[pos + 1] << 8) | (frame[pos + 2] << 16); + pos += 3; + const lastBlock = blockHeader & 1; + const blockType = (blockHeader >> 1) & 3; + if (blockType === 3) return Infinity; // reserved block type + pos += blockType === 1 ? 1 : blockHeader >> 3; // RLE block content is 1 byte + if (pos > frame.length) return Infinity; + if (lastBlock) break; + } + if (contentChecksum) pos += 4; + if (pos !== frame.length) return Infinity; // trailing/concatenated frame + + return contentSize || windowSize; +} + +/** + * Decode a ChunkRef's `compressed_location` byte vector into a location string. + * Returns `null` when this manifest has no location dictionary (locations are + * stored uncompressed). + */ +type LocationDecoder = (compressed: Uint8Array) => string; + /** Parse a Manifest from FlatBuffer data */ export function parseManifest(data: Uint8Array): Manifest { const bb = new ByteBuffer(data); @@ -30,20 +124,73 @@ export function parseManifest(data: Uint8Array): Manifest { idObj.bb!.bytes().slice(idObj.bb_pos, idObj.bb_pos + 12), ); + // Build the location decoder once per manifest from its zstd dictionary. + const decodeLocation = makeLocationDecoder(fbsManifest); + // Parse arrays const arraysLength = fbsManifest.arraysLength(); const arrays: ArrayManifest[] = []; for (let i = 0; i < arraysLength; i++) { const fbsArray = fbsManifest.arrays(i); if (fbsArray) { - arrays.push(parseArrayManifest(fbsArray)); + arrays.push(parseArrayManifest(fbsArray, decodeLocation)); } } return { id, arrays }; } -function parseArrayManifest(fbsArray: FbsArrayManifest): ArrayManifest { +/** + * Build the function that turns a ChunkRef's `compressed_location` into a + * location string, using the manifest's zstd dictionary. + * + * Mirrors the Rust `Manifest::decompressor`: a dictionary is only used when + * `compression_algorithm == ZSTD_DICT` and `location_dictionary` is present. + * When a `compressed_location` is encountered without a dictionary, decoding + * throws (matching the Rust `MissingLocationCompressionDictionary` error). + * + * Note: `manifest.fbs` prose says `compression_algorithm == 0` means + * `compressed_location` holds raw (uncompressed) bytes, but that path is not + * implemented in the Rust reference — its writer only emits `compressed_location` + * for ZSTD_DICT (otherwise it writes the plain `location` field) and its reader + * errors here. We follow the reference behavior, not the (inconsistent) schema + * comment. See earth-mover/icechunk manifest.fbs vs manifest.rs. + */ +function makeLocationDecoder(fbsManifest: FbsManifest): LocationDecoder { + const dictionary = + fbsManifest.compressionAlgorithm() === COMPRESSION_ALG_ZSTD_DICT + ? fbsManifest.locationDictionaryArray() + : null; + // `fatal` makes invalid UTF-8 throw, matching the Rust `String::from_utf8`. + const textDecoder = new TextDecoder("utf-8", { fatal: true }); + return (compressed: Uint8Array): string => { + if (!dictionary) { + throw new Error( + "ChunkRef has a compressed_location but the manifest has no location dictionary", + ); + } + // Reject before decompressing: a malformed frame can advertise a huge size + // and make the decoder allocate far beyond the bound (or OOM) before the + // post-decompress length check below would run. + if (zstdFrameAllocSize(compressed) > MAX_DECOMPRESSED_LOCATION_SIZE) { + throw new Error( + `Compressed location declares a size over ${MAX_DECOMPRESSED_LOCATION_SIZE} bytes`, + ); + } + const decompressed = decompress(compressed, undefined, dictionary); + if (decompressed.length > MAX_DECOMPRESSED_LOCATION_SIZE) { + throw new Error( + `Decompressed location exceeds ${MAX_DECOMPRESSED_LOCATION_SIZE} bytes`, + ); + } + return textDecoder.decode(decompressed); + }; +} + +function parseArrayManifest( + fbsArray: FbsArrayManifest, + decodeLocation: LocationDecoder, +): ArrayManifest { // Parse node_id (required) const nodeIdObj = fbsArray.nodeId(); if (!nodeIdObj) @@ -58,14 +205,17 @@ function parseArrayManifest(fbsArray: FbsArrayManifest): ArrayManifest { for (let i = 0; i < refsLength; i++) { const fbsRef = fbsArray.refs(i); if (fbsRef) { - refs.push(parseChunkRef(fbsRef)); + refs.push(parseChunkRef(fbsRef, decodeLocation)); } } return { nodeId, refs }; } -function parseChunkRef(fbsRef: FbsChunkRef): ChunkRef { +function parseChunkRef( + fbsRef: FbsChunkRef, + decodeLocation: LocationDecoder, +): ChunkRef { // Parse index (required, vector of uint32) const indexLength = fbsRef.indexLength(); const index: number[] = []; @@ -89,8 +239,18 @@ function parseChunkRef(fbsRef: FbsChunkRef): ChunkRef { ) : null; - // Parse location (optional string) - const location = fbsRef.location(); + // Resolve the virtual location. Per the Rust `ref_to_payload` priority + // (chunk_id -> compressed_location -> location), a dictionary-compressed + // location takes precedence over the plain `location` string and is decoded + // here at parse time. `chunk_id` refs are native, not virtual, so they never + // carry a location — skip decoding for them. + let location = fbsRef.location(); + if (chunkId === null) { + const compressed = fbsRef.compressedLocationArray(); + if (compressed) { + location = decodeLocation(compressed); + } + } // Parse checksum fields const checksumEtag = fbsRef.checksumEtag(); @@ -193,12 +353,14 @@ function binarySearchChunkRef( return null; } -/** Extract the payload type from a ChunkRef */ +/** + * Extract the payload type from a ChunkRef. + * + * Priority matches the Rust `ref_to_payload`: chunk_id (native) -> + * location (virtual; already decoded from compressed_location at parse time) + * -> inline. + */ export function getChunkPayload(ref: ChunkRef): ChunkPayload { - if (ref.inline !== null) { - return { type: "inline", data: ref.inline }; - } - if (ref.chunkId !== null) { return { type: "native", @@ -219,5 +381,9 @@ export function getChunkPayload(ref: ChunkRef): ChunkPayload { }; } - throw new Error("Invalid ChunkRef: no inline, chunkId, or location"); + if (ref.inline !== null) { + return { type: "inline", data: ref.inline }; + } + + throw new Error("Invalid ChunkRef: no chunkId, location, or inline"); } diff --git a/src/format/flatbuffers/repo-parser.ts b/src/format/flatbuffers/repo-parser.ts index 23af086..ca30480 100644 --- a/src/format/flatbuffers/repo-parser.ts +++ b/src/format/flatbuffers/repo-parser.ts @@ -7,7 +7,7 @@ import { ByteBuffer } from "flatbuffers"; import * as flexbuffers from "flatbuffers/js/flexbuffers.js"; import { Repo as FbsRepo } from "./generated/repo.js"; -import { decompress } from "fzstd"; +import { decompress } from "../../vendor/fzstd/index.js"; import { parseHeader, getDataAfterHeader, diff --git a/src/reader/session.ts b/src/reader/session.ts index 68d42b1..b5f3789 100644 --- a/src/reader/session.ts +++ b/src/reader/session.ts @@ -7,7 +7,7 @@ import type { Storage, RequestOptions, } from "../storage/storage.js"; -import { decompress } from "fzstd"; +import { decompress } from "../vendor/fzstd/index.js"; import { LRUCache } from "../cache/lru.js"; import { singleFlight, type SingleFlight } from "../cache/single-flight.js"; import { diff --git a/src/vendor/fzstd/LICENSE b/src/vendor/fzstd/LICENSE new file mode 100644 index 0000000..d844c26 --- /dev/null +++ b/src/vendor/fzstd/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Arjun Barrett + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/src/vendor/fzstd/README.md b/src/vendor/fzstd/README.md new file mode 100644 index 0000000..c1994f5 --- /dev/null +++ b/src/vendor/fzstd/README.md @@ -0,0 +1,80 @@ +# Vendored `fzstd` + +This directory contains a vendored copy of [`fzstd`](https://github.com/101arrowz/fzstd) +(MIT, © 2020 Arjun Barrett) used for zstd decompression — both the outer +metadata-file decompression and the dictionary-compressed virtual chunk +locations in manifests. + +## Why it's vendored (not an npm dependency) + +- **Released `fzstd` (0.1.1) has no dictionary support.** Its `decompress()` + takes no dictionary argument, and icechunk's `compressed_location` fields are + zstd frames compressed against a per-manifest dictionary — undecodable + without it. Dictionary support exists only in the unmerged + [fzstd#18](https://github.com/101arrowz/fzstd/pull/18). +- **The PR branch can't be cleanly `npm install`ed** — its `package.json` + `main`/`module` point at build outputs that aren't committed, and there's no + `prepare` hook, so a git/`github:` install resolves to missing files. +- **`zstdify` (the other dictionary-capable JS zstd lib) silently corrupts + data** — it returned wrong bytes (same length, no error) for ~32% of real + icechunk metadata fixtures, at both 1.4.0 and `main` (the PR#3 "fix" does not + address it). `fzstd` is byte-exact vs the reference `zstd` CLI on all of them. + +So we vendor the dictionary-capable source and decompress with it everywhere. + +## Upstream source + +| | | +|---|---| +| Repo | `101arrowz/fzstd` | +| Branch | `handlerug:push-qxkytkvukoqs` (= the head of fzstd#18) | +| Commit | `819999b` (dictionary support + the aliasing fix, see below) | +| File | `src/index.ts` → `index.ts` here (with a header prepended) | +| License | `LICENSE` (copied verbatim) | + +`index.ts` is `@ts-nocheck`'d and excluded from Prettier (`.prettierignore`) to +stay byte-identical to upstream. + +## Local modifications + +**None.** This copy is byte-identical to the branch. + +A dictionary/output-buffer aliasing bug in the original PR (silent corruption +when the dictionary-content length equals the decompressed size) was found here +and fixed upstream via [handlerug/fzstd#1](https://github.com/handlerug/fzstd/pull/1), +now merged into the branch — so there is no longer a local patch to re-apply. +Regression coverage lives in +`tests/format/flatbuffers/manifest-dictionary.test.ts` ("decodes a frame whose +output aliases the dictionary buffer"). + +> Note: the size-bound (`MAX_DECOMPRESSED_LOCATION_SIZE`) and UTF-8 strictness +> guards are **not** here — they live in the consumer +> (`src/format/flatbuffers/manifest-parser.ts`), since bounding untrusted input +> is the caller's job, not the decoder's. + +## Re-vendoring (updating the copy) + +```sh +# Fetch the upstream source for the pinned branch/commit: +gh api "repos/handlerug/fzstd/contents/src/index.ts?ref=push-qxkytkvukoqs" \ + --jq '.content' | base64 -d > /tmp/fzstd-index.ts +``` + +Then: +1. Re-prepend the provenance header (top of the current `index.ts`), updating + the pinned commit. +2. Re-run the suite: `npm test` (the aliasing + dictionary tests must pass). + +## When this can be deleted + +Once fzstd#18 **and** the aliasing fix land upstream and a release is published +to npm: delete this directory, add `fzstd@` to `dependencies`, and +repoint the four imports (`src/reader/session.ts`, +`src/format/flatbuffers/{repo,manifest}-parser.ts`, and the two parser tests). + +## Related + +- Consumer: `src/format/flatbuffers/manifest-parser.ts` (dictionary decode, + decode priority matching the Rust `ref_to_payload`, size/UTF-8 guards). +- Test fixtures: `tests/fixtures/dictionary/` (generated with the `zstd` CLI; + generation commands documented in the test file headers). diff --git a/src/vendor/fzstd/index.ts b/src/vendor/fzstd/index.ts new file mode 100644 index 0000000..2784600 --- /dev/null +++ b/src/vendor/fzstd/index.ts @@ -0,0 +1,836 @@ +/** + * Vendored from fzstd — https://github.com/101arrowz/fzstd + * MIT License, Copyright (c) 2020 Arjun Barrett. See ./LICENSE. + * + * Source: the unmerged dictionary-decompression PR #18 + * (https://github.com/101arrowz/fzstd/pull/18), branch + * `handlerug:push-qxkytkvukoqs` @ 819999b, which adds the optional `dic` + * (dictionary) argument to `decompress()` and the `Decompress` stream. + * + * Why vendored: icechunk's dictionary-compressed virtual chunk locations need + * zstd dictionary decompression, which released fzstd 0.1.1 lacks. PR #18 is + * still unmerged/unreleased and the fork can't be cleanly npm-installed (no + * committed dist, no prepare hook). The alternative decoder (zstdify) silently + * corrupts ~32% of real icechunk metadata, so fzstd is the only correct option. + * See ../../format/flatbuffers/manifest-parser.ts. + * + * This copy is byte-identical to the branch — no local edits. The branch now + * includes the dictionary/output-buffer aliasing fix (handlerug/fzstd#1, which + * we contributed), so there is no longer a local patch to re-apply. To update, + * re-vendor src/index.ts + LICENSE from the branch. + * + * Verified: byte-exact vs released fzstd on all 244 icechunk fixtures, plus + * correct dictionary decode vs the reference `zstd` CLI. + */ +/* eslint-disable */ +// @ts-nocheck + +// Some numerical data is initialized as -1 even when it doesn't need initialization to help the JIT infer types + +// aliases for shorter compressed code (most minifers don't do this) +const ab = ArrayBuffer, u8 = Uint8Array, u16 = Uint16Array, i16 = Int16Array, u32 = Uint32Array, i32 = Int32Array; + +// Huffman decoding table +interface HDT { + // initial bits + b: number; + // symbols + s: Uint8Array; + // num bits + n: Uint8Array; +} + +// FSE decoding table +interface FSEDT extends HDT { + // next state + t: Uint16Array; +} + +// decompress Zstandard state +interface DZstdState { + // byte + b: number; + // out byte + y: number; + // dictionary ID + d: number; + // window + w: Uint8Array; + // max block size + m: number; + // uncompressed size + u: number; + // has checksum + c: number; + // offsets + o: Int32Array; + // window head + e: number; + // last huffman decoding table + h?: HDT; + // last FSE decoding tables + t?: [FSEDT, FSEDT, FSEDT]; + // last block + l: number; +} + +const slc = (v: Uint8Array, s: number, e?: number) => { + if (u8.prototype.slice) return u8.prototype.slice.call(v, s, e); + if (s == null || s < 0) s = 0; + if (e == null || e > v.length) e = v.length; + const n = new u8(e - s); + n.set(v.subarray(s, e)); + return n; +}; + +const fill = (v: Uint8Array, n: number, s?: number, e?: number) => { + if (u8.prototype.fill) return u8.prototype.fill.call(v, n, s, e); + if (s == null || s < 0) s = 0; + if (e == null || e > v.length) e = v.length; + for (; s < e; ++s) v[s] = n; + return v; +}; + +const cpw = (v: Uint8Array, t: number, s?: number, e?: number) => { + if (u8.prototype.copyWithin) return u8.prototype.copyWithin.call(v, t, s, e); + if (s == null || s < 0) s = 0; + if (e == null || e > v.length) e = v.length; + while (s < e) { + v[t++] = v[s++]; + } +}; + +/** + * Codes for errors generated within this library + */ +export const ZstdErrorCode = { + InvalidData: 0, + WindowSizeTooLarge: 1, + InvalidBlockType: 2, + FSEAccuracyTooHigh: 3, + DistanceTooFarBack: 4, + UnexpectedEOF: 5, + WrongDictionary: 6 +} as const; + +type ZEC = (typeof ZstdErrorCode)[keyof typeof ZstdErrorCode]; + +// error codes +const ec: Record = [ + 'invalid zstd data', + 'window size too large (>2046MB)', + 'invalid block type', + 'FSE accuracy too high', + 'match distance too far back', + 'unexpected EOF', + 'wrong dictionary' +]; + +/** + * An error generated within this library + */ +export interface ZstdError extends Error { + /** + * The code associated with this error + */ + code: ZEC; +} + +const err = (ind: ZEC, msg?: string | 0, nt?: 1) => { + const e: Partial = new Error(msg || ec[ind]); + e.code = ind; + if (Error.captureStackTrace) Error.captureStackTrace(e, err); + if (!nt) throw e; + return e as ZstdError; +}; + +const rb = (d: Uint8Array, b: number, n: number) => { + let i = 0, o = 0; + for (; i < n; ++i) o |= d[b++] << (i << 3); + return o; +}; + +const b4 = (d: Uint8Array, b: number) => (d[b] | (d[b + 1] << 8) | (d[b + 2] << 16) | (d[b + 3] << 24)) >>> 0; + +// apply a dictionary to decompressor state +const rdic = (dic: Uint8Array, st: DZstdState) => { + if (b4(dic, 0) == 0xec30a437) { + if (st.d && st.d !== b4(dic, 4)) err(6); + let bt = 8; + [bt, st.h] = rhu(dic, bt); + let mlt: FSEDT, olt: FSEDT, llt: FSEDT; + [bt, olt] = rfse(dic, bt, 8); + [bt, mlt] = rfse(dic, bt, 9); + [bt, llt] = rfse(dic, bt, 9); + st.t = [mlt, olt, llt]; + st.o = new i32([b4(dic, bt), b4(dic, bt + 4), b4(dic, bt + 8)]); + dic = dic.subarray(bt + 12); + } + st.w = dic.slice(); + st.e = dic.length; +}; + +// read Zstandard frame header +const rzfh = (dat: Uint8Array, w?: Uint8Array | 1): number | DZstdState => { + const n3 = dat[0] | (dat[1] << 8) | (dat[2] << 16); + if (n3 == 0x2FB528 && dat[3] == 253) { + // Zstandard + const flg = dat[4]; + // single segment checksum dict flag frame content flag + const ss = (flg >> 5) & 1, cc = (flg >> 2) & 1, df = flg & 3, fcf = flg >> 6; + if (flg & 8) err(0); + // byte + let bt = 6 - ss; + // dict bytes + const db = df == 3 ? 4 : df; + // dictionary id + const di = rb(dat, bt, db); + bt += db; + // frame size bytes + const fsb = fcf ? (1 << fcf) : ss; + // frame source size + const fss = rb(dat, bt, fsb) + ((fcf == 1) && 256); + // window size + let ws = fss; + if (!ss) { + // window descriptor + const wb = 1 << (10 + (dat[5] >> 3)); + ws = wb + (wb >> 3) * (dat[5] & 7); + } + if (ws > 2145386496) err(1); + const buf = new u8((w == 1 ? (fss || ws) : w ? 0 : ws) + 12); + buf[0] = 1, buf[4] = 4, buf[8] = 8; + return { + b: bt + fsb, + y: 0, + l: 0, + d: di, + w: (w && w != 1) ? w : buf.subarray(12), + e: ws, + o: new i32(buf.buffer, 0, 3), + u: fss, + c: cc, + m: Math.min(131072, ws) + }; + } else if (((n3 >> 4) | (dat[3] << 20)) == 0x184D2A5) { + // skippable + return b4(dat, 4) + 8; + } + err(0); +}; + +// most significant bit for nonzero +const msb = (val: number) => { + let bits = 0; + for (; (1 << bits) <= val; ++bits); + return bits - 1; +}; + +// read finite state entropy +const rfse = (dat: Uint8Array, bt: number, mal: number): [number, FSEDT] => { + // table pos + let tpos = (bt << 3) + 4; + // accuracy log + const al = (dat[bt] & 15) + 5; + if (al > mal) err(3); + // size + const sz = 1 << al; + // probabilities symbols repeat index high threshold + let probs = sz, sym = -1, re = -1, i = -1, ht = sz; + // optimization: single allocation is much faster + const buf = new ab(512 + (sz << 2)); + const freq = new i16(buf, 0, 256); + // same view as freq + const dstate = new u16(buf, 0, 256); + const nstate = new u16(buf, 512, sz); + const bb1 = 512 + (sz << 1); + const syms = new u8(buf, bb1, sz); + const nbits = new u8(buf, bb1 + sz); + while (sym < 255 && probs > 0) { + const bits = msb(probs + 1); + const cbt = tpos >> 3; + // mask + const msk = (1 << (bits + 1)) - 1; + let val = ((dat[cbt] | (dat[cbt + 1] << 8) | (dat[cbt + 2] << 16)) >> (tpos & 7)) & msk; + // mask (1 fewer bit) + const msk1fb = (1 << bits) - 1; + // max small value + const msv = msk - probs - 1; + // small value + const sval = val & msk1fb; + if (sval < msv) tpos += bits, val = sval; + else { + tpos += bits + 1; + if (val > msk1fb) val -= msv; + } + freq[++sym] = --val; + if (val == -1) { + probs += val; + syms[--ht] = sym; + } else probs -= val; + if (!val) { + do { + // repeat byte + const rbt = tpos >> 3; + re = ((dat[rbt] | (dat[rbt + 1] << 8)) >> (tpos & 7)) & 3; + tpos += 2; + sym += re; + } while (re == 3); + } + } + if (sym > 255 || probs) err(0); + let sympos = 0; + // sym step (coprime with sz - formula from zstd source) + const sstep = (sz >> 1) + (sz >> 3) + 3; + // sym mask + const smask = sz - 1; + for (let s = 0; s <= sym; ++s) { + const sf = freq[s]; + if (sf < 1) { + dstate[s] = -sf; + continue; + } + // This is split into two loops in zstd to avoid branching, but as JS is higher-level that is unnecessary + for (i = 0; i < sf; ++i) { + syms[sympos] = s; + do { + sympos = (sympos + sstep) & smask; + } while (sympos >= ht); + } + } + // After spreading symbols, should be zero again + if (sympos) err(0); + for (i = 0; i < sz; ++i) { + // next state + const ns = dstate[syms[i]]++; + // num bits + const nb = nbits[i] = al - msb(ns); + nstate[i] = (ns << nb) - sz; + } + return [(tpos + 7) >> 3, { + b: al, + s: syms, + n: nbits, + t: nstate + }]; +}; + +// read huffman +const rhu = (dat: Uint8Array, bt: number): [number, HDT] => { + // index weight count + let i = 0, wc = -1; + // buffer header byte + const buf = new u8(292), hb = dat[bt]; + // huffman weights + const hw = buf.subarray(0, 256); + // rank count + const rc = buf.subarray(256, 268); + // rank index + const ri = new u16(buf.buffer, 268); + // NOTE: at this point bt is 1 less than expected + if (hb < 128) { + // end byte, fse decode table + const [ebt, fdt] = rfse(dat, bt + 1, 6); + bt += hb; + const epos = ebt << 3; + // last byte + const lb = dat[bt]; + if (!lb) err(0); + // state1 state2 state1 bits state2 bits + let st1 = 0, st2 = 0, btr1 = fdt.b, btr2 = btr1; + // fse pos + // pre-increment to account for original deficit of 1 + let fpos = (++bt << 3) - 8 + msb(lb); + for (;;) { + fpos -= btr1; + if (fpos < epos) break; + let cbt = fpos >> 3; + st1 += ((dat[cbt] | (dat[cbt + 1] << 8)) >> (fpos & 7)) & ((1 << btr1) - 1); + hw[++wc] = fdt.s[st1]; + fpos -= btr2; + if (fpos < epos) break; + cbt = fpos >> 3; + st2 += ((dat[cbt] | (dat[cbt + 1] << 8)) >> (fpos & 7)) & ((1 << btr2) - 1); + hw[++wc] = fdt.s[st2]; + btr1 = fdt.n[st1]; + st1 = fdt.t[st1]; + btr2 = fdt.n[st2]; + st2 = fdt.t[st2]; + } + if (++wc > 255) err(0); + } else { + wc = hb - 127; + for (; i < wc; i += 2) { + const byte = dat[++bt]; + hw[i] = byte >> 4; + hw[i + 1] = byte & 15; + } + ++bt; + } + // weight exponential sum + let wes = 0; + for (i = 0; i < wc; ++i) { + const wt = hw[i]; + // bits must be at most 11, same as weight + if (wt > 11) err(0); + wes += wt && (1 << (wt - 1)); + } + // max bits + const mb = msb(wes) + 1; + // table size + const ts = 1 << mb; + // remaining sum + const rem = ts - wes; + // must be power of 2 + if (rem & (rem - 1)) err(0); + hw[wc++] = msb(rem) + 1; + for (i = 0; i < wc; ++i) { + const wt = hw[i]; + ++rc[hw[i] = wt && (mb + 1 - wt)]; + } + // huf buf + const hbuf = new u8(ts << 1); + // symbols num bits + const syms = hbuf.subarray(0, ts), nb = hbuf.subarray(ts); + ri[mb] = 0; + for (i = mb; i > 0; --i) { + const pv = ri[i]; + fill(nb, i, pv, ri[i - 1] = pv + rc[i] * (1 << (mb - i))); + } + if (ri[0] != ts) err(0); + for (i = 0; i < wc; ++i) { + const bits = hw[i]; + if (bits) { + const code = ri[bits]; + fill(syms, i, code, ri[bits] = code + (1 << (mb - bits))); + } + } + return [bt, { + n: nb, + b: mb, + s: syms + }]; +}; + +// Tables generated using this: +// https://gist.github.com/101arrowz/a979452d4355992cbf8f257cbffc9edd + +// default literal length table +const dllt = /*#__PURE__*/ rfse(/*#__PURE__*/ new u8([ + 81, 16, 99, 140, 49, 198, 24, 99, 12, 33, 196, 24, 99, 102, 102, 134, 70, 146, 4 +]), 0, 6)[1]; + +// default match length table +const dmlt = /*#__PURE__*/ rfse(/*#__PURE__*/ new u8([ + 33, 20, 196, 24, 99, 140, 33, 132, 16, 66, 8, 33, 132, 16, 66, 8, 33, 68, 68, 68, 68, 68, 68, 68, 68, 36, 9 +]), 0, 6)[1]; + +// default offset code table +const doct = /*#__PURE__ */ rfse(/*#__PURE__*/ new u8([ + 32, 132, 16, 66, 102, 70, 68, 68, 68, 68, 36, 73, 2 +]), 0, 5)[1]; + +// bits to baseline +const b2bl = (b: Uint8Array, s: number) => { + const len = b.length, bl = new i32(len); + for (let i = 0; i < len; ++i) { + bl[i] = s; + s += 1 << b[i]; + } + return bl; +}; + +// literal length bits +const llb = /*#__PURE__ */ new u8((/*#__PURE__ */ new i32([ + 0, 0, 0, 0, 16843009, 50528770, 134678020, 202050057, 269422093 +])).buffer, 0, 36); + +// literal length baseline +const llbl = /*#__PURE__ */ b2bl(llb, 0); + +// match length bits +const mlb = /*#__PURE__ */ new u8((/*#__PURE__ */ new i32([ + 0, 0, 0, 0, 0, 0, 0, 0, 16843009, 50528770, 117769220, 185207048, 252579084, 16 +])).buffer, 0, 53); + +// match length baseline +const mlbl = /*#__PURE__ */ b2bl(mlb, 3); + +// decode huffman stream +const dhu = (dat: Uint8Array, out: Uint8Array, hu: HDT) => { + const len = dat.length, ss = out.length, lb = dat[len - 1], msk = (1 << hu.b) - 1, eb = -hu.b; + if (!lb) err(0); + let st = 0, btr = hu.b, pos = (len << 3) - 8 + msb(lb) - btr, i = -1; + for (; pos > eb && i < ss;) { + const cbt = pos >> 3; + const val = (dat[cbt] | (dat[cbt + 1] << 8) | (dat[cbt + 2] << 16)) >> (pos & 7); + st = ((st << btr) | val) & msk; + out[++i] = hu.s[st]; + pos -= (btr = hu.n[st]); + } + if (pos != eb || i + 1 != ss) err(0); +}; + +// decode huffman stream 4x +// TODO: use workers to parallelize +const dhu4 = (dat: Uint8Array, out: Uint8Array, hu: HDT) => { + let bt = 6; + const ss = out.length, sz1 = (ss + 3) >> 2, sz2 = sz1 << 1, sz3 = sz1 + sz2; + dhu(dat.subarray(bt, bt += dat[0] | (dat[1] << 8)), out.subarray(0, sz1), hu); + dhu(dat.subarray(bt, bt += dat[2] | (dat[3] << 8)), out.subarray(sz1, sz2), hu); + dhu(dat.subarray(bt, bt += dat[4] | (dat[5] << 8)), out.subarray(sz2, sz3), hu); + dhu(dat.subarray(bt), out.subarray(sz3), hu); +}; + +// read Zstandard block +const rzb = (dat: Uint8Array, st: DZstdState, out?: Uint8Array) => { + let bt = st.b; + // byte 0 block type + const b0 = dat[bt], btype = (b0 >> 1) & 3; + st.l = b0 & 1; + const sz = (b0 >> 3) | (dat[bt + 1] << 5) | (dat[bt + 2] << 13); + // end byte for block + const ebt = (bt += 3) + sz; + if (btype == 1) { + if (bt >= dat.length) return; + st.b = bt + 1; + if (out) { + fill(out, dat[bt], st.y, st.y += sz); + return out; + } + return fill(new u8(sz), dat[bt]); + } + if (ebt > dat.length) return; + if (btype == 0) { + st.b = ebt; + if (out) { + out.set(dat.subarray(bt, ebt), st.y); + st.y += sz; + return out; + } + return slc(dat, bt, ebt); + } + if (btype == 2) { + // byte 3 lit btype size format + const b3 = dat[bt], lbt = b3 & 3, sf = (b3 >> 2) & 3; + // lit src size lit cmp sz 4 streams + let lss = b3 >> 4, lcs = 0, s4 = 0; + if (lbt < 2) { + if (sf & 1) lss |= (dat[++bt] << 4) | ((sf & 2) && (dat[++bt] << 12)); + else lss = b3 >> 3; + } else { + s4 = sf; + if (sf < 2) lss |= ((dat[++bt] & 63) << 4), lcs = (dat[bt] >> 6) | (dat[++bt] << 2); + else if (sf == 2) lss |= (dat[++bt] << 4) | ((dat[++bt] & 3) << 12), lcs = (dat[bt] >> 2) | (dat[++bt] << 6); + else lss |= (dat[++bt] << 4) | ((dat[++bt] & 63) << 12), lcs = (dat[bt] >> 6) | (dat[++bt] << 2) | (dat[++bt] << 10); + } + ++bt; + // add literals to end - can never overlap with backreferences because unused literals always appended + let buf = out ? out.subarray(st.y, st.y + st.m) : new u8(st.m); + // starting point for literals + let spl = buf.length - lss; + if (lbt == 0) buf.set(dat.subarray(bt, bt += lss), spl); + else if (lbt == 1) fill(buf, dat[bt++], spl); + else { + // huffman table + let hu = st.h; + if (lbt == 2) { + const hud = rhu(dat, bt); + // subtract description length + lcs += bt - (bt = hud[0]); + st.h = hu = hud[1]; + } + else if (!hu) err(0); + (s4 ? dhu4 : dhu)(dat.subarray(bt, bt += lcs), buf.subarray(spl), hu); + } + // num sequences + let ns = dat[bt++]; + if (ns) { + if (ns == 255) ns = (dat[bt++] | (dat[bt++] << 8)) + 0x7F00; + else if (ns > 127) ns = ((ns - 128) << 8) | dat[bt++]; + // symbol compression modes + const scm = dat[bt++]; + if (scm & 3) err(0); + const dts: [FSEDT, FSEDT, FSEDT] = [dmlt, doct, dllt]; + for (let i = 2; i > -1; --i) { + const md = (scm >> ((i << 1) + 2)) & 3; + if (md == 1) { + // rle buf + const rbuf = new u8([0, 0, dat[bt++]]); + dts[i] = { + s: rbuf.subarray(2, 3), + n: rbuf.subarray(0, 1), + t: new u16(rbuf.buffer, 0, 1), + b: 0 + }; + } else if (md == 2) { + // accuracy log 8 for offsets, 9 for others + [bt, dts[i]] = rfse(dat, bt, 9 - (i & 1)); + } else if (md == 3) { + if (!st.t) err(0); + dts[i] = st.t[i]; + } + } + const [mlt, oct, llt] = st.t = dts; + const lb = dat[ebt - 1]; + if (!lb) err(0); + let spos = (ebt << 3) - 8 + msb(lb) - llt.b, cbt = spos >> 3, oubt = 0; + let lst = ((dat[cbt] | (dat[cbt + 1] << 8)) >> (spos & 7)) & ((1 << llt.b) - 1); + cbt = (spos -= oct.b) >> 3; + let ost = ((dat[cbt] | (dat[cbt + 1] << 8)) >> (spos & 7)) & ((1 << oct.b) - 1); + cbt = (spos -= mlt.b) >> 3; + let mst = ((dat[cbt] | (dat[cbt + 1] << 8)) >> (spos & 7)) & ((1 << mlt.b) - 1); + for (++ns; --ns;) { + const llc = llt.s[lst]; + const lbtr = llt.n[lst]; + const mlc = mlt.s[mst]; + const mbtr = mlt.n[mst]; + const ofc = oct.s[ost]; + const obtr = oct.n[ost]; + + cbt = (spos -= ofc) >> 3; + const ofp = 1 << ofc; + let off = ofp + (((dat[cbt] | (dat[cbt + 1] << 8) | (dat[cbt + 2] << 16) | (dat[cbt + 3] << 24)) >>> (spos & 7)) & (ofp - 1)); + cbt = (spos -= mlb[mlc]) >> 3; + let ml = mlbl[mlc] + (((dat[cbt] | (dat[cbt + 1] << 8) | (dat[cbt + 2] << 16)) >> (spos & 7)) & ((1 << mlb[mlc]) - 1)); + cbt = (spos -= llb[llc]) >> 3; + const ll = llbl[llc] + (((dat[cbt] | (dat[cbt + 1] << 8) | (dat[cbt + 2] << 16)) >> (spos & 7)) & ((1 << llb[llc]) - 1)); + + cbt = (spos -= lbtr) >> 3; + lst = llt.t[lst] + (((dat[cbt] | (dat[cbt + 1] << 8)) >> (spos & 7)) & ((1 << lbtr) - 1)); + cbt = (spos -= mbtr) >> 3; + mst = mlt.t[mst] + (((dat[cbt] | (dat[cbt + 1] << 8)) >> (spos & 7)) & ((1 << mbtr) - 1)); + cbt = (spos -= obtr) >> 3; + ost = oct.t[ost] + (((dat[cbt] | (dat[cbt + 1] << 8)) >> (spos & 7)) & ((1 << obtr) - 1)); + + if (off > 3) { + st.o[2] = st.o[1]; + st.o[1] = st.o[0]; + st.o[0] = off -= 3; + } else { + const idx = off - ((ll != 0) as unknown as number); + if (idx) { + off = idx == 3 ? st.o[0] - 1 : st.o[idx]; + if (idx > 1) st.o[2] = st.o[1]; + st.o[1] = st.o[0]; + st.o[0] = off; + } else off = st.o[0]; + } + for (let i = 0; i < ll; ++i) { + buf[oubt + i] = buf[spl + i]; + } + oubt += ll, spl += ll; + let stin = oubt - off; + if (stin < 0) { + let len = -stin; + const bs = st.e + stin; + if (len > ml) len = ml; + for (let i = 0; i < len; ++i) { + buf[oubt + i] = st.w[bs + i]; + } + oubt += len, ml -= len, stin = 0; + } + for (let i = 0; i < ml; ++i) { + buf[oubt + i] = buf[stin + i]; + } + oubt += ml; + } + if (oubt != spl) { + while (spl < buf.length) { + buf[oubt++] = buf[spl++]; + } + } else oubt = buf.length; + if (out) st.y += oubt; + else buf = slc(buf, 0, oubt); + } else if (out) { + st.y += lss; + if (spl) { + for (let i = 0; i < lss; ++i) { + buf[i] = buf[spl + i]; + } + } + } else if (spl) buf = slc(buf, spl); + st.b = ebt; + return buf; + } + err(2); +}; + +// concat +const cct = (bufs: Uint8Array[], ol: number) => { + if (bufs.length == 1) return bufs[0]; + const buf = new u8(ol); + for (let i = 0, b = 0; i < bufs.length; ++i) { + const chk = bufs[i]; + buf.set(chk, b); + b += chk.length; + } + return buf; +}; + +/** + * Decompresses Zstandard data + * @param dat The input data + * @param buf The output buffer. If unspecified, the function will allocate + * exactly enough memory to fit the decompressed data. If your + * data has multiple frames and you know the output size, specifying + * it will yield better performance. + * @param dic The dictionary buffer. May be a Zstandard-formatted dictionary + * (as generated by zstd --train) or raw content. + * @returns The decompressed data + */ +export function decompress(dat: Uint8Array, buf?: Uint8Array, dic?: Uint8Array) { + const bufs: Uint8Array[] = [], nb = +!buf as 0 | 1; + let bt = 0, ol = 0; + for (; dat.length;) { + const st = rzfh(dat, nb || buf); + if (typeof st == 'object') { + if (st.d && !dic) err(6); + if (dic) rdic(dic, st); + if (nb) { + buf = null; + if (!dic && st.w.length == st.u) { + bufs.push(buf = st.w); + ol += st.u; + } + } else { + bufs.push(buf); + st.e = 0; + } + for (; !st.l;) { + const blk = rzb(dat, st, buf); + if (!blk) err(5); + if (buf) st.e = st.y; + else { + bufs.push(blk); + ol += blk.length; + cpw(st.w, 0, blk.length); + st.w.set(blk, st.w.length - blk.length); + } + } + bt = st.b + (st.c * 4); + } else bt = st; + dat = dat.subarray(bt); + } + return cct(bufs, ol); +} + +/** + * Callback to handle data in Zstandard streams + * @param data The data that was (de)compressed + * @param final Whether this is the last chunk in the stream + */ +export type ZstdStreamHandler = (data: Uint8Array, final?: boolean) => unknown; + +/** + * Decompressor for Zstandard streamed data + */ +export class Decompress { + private s: DZstdState | number; + private c: Uint8Array[]; + private l: number; + private z: number; + private d: Uint8Array; + /** + * Creates a Zstandard decompressor + * @param ondata The handler for stream data + */ + constructor(ondata?: ZstdStreamHandler) { + this.ondata = ondata; + this.c = []; + this.l = 0; + this.z = 0; + } + + /** + * Loads a dictionary + * @param dic The dictionary buffer. May be a Zstandard-formatted dictionary + * (as generated by zstd --train) or raw content. + */ + loadDictionary(dic: Uint8Array) { + this.d = dic; + } + + /** + * Pushes data to be decompressed + * @param chunk The chunk of data to push + * @param final Whether or not this is the last chunk in the stream + */ + push(chunk: Uint8Array, final?: boolean) { + if (typeof this.s == 'number') { + const sub = Math.min(chunk.length, this.s as number); + chunk = chunk.subarray(sub); + (this.s as number) -= sub; + } + const sl = chunk.length; + const ncs = sl + this.l; + if (!this.s) { + if (final) { + if (!ncs) { + this.ondata(new u8(0), true); + return; + } + // min for frame + one block + if (ncs < 5) err(5); + } else if (ncs < 18) { + this.c.push(chunk); + this.l = ncs; + return; + } + if (this.l) { + this.c.push(chunk); + chunk = cct(this.c, ncs); + this.c = []; + this.l = 0; + } + if (typeof (this.s = rzfh(chunk)) == 'number') return this.push(chunk, final); + if (this.s.d && !this.d) err(6); + if (this.d) rdic(this.d, this.s); + } + if (typeof this.s != 'number') { + if (ncs < (this.z || 3)) { + if (final) err(5); + this.c.push(chunk); + this.l = ncs; + return; + } + if (this.l) { + this.c.push(chunk); + chunk = cct(this.c, ncs); + this.c = []; + this.l = 0; + } + if (!this.z && ncs < (this.z = (chunk[(this.s as DZstdState).b] & 2) ? 4 : 3 + ((chunk[(this.s as DZstdState).b] >> 3) | (chunk[(this.s as DZstdState).b + 1] << 5) | (chunk[(this.s as DZstdState).b + 2] << 13)))) { + if (final) err(5); + this.c.push(chunk); + this.l = ncs; + return; + } else this.z = 0; + for (;;) { + const blk = rzb(chunk, this.s as DZstdState); + if (!blk) { + if (final) err(5); + const adc = chunk.subarray((this.s as DZstdState).b); + (this.s as DZstdState).b = 0; + this.c.push(adc), this.l += adc.length; + return; + } else { + this.ondata(blk, false); + cpw((this.s as DZstdState).w, 0, blk.length); + (this.s as DZstdState).w.set(blk, (this.s as DZstdState).w.length - blk.length); + } + if ((this.s as DZstdState).l) { + const rest = chunk.subarray((this.s as DZstdState).b); + this.s = (this.s as DZstdState).c * 4; + this.push(rest, final); + return; + } + } + } else if (final) err(5); + } + + /** + * Handler called whenever data is decompressed + */ + ondata: ZstdStreamHandler; +} \ No newline at end of file diff --git a/tests/fixtures/dictionary/alias-compressed.zst b/tests/fixtures/dictionary/alias-compressed.zst new file mode 100644 index 0000000..47646fd Binary files /dev/null and b/tests/fixtures/dictionary/alias-compressed.zst differ diff --git a/tests/fixtures/dictionary/alias-dict.bin b/tests/fixtures/dictionary/alias-dict.bin new file mode 100644 index 0000000..2b53753 Binary files /dev/null and b/tests/fixtures/dictionary/alias-dict.bin differ diff --git a/tests/fixtures/dictionary/compressed-location.zst b/tests/fixtures/dictionary/compressed-location.zst new file mode 100644 index 0000000..beabac9 Binary files /dev/null and b/tests/fixtures/dictionary/compressed-location.zst differ diff --git a/tests/fixtures/dictionary/location-dict.bin b/tests/fixtures/dictionary/location-dict.bin new file mode 100644 index 0000000..692bd95 Binary files /dev/null and b/tests/fixtures/dictionary/location-dict.bin differ diff --git a/tests/fixtures/dictionary/location.txt b/tests/fixtures/dictionary/location.txt new file mode 100644 index 0000000..3b6cef0 --- /dev/null +++ b/tests/fixtures/dictionary/location.txt @@ -0,0 +1 @@ +s3://my-noaa-bucket/gfs/2026/06/17/run00/temperature/chunk_000042.nc \ No newline at end of file diff --git a/tests/fixtures/dictionary/oversized-location.txt b/tests/fixtures/dictionary/oversized-location.txt new file mode 100644 index 0000000..88fbbce --- /dev/null +++ b/tests/fixtures/dictionary/oversized-location.txt @@ -0,0 +1 @@ +s3://my-noaa-bucket/gfs/2026/06/17/run00/temperature/chunk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \ No newline at end of file diff --git a/tests/fixtures/dictionary/oversized-location.zst b/tests/fixtures/dictionary/oversized-location.zst new file mode 100644 index 0000000..3366083 Binary files /dev/null and b/tests/fixtures/dictionary/oversized-location.zst differ diff --git a/tests/format/flatbuffers/manifest-dictionary.test.ts b/tests/format/flatbuffers/manifest-dictionary.test.ts new file mode 100644 index 0000000..5b1accb --- /dev/null +++ b/tests/format/flatbuffers/manifest-dictionary.test.ts @@ -0,0 +1,335 @@ +/** + * Tests for dictionary-compressed virtual chunk locations. + * + * icechunk (earth-mover/icechunk#1776) can zstd-compress the often-repetitive + * S3 URLs in virtual manifests against a per-manifest trained dictionary: + * `Manifest.location_dictionary` holds the dictionary and each + * `ChunkRef.compressed_location` holds a dictionary-compressed location. + * + * The fixtures under tests/fixtures/dictionary/ were produced offline with the + * reference `zstd` CLI (v1.5.7): + * zstd --train corpus/* -o location-dict.bin --maxdict=2048 + * zstd -D location-dict.bin location.txt -o compressed-location.zst + * so this test exercises the real zstd dictionary format end-to-end through the + * vendored fzstd decoder (../../../src/vendor/fzstd), independent of any single + * library's round-trip. + */ + +import { describe, it, expect } from "vitest"; +import { readFileSync } from "node:fs"; +import { fileURLToPath } from "node:url"; +import * as flatbuffers from "flatbuffers"; +import { + parseManifest, + findChunkRef, + getChunkPayload, +} from "../../../src/format/flatbuffers/manifest-parser.js"; +import { decompress } from "../../../src/vendor/fzstd/index.js"; +import type { ObjectId8 } from "../../../src/format/flatbuffers/types.js"; + +// --- Offline-generated zstd dictionary fixtures (see file header) --- +function fixture(name: string): Uint8Array { + return new Uint8Array( + readFileSync( + fileURLToPath( + new URL(`../../fixtures/dictionary/${name}`, import.meta.url), + ), + ), + ); +} + +const DICTIONARY = fixture("location-dict.bin"); +const COMPRESSED = fixture("compressed-location.zst"); +const LOCATION = new TextDecoder().decode(fixture("location.txt")); + +// Crafted so the (raw) dictionary-content length equals the decompressed size, +// which triggered a buffer-aliasing corruption in upstream fzstd PR#18. The +// location content is identical to the dictionary, so decode output must equal +// the dictionary bytes. +const ALIAS_DICT = fixture("alias-dict.bin"); +const ALIAS_COMPRESSED = fixture("alias-compressed.zst"); + +function mockNodeId(seed: number): ObjectId8 { + const id = new Uint8Array(8); + for (let i = 0; i < 8; i++) id[i] = (seed + i) % 256; + return id as ObjectId8; +} + +// --- Minimal flatbuffer Manifest builder (generated classes are reader-only) --- + +/** Write an inline struct (N raw bytes, alignment 1) and return its offset. */ +function buildStruct(builder: flatbuffers.Builder, bytes: Uint8Array): number { + builder.prep(1, bytes.length); + for (let i = bytes.length - 1; i >= 0; i--) builder.writeInt8(bytes[i]); + return builder.offset(); +} + +function buildIndexVector( + builder: flatbuffers.Builder, + data: number[], +): number { + builder.startVector(4, data.length, 4); + for (let i = data.length - 1; i >= 0; i--) builder.addInt32(data[i]); + return builder.endVector(); +} + +interface RefSpec { + index: number[]; + offset: number; + length: number; + location?: string; + compressed?: Uint8Array; +} + +function buildChunkRef(builder: flatbuffers.Builder, ref: RefSpec): number { + // Child offsets must be created before the table is started. + const indexOff = buildIndexVector(builder, ref.index); + const locOff = ref.location != null ? builder.createString(ref.location) : 0; + const compOff = ref.compressed ? builder.createByteVector(ref.compressed) : 0; + + // ChunkRef vtable slots: index=0, inline=1, offset=2, length=3, chunk_id=4, + // location=5, checksum_etag=6, checksum_last_modified=7, + // compressed_location=8, extra=9. + builder.startObject(10); + builder.addFieldOffset(0, indexOff, 0); + builder.addFieldInt64(2, BigInt(ref.offset), BigInt(0)); + builder.addFieldInt64(3, BigInt(ref.length), BigInt(0)); + if (locOff) builder.addFieldOffset(5, locOff, 0); + if (compOff) builder.addFieldOffset(8, compOff, 0); + return builder.endObject(); +} + +function buildArrayManifest( + builder: flatbuffers.Builder, + nodeId: Uint8Array, + refOffsets: number[], +): number { + builder.startVector(4, refOffsets.length, 4); + for (let i = refOffsets.length - 1; i >= 0; i--) + builder.addOffset(refOffsets[i]); + const refsOff = builder.endVector(); + + // ArrayManifest vtable slots: node_id=0, refs=1, extra=2. + builder.startObject(3); + builder.addFieldOffset(1, refsOff, 0); + const nodeOff = buildStruct(builder, nodeId); + builder.addFieldStruct(0, nodeOff, 0); + return builder.endObject(); +} + +interface ManifestSpec { + arrays: { nodeId: Uint8Array; refs: RefSpec[] }[]; + dictionary?: Uint8Array; + compressionAlgorithm?: number; +} + +function buildManifest(spec: ManifestSpec): Uint8Array { + const builder = new flatbuffers.Builder(1024); + const arrayOffsets = spec.arrays.map((a) => + buildArrayManifest( + builder, + a.nodeId, + a.refs.map((r) => buildChunkRef(builder, r)), + ), + ); + + builder.startVector(4, arrayOffsets.length, 4); + for (let i = arrayOffsets.length - 1; i >= 0; i--) + builder.addOffset(arrayOffsets[i]); + const arraysOff = builder.endVector(); + + const dictOff = spec.dictionary + ? builder.createByteVector(spec.dictionary) + : 0; + + // Manifest vtable slots: id=0, arrays=1, location_dictionary=2, + // compression_algorithm=3, extra=4. compression_algorithm default is 1. + builder.startObject(5); + builder.addFieldOffset(1, arraysOff, 0); + if (dictOff) builder.addFieldOffset(2, dictOff, 0); + if (spec.compressionAlgorithm != null) + builder.addFieldInt8(3, spec.compressionAlgorithm, 1); + const idOff = buildStruct(builder, new Uint8Array(12).fill(7)); + builder.addFieldStruct(0, idOff, 0); + const manifestOff = builder.endObject(); + + builder.finish(manifestOff); + return builder.asUint8Array(); +} + +describe("dictionary-compressed locations", () => { + it("vendored fzstd decodes a real zstd dictionary frame", () => { + const decoded = new TextDecoder().decode( + decompress(COMPRESSED, undefined, DICTIONARY), + ); + expect(decoded).toBe(LOCATION); + }); + + it("builder round-trips a plain (uncompressed) location", () => { + // Self-check: proves the hand-rolled flatbuffer builder is correct before + // we trust it for the compressed case. + const nodeId = mockNodeId(1); + const data = buildManifest({ + arrays: [ + { + nodeId, + refs: [{ index: [0], offset: 10, length: 20, location: "s3://b/k" }], + }, + ], + }); + const manifest = parseManifest(data); + const ref = findChunkRef(manifest, nodeId, [0]); + expect(ref).not.toBeNull(); + const payload = getChunkPayload(ref!); + expect(payload.type).toBe("virtual"); + if (payload.type === "virtual") { + expect(payload.location).toBe("s3://b/k"); + expect(payload.offset).toBe(10); + expect(payload.length).toBe(20); + } + }); + + it("decodes compressed_location via the manifest dictionary", () => { + const nodeId = mockNodeId(2); + const data = buildManifest({ + dictionary: DICTIONARY, + compressionAlgorithm: 1, + arrays: [ + { + nodeId, + refs: [ + { index: [0], offset: 1024, length: 512, compressed: COMPRESSED }, + ], + }, + ], + }); + const manifest = parseManifest(data); + const ref = findChunkRef(manifest, nodeId, [0]); + expect(ref).not.toBeNull(); + const payload = getChunkPayload(ref!); + expect(payload.type).toBe("virtual"); + if (payload.type === "virtual") { + expect(payload.location).toBe(LOCATION); + expect(payload.offset).toBe(1024); + expect(payload.length).toBe(512); + } + }); + + it("throws when a compressed_location has no manifest dictionary", () => { + const nodeId = mockNodeId(3); + const data = buildManifest({ + // No dictionary, algorithm left at the default. + arrays: [ + { + nodeId, + refs: [{ index: [0], offset: 0, length: 1, compressed: COMPRESSED }], + }, + ], + }); + expect(() => parseManifest(data)).toThrow(/location dictionary/); + }); + + it("rejects a location that decompresses beyond the size bound", () => { + // Matches the Rust MAX_DECOMPRESSED_LOCATION_SIZE (1024 bytes). This real + // frame honestly advertises its 1100-byte content size, so the pre-decode + // header guard rejects it before decompression. + const oversized = fixture("oversized-location.zst"); + const nodeId = mockNodeId(4); + const data = buildManifest({ + dictionary: DICTIONARY, + compressionAlgorithm: 1, + arrays: [ + { + nodeId, + refs: [{ index: [0], offset: 0, length: 1, compressed: oversized }], + }, + ], + }); + expect(() => parseManifest(data)).toThrow(/declares a size over 1024/); + }); + + it("decodes a frame whose output aliases the dictionary buffer (fzstd patch)", () => { + // Without the vendored fast-path fix this silently corrupts the output. + const out = decompress(ALIAS_COMPRESSED, undefined, ALIAS_DICT); + expect(Array.from(out)).toEqual(Array.from(ALIAS_DICT)); + }); + + it("rejects a compressed_location whose header advertises an oversized frame", () => { + // Hand-crafted zstd frame header: magic + single-segment + 4-byte content + // size of 2,000,000 and no real blocks. Must be rejected before the decoder + // allocates for the advertised size. + const malicious = new Uint8Array([ + 0x28, + 0xb5, + 0x2f, + 0xfd, // magic + 0xa0, // single_segment=1, content_size_flag=2 (4-byte FCS) + 0x80, + 0x84, + 0x1e, + 0x00, // content size = 2,000,000 (LE) + ]); + const nodeId = mockNodeId(5); + const data = buildManifest({ + dictionary: DICTIONARY, + compressionAlgorithm: 1, + arrays: [ + { + nodeId, + refs: [{ index: [0], offset: 0, length: 1, compressed: malicious }], + }, + ], + }); + expect(() => parseManifest(data)).toThrow(/declares a size over 1024/); + }); + + it("rejects a frame with a huge window descriptor (32-bit shift overflow)", () => { + // No single-segment / content size; a window descriptor with exponent 41. + // A bitwise `1 << 41` would wrap to 512 and wrongly pass the bound — the + // arithmetic computation must reject it. + const malicious = new Uint8Array([ + 0x28, + 0xb5, + 0x2f, + 0xfd, // magic + 0x00, // FHD: not single-segment, no content size + 0xf8, // window descriptor: exponent 31 -> 2**41 bytes + ]); + const nodeId = mockNodeId(6); + const data = buildManifest({ + dictionary: DICTIONARY, + compressionAlgorithm: 1, + arrays: [ + { + nodeId, + refs: [{ index: [0], offset: 0, length: 1, compressed: malicious }], + }, + ], + }); + expect(() => parseManifest(data)).toThrow(/declares a size over 1024/); + }); + + it("rejects a compressed_location with a trailing/concatenated frame", () => { + // A valid small frame (the real fixture) followed by a second frame that + // advertises a huge size. fzstd would decode both and allocate for the + // second, so the guard must reject anything that isn't exactly one frame. + const secondFrame = new Uint8Array([ + 0x28, 0xb5, 0x2f, 0xfd, 0xa0, 0x80, 0x84, 0x1e, 0x00, + ]); + const concatenated = new Uint8Array([...COMPRESSED, ...secondFrame]); + const nodeId = mockNodeId(7); + const data = buildManifest({ + dictionary: DICTIONARY, + compressionAlgorithm: 1, + arrays: [ + { + nodeId, + refs: [ + { index: [0], offset: 0, length: 1, compressed: concatenated }, + ], + }, + ], + }); + expect(() => parseManifest(data)).toThrow(/declares a size over 1024/); + }); +}); diff --git a/tests/format/flatbuffers/manifest-parser.test.ts b/tests/format/flatbuffers/manifest-parser.test.ts index 2be5383..26f92b9 100644 --- a/tests/format/flatbuffers/manifest-parser.test.ts +++ b/tests/format/flatbuffers/manifest-parser.test.ts @@ -504,8 +504,9 @@ describe("getChunkPayload", () => { expect(() => getChunkPayload(invalidRef)).toThrow("Invalid ChunkRef"); }); - it("should prefer inline over native", () => { - // Edge case: ref has both inline and chunkId (shouldn't happen, but test precedence) + it("should prefer native (chunk_id) over inline", () => { + // Edge case: ref has both chunkId and inline (shouldn't happen, but test + // precedence). Priority follows the Rust `ref_to_payload`: chunk_id first. const ref: ChunkRef = { index: [0], inline: new Uint8Array([1, 2, 3]), @@ -518,6 +519,6 @@ describe("getChunkPayload", () => { }; const payload = getChunkPayload(ref); - expect(payload.type).toBe("inline"); + expect(payload.type).toBe("native"); }); }); diff --git a/tests/format/flatbuffers/snapshot-parser.test.ts b/tests/format/flatbuffers/snapshot-parser.test.ts index 75f42e2..2f6a249 100644 --- a/tests/format/flatbuffers/snapshot-parser.test.ts +++ b/tests/format/flatbuffers/snapshot-parser.test.ts @@ -10,7 +10,7 @@ import { describe, it, expect } from "vitest"; import { readFileSync, readdirSync } from "node:fs"; import { join, dirname } from "node:path"; import { fileURLToPath } from "node:url"; -import { decompress } from "fzstd"; +import { decompress } from "../../../src/vendor/fzstd/index.js"; import { parseSnapshot } from "../../../src/format/flatbuffers/snapshot-parser.js"; import { parseManifest } from "../../../src/format/flatbuffers/manifest-parser.js"; import { diff --git a/tests/format/flatbuffers/transaction-log-parser.test.ts b/tests/format/flatbuffers/transaction-log-parser.test.ts index 9c71b5f..389b3c3 100644 --- a/tests/format/flatbuffers/transaction-log-parser.test.ts +++ b/tests/format/flatbuffers/transaction-log-parser.test.ts @@ -9,7 +9,7 @@ import { describe, it, expect } from "vitest"; import { readFileSync, readdirSync } from "node:fs"; import { join, dirname } from "node:path"; import { fileURLToPath } from "node:url"; -import { decompress } from "fzstd"; +import { decompress } from "../../../src/vendor/fzstd/index.js"; import { parseTransactionLog } from "../../../src/format/flatbuffers/transaction-log-parser.js"; import { parseHeader,