diff --git a/README.md b/README.md index 48e4bab..f8b3476 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,12 @@ ![hypgrep](hypgrep.jpg) +[![npm](https://img.shields.io/npm/v/hypgrep)](https://www.npmjs.com/package/hypgrep) +[![minzipped](https://img.shields.io/bundlephobia/minzip/hypgrep)](https://www.npmjs.com/package/hypgrep) +[![workflow status](https://github.com/hyparam/hypgrep/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hypgrep/actions) [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) ![coverage](https://img.shields.io/badge/Coverage-95-darkred) +[![dependencies](https://img.shields.io/badge/Dependencies-3-blueviolet)](https://www.npmjs.com/package/hypgrep?activeTab=dependencies) Build a compact n-gram search index for a Parquet file using [`hyparquet`](https://github.com/hyparam/hyparquet) and [`hyparquet-writer`](https://github.com/hyparam/hyparquet-writer). Queries are case-insensitive substring matches — grep semantics over a precomputed index. @@ -15,6 +19,19 @@ Enable efficient grep-style search on large Parquet datasets from any client wit Perfect for serverless architectures where you want to offer search capabilities without managing infrastructure. +## Benchmarks + +Full-text search over 3,199,860 real LLM conversations ([WildChat-4.8M](https://huggingface.co/datasets/allenai/WildChat-4.8M)), run against the same data on every engine. Every competitor was queried over the network, the way it is actually deployed. hypgrep keeps the index in object storage and runs the query in the client, so there is no server and no idle cost. + +| Engine | Index size | Warm query (p50) | All-in / mo | Server | +|---|---:|---:|---:|---| +| **hypgrep** | 1.20 GB | 237 ms | **~$0.33** | none | +| Elasticsearch | 27.2 GB | 66 ms | $371 | r5.2xlarge 24/7 | +| Quickwit | 28.8 GB | 133 ms | $63 | t3.large 24/7 | +| Athena | none | 5,490 ms | $0.065/query | serverless | + +The always-on engines win raw latency by keeping a hot index in RAM, which is what the monthly bill pays for. hypgrep trades that for zero idle cost, a smaller footprint, and no infrastructure. + ## CLI usage Build an index: diff --git a/package.json b/package.json index 20ccb6c..3e0bc21 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,7 @@ { "name": "hypgrep", "version": "0.2.1", + "description": "Compact full-text grep search index for Parquet files", "author": "Hyperparam", "homepage": "https://hyperparam.app", "license": "MIT", diff --git a/src/constants.js b/src/constants.js index f13f913..32f314c 100644 --- a/src/constants.js +++ b/src/constants.js @@ -18,3 +18,14 @@ export const defaultIndexRowGroupSize = 40000 // block contains every common short window; n=5 dramatically reduces the // candidate-block set for selective substrings. export const defaultNgramLength = 5 + +// Extra (non-alphanumeric) characters kept inside n-gram runs. Alphanumerics are +// always kept; every other character is a run boundary unless listed here. The +// default keeps the JSON delimiters " : { } so n-grams span them and `"role":` +// stays intact, which makes structured/JSON greps highly selective at almost no +// index-size cost (measured on WildChat: ~1.04x the size of a bare alphanumeric +// index, versus ~2.6x for keeping all punctuation). The exact set is recorded +// per index in kv metadata (`hypgrep.ngram_chars`) so queries tokenize +// identically; an index with no such key (built before this) is read as the +// empty set, i.e. plain alphanumeric, and keeps working unchanged. +export const defaultNgramChars = '"{}:' diff --git a/src/createIndex.js b/src/createIndex.js index 646d329..ba632d2 100644 --- a/src/createIndex.js +++ b/src/createIndex.js @@ -1,6 +1,6 @@ import { parquetMetadataAsync, parquetReadObjects } from 'hyparquet' import { ParquetWriter, schemaFromColumnData } from 'hyparquet-writer' -import { defaultBlockSize, defaultIndexRowGroupSize, defaultNgramLength, hypGrepVersion } from './constants.js' +import { defaultBlockSize, defaultIndexRowGroupSize, defaultNgramChars, defaultNgramLength, hypGrepVersion } from './constants.js' import { extractNgrams } from './ngrams.js' import { assertNonNegativeSafeInteger, assertPositiveSafeInteger, getTextColumnsFromSchema } from './utils.js' @@ -21,10 +21,14 @@ export async function createIndex({ blockSize = defaultBlockSize, indexRowGroupSize = defaultIndexRowGroupSize, ngramLength = defaultNgramLength, + ngramChars = defaultNgramChars, }) { assertPositiveSafeInteger(blockSize, 'blockSize') assertPositiveSafeInteger(indexRowGroupSize, 'indexRowGroupSize') assertPositiveSafeInteger(ngramLength, 'ngramLength') + if (typeof ngramChars !== 'string' || /[a-z0-9\s]/i.test(ngramChars)) { + throw new Error('ngramChars must be a string of non-alphanumeric, non-whitespace characters') + } assertNonNegativeSafeInteger(sourceFile.byteLength, 'sourceFile.byteLength') const metadata = sourceMetadata ?? await parquetMetadataAsync(sourceFile) @@ -60,7 +64,7 @@ export async function createIndex({ columns: textColumns, }) - const blockNgrams = collectBlockNgrams(rows, textColumns, ngramLength) + const blockNgrams = collectBlockNgrams(rows, textColumns, ngramLength, ngramChars) for (const ngram of blockNgrams) { const prefix = ngram.slice(0, PREFIX_LENGTH) let postings = buckets.get(prefix) @@ -80,6 +84,10 @@ export async function createIndex({ { key: 'hypgrep.version', value: String(hypGrepVersion) }, { key: 'hypgrep.block_size', value: String(blockSize) }, { key: 'hypgrep.ngram_length', value: String(ngramLength) }, + // Record the kept-character set so queryIndex tokenizes identically. An index + // missing this key (built before structural tokenization) is read as '', + // i.e. plain alphanumeric, and keeps working unchanged. + { key: 'hypgrep.ngram_chars', value: ngramChars }, { key: 'hypgrep.text_columns', value: textColumns.join(',') }, { key: 'hypgrep.source_rows', value: String(numRows) }, // Can save network requests on the source file @@ -150,9 +158,10 @@ export async function createIndex({ * @param {Record[]} rows * @param {string[]} textColumns * @param {number} n + * @param {string} chars extra characters kept inside n-gram runs * @returns {Set} */ -function collectBlockNgrams(rows, textColumns, n) { +function collectBlockNgrams(rows, textColumns, n, chars) { /** @type {Set} */ const ngrams = new Set() for (const row of rows) { @@ -160,7 +169,7 @@ function collectBlockNgrams(rows, textColumns, n) { for (const columnName of textColumns) { const value = row[columnName] if (typeof value !== 'string' || value.length < n) continue - for (const g of extractNgrams(value, n)) { + for (const g of extractNgrams(value, n, chars)) { ngrams.add(g) } } diff --git a/src/ngrams.js b/src/ngrams.js index 1e1fd79..9ef05c4 100644 --- a/src/ngrams.js +++ b/src/ngrams.js @@ -1,23 +1,47 @@ /** * N-gram extraction for grep-style substring matching. * - * Text is lowercased and split on non-alphanumeric boundaries, then every - * n-character window of each alphanumeric run is emitted as an n-gram. + * Text is lowercased and split into runs, then every n-character window of each + * run is emitted as an n-gram. Alphanumerics are always part of a run; the + * `chars` argument lists extra punctuation characters that are also kept inside + * runs (so n-grams span them) rather than treated as boundaries. The default, + * `defaultNgramChars`, keeps the JSON delimiters " : { } so structured greps like + * `"role":` stay selective. Passing '' reproduces a bare alphanumeric tokenizer. + * + * Index and query MUST use the same `chars`, or their n-grams won't line up; + * createIndex records the set in kv metadata and queryIndex reads it back. + */ +import { defaultNgramChars } from './constants.js' + +/** + * Build the run-boundary regex for a set of kept punctuation characters. A fresh + * RegExp per call keeps the global `g` flag's `lastIndex` from leaking across + * `String.prototype.split` calls. + * + * @param {string} chars extra (non-alphanumeric) characters kept inside runs + * @returns {RegExp} */ +function boundaryPattern(chars) { + if (!chars) return /[^a-z0-9]+/g + // Escape the characters that are special inside a regex character class. + const escaped = chars.replace(/[\]\\^-]/g, c => '\\' + c) + return new RegExp(`[^a-z0-9${escaped}]+`, 'g') +} /** * Extract the set of distinct n-grams in a string. * * @param {string} text * @param {number} n + * @param {string} [chars] extra characters kept inside runs (default: JSON delimiters) * @returns {Set} */ -export function extractNgrams(text, n) { +export function extractNgrams(text, n, chars = defaultNgramChars) { /** @type {Set} */ const out = new Set() if (typeof text !== 'string' || text.length < n) return out const lower = text.toLowerCase() - for (const run of lower.split(/[^a-z0-9]+/g)) { + for (const run of lower.split(boundaryPattern(chars))) { for (let i = 0; i + n <= run.length; i += 1) { out.add(run.slice(i, i + n)) } @@ -31,10 +55,11 @@ export function extractNgrams(text, n) { * * @param {string} query * @param {number} n + * @param {string} [chars] extra characters kept inside runs (default: JSON delimiters) * @returns {string[]} */ -export function queryNgrams(query, n) { - return Array.from(extractNgrams(query, n)) +export function queryNgrams(query, n, chars = defaultNgramChars) { + return Array.from(extractNgrams(query, n, chars)) } /** @@ -44,13 +69,14 @@ export function queryNgrams(query, n) { * * @param {string[]} literals * @param {number} n + * @param {string} [chars] extra characters kept inside runs (default: JSON delimiters) * @returns {string[]} */ -export function literalsToNgrams(literals, n) { +export function literalsToNgrams(literals, n, chars = defaultNgramChars) { /** @type {Set} */ const out = new Set() for (const lit of literals) { - for (const g of extractNgrams(lit, n)) out.add(g) + for (const g of extractNgrams(lit, n, chars)) out.add(g) } return Array.from(out) } diff --git a/src/queryIndex.js b/src/queryIndex.js index 38bd15b..62216e9 100644 --- a/src/queryIndex.js +++ b/src/queryIndex.js @@ -32,17 +32,19 @@ export async function queryIndex({ query, indexFile, indexMetadata }) { // Read index kv metadata indexMetadata ??= await parquetMetadataAsync(indexFile) const kvMetadata = indexMetadata.key_value_metadata || [] - const { blockSize, ngramLength, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata) + const { blockSize, ngramLength, ngramChars, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata) // A "branch" is a conjunction of n-grams that ALL must appear in a block. // A query matches a block if ANY branch is fully satisfied (DNF). // // - String query: one branch, the n-grams of the string itself. // - RegExp query: one branch per top-level alternation arm. + // The query MUST be tokenized with the same kept-character set the index was + // built with, or its n-grams won't line up with the postings. /** @type {string[][]} */ const branches = query instanceof RegExp - ? extractRegexLiterals(query).map(lits => literalsToNgrams(lits, ngramLength)) - : [queryNgrams(query, ngramLength)] + ? extractRegexLiterals(query).map(lits => literalsToNgrams(lits, ngramLength, ngramChars)) + : [queryNgrams(query, ngramLength, ngramChars)] // If any branch is empty, that branch matches anything — falling back to a // full scan is correct and bounded. @@ -119,6 +121,9 @@ export function parseKvMetadata(kvMetadata) { let blockSize /** @type {number | undefined} */ let ngramLength + // Extra characters kept inside n-gram runs. Absent in pre-structural indexes, + // which were tokenized as plain alphanumeric, so '' reproduces that exactly. + let ngramChars = '' /** @type {string[]} */ let textColumns = [] /** @type {number | undefined} */ @@ -133,6 +138,9 @@ export function parseKvMetadata(kvMetadata) { if (key === 'hypgrep.ngram_length') { ngramLength = Number(value) } + if (key === 'hypgrep.ngram_chars' && typeof value === 'string') { + ngramChars = value + } if (key === 'hypgrep.version') { version = Number(value) if (version !== hypGrepVersion) { @@ -172,5 +180,5 @@ export function parseKvMetadata(kvMetadata) { assertNonNegativeSafeInteger(sourceRows, 'hypgrep.source_rows') assertNonNegativeSafeInteger(sourceByteLength, 'hypgrep.source_bytelength') - return { blockSize, ngramLength, textColumns, sourceByteLength, sourceRows } + return { blockSize, ngramLength, ngramChars, textColumns, sourceByteLength, sourceRows } } diff --git a/src/types.d.ts b/src/types.d.ts index 0f02d70..a7ccb71 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -8,6 +8,7 @@ export interface CreateIndexOptions { blockSize?: number // number of rows per logical block indexRowGroupSize?: number // row group size in the index file ngramLength?: number // n-gram size used to build the index (default 5) + ngramChars?: string // extra non-alphanumeric characters kept inside n-gram runs so n-grams span them (default: the JSON delimiters " : { }); '' yields a plain alphanumeric tokenizer } export interface QueryIndexOptions { @@ -61,6 +62,7 @@ export interface BlockResult { export interface HypGrepMetadata { blockSize: number // number of rows per logical block ngramLength: number // n-gram size used to build the index + ngramChars: string // extra characters kept inside n-gram runs ('' for pre-structural indexes) textColumns: string[] // list of indexed text columns sourceRows: number // number of rows in the source parquet file sourceByteLength: number // byte length of the source parquet file diff --git a/test/createIndex.test.js b/test/createIndex.test.js index e4cd271..b64bcf7 100644 --- a/test/createIndex.test.js +++ b/test/createIndex.test.js @@ -27,18 +27,19 @@ describe('createIndex', () => { expect(existsSync(TEST_INDEX)).toBe(true) const indexBuffer = await asyncBufferFromFile(TEST_INDEX) - expect(indexBuffer.byteLength).toBe(2494) + expect(indexBuffer.byteLength).toBe(2522) const indexMetadata = await parquetMetadataAsync(indexBuffer) expect(indexMetadata.row_groups.length).toBe(7) expect(indexMetadata.num_rows).toBe(676n) - expect(indexMetadata.key_value_metadata?.length).toBe(6) + expect(indexMetadata.key_value_metadata?.length).toBe(7) const kv = indexMetadata.key_value_metadata expect(kv?.[0]).toEqual({ key: 'hypgrep.version', value: '0' }) expect(kv?.[1]).toEqual({ key: 'hypgrep.block_size', value: '200' }) expect(kv?.[2]).toEqual({ key: 'hypgrep.ngram_length', value: '5' }) - expect(kv?.[3]).toEqual({ key: 'hypgrep.text_columns', value: 'id' }) - expect(kv?.[4]).toEqual({ key: 'hypgrep.source_rows', value: '676' }) - expect(kv?.[5]).toEqual({ key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) }) + expect(kv?.[3]).toEqual({ key: 'hypgrep.ngram_chars', value: '"{}:' }) + expect(kv?.[4]).toEqual({ key: 'hypgrep.text_columns', value: 'id' }) + expect(kv?.[5]).toEqual({ key: 'hypgrep.source_rows', value: '676' }) + expect(kv?.[6]).toEqual({ key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) }) }) it('should reject invalid sizing options', async () => { diff --git a/test/ngramChars.test.js b/test/ngramChars.test.js new file mode 100644 index 0000000..adc1cf6 --- /dev/null +++ b/test/ngramChars.test.js @@ -0,0 +1,107 @@ +import { afterEach, describe, expect, it } from 'vitest' +import { asyncBufferFromFile, parquetMetadataAsync } from 'hyparquet' +import { fileWriter, parquetWriteFile } from 'hyparquet-writer' +import { existsSync, unlinkSync } from 'fs' +import { createIndex } from '../src/createIndex.js' +import { parquetFind } from '../src/parquetFind.js' +import { parseKvMetadata, queryIndex } from '../src/queryIndex.js' + +const SRC = 'test/files/ngramchars.source.parquet' +const IDX = 'test/files/ngramchars.index.parquet' +const IDX_PLAIN = 'test/files/ngramchars.plain.index.parquet' + +// 20 rows, one per block. Two rows carry the literal structural string +// {"role":"system",...}; the rest mention "role" and "system" as plain prose +// words. A plain alphanumeric tokenizer (dropping the quotes/colon) can't tell a +// structural match from prose, but the default structural tokenizer can. +function writeSource() { + const rows = [] + for (let i = 0; i < 20; i += 1) { + if (i === 3 || i === 11) rows.push(`{"role":"system","content":"message ${i}"}`) + else rows.push(`the role of the system in document ${i} is to assist the user`) + } + parquetWriteFile({ filename: SRC, columnData: [{ name: 'text', data: rows }] }) +} + +/** + * @param {string} query + * @param {string} idx + * @returns {Promise} + */ +async function candBlocks(query, idx) { + const r = await queryIndex({ query, indexFile: await asyncBufferFromFile(idx) }) + return r ? r.blocks.length : 0 +} + +/** + * @param {string} query + * @param {string} idx + * @returns {Promise} + */ +async function find(query, idx) { + const rows = [] + for await (const row of parquetFind({ + url: SRC, + sourceFile: await asyncBufferFromFile(SRC), + indexFile: await asyncBufferFromFile(idx), + query, + })) rows.push(row.__index__) + return rows.sort((a, b) => a - b) +} + +describe('structural tokenizer (default kept characters)', () => { + afterEach(() => { + for (const f of [SRC, IDX, IDX_PLAIN]) if (existsSync(f)) unlinkSync(f) + }) + + it('the default prunes a structural JSON query; an empty kept-set does not', async () => { + writeSource() + await createIndex({ sourceFile: await asyncBufferFromFile(SRC), indexFile: fileWriter(IDX), blockSize: 1 }) + await createIndex({ sourceFile: await asyncBufferFromFile(SRC), indexFile: fileWriter(IDX_PLAIN), blockSize: 1, ngramChars: '' }) + + const q = '"role":"system"' + // default keeps " and :, isolating the 2 real blocks; the plain index can't + // prune (system/role appear everywhere as bare words). + expect(await candBlocks(q, IDX)).toBe(2) + expect(await candBlocks(q, IDX_PLAIN)).toBe(20) + + // both return exactly the right rows; the per-row filter guarantees it + expect(await find(q, IDX)).toEqual([3, 11]) + expect(await find(q, IDX_PLAIN)).toEqual([3, 11]) + }) + + it('records the kept characters in index metadata', async () => { + writeSource() + await createIndex({ sourceFile: await asyncBufferFromFile(SRC), indexFile: fileWriter(IDX), blockSize: 1 }) + const kv = (await parquetMetadataAsync(await asyncBufferFromFile(IDX))).key_value_metadata || [] + expect(kv.find(k => k.key === 'hypgrep.ngram_chars')).toEqual({ key: 'hypgrep.ngram_chars', value: '"{}:' }) + expect(parseKvMetadata(kv).ngramChars).toBe('"{}:') + }) + + it('reads a pre-structural index (no ngram_chars key) as plain alphanumeric', () => { + // an index built before this change has no ngram_chars key + const kv = [ + { key: 'hypgrep.version', value: '0' }, + { key: 'hypgrep.block_size', value: '100' }, + { key: 'hypgrep.ngram_length', value: '5' }, + { key: 'hypgrep.text_columns', value: 'text' }, + { key: 'hypgrep.source_rows', value: '10' }, + { key: 'hypgrep.source_bytelength', value: '100' }, + ] + expect(parseKvMetadata(kv).ngramChars).toBe('') + }) + + it('rejects ngramChars containing alphanumerics or whitespace', async () => { + writeSource() + await expect(createIndex({ + sourceFile: await asyncBufferFromFile(SRC), + indexFile: fileWriter(IDX), + ngramChars: 'a', + })).rejects.toThrow('ngramChars') + await expect(createIndex({ + sourceFile: await asyncBufferFromFile(SRC), + indexFile: fileWriter(IDX), + ngramChars: ': ', + })).rejects.toThrow('ngramChars') + }) +}) diff --git a/test/ngrams.test.js b/test/ngrams.test.js index 31f531f..f6fa17b 100644 --- a/test/ngrams.test.js +++ b/test/ngrams.test.js @@ -31,6 +31,35 @@ describe('extractNgrams', () => { }) }) +describe('extractNgrams kept characters', () => { + it('keeps the JSON delimiters " : { } by default', () => { + // both words are too short for n=5 alone, so a bare alphanumeric tokenizer + // would extract nothing; keeping " and : lets n-grams span the delimiter + const grams = extractNgrams('"role":"user"', 5) + expect(grams.has('le":"')).toBe(true) + expect(grams.has('"role')).toBe(true) + expect(extractNgrams('{"a":1', 3).has('{"a')).toBe(true) // spans the brace + }) + + it('still splits on punctuation that is not kept', () => { + // commas, periods, apostrophes are NOT kept, so prose vocabulary stays small + expect(extractNgrams('foo,bar', 3)).toEqual(new Set(['foo', 'bar'])) + expect(extractNgrams('don\'t', 3)).toEqual(new Set(['don'])) + }) + + it('an empty kept-set reproduces a plain alphanumeric tokenizer', () => { + expect(extractNgrams('"role":"user"', 5, '')).toEqual(new Set()) + expect(extractNgrams('foo,bar baz', 3, '')).toEqual(new Set(['foo', 'bar', 'baz'])) + }) + + it('escapes regex-special kept characters', () => { + // ] and - are special inside a character class; they must still work as kept + const grams = extractNgrams('a]b-c', 3, ']-') + expect(grams.has('a]b')).toBe(true) + expect(grams.has('b-c')).toBe(true) + }) +}) + describe('queryNgrams', () => { it('returns empty array for short queries', () => { expect(queryNgrams('ab', 3)).toEqual([])