hyparam · platypii · Jun 29, 2026 · Jun 29, 2026
diff --git a/README.md b/README.md
@@ -2,8 +2,12 @@
 
 ![hypgrep](hypgrep.jpg)
 
+[![npm](https://img.shields.io/npm/v/hypgrep)](https://www.npmjs.com/package/hypgrep)
+[![minzipped](https://img.shields.io/bundlephobia/minzip/hypgrep)](https://www.npmjs.com/package/hypgrep)
+[![workflow status](https://github.com/hyparam/hypgrep/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hypgrep/actions)
 [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT)
 ![coverage](https://img.shields.io/badge/Coverage-95-darkred)
+[![dependencies](https://img.shields.io/badge/Dependencies-3-blueviolet)](https://www.npmjs.com/package/hypgrep?activeTab=dependencies)
 
 Build a compact n-gram search index for a Parquet file using [`hyparquet`](https://github.com/hyparam/hyparquet) and [`hyparquet-writer`](https://github.com/hyparam/hyparquet-writer). Queries are case-insensitive substring matches — grep semantics over a precomputed index.
 
@@ -15,6 +19,19 @@ Enable efficient grep-style search on large Parquet datasets from any client wit
 
 Perfect for serverless architectures where you want to offer search capabilities without managing infrastructure.
 
+## Benchmarks
+
+Full-text search over 3,199,860 real LLM conversations ([WildChat-4.8M](https://huggingface.co/datasets/allenai/WildChat-4.8M)), run against the same data on every engine. Every competitor was queried over the network, the way it is actually deployed. hypgrep keeps the index in object storage and runs the query in the client, so there is no server and no idle cost.
+
+| Engine | Index size | Warm query (p50) | All-in / mo | Server |
+|---|---:|---:|---:|---|
+| **hypgrep** | 1.20 GB | 237 ms | **~$0.33** | none |
+| Elasticsearch | 27.2 GB | 66 ms | $371 | r5.2xlarge 24/7 |
+| Quickwit | 28.8 GB | 133 ms | $63 | t3.large 24/7 |
+| Athena | none | 5,490 ms | $0.065/query | serverless |
+
+The always-on engines win raw latency by keeping a hot index in RAM, which is what the monthly bill pays for. hypgrep trades that for zero idle cost, a smaller footprint, and no infrastructure.
+
 ## CLI usage
 
 Build an index:

diff --git a/package.json b/package.json
@@ -1,6 +1,7 @@
 {
   "name": "hypgrep",
   "version": "0.2.1",
+  "description": "Compact full-text grep search index for Parquet files",
   "author": "Hyperparam",
   "homepage": "https://hyperparam.app",
   "license": "MIT",

diff --git a/src/constants.js b/src/constants.js
@@ -18,3 +18,14 @@ export const defaultIndexRowGroupSize = 40000
 // block contains every common short window; n=5 dramatically reduces the
 // candidate-block set for selective substrings.
 export const defaultNgramLength = 5
+
+// Extra (non-alphanumeric) characters kept inside n-gram runs. Alphanumerics are
+// always kept; every other character is a run boundary unless listed here. The
+// default keeps the JSON delimiters " : { } so n-grams span them and `"role":`
+// stays intact, which makes structured/JSON greps highly selective at almost no
+// index-size cost (measured on WildChat: ~1.04x the size of a bare alphanumeric
+// index, versus ~2.6x for keeping all punctuation). The exact set is recorded
+// per index in kv metadata (`hypgrep.ngram_chars`) so queries tokenize
+// identically; an index with no such key (built before this) is read as the
+// empty set, i.e. plain alphanumeric, and keeps working unchanged.
+export const defaultNgramChars = '"{}:'
diff --git a/src/createIndex.js b/src/createIndex.js
@@ -1,6 +1,6 @@
 import { parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
 import { ParquetWriter, schemaFromColumnData } from 'hyparquet-writer'
-import { defaultBlockSize, defaultIndexRowGroupSize, defaultNgramLength, hypGrepVersion } from './constants.js'
+import { defaultBlockSize, defaultIndexRowGroupSize, defaultNgramChars, defaultNgramLength, hypGrepVersion } from './constants.js'
 import { extractNgrams } from './ngrams.js'
 import { assertNonNegativeSafeInteger, assertPositiveSafeInteger, getTextColumnsFromSchema } from './utils.js'
 
@@ -21,10 +21,14 @@ export async function createIndex({
   blockSize = defaultBlockSize,
   indexRowGroupSize = defaultIndexRowGroupSize,
   ngramLength = defaultNgramLength,
+  ngramChars = defaultNgramChars,
 }) {
   assertPositiveSafeInteger(blockSize, 'blockSize')
   assertPositiveSafeInteger(indexRowGroupSize, 'indexRowGroupSize')
   assertPositiveSafeInteger(ngramLength, 'ngramLength')
+  if (typeof ngramChars !== 'string' || /[a-z0-9\s]/i.test(ngramChars)) {
+    throw new Error('ngramChars must be a string of non-alphanumeric, non-whitespace characters')
+  }
   assertNonNegativeSafeInteger(sourceFile.byteLength, 'sourceFile.byteLength')
 
   const metadata = sourceMetadata ?? await parquetMetadataAsync(sourceFile)
@@ -60,7 +64,7 @@ export async function createIndex({
       columns: textColumns,
     })
 
-    const blockNgrams = collectBlockNgrams(rows, textColumns, ngramLength)
+    const blockNgrams = collectBlockNgrams(rows, textColumns, ngramLength, ngramChars)
     for (const ngram of blockNgrams) {
       const prefix = ngram.slice(0, PREFIX_LENGTH)
       let postings = buckets.get(prefix)
@@ -80,6 +84,10 @@ export async function createIndex({
     { key: 'hypgrep.version', value: String(hypGrepVersion) },
     { key: 'hypgrep.block_size', value: String(blockSize) },
     { key: 'hypgrep.ngram_length', value: String(ngramLength) },
+    // Record the kept-character set so queryIndex tokenizes identically. An index
+    // missing this key (built before structural tokenization) is read as '',
+    // i.e. plain alphanumeric, and keeps working unchanged.
+    { key: 'hypgrep.ngram_chars', value: ngramChars },
     { key: 'hypgrep.text_columns', value: textColumns.join(',') },
     { key: 'hypgrep.source_rows', value: String(numRows) },
     // Can save network requests on the source file
@@ -150,17 +158,18 @@ export async function createIndex({
  * @param {Record<string, any>[]} rows
  * @param {string[]} textColumns
  * @param {number} n
+ * @param {string} chars extra characters kept inside n-gram runs
  * @returns {Set<string>}
  */
-function collectBlockNgrams(rows, textColumns, n) {
+function collectBlockNgrams(rows, textColumns, n, chars) {
   /** @type {Set<string>} */
   const ngrams = new Set()
   for (const row of rows) {
     if (!row) continue
     for (const columnName of textColumns) {
       const value = row[columnName]
       if (typeof value !== 'string' || value.length < n) continue
-      for (const g of extractNgrams(value, n)) {
+      for (const g of extractNgrams(value, n, chars)) {
         ngrams.add(g)
       }
     }

diff --git a/src/ngrams.js b/src/ngrams.js
@@ -1,23 +1,47 @@
 /**
  * N-gram extraction for grep-style substring matching.
  *
- * Text is lowercased and split on non-alphanumeric boundaries, then every
- * n-character window of each alphanumeric run is emitted as an n-gram.
+ * Text is lowercased and split into runs, then every n-character window of each
+ * run is emitted as an n-gram. Alphanumerics are always part of a run; the
+ * `chars` argument lists extra punctuation characters that are also kept inside
+ * runs (so n-grams span them) rather than treated as boundaries. The default,
+ * `defaultNgramChars`, keeps the JSON delimiters " : { } so structured greps like
+ * `"role":` stay selective. Passing '' reproduces a bare alphanumeric tokenizer.
+ *
+ * Index and query MUST use the same `chars`, or their n-grams won't line up;
+ * createIndex records the set in kv metadata and queryIndex reads it back.
+ */
+import { defaultNgramChars } from './constants.js'
+
+/**
+ * Build the run-boundary regex for a set of kept punctuation characters. A fresh
+ * RegExp per call keeps the global `g` flag's `lastIndex` from leaking across
+ * `String.prototype.split` calls.
+ *
+ * @param {string} chars extra (non-alphanumeric) characters kept inside runs
+ * @returns {RegExp}
  */
+function boundaryPattern(chars) {
+  if (!chars) return /[^a-z0-9]+/g
+  // Escape the characters that are special inside a regex character class.
+  const escaped = chars.replace(/[\]\\^-]/g, c => '\\' + c)
+  return new RegExp(`[^a-z0-9${escaped}]+`, 'g')
+}
 
 /**
  * Extract the set of distinct n-grams in a string.
  *
  * @param {string} text
  * @param {number} n
+ * @param {string} [chars] extra characters kept inside runs (default: JSON delimiters)
  * @returns {Set<string>}
  */
-export function extractNgrams(text, n) {
+export function extractNgrams(text, n, chars = defaultNgramChars) {
   /** @type {Set<string>} */
   const out = new Set()
   if (typeof text !== 'string' || text.length < n) return out
   const lower = text.toLowerCase()
-  for (const run of lower.split(/[^a-z0-9]+/g)) {
+  for (const run of lower.split(boundaryPattern(chars))) {
     for (let i = 0; i + n <= run.length; i += 1) {
       out.add(run.slice(i, i + n))
     }
@@ -31,10 +55,11 @@ export function extractNgrams(text, n) {
  *
  * @param {string} query
  * @param {number} n
+ * @param {string} [chars] extra characters kept inside runs (default: JSON delimiters)
  * @returns {string[]}
  */
-export function queryNgrams(query, n) {
-  return Array.from(extractNgrams(query, n))
+export function queryNgrams(query, n, chars = defaultNgramChars) {
+  return Array.from(extractNgrams(query, n, chars))
 }
 
 /**
@@ -44,13 +69,14 @@ export function queryNgrams(query, n) {
  *
  * @param {string[]} literals
  * @param {number} n
+ * @param {string} [chars] extra characters kept inside runs (default: JSON delimiters)
  * @returns {string[]}
  */
-export function literalsToNgrams(literals, n) {
+export function literalsToNgrams(literals, n, chars = defaultNgramChars) {
   /** @type {Set<string>} */
   const out = new Set()
   for (const lit of literals) {
-    for (const g of extractNgrams(lit, n)) out.add(g)
+    for (const g of extractNgrams(lit, n, chars)) out.add(g)
   }
   return Array.from(out)
 }
diff --git a/src/queryIndex.js b/src/queryIndex.js
@@ -32,17 +32,19 @@ export async function queryIndex({ query, indexFile, indexMetadata }) {
   // Read index kv metadata
   indexMetadata ??= await parquetMetadataAsync(indexFile)
   const kvMetadata = indexMetadata.key_value_metadata || []
-  const { blockSize, ngramLength, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata)
+  const { blockSize, ngramLength, ngramChars, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata)
 
   // A "branch" is a conjunction of n-grams that ALL must appear in a block.
   // A query matches a block if ANY branch is fully satisfied (DNF).
   //
   // - String query: one branch, the n-grams of the string itself.
   // - RegExp query: one branch per top-level alternation arm.
+  // The query MUST be tokenized with the same kept-character set the index was
+  // built with, or its n-grams won't line up with the postings.
   /** @type {string[][]} */
   const branches = query instanceof RegExp
-    ? extractRegexLiterals(query).map(lits => literalsToNgrams(lits, ngramLength))
-    : [queryNgrams(query, ngramLength)]
+    ? extractRegexLiterals(query).map(lits => literalsToNgrams(lits, ngramLength, ngramChars))
+    : [queryNgrams(query, ngramLength, ngramChars)]
 
   // If any branch is empty, that branch matches anything — falling back to a
   // full scan is correct and bounded.
@@ -119,6 +121,9 @@ export function parseKvMetadata(kvMetadata) {
   let blockSize
   /** @type {number | undefined} */
   let ngramLength
+  // Extra characters kept inside n-gram runs. Absent in pre-structural indexes,
+  // which were tokenized as plain alphanumeric, so '' reproduces that exactly.
+  let ngramChars = ''
   /** @type {string[]} */
   let textColumns = []
   /** @type {number | undefined} */
@@ -133,6 +138,9 @@ export function parseKvMetadata(kvMetadata) {
     if (key === 'hypgrep.ngram_length') {
       ngramLength = Number(value)
     }
+    if (key === 'hypgrep.ngram_chars' && typeof value === 'string') {
+      ngramChars = value
+    }
     if (key === 'hypgrep.version') {
       version = Number(value)
       if (version !== hypGrepVersion) {
@@ -172,5 +180,5 @@ export function parseKvMetadata(kvMetadata) {
   assertNonNegativeSafeInteger(sourceRows, 'hypgrep.source_rows')
   assertNonNegativeSafeInteger(sourceByteLength, 'hypgrep.source_bytelength')
 
-  return { blockSize, ngramLength, textColumns, sourceByteLength, sourceRows }
+  return { blockSize, ngramLength, ngramChars, textColumns, sourceByteLength, sourceRows }
 }
diff --git a/src/types.d.ts b/src/types.d.ts
@@ -8,6 +8,7 @@ export interface CreateIndexOptions {
   blockSize?: number // number of rows per logical block
   indexRowGroupSize?: number // row group size in the index file
   ngramLength?: number // n-gram size used to build the index (default 5)
+  ngramChars?: string // extra non-alphanumeric characters kept inside n-gram runs so n-grams span them (default: the JSON delimiters " : { }); '' yields a plain alphanumeric tokenizer
 }
 
 export interface QueryIndexOptions {
@@ -61,6 +62,7 @@ export interface BlockResult {
 export interface HypGrepMetadata {
   blockSize: number // number of rows per logical block
   ngramLength: number // n-gram size used to build the index
+  ngramChars: string // extra characters kept inside n-gram runs ('' for pre-structural indexes)
   textColumns: string[] // list of indexed text columns
   sourceRows: number // number of rows in the source parquet file
   sourceByteLength: number // byte length of the source parquet file

diff --git a/test/createIndex.test.js b/test/createIndex.test.js
@@ -27,18 +27,19 @@ describe('createIndex', () => {
 
     expect(existsSync(TEST_INDEX)).toBe(true)
     const indexBuffer = await asyncBufferFromFile(TEST_INDEX)
-    expect(indexBuffer.byteLength).toBe(2494)
+    expect(indexBuffer.byteLength).toBe(2522)
     const indexMetadata = await parquetMetadataAsync(indexBuffer)
     expect(indexMetadata.row_groups.length).toBe(7)
     expect(indexMetadata.num_rows).toBe(676n)
-    expect(indexMetadata.key_value_metadata?.length).toBe(6)
+    expect(indexMetadata.key_value_metadata?.length).toBe(7)
     const kv = indexMetadata.key_value_metadata
     expect(kv?.[0]).toEqual({ key: 'hypgrep.version', value: '0' })
     expect(kv?.[1]).toEqual({ key: 'hypgrep.block_size', value: '200' })
     expect(kv?.[2]).toEqual({ key: 'hypgrep.ngram_length', value: '5' })
-    expect(kv?.[3]).toEqual({ key: 'hypgrep.text_columns', value: 'id' })
-    expect(kv?.[4]).toEqual({ key: 'hypgrep.source_rows', value: '676' })
-    expect(kv?.[5]).toEqual({ key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) })
+    expect(kv?.[3]).toEqual({ key: 'hypgrep.ngram_chars', value: '"{}:' })
+    expect(kv?.[4]).toEqual({ key: 'hypgrep.text_columns', value: 'id' })
+    expect(kv?.[5]).toEqual({ key: 'hypgrep.source_rows', value: '676' })
+    expect(kv?.[6]).toEqual({ key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) })
   })
 
   it('should reject invalid sizing options', async () => {