Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

![hypgrep](hypgrep.jpg)

[![npm](https://img.shields.io/npm/v/hypgrep)](https://www.npmjs.com/package/hypgrep)
[![minzipped](https://img.shields.io/bundlephobia/minzip/hypgrep)](https://www.npmjs.com/package/hypgrep)
[![workflow status](https://github.com/hyparam/hypgrep/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hypgrep/actions)
[![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT)
![coverage](https://img.shields.io/badge/Coverage-95-darkred)
[![dependencies](https://img.shields.io/badge/Dependencies-3-blueviolet)](https://www.npmjs.com/package/hypgrep?activeTab=dependencies)

Build a compact n-gram search index for a Parquet file using [`hyparquet`](https://github.com/hyparam/hyparquet) and [`hyparquet-writer`](https://github.com/hyparam/hyparquet-writer). Queries are case-insensitive substring matches — grep semantics over a precomputed index.

Expand All @@ -15,6 +19,19 @@ Enable efficient grep-style search on large Parquet datasets from any client wit

Perfect for serverless architectures where you want to offer search capabilities without managing infrastructure.

## Benchmarks

Full-text search over 3,199,860 real LLM conversations ([WildChat-4.8M](https://huggingface.co/datasets/allenai/WildChat-4.8M)), run against the same data on every engine. Every competitor was queried over the network, the way it is actually deployed. hypgrep keeps the index in object storage and runs the query in the client, so there is no server and no idle cost.

| Engine | Index size | Warm query (p50) | All-in / mo | Server |
|---|---:|---:|---:|---|
| **hypgrep** | 1.20 GB | 237 ms | **~$0.33** | none |
| Elasticsearch | 27.2 GB | 66 ms | $371 | r5.2xlarge 24/7 |
| Quickwit | 28.8 GB | 133 ms | $63 | t3.large 24/7 |
| Athena | none | 5,490 ms | $0.065/query | serverless |

The always-on engines win raw latency by keeping a hot index in RAM, which is what the monthly bill pays for. hypgrep trades that for zero idle cost, a smaller footprint, and no infrastructure.

## CLI usage

Build an index:
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"name": "hypgrep",
"version": "0.2.1",
"description": "Compact full-text grep search index for Parquet files",
"author": "Hyperparam",
"homepage": "https://hyperparam.app",
"license": "MIT",
Expand Down
11 changes: 11 additions & 0 deletions src/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,14 @@ export const defaultIndexRowGroupSize = 40000
// block contains every common short window; n=5 dramatically reduces the
// candidate-block set for selective substrings.
export const defaultNgramLength = 5

// Extra (non-alphanumeric) characters kept inside n-gram runs. Alphanumerics are
// always kept; every other character is a run boundary unless listed here. The
// default keeps the JSON delimiters " : { } so n-grams span them and `"role":`
// stays intact, which makes structured/JSON greps highly selective at almost no
// index-size cost (measured on WildChat: ~1.04x the size of a bare alphanumeric
// index, versus ~2.6x for keeping all punctuation). The exact set is recorded
// per index in kv metadata (`hypgrep.ngram_chars`) so queries tokenize
// identically; an index with no such key (built before this) is read as the
// empty set, i.e. plain alphanumeric, and keeps working unchanged.
export const defaultNgramChars = '"{}:'
17 changes: 13 additions & 4 deletions src/createIndex.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
import { ParquetWriter, schemaFromColumnData } from 'hyparquet-writer'
import { defaultBlockSize, defaultIndexRowGroupSize, defaultNgramLength, hypGrepVersion } from './constants.js'
import { defaultBlockSize, defaultIndexRowGroupSize, defaultNgramChars, defaultNgramLength, hypGrepVersion } from './constants.js'
import { extractNgrams } from './ngrams.js'
import { assertNonNegativeSafeInteger, assertPositiveSafeInteger, getTextColumnsFromSchema } from './utils.js'

Expand All @@ -21,10 +21,14 @@ export async function createIndex({
blockSize = defaultBlockSize,
indexRowGroupSize = defaultIndexRowGroupSize,
ngramLength = defaultNgramLength,
ngramChars = defaultNgramChars,
}) {
assertPositiveSafeInteger(blockSize, 'blockSize')
assertPositiveSafeInteger(indexRowGroupSize, 'indexRowGroupSize')
assertPositiveSafeInteger(ngramLength, 'ngramLength')
if (typeof ngramChars !== 'string' || /[a-z0-9\s]/i.test(ngramChars)) {
throw new Error('ngramChars must be a string of non-alphanumeric, non-whitespace characters')
}
assertNonNegativeSafeInteger(sourceFile.byteLength, 'sourceFile.byteLength')

const metadata = sourceMetadata ?? await parquetMetadataAsync(sourceFile)
Expand Down Expand Up @@ -60,7 +64,7 @@ export async function createIndex({
columns: textColumns,
})

const blockNgrams = collectBlockNgrams(rows, textColumns, ngramLength)
const blockNgrams = collectBlockNgrams(rows, textColumns, ngramLength, ngramChars)
for (const ngram of blockNgrams) {
const prefix = ngram.slice(0, PREFIX_LENGTH)
let postings = buckets.get(prefix)
Expand All @@ -80,6 +84,10 @@ export async function createIndex({
{ key: 'hypgrep.version', value: String(hypGrepVersion) },
{ key: 'hypgrep.block_size', value: String(blockSize) },
{ key: 'hypgrep.ngram_length', value: String(ngramLength) },
// Record the kept-character set so queryIndex tokenizes identically. An index
// missing this key (built before structural tokenization) is read as '',
// i.e. plain alphanumeric, and keeps working unchanged.
{ key: 'hypgrep.ngram_chars', value: ngramChars },
{ key: 'hypgrep.text_columns', value: textColumns.join(',') },
{ key: 'hypgrep.source_rows', value: String(numRows) },
// Can save network requests on the source file
Expand Down Expand Up @@ -150,17 +158,18 @@ export async function createIndex({
* @param {Record<string, any>[]} rows
* @param {string[]} textColumns
* @param {number} n
* @param {string} chars extra characters kept inside n-gram runs
* @returns {Set<string>}
*/
function collectBlockNgrams(rows, textColumns, n) {
function collectBlockNgrams(rows, textColumns, n, chars) {
/** @type {Set<string>} */
const ngrams = new Set()
for (const row of rows) {
if (!row) continue
for (const columnName of textColumns) {
const value = row[columnName]
if (typeof value !== 'string' || value.length < n) continue
for (const g of extractNgrams(value, n)) {
for (const g of extractNgrams(value, n, chars)) {
ngrams.add(g)
}
}
Expand Down
42 changes: 34 additions & 8 deletions src/ngrams.js
Original file line number Diff line number Diff line change
@@ -1,23 +1,47 @@
/**
* N-gram extraction for grep-style substring matching.
*
* Text is lowercased and split on non-alphanumeric boundaries, then every
* n-character window of each alphanumeric run is emitted as an n-gram.
* Text is lowercased and split into runs, then every n-character window of each
* run is emitted as an n-gram. Alphanumerics are always part of a run; the
* `chars` argument lists extra punctuation characters that are also kept inside
* runs (so n-grams span them) rather than treated as boundaries. The default,
* `defaultNgramChars`, keeps the JSON delimiters " : { } so structured greps like
* `"role":` stay selective. Passing '' reproduces a bare alphanumeric tokenizer.
*
* Index and query MUST use the same `chars`, or their n-grams won't line up;
* createIndex records the set in kv metadata and queryIndex reads it back.
*/
import { defaultNgramChars } from './constants.js'

/**
* Build the run-boundary regex for a set of kept punctuation characters. A fresh
* RegExp per call keeps the global `g` flag's `lastIndex` from leaking across
* `String.prototype.split` calls.
*
* @param {string} chars extra (non-alphanumeric) characters kept inside runs
* @returns {RegExp}
*/
function boundaryPattern(chars) {
if (!chars) return /[^a-z0-9]+/g
// Escape the characters that are special inside a regex character class.
const escaped = chars.replace(/[\]\\^-]/g, c => '\\' + c)
return new RegExp(`[^a-z0-9${escaped}]+`, 'g')
}

/**
* Extract the set of distinct n-grams in a string.
*
* @param {string} text
* @param {number} n
* @param {string} [chars] extra characters kept inside runs (default: JSON delimiters)
* @returns {Set<string>}
*/
export function extractNgrams(text, n) {
export function extractNgrams(text, n, chars = defaultNgramChars) {
/** @type {Set<string>} */
const out = new Set()
if (typeof text !== 'string' || text.length < n) return out
const lower = text.toLowerCase()
for (const run of lower.split(/[^a-z0-9]+/g)) {
for (const run of lower.split(boundaryPattern(chars))) {
for (let i = 0; i + n <= run.length; i += 1) {
out.add(run.slice(i, i + n))
}
Expand All @@ -31,10 +55,11 @@ export function extractNgrams(text, n) {
*
* @param {string} query
* @param {number} n
* @param {string} [chars] extra characters kept inside runs (default: JSON delimiters)
* @returns {string[]}
*/
export function queryNgrams(query, n) {
return Array.from(extractNgrams(query, n))
export function queryNgrams(query, n, chars = defaultNgramChars) {
return Array.from(extractNgrams(query, n, chars))
}

/**
Expand All @@ -44,13 +69,14 @@ export function queryNgrams(query, n) {
*
* @param {string[]} literals
* @param {number} n
* @param {string} [chars] extra characters kept inside runs (default: JSON delimiters)
* @returns {string[]}
*/
export function literalsToNgrams(literals, n) {
export function literalsToNgrams(literals, n, chars = defaultNgramChars) {
/** @type {Set<string>} */
const out = new Set()
for (const lit of literals) {
for (const g of extractNgrams(lit, n)) out.add(g)
for (const g of extractNgrams(lit, n, chars)) out.add(g)
}
return Array.from(out)
}
16 changes: 12 additions & 4 deletions src/queryIndex.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,19 @@ export async function queryIndex({ query, indexFile, indexMetadata }) {
// Read index kv metadata
indexMetadata ??= await parquetMetadataAsync(indexFile)
const kvMetadata = indexMetadata.key_value_metadata || []
const { blockSize, ngramLength, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata)
const { blockSize, ngramLength, ngramChars, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata)

// A "branch" is a conjunction of n-grams that ALL must appear in a block.
// A query matches a block if ANY branch is fully satisfied (DNF).
//
// - String query: one branch, the n-grams of the string itself.
// - RegExp query: one branch per top-level alternation arm.
// The query MUST be tokenized with the same kept-character set the index was
// built with, or its n-grams won't line up with the postings.
/** @type {string[][]} */
const branches = query instanceof RegExp
? extractRegexLiterals(query).map(lits => literalsToNgrams(lits, ngramLength))
: [queryNgrams(query, ngramLength)]
? extractRegexLiterals(query).map(lits => literalsToNgrams(lits, ngramLength, ngramChars))
: [queryNgrams(query, ngramLength, ngramChars)]

// If any branch is empty, that branch matches anything — falling back to a
// full scan is correct and bounded.
Expand Down Expand Up @@ -119,6 +121,9 @@ export function parseKvMetadata(kvMetadata) {
let blockSize
/** @type {number | undefined} */
let ngramLength
// Extra characters kept inside n-gram runs. Absent in pre-structural indexes,
// which were tokenized as plain alphanumeric, so '' reproduces that exactly.
let ngramChars = ''
/** @type {string[]} */
let textColumns = []
/** @type {number | undefined} */
Expand All @@ -133,6 +138,9 @@ export function parseKvMetadata(kvMetadata) {
if (key === 'hypgrep.ngram_length') {
ngramLength = Number(value)
}
if (key === 'hypgrep.ngram_chars' && typeof value === 'string') {
ngramChars = value
}
if (key === 'hypgrep.version') {
version = Number(value)
if (version !== hypGrepVersion) {
Expand Down Expand Up @@ -172,5 +180,5 @@ export function parseKvMetadata(kvMetadata) {
assertNonNegativeSafeInteger(sourceRows, 'hypgrep.source_rows')
assertNonNegativeSafeInteger(sourceByteLength, 'hypgrep.source_bytelength')

return { blockSize, ngramLength, textColumns, sourceByteLength, sourceRows }
return { blockSize, ngramLength, ngramChars, textColumns, sourceByteLength, sourceRows }
}
2 changes: 2 additions & 0 deletions src/types.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export interface CreateIndexOptions {
blockSize?: number // number of rows per logical block
indexRowGroupSize?: number // row group size in the index file
ngramLength?: number // n-gram size used to build the index (default 5)
ngramChars?: string // extra non-alphanumeric characters kept inside n-gram runs so n-grams span them (default: the JSON delimiters " : { }); '' yields a plain alphanumeric tokenizer
}

export interface QueryIndexOptions {
Expand Down Expand Up @@ -61,6 +62,7 @@ export interface BlockResult {
export interface HypGrepMetadata {
blockSize: number // number of rows per logical block
ngramLength: number // n-gram size used to build the index
ngramChars: string // extra characters kept inside n-gram runs ('' for pre-structural indexes)
textColumns: string[] // list of indexed text columns
sourceRows: number // number of rows in the source parquet file
sourceByteLength: number // byte length of the source parquet file
Expand Down
11 changes: 6 additions & 5 deletions test/createIndex.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,19 @@ describe('createIndex', () => {

expect(existsSync(TEST_INDEX)).toBe(true)
const indexBuffer = await asyncBufferFromFile(TEST_INDEX)
expect(indexBuffer.byteLength).toBe(2494)
expect(indexBuffer.byteLength).toBe(2522)
const indexMetadata = await parquetMetadataAsync(indexBuffer)
expect(indexMetadata.row_groups.length).toBe(7)
expect(indexMetadata.num_rows).toBe(676n)
expect(indexMetadata.key_value_metadata?.length).toBe(6)
expect(indexMetadata.key_value_metadata?.length).toBe(7)
const kv = indexMetadata.key_value_metadata
expect(kv?.[0]).toEqual({ key: 'hypgrep.version', value: '0' })
expect(kv?.[1]).toEqual({ key: 'hypgrep.block_size', value: '200' })
expect(kv?.[2]).toEqual({ key: 'hypgrep.ngram_length', value: '5' })
expect(kv?.[3]).toEqual({ key: 'hypgrep.text_columns', value: 'id' })
expect(kv?.[4]).toEqual({ key: 'hypgrep.source_rows', value: '676' })
expect(kv?.[5]).toEqual({ key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) })
expect(kv?.[3]).toEqual({ key: 'hypgrep.ngram_chars', value: '"{}:' })
expect(kv?.[4]).toEqual({ key: 'hypgrep.text_columns', value: 'id' })
expect(kv?.[5]).toEqual({ key: 'hypgrep.source_rows', value: '676' })
expect(kv?.[6]).toEqual({ key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) })
})

it('should reject invalid sizing options', async () => {
Expand Down
Loading