Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
407 changes: 315 additions & 92 deletions ordvec-python/src/lib.rs

Large diffs are not rendered by default.

16 changes: 11 additions & 5 deletions ordvec-python/tests/test_bitmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,17 @@ def test_top_m_candidates_deterministic_across_repeated_calls():
assert set(runs[0]) == {int(i) for i in indices[0].tolist()}


def test_add_float64_is_rejected():
idx = Bitmap(dim=64, n_top=8)
v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64)
with pytest.raises(TypeError):
idx.add(v64)
def test_add_float64_is_coerced():
# float64 accepted and coerced to float32 at the boundary; same index as f32.
rng = np.random.default_rng(0)
v32 = rng.standard_normal((20, 64)).astype(np.float32)
a = Bitmap(dim=64, n_top=8)
a.add(v32)
b = Bitmap(dim=64, n_top=8)
b.add(v32.astype(np.float64))
assert len(a) == len(b) == 20
q = rng.standard_normal((3, 64)).astype(np.float32)
np.testing.assert_array_equal(a.search(q, k=5)[1], b.search(q, k=5)[1])


def test_dim_above_u16_max_rejected():
Expand Down
189 changes: 189 additions & 0 deletions ordvec-python/tests/test_input_dtype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
"""Embedding-input dtype/layout boundary contract (``as_f32_1d`` / ``as_f32_2d``).

ordvec normalises real-valued vector input to float32 at the FFI boundary — the
premise is *float vector in -> rank/sign transform*, so float32 is the internal
working dtype, not a contract the caller must pre-satisfy. The policy is uniform
across the four index types because every embedding entry point routes through
the same two choke-point helpers.

Accepted: float16 / float32 / float64, C-contiguous, finite after coercion
Rejected: bool, integers, complex, object, string -> TypeError
wrong ndim (scalar / 3-D) -> TypeError
non-contiguous (transpose / stride) -> ValueError
(never silently copied — the copy decision stays with the caller)
non-finite after coercion (NaN / inf / f64 > f32::MAX) -> ValueError

Why bool/int are rejected rather than coerced: a ``{0.0, 1.0}`` or narrow-integer
vector rank-transforms to an index-tie artefact (silent retrieval garbage), so
those are a deliberate usage-error guard, not an ergonomic gap. Candidate *IDs*
are a different boundary (labels, not measurements) and DO accept int64 — see
test_input_guards.py.
"""
from __future__ import annotations

import numpy as np
import pytest

from ordvec import Bitmap, Rank, RankQuant, SignBitmap

INDEX_CLASSES = [Rank, RankQuant, Bitmap, SignBitmap]


def _make(index_cls):
if index_cls is RankQuant:
return RankQuant(dim=64, bits=2)
if index_cls is Bitmap:
return Bitmap(dim=64, n_top=16)
return index_cls(dim=64)


def f32(n, dim=64, seed=0):
return np.random.default_rng(seed).standard_normal((n, dim)).astype(np.float32)


# -------------------------------------------------------------------
# Accepted: float dtypes, coerced to f32.
# -------------------------------------------------------------------


@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
def test_float_dtypes_accepted(index_cls, dtype):
idx = _make(index_cls)
idx.add(np.ascontiguousarray(f32(8).astype(dtype)))
assert len(idx) == 8


@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
def test_float64_coercion_is_faithful(index_cls):
# f64 and f32 builds of the same values yield the same index — the rank/sign
# transform is order/sign-only, and f64->f32 rounding is monotonic.
v32 = f32(12)
a = _make(index_cls)
a.add(v32)
b = _make(index_cls)
b.add(v32.astype(np.float64))
assert len(a) == len(b) == 12


# -------------------------------------------------------------------
# Rejected dtypes -> TypeError (bool/int/complex/object/string).
# -------------------------------------------------------------------


@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
@pytest.mark.parametrize(
"dtype",
[np.int8, np.int32, np.int64, np.uint8, np.uint32, np.uint64, bool, np.complex64, np.complex128, object],
)
def test_nonfloat_dtypes_rejected(index_cls, dtype):
with pytest.raises(TypeError):
_make(index_cls).add(np.ones((8, 64), dtype=dtype))


@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
def test_string_dtype_rejected(index_cls):
with pytest.raises(TypeError):
_make(index_cls).add(np.full((8, 64), "x"))


# -------------------------------------------------------------------
# Rejected ndim -> TypeError (scalar / 3-D).
# -------------------------------------------------------------------


@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
def test_scalar_rejected(index_cls):
with pytest.raises(TypeError):
_make(index_cls).add(np.float32(1.0))


@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
def test_3d_rejected(index_cls):
with pytest.raises(TypeError):
_make(index_cls).add(np.zeros((2, 3, 64), dtype=np.float32))


@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_wrong_width_rejected_before_coercion(index_cls, dtype):
# Width is a cheap metadata check validated on the ORIGINAL array *before* any
# dtype coercion: a wrong-width float64 is rejected with the same ValueError as
# float32, without first allocating its float32 twin (a wrong-shaped large
# array must not be copied just to be rejected).
bad = np.ones((4, 128), dtype=dtype) # width 128 != dim 64
with pytest.raises(ValueError, match="dimension"):
_make(index_cls).add(bad)


# -------------------------------------------------------------------
# Rejected layout -> ValueError, checked BEFORE coercion so a float64 transpose
# is never silently laundered into a contiguous float32 (hidden copy).
# -------------------------------------------------------------------


@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_transpose_rejected_not_silently_copied(index_cls, dtype):
v = np.asfortranarray(f32(8).astype(dtype)) # (8, 64) F-order -> non-C-contiguous
assert not v.flags["C_CONTIGUOUS"]
with pytest.raises(ValueError, match="C-contiguous"):
_make(index_cls).add(v)


# -------------------------------------------------------------------
# Rejected values -> ValueError, finite check AFTER coercion.
# -------------------------------------------------------------------


@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
@pytest.mark.parametrize("bad", [np.nan, np.inf, -np.inf])
def test_nonfinite_rejected(index_cls, bad):
v = f32(8)
v[2, 5] = bad
with pytest.raises(ValueError, match="finite"):
_make(index_cls).add(v)


@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
def test_float64_overflow_to_inf_rejected(index_cls):
# 1e300 is finite in float64 but rounds to +inf in float32 (NumPy's cast emits
# a RuntimeWarning, ignored here) — the finite check runs on the POST-coercion
# f32, so this is caught, not silently indexed.
v = f32(8).astype(np.float64)
v[0, 0] = 1e300
with pytest.raises(ValueError, match="finite"):
_make(index_cls).add(v)


# -------------------------------------------------------------------
# The 1-D query path (as_f32_1d) shares the contract.
# -------------------------------------------------------------------


def test_query_float64_accepted_and_faithful():
corpus = f32(30)
a = Bitmap(dim=64, n_top=16)
a.add(corpus)
q32 = f32(1, seed=7)[0]
np.testing.assert_array_equal(
a.top_m_candidates(q32, m=10),
a.top_m_candidates(q32.astype(np.float64), m=10),
)


def test_query_bool_rejected():
a = SignBitmap(dim=64)
a.add(f32(10))
with pytest.raises(TypeError):
a.top_m_candidates(np.ones(64, dtype=bool), m=5)


def test_query_noncontiguous_rejected():
a = Bitmap(dim=64, n_top=16)
a.add(f32(10))
strided = np.ascontiguousarray(f32(1, dim=128)[0])[::2] # len 64, non-contiguous
assert not strided.flags["C_CONTIGUOUS"]
with pytest.raises(ValueError, match="C-contiguous"):
a.top_m_candidates(strided, m=5)
122 changes: 122 additions & 0 deletions ordvec-python/tests/test_input_guards.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,128 @@ def test_subset_in_range_candidates_still_work():
assert int(ids[0]) == 0 # self-query → self ranks first


# -------------------------------------------------------------------
# Candidate / doc-id dtype acceptance. The core takes u32 ids, but NumPy
# index arrays are int64 by default (np.arange, np.where()[0], fancy
# indexing, np.argpartition). The binding accepts any integer dtype and
# converts to u32 with checked bounds rather than rejecting non-uint32
# with an opaque "ndarray cannot be cast as ndarray" TypeError.
# -------------------------------------------------------------------


# Every integer dtype a candidate set might realistically arrive in.
INT_DTYPES = [
np.uint32, # ordvec's own top_m_candidates output (zero-copy fast path)
np.int64, # NumPy default — np.arange / np.array([...]) / np.where()[0]
np.int32,
np.uint64,
np.int16,
np.uint16,
np.int8,
np.uint8,
]


@pytest.mark.parametrize("dtype", INT_DTYPES)
def test_subset_candidate_dtype_accepted_and_equivalent(dtype):
# Any integer dtype is accepted and yields results identical to the
# uint32 reference. (Friend's report: int64/int32/uint64 used to raise
# TypeError.) ids stay small enough for int8 (max 127).
vectors = unit_vectors(50, 128, seed=0)
idx = RankQuant(dim=128, bits=2)
idx.add(vectors)
ref = np.array([0, 7, 13, 25, 41], dtype=np.uint32)
s_ref, id_ref = idx.search_asymmetric_subset(vectors[0], ref, k=4)

s, ids = idx.search_asymmetric_subset(vectors[0], ref.astype(dtype), k=4)
np.testing.assert_array_equal(ids, id_ref)
np.testing.assert_array_equal(s, s_ref)


def test_subset_candidate_natural_numpy_idioms_accepted():
# The ways a user actually builds a candidate set — all int64.
vectors = unit_vectors(50, 128, seed=0)
idx = RankQuant(dim=128, bits=2)
idx.add(vectors)
for candidates in (
np.arange(20),
np.where(np.arange(50) % 5 == 0)[0],
np.argpartition(np.arange(50)[::-1], 15)[:15],
):
assert candidates.dtype == np.int64 # confirm the trap dtype
scores, ids = idx.search_asymmetric_subset(vectors[0], candidates, k=3)
assert scores.shape == (3,) and ids.shape == (3,)


def test_subset_noncontiguous_uint32_candidates_accepted():
# A strided uint32 view (non-contiguous) is copied through the checked
# path rather than rejected — the contiguous fast path is just an
# optimisation, not a requirement, for candidate ids.
vectors = unit_vectors(50, 128, seed=0)
idx = RankQuant(dim=128, bits=2)
idx.add(vectors)
strided = np.arange(0, 48, 2, dtype=np.uint32)[::3]
assert not strided.flags["C_CONTIGUOUS"]
scores, ids = idx.search_asymmetric_subset(vectors[0], strided, k=3)
assert scores.shape == (3,) and ids.shape == (3,)


def test_subset_negative_candidate_raises_value_error():
# Fail-loud: a negative id must NOT silently wrap to a huge u32
# (np.asarray(-1, uint32) -> 4294967295). Reject with a clear ValueError.
vectors = unit_vectors(50, 128, seed=0)
idx = RankQuant(dim=128, bits=2)
idx.add(vectors)
candidates = np.array([0, -1, 5], dtype=np.int64)
with pytest.raises(ValueError, match="out of range for a u32"):
idx.search_asymmetric_subset(vectors[0], candidates, k=2)


def test_subset_overflow_candidate_raises_value_error():
# Fail-loud: an id >= 2**32 must NOT silently wrap (2**32 + 5 -> 5) and
# score the wrong document. Reject with a clear ValueError.
vectors = unit_vectors(50, 128, seed=0)
idx = RankQuant(dim=128, bits=2)
idx.add(vectors)
candidates = np.array([0, 2**32 + 5], dtype=np.int64)
with pytest.raises(ValueError, match="out of range for a u32"):
idx.search_asymmetric_subset(vectors[0], candidates, k=2)


def test_subset_out_of_range_int64_candidate_raises_index_error():
# The >= len(index) check applies regardless of input dtype.
vectors = unit_vectors(50, 128, seed=0)
idx = RankQuant(dim=128, bits=2)
idx.add(vectors)
candidates = np.array([0, 999], dtype=np.int64)
with pytest.raises(IndexError, match="out of range"):
idx.search_asymmetric_subset(vectors[0], candidates, k=2)


def test_subset_float_candidates_raise_type_error():
# A non-integer dtype is a clear TypeError, not a silent truncation.
vectors = unit_vectors(50, 128, seed=0)
idx = RankQuant(dim=128, bits=2)
idx.add(vectors)
candidates = np.array([0.0, 1.0, 2.0], dtype=np.float32)
with pytest.raises(TypeError, match="integer"):
idx.search_asymmetric_subset(vectors[0], candidates, k=2)


def test_body_overlap_doc_ids_int64_accepted():
# Bitmap.body_overlap_scores_subset shares the same coercion: int64
# (sorted) doc_ids are accepted; the ascending-order policy still holds.
vectors = unit_vectors(50, 128, seed=0)
bm = Bitmap(dim=128, n_top=32)
bm.add(vectors)
qb = bm.build_query_bitmap_fp32(vectors[0])
ids_sorted = np.array([2, 4, 8, 16, 32], dtype=np.int64)
out = bm.body_overlap_scores_subset(qb, ids_sorted)
assert out.shape == (5,)
with pytest.raises(ValueError, match="sorted"):
bm.body_overlap_scores_subset(qb, np.array([16, 2, 4], dtype=np.int64))


# -------------------------------------------------------------------
# Wrong array width (ncols/len != dim) -> ValueError, not silent
# misalignment or a reshape panic. The core derives n = len/dim and only
Expand Down
20 changes: 13 additions & 7 deletions ordvec-python/tests/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,16 @@ def test_swap_remove_shrinks_length():
assert len(idx) == 9


def test_add_float64_is_rejected():
# pyo3 numpy binding is strict on dtype — float64 is not silently
# up/down-converted; the caller must convert.
idx = Rank(dim=64)
v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64)
with pytest.raises(TypeError):
idx.add(v64)
def test_add_float64_is_coerced():
# ordvec normalizes real-valued input to float32 at the boundary: float64
# (NumPy's default) is accepted and coerced, producing the same index as the
# explicitly-f32 array. Rank discards magnitude, so coercion is lossless here.
rng = np.random.default_rng(0)
v32 = rng.standard_normal((9, 64)).astype(np.float32)
a = Rank(dim=64)
a.add(v32)
b = Rank(dim=64)
b.add(v32.astype(np.float64))
assert len(a) == len(b) == 9
q = rng.standard_normal((3, 64)).astype(np.float32)
np.testing.assert_array_equal(a.search(q, k=5)[1], b.search(q, k=5)[1])
22 changes: 15 additions & 7 deletions ordvec-python/tests/test_rank_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,21 @@ def test_load_rejects_nonexistent_file():


@pytest.mark.parametrize("bits", [1, 2, 4])
def test_add_float64_is_rejected(bits):
# pyo3 numpy binding is strict on dtype — float64 is not silently
# converted, the caller must convert.
idx = RankQuant(dim=64, bits=bits)
v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64)
with pytest.raises(TypeError):
idx.add(v64)
def test_add_float64_is_coerced(bits):
# float64 is accepted and coerced to float32 at the boundary. The asymmetric
# LUT keeps the query floats but scores against f32-quantised docs, so f64
# precision beyond f32 is meaningless — same results as an f32 index.
rng = np.random.default_rng(0)
v32 = rng.standard_normal((9, 64)).astype(np.float32)
a = RankQuant(dim=64, bits=bits)
a.add(v32)
b = RankQuant(dim=64, bits=bits)
b.add(v32.astype(np.float64))
assert len(a) == len(b) == 9
q = rng.standard_normal((3, 64)).astype(np.float32)
np.testing.assert_array_equal(
a.search_asymmetric(q, k=5)[1], b.search_asymmetric(q, k=5)[1]
)


@pytest.mark.parametrize("bits", [1, 2, 4])
Expand Down
Loading
Loading