Project-Navi · Navi Bot (project-navi-bot) · May 28, 2026 · May 27, 2026 · May 28, 2026 · May 28, 2026
@@ -159,11 +159,17 @@ def test_top_m_candidates_deterministic_across_repeated_calls():
     assert set(runs[0]) == {int(i) for i in indices[0].tolist()}
 
 
-def test_add_float64_is_rejected():
-    idx = Bitmap(dim=64, n_top=8)
-    v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64)
-    with pytest.raises(TypeError):
-        idx.add(v64)
+def test_add_float64_is_coerced():
+    # float64 accepted and coerced to float32 at the boundary; same index as f32.
+    rng = np.random.default_rng(0)
+    v32 = rng.standard_normal((20, 64)).astype(np.float32)
+    a = Bitmap(dim=64, n_top=8)
+    a.add(v32)
+    b = Bitmap(dim=64, n_top=8)
+    b.add(v32.astype(np.float64))
+    assert len(a) == len(b) == 20
+    q = rng.standard_normal((3, 64)).astype(np.float32)
+    np.testing.assert_array_equal(a.search(q, k=5)[1], b.search(q, k=5)[1])
 
 
 def test_dim_above_u16_max_rejected():

@@ -0,0 +1,189 @@
+"""Embedding-input dtype/layout boundary contract (``as_f32_1d`` / ``as_f32_2d``).
+
+ordvec normalises real-valued vector input to float32 at the FFI boundary — the
+premise is *float vector in -> rank/sign transform*, so float32 is the internal
+working dtype, not a contract the caller must pre-satisfy. The policy is uniform
+across the four index types because every embedding entry point routes through
+the same two choke-point helpers.
+
+    Accepted:  float16 / float32 / float64, C-contiguous, finite after coercion
+    Rejected:  bool, integers, complex, object, string        -> TypeError
+               wrong ndim (scalar / 3-D)                       -> TypeError
+               non-contiguous (transpose / stride)             -> ValueError
+                 (never silently copied — the copy decision stays with the caller)
+               non-finite after coercion (NaN / inf / f64 > f32::MAX)  -> ValueError
+
+Why bool/int are rejected rather than coerced: a ``{0.0, 1.0}`` or narrow-integer
+vector rank-transforms to an index-tie artefact (silent retrieval garbage), so
+those are a deliberate usage-error guard, not an ergonomic gap. Candidate *IDs*
+are a different boundary (labels, not measurements) and DO accept int64 — see
+test_input_guards.py.
+"""
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from ordvec import Bitmap, Rank, RankQuant, SignBitmap
+
+INDEX_CLASSES = [Rank, RankQuant, Bitmap, SignBitmap]
+
+
+def _make(index_cls):
+    if index_cls is RankQuant:
+        return RankQuant(dim=64, bits=2)
+    if index_cls is Bitmap:
+        return Bitmap(dim=64, n_top=16)
+    return index_cls(dim=64)
+
+
+def f32(n, dim=64, seed=0):
+    return np.random.default_rng(seed).standard_normal((n, dim)).astype(np.float32)
+
+
+# -------------------------------------------------------------------
+# Accepted: float dtypes, coerced to f32.
+# -------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
+def test_float_dtypes_accepted(index_cls, dtype):
+    idx = _make(index_cls)
+    idx.add(np.ascontiguousarray(f32(8).astype(dtype)))
+    assert len(idx) == 8
+
+
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+def test_float64_coercion_is_faithful(index_cls):
+    # f64 and f32 builds of the same values yield the same index — the rank/sign
+    # transform is order/sign-only, and f64->f32 rounding is monotonic.
+    v32 = f32(12)
+    a = _make(index_cls)
+    a.add(v32)
+    b = _make(index_cls)
+    b.add(v32.astype(np.float64))
+    assert len(a) == len(b) == 12
+
+
+# -------------------------------------------------------------------
+# Rejected dtypes -> TypeError (bool/int/complex/object/string).
+# -------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+@pytest.mark.parametrize(
+    "dtype",
+    [np.int8, np.int32, np.int64, np.uint8, np.uint32, np.uint64, bool, np.complex64, np.complex128, object],
+)
+def test_nonfloat_dtypes_rejected(index_cls, dtype):
+    with pytest.raises(TypeError):
+        _make(index_cls).add(np.ones((8, 64), dtype=dtype))
+
+
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+def test_string_dtype_rejected(index_cls):
+    with pytest.raises(TypeError):
+        _make(index_cls).add(np.full((8, 64), "x"))
+
+
+# -------------------------------------------------------------------
+# Rejected ndim -> TypeError (scalar / 3-D).
+# -------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+def test_scalar_rejected(index_cls):
+    with pytest.raises(TypeError):
+        _make(index_cls).add(np.float32(1.0))
+
+
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+def test_3d_rejected(index_cls):
+    with pytest.raises(TypeError):
+        _make(index_cls).add(np.zeros((2, 3, 64), dtype=np.float32))
+
+
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_wrong_width_rejected_before_coercion(index_cls, dtype):
+    # Width is a cheap metadata check validated on the ORIGINAL array *before* any
+    # dtype coercion: a wrong-width float64 is rejected with the same ValueError as
+    # float32, without first allocating its float32 twin (a wrong-shaped large
+    # array must not be copied just to be rejected).
+    bad = np.ones((4, 128), dtype=dtype)  # width 128 != dim 64
+    with pytest.raises(ValueError, match="dimension"):
+        _make(index_cls).add(bad)
+
+
+# -------------------------------------------------------------------
+# Rejected layout -> ValueError, checked BEFORE coercion so a float64 transpose
+# is never silently laundered into a contiguous float32 (hidden copy).
+# -------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_transpose_rejected_not_silently_copied(index_cls, dtype):
+    v = np.asfortranarray(f32(8).astype(dtype))  # (8, 64) F-order -> non-C-contiguous
+    assert not v.flags["C_CONTIGUOUS"]
+    with pytest.raises(ValueError, match="C-contiguous"):
+        _make(index_cls).add(v)
+
+
+# -------------------------------------------------------------------
+# Rejected values -> ValueError, finite check AFTER coercion.
+# -------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+@pytest.mark.parametrize("bad", [np.nan, np.inf, -np.inf])
+def test_nonfinite_rejected(index_cls, bad):
+    v = f32(8)
+    v[2, 5] = bad
+    with pytest.raises(ValueError, match="finite"):
+        _make(index_cls).add(v)
+
+
+@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
+@pytest.mark.parametrize("index_cls", INDEX_CLASSES)
+def test_float64_overflow_to_inf_rejected(index_cls):
+    # 1e300 is finite in float64 but rounds to +inf in float32 (NumPy's cast emits
+    # a RuntimeWarning, ignored here) — the finite check runs on the POST-coercion
+    # f32, so this is caught, not silently indexed.
+    v = f32(8).astype(np.float64)
+    v[0, 0] = 1e300
+    with pytest.raises(ValueError, match="finite"):
+        _make(index_cls).add(v)
+
+
+# -------------------------------------------------------------------
+# The 1-D query path (as_f32_1d) shares the contract.
+# -------------------------------------------------------------------
+
+
+def test_query_float64_accepted_and_faithful():
+    corpus = f32(30)
+    a = Bitmap(dim=64, n_top=16)
+    a.add(corpus)
+    q32 = f32(1, seed=7)[0]
+    np.testing.assert_array_equal(
+        a.top_m_candidates(q32, m=10),
+        a.top_m_candidates(q32.astype(np.float64), m=10),
+    )
+
+
+def test_query_bool_rejected():
+    a = SignBitmap(dim=64)
+    a.add(f32(10))
+    with pytest.raises(TypeError):
+        a.top_m_candidates(np.ones(64, dtype=bool), m=5)
+
+
+def test_query_noncontiguous_rejected():
+    a = Bitmap(dim=64, n_top=16)
+    a.add(f32(10))
+    strided = np.ascontiguousarray(f32(1, dim=128)[0])[::2]  # len 64, non-contiguous
+    assert not strided.flags["C_CONTIGUOUS"]
+    with pytest.raises(ValueError, match="C-contiguous"):
+        a.top_m_candidates(strided, m=5)
@@ -226,6 +226,128 @@ def test_subset_in_range_candidates_still_work():
     assert int(ids[0]) == 0  # self-query → self ranks first
 
 
+# -------------------------------------------------------------------
+# Candidate / doc-id dtype acceptance. The core takes u32 ids, but NumPy
+# index arrays are int64 by default (np.arange, np.where()[0], fancy
+# indexing, np.argpartition). The binding accepts any integer dtype and
+# converts to u32 with checked bounds rather than rejecting non-uint32
+# with an opaque "ndarray cannot be cast as ndarray" TypeError.
+# -------------------------------------------------------------------
+
+
+# Every integer dtype a candidate set might realistically arrive in.
+INT_DTYPES = [
+    np.uint32,  # ordvec's own top_m_candidates output (zero-copy fast path)
+    np.int64,  # NumPy default — np.arange / np.array([...]) / np.where()[0]
+    np.int32,
+    np.uint64,
+    np.int16,
+    np.uint16,
+    np.int8,
+    np.uint8,
+]
+
+
+@pytest.mark.parametrize("dtype", INT_DTYPES)
+def test_subset_candidate_dtype_accepted_and_equivalent(dtype):
+    # Any integer dtype is accepted and yields results identical to the
+    # uint32 reference. (Friend's report: int64/int32/uint64 used to raise
+    # TypeError.) ids stay small enough for int8 (max 127).
+    vectors = unit_vectors(50, 128, seed=0)
+    idx = RankQuant(dim=128, bits=2)
+    idx.add(vectors)
+    ref = np.array([0, 7, 13, 25, 41], dtype=np.uint32)
+    s_ref, id_ref = idx.search_asymmetric_subset(vectors[0], ref, k=4)
+
+    s, ids = idx.search_asymmetric_subset(vectors[0], ref.astype(dtype), k=4)
+    np.testing.assert_array_equal(ids, id_ref)
+    np.testing.assert_array_equal(s, s_ref)
+
+
+def test_subset_candidate_natural_numpy_idioms_accepted():
+    # The ways a user actually builds a candidate set — all int64.
+    vectors = unit_vectors(50, 128, seed=0)
+    idx = RankQuant(dim=128, bits=2)
+    idx.add(vectors)
+    for candidates in (
+        np.arange(20),
+        np.where(np.arange(50) % 5 == 0)[0],
+        np.argpartition(np.arange(50)[::-1], 15)[:15],
+    ):
+        assert candidates.dtype == np.int64  # confirm the trap dtype
+        scores, ids = idx.search_asymmetric_subset(vectors[0], candidates, k=3)
+        assert scores.shape == (3,) and ids.shape == (3,)
+
+
+def test_subset_noncontiguous_uint32_candidates_accepted():
+    # A strided uint32 view (non-contiguous) is copied through the checked
+    # path rather than rejected — the contiguous fast path is just an
+    # optimisation, not a requirement, for candidate ids.
+    vectors = unit_vectors(50, 128, seed=0)
+    idx = RankQuant(dim=128, bits=2)
+    idx.add(vectors)
+    strided = np.arange(0, 48, 2, dtype=np.uint32)[::3]
+    assert not strided.flags["C_CONTIGUOUS"]
+    scores, ids = idx.search_asymmetric_subset(vectors[0], strided, k=3)
+    assert scores.shape == (3,) and ids.shape == (3,)
+
+
+def test_subset_negative_candidate_raises_value_error():
+    # Fail-loud: a negative id must NOT silently wrap to a huge u32
+    # (np.asarray(-1, uint32) -> 4294967295). Reject with a clear ValueError.
+    vectors = unit_vectors(50, 128, seed=0)
+    idx = RankQuant(dim=128, bits=2)
+    idx.add(vectors)
+    candidates = np.array([0, -1, 5], dtype=np.int64)
+    with pytest.raises(ValueError, match="out of range for a u32"):
+        idx.search_asymmetric_subset(vectors[0], candidates, k=2)
+
+
+def test_subset_overflow_candidate_raises_value_error():
+    # Fail-loud: an id >= 2**32 must NOT silently wrap (2**32 + 5 -> 5) and
+    # score the wrong document. Reject with a clear ValueError.
+    vectors = unit_vectors(50, 128, seed=0)
+    idx = RankQuant(dim=128, bits=2)
+    idx.add(vectors)
+    candidates = np.array([0, 2**32 + 5], dtype=np.int64)
+    with pytest.raises(ValueError, match="out of range for a u32"):
+        idx.search_asymmetric_subset(vectors[0], candidates, k=2)
+
+
+def test_subset_out_of_range_int64_candidate_raises_index_error():
+    # The >= len(index) check applies regardless of input dtype.
+    vectors = unit_vectors(50, 128, seed=0)
+    idx = RankQuant(dim=128, bits=2)
+    idx.add(vectors)
+    candidates = np.array([0, 999], dtype=np.int64)
+    with pytest.raises(IndexError, match="out of range"):
+        idx.search_asymmetric_subset(vectors[0], candidates, k=2)
+
+
+def test_subset_float_candidates_raise_type_error():
+    # A non-integer dtype is a clear TypeError, not a silent truncation.
+    vectors = unit_vectors(50, 128, seed=0)
+    idx = RankQuant(dim=128, bits=2)
+    idx.add(vectors)
+    candidates = np.array([0.0, 1.0, 2.0], dtype=np.float32)
+    with pytest.raises(TypeError, match="integer"):
+        idx.search_asymmetric_subset(vectors[0], candidates, k=2)
+
+
+def test_body_overlap_doc_ids_int64_accepted():
+    # Bitmap.body_overlap_scores_subset shares the same coercion: int64
+    # (sorted) doc_ids are accepted; the ascending-order policy still holds.
+    vectors = unit_vectors(50, 128, seed=0)
+    bm = Bitmap(dim=128, n_top=32)
+    bm.add(vectors)
+    qb = bm.build_query_bitmap_fp32(vectors[0])
+    ids_sorted = np.array([2, 4, 8, 16, 32], dtype=np.int64)
+    out = bm.body_overlap_scores_subset(qb, ids_sorted)
+    assert out.shape == (5,)
+    with pytest.raises(ValueError, match="sorted"):
+        bm.body_overlap_scores_subset(qb, np.array([16, 2, 4], dtype=np.int64))
+
+
 # -------------------------------------------------------------------
 # Wrong array width (ncols/len != dim) -> ValueError, not silent
 # misalignment or a reshape panic. The core derives n = len/dim and only

@@ -155,10 +155,16 @@ def test_swap_remove_shrinks_length():
     assert len(idx) == 9
 
 
-def test_add_float64_is_rejected():
-    # pyo3 numpy binding is strict on dtype — float64 is not silently
-    # up/down-converted; the caller must convert.
-    idx = Rank(dim=64)
-    v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64)
-    with pytest.raises(TypeError):
-        idx.add(v64)
+def test_add_float64_is_coerced():
+    # ordvec normalizes real-valued input to float32 at the boundary: float64
+    # (NumPy's default) is accepted and coerced, producing the same index as the
+    # explicitly-f32 array. Rank discards magnitude, so coercion is lossless here.
+    rng = np.random.default_rng(0)
+    v32 = rng.standard_normal((9, 64)).astype(np.float32)
+    a = Rank(dim=64)
+    a.add(v32)
+    b = Rank(dim=64)
+    b.add(v32.astype(np.float64))
+    assert len(a) == len(b) == 9
+    q = rng.standard_normal((3, 64)).astype(np.float32)
+    np.testing.assert_array_equal(a.search(q, k=5)[1], b.search(q, k=5)[1])
@@ -122,13 +122,21 @@ def test_load_rejects_nonexistent_file():
 
 
 @pytest.mark.parametrize("bits", [1, 2, 4])
-def test_add_float64_is_rejected(bits):
-    # pyo3 numpy binding is strict on dtype — float64 is not silently
-    # converted, the caller must convert.
-    idx = RankQuant(dim=64, bits=bits)
-    v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64)
-    with pytest.raises(TypeError):
-        idx.add(v64)
+def test_add_float64_is_coerced(bits):
+    # float64 is accepted and coerced to float32 at the boundary. The asymmetric
+    # LUT keeps the query floats but scores against f32-quantised docs, so f64
+    # precision beyond f32 is meaningless — same results as an f32 index.
+    rng = np.random.default_rng(0)
+    v32 = rng.standard_normal((9, 64)).astype(np.float32)
+    a = RankQuant(dim=64, bits=bits)
+    a.add(v32)
+    b = RankQuant(dim=64, bits=bits)
+    b.add(v32.astype(np.float64))
+    assert len(a) == len(b) == 9
+    q = rng.standard_normal((3, 64)).astype(np.float32)
+    np.testing.assert_array_equal(
+        a.search_asymmetric(q, k=5)[1], b.search_asymmetric(q, k=5)[1]
+    )
 
 
 @pytest.mark.parametrize("bits", [1, 2, 4])