diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index 922f3a3c..a770cb85 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -31,7 +31,7 @@ //! usual contract for GIL-releasing numeric extensions (NumPy behaves the same //! way). -use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2}; +use numpy::{IntoPyArray, PyArray1, PyArray2, PyArrayMethods, PyReadonlyArray1, PyReadonlyArray2}; use pyo3::prelude::*; use pyo3::types::PyType; use pyo3::wrap_pyfunction; @@ -95,6 +95,260 @@ fn check_bits_max7(bits: u8) -> PyResult<()> { Ok(()) } +fn not_contiguous_err() -> PyErr { + pyo3::exceptions::PyValueError::new_err( + "array must be C-contiguous; call np.ascontiguousarray() first", + ) +} + +/// Candidate / doc-id slice obtained from a NumPy array, either borrowed +/// zero-copy (already `uint32` and contiguous) or owned (converted from another +/// integer dtype). The `Borrowed` variant keeps the `PyReadonlyArray` guard +/// alive so its slice stays valid across a GIL-released `py.detach` call. +enum CandidateIds<'py> { + Borrowed(PyReadonlyArray1<'py, u32>), + Owned(Vec), +} + +impl CandidateIds<'_> { + fn as_slice(&self) -> PyResult<&[u32]> { + match self { + CandidateIds::Borrowed(ro) => ro.as_slice().map_err(|_| not_contiguous_err()), + CandidateIds::Owned(v) => Ok(v), + } + } +} + +/// Coerce a NumPy candidate/doc-id array of *any* integer dtype to `u32`. +/// +/// The core takes `&[u32]` doc ids (the corpus is capped at `MAX_VECTORS = 2^26`, +/// well below `u32::MAX`), so the natural binding type is `PyReadonlyArray1`. +/// But rust-numpy matches that dtype *exactly*, while NumPy index arrays are +/// `int64` by default (`np.arange`, `np.where()[0]`, `np.array([...])`, fancy +/// indexing, `np.argpartition`). Requiring `uint32` made the most natural ways to +/// build a candidate set raise an opaque `TypeError`, even though ordvec's own +/// candidate generators (`top_m_candidates*`) already emit `uint32`. +/// +/// We accept any integer dtype and convert with **checked** bounds: a negative id +/// or one exceeding `u32::MAX` is a clean `ValueError`, never a silent wrap — note +/// `np.asarray(x, dtype=uint32)` would wrap `-1 -> 4294967295` and `2**32 -> 0`, +/// which would then score the wrong document. Already-`uint32` contiguous arrays +/// are borrowed zero-copy; every other dtype is copied once (candidate shortlists +/// are small relative to the scan; large-M FFI is tracked in issue #11). The +/// in-range (`< n`) check stays with the caller, which knows the corpus size. +fn as_u32_ids_1d<'py>(arr: &Bound<'py, PyAny>, what: &str) -> PyResult> { + // Fast path: already uint32 and C-contiguous -> borrow, zero-copy. + if let Ok(a) = arr.cast::>() { + let ro = a.readonly(); + if ro.as_slice().is_ok() { + return Ok(CandidateIds::Borrowed(ro)); + } + // Non-contiguous uint32 falls through to the copying path below. + } + + macro_rules! try_int_dtype { + ($t:ty) => { + if let Ok(a) = arr.cast::>() { + let ro = a.readonly(); + let view = ro.as_array(); + let mut out = Vec::with_capacity(view.len()); + for &x in view.iter() { + out.push(u32::try_from(x).map_err(|_| { + pyo3::exceptions::PyValueError::new_err(format!( + "{what} {x} is out of range for a u32 index \ + (must be in 0..=4294967295)" + )) + })?); + } + return Ok(CandidateIds::Owned(out)); + } + }; + } + // u32 first so non-contiguous uint32 (which fell through above) is handled + // before the wider/narrower dtypes; order is otherwise irrelevant since each + // downcast is an exact dtype match. + try_int_dtype!(u32); + try_int_dtype!(i64); + try_int_dtype!(u64); + try_int_dtype!(i32); + try_int_dtype!(i16); + try_int_dtype!(u16); + try_int_dtype!(i8); + try_int_dtype!(u8); + + let got = arr + .getattr("dtype") + .map(|d| d.to_string()) + .unwrap_or_else(|_| "a non-array object".to_owned()); + Err(pyo3::exceptions::PyTypeError::new_err(format!( + "{what} must be a 1-D integer NumPy array with values in [0, 2**32 - 1]; got {got} \ + (ordvec stores candidate ids as u32 — boolean and floating-point arrays are rejected)" + ))) +} + +/// Reject any id `>= n` (out of the corpus) as a typed `IndexError`. The core +/// hard-asserts ids are in range (an AVX-512 path issues a raw gather load), so an +/// out-of-range id would otherwise surface as a `PanicException` that leaks the +/// internal buffer geometry. +fn check_ids_in_range(ids: &[u32], n: usize, what: &str) -> PyResult<()> { + if let Some(&bad) = ids.iter().find(|&&di| (di as usize) >= n) { + return Err(pyo3::exceptions::PyIndexError::new_err(format!( + "{what} {bad} out of range (index holds {n} vectors)" + ))); + } + Ok(()) +} + +fn f32_dtype_error(arr: &Bound<'_, PyAny>) -> PyErr { + let got = arr + .getattr("dtype") + .map(|d| d.to_string()) + .unwrap_or_else(|_| "a non-array object".to_owned()); + pyo3::exceptions::PyTypeError::new_err(format!( + "expected a floating-point NumPy array (float16/float32/float64), got {got}; ordvec \ + rank/sign-transforms real vectors and converts them to float32 at the boundary — \ + boolean, integer, complex, object, and string arrays are rejected (a {{0, 1}} or \ + narrow-integer vector rank-transforms to a degenerate index artefact, not a meaningful \ + ordinal signal; call .astype(np.float32) to opt in deliberately)" + )) +} + +fn not_contiguous_f32_err() -> PyErr { + pyo3::exceptions::PyValueError::new_err( + "expected a C-contiguous NumPy array; got non-contiguous input. Use \ + np.ascontiguousarray(x, dtype=np.float32) if you intend to make a copy.", + ) +} + +/// Reject a non-`float32` input whose dtype isn't a float kind, or whose `ndim` +/// doesn't match. Error types mirror the strict-extraction contract: a bad dtype +/// or rank is a `TypeError`, ordered so the dtype message wins. Layout is checked +/// separately by [`require_c_contiguous`] *after* this (a `ValueError`). +fn gate_float_ndim(arr: &Bound<'_, PyAny>, ndim: usize) -> PyResult<()> { + let kind = arr + .getattr("dtype") + .and_then(|d| d.getattr("kind")) + .and_then(|k| k.extract::()); + if !matches!(kind, Ok('f')) { + return Err(f32_dtype_error(arr)); + } + let nd = arr.getattr("ndim").and_then(|n| n.extract::()); + if !matches!(nd, Ok(n) if n == ndim) { + return Err(pyo3::exceptions::PyTypeError::new_err(format!( + "expected a {ndim}-D float array" + ))); + } + Ok(()) +} + +/// Reject a non-`C`-contiguous original array *before* any dtype coercion, so a +/// transposed/strided float64 can't be silently laundered into a contiguous +/// float32 (that hidden copy can dominate runtime / poison benchmarks — the copy +/// decision stays with the caller). +fn require_c_contiguous(arr: &Bound<'_, PyAny>) -> PyResult<()> { + let contiguous = arr + .getattr("flags") + .and_then(|f| f.getattr("c_contiguous")) + .and_then(|c| c.extract::()) + .unwrap_or(false); + if contiguous { + Ok(()) + } else { + Err(not_contiguous_f32_err()) + } +} + +/// Length of `arr`'s axis `axis` from its `shape` tuple, read as cheap metadata so +/// width can be validated *before* any coercion copy — rejecting a wrong-shaped +/// large float64 array must not first allocate its float32 twin. +fn axis_len(arr: &Bound<'_, PyAny>, axis: usize) -> PyResult { + arr.getattr("shape")?.get_item(axis)?.extract::() +} + +/// Present an embedding vector as a 1-D `float32` `PyReadonlyArray`, converting at +/// the boundary. The premise of ordvec is *float vector in → rank/sign transform*, +/// so float32 is the internal working dtype, not a contract the caller must +/// pre-satisfy: `float64` (the default for `np.array([...])` and most API +/// embeddings) and `float16` are coerced. The transforms that consume the floats +/// are order-only (rank transform, top-bucket bitmap) or sign-only, and `f64→f32` +/// rounding is *monotonic* — it can never reorder two coordinates, only collapse a +/// near-tie at the f32 floor, strictly less perturbation than the rank/bucket +/// quantisation already applied. The asymmetric-query LUT keeps the floats but +/// scores against f32-quantised docs, so sub-`f32` query precision is meaningless +/// there too. +/// +/// Rejected (matching exception type): non-float dtype — bool / integer / complex / +/// object / string — and wrong `ndim` (`TypeError`); a width that doesn't match the +/// index dimension, or a non-`C`-contiguous original (`ValueError`) — both checked +/// on the original *before* coercion, so a wrong-shaped large array is never copied +/// just to be rejected. Bool and narrow integers are +/// *deliberately* rejected: a `{0, 1}` or few-valued vector rank-transforms to an +/// index-tie artefact, i.e. silent retrieval garbage. The all-finite check runs on +/// the post-coercion f32 (an `f64 > f32::MAX` rounds to `+inf` — caught here, not +/// silently indexed). Already-`float32` contiguous arrays are borrowed zero-copy. +fn as_f32_1d<'py>( + arr: &Bound<'py, PyAny>, + expected_len: Option, +) -> PyResult> { + let ro = if let Ok(a) = arr.cast::>() { + let ro = a.readonly(); + if let Some(dim) = expected_len { + check_width(ro.as_array().len(), dim)?; + } + ro + } else { + gate_float_ndim(arr, 1)?; + if let Some(dim) = expected_len { + check_width(axis_len(arr, 0)?, dim)?; + } + require_c_contiguous(arr)?; + arr.py() + .import("numpy")? + .getattr("ascontiguousarray")? + .call1((arr, "float32"))? + .cast::>() + .map(|a| a.readonly()) + .map_err(|_| pyo3::exceptions::PyTypeError::new_err("expected a 1-D float array"))? + }; + ensure_finite( + ro.as_array() + .as_slice() + .ok_or_else(not_contiguous_f32_err)?, + )?; + Ok(ro) +} + +/// 2-D `(n, dim)` counterpart of [`as_f32_1d`] for the `add` / batched-query paths. +/// Same contract; see [`as_f32_1d`] for the full rationale. +fn as_f32_2d<'py>(arr: &Bound<'py, PyAny>, dim: usize) -> PyResult> { + let ro = if let Ok(a) = arr.cast::>() { + let ro = a.readonly(); + check_width(ro.as_array().ncols(), dim)?; + ro + } else { + gate_float_ndim(arr, 2)?; + check_width(axis_len(arr, 1)?, dim)?; + require_c_contiguous(arr)?; + arr.py() + .import("numpy")? + .getattr("ascontiguousarray")? + .call1((arr, "float32"))? + .cast::>() + .map(|a| a.readonly()) + .map_err(|_| { + pyo3::exceptions::PyTypeError::new_err( + "expected a 2-D float array of shape (n, dim)", + ) + })? + }; + ensure_finite( + ro.as_array() + .as_slice() + .ok_or_else(not_contiguous_f32_err)?, + )?; + Ok(ro) +} + // ===================================================================== // Rank-mode retrieval bindings: Rank, RankQuant, Bitmap, SignBitmap. // @@ -128,15 +382,14 @@ impl Rank { format!("Rank(dim={}, n={})", self.inner.dim(), self.inner.len()) } - fn add(&mut self, py: Python<'_>, vectors: PyReadonlyArray2) -> PyResult<()> { + fn add<'py>(&mut self, py: Python<'py>, vectors: &Bound<'py, PyAny>) -> PyResult<()> { + let vectors = as_f32_2d(vectors, self.inner.dim())?; let arr = vectors.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; // Release the GIL around the parallel rank-transform / pack so other // Python threads run during a bulk add. `slice` (`&[f32]`) and // `&mut self.inner` are both `Ungil`, so no pointer juggling is needed. @@ -156,18 +409,17 @@ impl Rank { fn search<'py>( &self, py: Python<'py>, - queries: PyReadonlyArray2, + queries: &Bound<'py, PyAny>, k: usize, ) -> PyResult> { + let queries = as_f32_2d(queries, self.inner.dim())?; let arr = queries.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let nq = arr.nrows(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; let results = py.detach(|| self.inner.search(slice, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? @@ -183,18 +435,17 @@ impl Rank { fn search_asymmetric<'py>( &self, py: Python<'py>, - queries: PyReadonlyArray2, + queries: &Bound<'py, PyAny>, k: usize, ) -> PyResult> { + let queries = as_f32_2d(queries, self.inner.dim())?; let arr = queries.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let nq = arr.nrows(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; let results = py.detach(|| self.inner.search_asymmetric(slice, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? @@ -311,15 +562,14 @@ impl RankQuant { ) } - fn add(&mut self, py: Python<'_>, vectors: PyReadonlyArray2) -> PyResult<()> { + fn add<'py>(&mut self, py: Python<'py>, vectors: &Bound<'py, PyAny>) -> PyResult<()> { + let vectors = as_f32_2d(vectors, self.inner.dim())?; let arr = vectors.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; // Release the GIL around the parallel rank-transform / pack so other // Python threads run during a bulk add. `slice` (`&[f32]`) and // `&mut self.inner` are both `Ungil`, so no pointer juggling is needed. @@ -337,18 +587,17 @@ impl RankQuant { fn search<'py>( &self, py: Python<'py>, - queries: PyReadonlyArray2, + queries: &Bound<'py, PyAny>, k: usize, ) -> PyResult> { + let queries = as_f32_2d(queries, self.inner.dim())?; let arr = queries.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let nq = arr.nrows(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; let results = py.detach(|| self.inner.search(slice, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? @@ -363,18 +612,17 @@ impl RankQuant { fn search_asymmetric<'py>( &self, py: Python<'py>, - queries: PyReadonlyArray2, + queries: &Bound<'py, PyAny>, k: usize, ) -> PyResult> { + let queries = as_f32_2d(queries, self.inner.dim())?; let arr = queries.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let nq = arr.nrows(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; let results = py.detach(|| self.inner.search_asymmetric(slice, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? @@ -424,38 +672,32 @@ impl RankQuant { /// indices (mapped from the local candidate slot); slots that could not be /// filled are returned as ``-1``. Uses the same AVX-512 → AVX2 → scalar /// dispatch as ``search_asymmetric``. + /// + /// ``candidates`` may be a 1-D array of any integer dtype — the ``uint32`` + /// emitted by ``top_m_candidates``/``top_m_candidates_batched`` or a plain + /// ``int64`` index array (``np.arange``, ``np.where(...)[0]``, fancy-index + /// results). Ids are converted to ``uint32``; a negative id, one ``>= 2**32``, + /// or one ``>= len(self)`` raises a ``ValueError``/``IndexError``. fn search_asymmetric_subset<'py>( &self, py: Python<'py>, - query: PyReadonlyArray1, - candidates: PyReadonlyArray1, + query: &Bound<'py, PyAny>, + candidates: &Bound<'py, PyAny>, k: usize, ) -> PyResult> { + let query = as_f32_1d(query, Some(self.inner.dim()))?; let q = query.as_array(); - check_width(q.len(), self.inner.dim())?; let q_slice = q.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(q_slice)?; - let c = candidates.as_array(); - let c_slice = c.as_slice().ok_or_else(|| { - pyo3::exceptions::PyValueError::new_err( - "array must be C-contiguous; call np.ascontiguousarray() first", - ) - })?; - // Validate every candidate id against the index size *before* calling the - // core. The core gathers `self.packed[di * bpv ..]` for each id and only - // `assert!`s the bound, so an out-of-range id would panic inside Rust and - // surface across pyo3 as a `PanicException` that leaks the internal buffer - // geometry. Reject it here as a typed `IndexError` instead. - let n = self.inner.len(); - if let Some(&bad) = c_slice.iter().find(|&&di| (di as usize) >= n) { - return Err(pyo3::exceptions::PyIndexError::new_err(format!( - "candidate id {bad} out of range (index holds {n} vectors)" - ))); - } + // Accept candidate ids of any integer dtype (NumPy index arrays are int64 + // by default) and convert to the core's u32 with checked bounds, then + // reject any id outside the corpus before dispatch. + let cands = as_u32_ids_1d(candidates, "candidate id")?; + let c_slice = cands.as_slice()?; + check_ids_in_range(c_slice, self.inner.len(), "candidate id")?; let (scores, ids) = py.detach(|| self.inner.search_asymmetric_subset(q_slice, c_slice, k)); Ok((scores.into_pyarray(py), ids.into_pyarray(py))) } @@ -535,15 +777,14 @@ impl Bitmap { ) } - fn add(&mut self, py: Python<'_>, vectors: PyReadonlyArray2) -> PyResult<()> { + fn add<'py>(&mut self, py: Python<'py>, vectors: &Bound<'py, PyAny>) -> PyResult<()> { + let vectors = as_f32_2d(vectors, self.inner.dim())?; let arr = vectors.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; // Release the GIL around the parallel rank-transform / pack so other // Python threads run during a bulk add. `slice` (`&[f32]`) and // `&mut self.inner` are both `Ungil`, so no pointer juggling is needed. @@ -563,18 +804,17 @@ impl Bitmap { fn search<'py>( &self, py: Python<'py>, - queries: PyReadonlyArray2, + queries: &Bound<'py, PyAny>, k: usize, ) -> PyResult> { + let queries = as_f32_2d(queries, self.inner.dim())?; let arr = queries.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let nq = arr.nrows(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; let results = py.detach(|| self.inner.search(slice, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? @@ -591,17 +831,16 @@ impl Bitmap { fn top_m_candidates<'py>( &self, py: Python<'py>, - query: PyReadonlyArray1, + query: &Bound<'py, PyAny>, m: usize, ) -> PyResult>> { + let query = as_f32_1d(query, Some(self.inner.dim()))?; let arr = query.as_array(); - check_width(arr.len(), self.inner.dim())?; let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; let cands = py.detach(|| self.inner.top_m_candidates(slice, m)); Ok(cands.into_pyarray(py)) } @@ -612,16 +851,15 @@ impl Bitmap { fn build_query_bitmap_fp32<'py>( &self, py: Python<'py>, - query: PyReadonlyArray1, + query: &Bound<'py, PyAny>, ) -> PyResult>> { + let query = as_f32_1d(query, Some(self.inner.dim()))?; let arr = query.as_array(); - check_width(arr.len(), self.inner.dim())?; let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; Ok(self.inner.build_query_bitmap_fp32(slice).into_pyarray(py)) } @@ -633,18 +871,17 @@ impl Bitmap { fn top_m_candidates_batched<'py>( &self, py: Python<'py>, - queries: PyReadonlyArray2, + queries: &Bound<'py, PyAny>, m: usize, ) -> PyResult>> { + let queries = as_f32_2d(queries, self.inner.dim())?; let arr = queries.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let batch = arr.nrows(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; // Guard the core's internal `batch * n` (scores) and `batch * qpv` // (query bitmaps) allocations BEFORE the call: an overflow there wraps // and then indexes out of bounds (a panic), so convert it to a clean @@ -678,7 +915,7 @@ impl Bitmap { fn top_m_candidates_batched_chunked<'py>( &self, py: Python<'py>, - queries: PyReadonlyArray2, + queries: &Bound<'py, PyAny>, m: usize, batch_size: usize, ) -> PyResult>> { @@ -687,15 +924,14 @@ impl Bitmap { "batch_size must be > 0", )); } + let queries = as_f32_2d(queries, self.inner.dim())?; let arr = queries.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let n_queries = arr.nrows(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; // Clamp batch_size to the query count so a very large value can't // overflow `batch_size * dim` inside the core (which fails loud with an // overflow panic). A batch larger than the workload is just one chunk, @@ -735,8 +971,8 @@ impl Bitmap { /// Compute bitmap-overlap scores for a subset of doc IDs against a pre-built /// query bitmap. `q_bitmap` is a 1-D `uint64` array of `dim / 64` words /// (e.g. from [`Bitmap.build_query_bitmap_fp32`]); `doc_ids` is a 1-D - /// `uint32` array that must be in range. Returns a 1-D `uint32` array of - /// overlap scores aligned to `doc_ids`. + /// integer array of any dtype (converted to `uint32`) whose ids must be in + /// range. Returns a 1-D `uint32` array of overlap scores aligned to `doc_ids`. /// /// `doc_ids` must additionally be sorted ascending. This is a *Python-side /// ergonomic policy*, not a core requirement: the Rust core accepts unsorted @@ -748,7 +984,7 @@ impl Bitmap { &self, py: Python<'py>, q_bitmap: PyReadonlyArray1, - doc_ids: PyReadonlyArray1, + doc_ids: &Bound<'py, PyAny>, ) -> PyResult>> { let qb = q_bitmap.as_array(); let qb_slice = qb.as_slice().ok_or_else(|| { @@ -763,21 +999,12 @@ impl Bitmap { qb_slice.len() ))); } - let ids = doc_ids.as_array(); - let ids_slice = ids.as_slice().ok_or_else(|| { - pyo3::exceptions::PyValueError::new_err( - "array must be C-contiguous; call np.ascontiguousarray() first", - ) - })?; - // Bound-check every id before dispatch: the core hard-asserts ids are in - // range (the AVX-512 path issues a raw load), so an OOB id would surface - // as a PanicException. Reject it as a typed IndexError instead. - let n = self.inner.len(); - if let Some(&bad) = ids_slice.iter().find(|&&di| (di as usize) >= n) { - return Err(pyo3::exceptions::PyIndexError::new_err(format!( - "doc id {bad} out of range (index holds {n} vectors)" - ))); - } + // Accept doc ids of any integer dtype (NumPy index arrays are int64 by + // default) and convert to u32 with checked bounds, then reject any id + // outside the corpus before dispatch. + let doc_ids = as_u32_ids_1d(doc_ids, "doc id")?; + let ids_slice = doc_ids.as_slice()?; + check_ids_in_range(ids_slice, self.inner.len(), "doc id")?; // Python-side ergonomic policy (NOT a core correctness requirement): // the Rust core scores unsorted ids correctly in input order, just with // worse cache locality. The binding requires the sorted, cache-friendly @@ -883,15 +1110,14 @@ impl SignBitmap { ) } - fn add(&mut self, py: Python<'_>, vectors: PyReadonlyArray2) -> PyResult<()> { + fn add<'py>(&mut self, py: Python<'py>, vectors: &Bound<'py, PyAny>) -> PyResult<()> { + let vectors = as_f32_2d(vectors, self.inner.dim())?; let arr = vectors.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; // Release the GIL around the parallel rank-transform / pack so other // Python threads run during a bulk add. `slice` (`&[f32]`) and // `&mut self.inner` are both `Ungil`, so no pointer juggling is needed. @@ -912,17 +1138,16 @@ impl SignBitmap { fn top_m_candidates<'py>( &self, py: Python<'py>, - query: PyReadonlyArray1, + query: &Bound<'py, PyAny>, m: usize, ) -> PyResult>> { + let query = as_f32_1d(query, Some(self.inner.dim()))?; let arr = query.as_array(); - check_width(arr.len(), self.inner.dim())?; let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; let cands = py.detach(|| self.inner.top_m_candidates(slice, m)); Ok(cands.into_pyarray(py)) } @@ -936,18 +1161,17 @@ impl SignBitmap { fn top_m_candidates_batched<'py>( &self, py: Python<'py>, - queries: PyReadonlyArray2, + queries: &Bound<'py, PyAny>, m: usize, ) -> PyResult>> { + let queries = as_f32_2d(queries, self.inner.dim())?; let arr = queries.as_array(); - check_width(arr.ncols(), self.inner.dim())?; let batch = arr.nrows(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; // Guard the core's internal `batch * n` (scores) and `batch * qpv` // (query bitmaps) allocations BEFORE the call: an overflow there wraps // and then indexes out of bounds (a panic), so convert it to a clean @@ -980,16 +1204,15 @@ impl SignBitmap { fn build_query_bitmap<'py>( &self, py: Python<'py>, - query: PyReadonlyArray1, + query: &Bound<'py, PyAny>, ) -> PyResult>> { + let query = as_f32_1d(query, Some(self.inner.dim()))?; let arr = query.as_array(); - check_width(arr.len(), self.inner.dim())?; let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; Ok(self.inner.build_query_bitmap(slice).into_pyarray(py)) } @@ -1058,8 +1281,9 @@ impl SignBitmap { #[pyfunction] fn rank_transform<'py>( py: Python<'py>, - v: PyReadonlyArray1, + v: &Bound<'py, PyAny>, ) -> PyResult>> { + let v = as_f32_1d(v, None)?; let arr = v.as_array(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( @@ -1247,7 +1471,7 @@ fn rankquant_norm(d: usize, bits: u8) -> PyResult { fn search_asymmetric_byte_lut<'py>( py: Python<'py>, index: PyRef<'_, RankQuant>, - queries: PyReadonlyArray2, + queries: &Bound<'py, PyAny>, k: usize, ) -> PyResult> { if index.inner.bits() == 1 { @@ -1255,15 +1479,14 @@ fn search_asymmetric_byte_lut<'py>( "search_asymmetric_byte_lut is a benchmark-only helper and does not support bits=1; use RankQuant.search_asymmetric instead", )); } + let queries = as_f32_2d(queries, index.inner.dim())?; let arr = queries.as_array(); - check_width(arr.ncols(), index.inner.dim())?; let nq = arr.nrows(); let slice = arr.as_slice().ok_or_else(|| { pyo3::exceptions::PyValueError::new_err( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - ensure_finite(slice)?; // Deref the GIL-bound `PyRef` to a plain `&RankQuant` *before* the closure: // capturing `index` (a `PyRef`) directly would make the closure non-`Ungil`, // but a bare `&ordvec_core::RankQuant` is fine to carry across `detach`. diff --git a/ordvec-python/tests/test_bitmap.py b/ordvec-python/tests/test_bitmap.py index 10502136..8cf95a38 100644 --- a/ordvec-python/tests/test_bitmap.py +++ b/ordvec-python/tests/test_bitmap.py @@ -159,11 +159,17 @@ def test_top_m_candidates_deterministic_across_repeated_calls(): assert set(runs[0]) == {int(i) for i in indices[0].tolist()} -def test_add_float64_is_rejected(): - idx = Bitmap(dim=64, n_top=8) - v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64) - with pytest.raises(TypeError): - idx.add(v64) +def test_add_float64_is_coerced(): + # float64 accepted and coerced to float32 at the boundary; same index as f32. + rng = np.random.default_rng(0) + v32 = rng.standard_normal((20, 64)).astype(np.float32) + a = Bitmap(dim=64, n_top=8) + a.add(v32) + b = Bitmap(dim=64, n_top=8) + b.add(v32.astype(np.float64)) + assert len(a) == len(b) == 20 + q = rng.standard_normal((3, 64)).astype(np.float32) + np.testing.assert_array_equal(a.search(q, k=5)[1], b.search(q, k=5)[1]) def test_dim_above_u16_max_rejected(): diff --git a/ordvec-python/tests/test_input_dtype.py b/ordvec-python/tests/test_input_dtype.py new file mode 100644 index 00000000..a42e57c0 --- /dev/null +++ b/ordvec-python/tests/test_input_dtype.py @@ -0,0 +1,189 @@ +"""Embedding-input dtype/layout boundary contract (``as_f32_1d`` / ``as_f32_2d``). + +ordvec normalises real-valued vector input to float32 at the FFI boundary — the +premise is *float vector in -> rank/sign transform*, so float32 is the internal +working dtype, not a contract the caller must pre-satisfy. The policy is uniform +across the four index types because every embedding entry point routes through +the same two choke-point helpers. + + Accepted: float16 / float32 / float64, C-contiguous, finite after coercion + Rejected: bool, integers, complex, object, string -> TypeError + wrong ndim (scalar / 3-D) -> TypeError + non-contiguous (transpose / stride) -> ValueError + (never silently copied — the copy decision stays with the caller) + non-finite after coercion (NaN / inf / f64 > f32::MAX) -> ValueError + +Why bool/int are rejected rather than coerced: a ``{0.0, 1.0}`` or narrow-integer +vector rank-transforms to an index-tie artefact (silent retrieval garbage), so +those are a deliberate usage-error guard, not an ergonomic gap. Candidate *IDs* +are a different boundary (labels, not measurements) and DO accept int64 — see +test_input_guards.py. +""" +from __future__ import annotations + +import numpy as np +import pytest + +from ordvec import Bitmap, Rank, RankQuant, SignBitmap + +INDEX_CLASSES = [Rank, RankQuant, Bitmap, SignBitmap] + + +def _make(index_cls): + if index_cls is RankQuant: + return RankQuant(dim=64, bits=2) + if index_cls is Bitmap: + return Bitmap(dim=64, n_top=16) + return index_cls(dim=64) + + +def f32(n, dim=64, seed=0): + return np.random.default_rng(seed).standard_normal((n, dim)).astype(np.float32) + + +# ------------------------------------------------------------------- +# Accepted: float dtypes, coerced to f32. +# ------------------------------------------------------------------- + + +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +def test_float_dtypes_accepted(index_cls, dtype): + idx = _make(index_cls) + idx.add(np.ascontiguousarray(f32(8).astype(dtype))) + assert len(idx) == 8 + + +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +def test_float64_coercion_is_faithful(index_cls): + # f64 and f32 builds of the same values yield the same index — the rank/sign + # transform is order/sign-only, and f64->f32 rounding is monotonic. + v32 = f32(12) + a = _make(index_cls) + a.add(v32) + b = _make(index_cls) + b.add(v32.astype(np.float64)) + assert len(a) == len(b) == 12 + + +# ------------------------------------------------------------------- +# Rejected dtypes -> TypeError (bool/int/complex/object/string). +# ------------------------------------------------------------------- + + +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +@pytest.mark.parametrize( + "dtype", + [np.int8, np.int32, np.int64, np.uint8, np.uint32, np.uint64, bool, np.complex64, np.complex128, object], +) +def test_nonfloat_dtypes_rejected(index_cls, dtype): + with pytest.raises(TypeError): + _make(index_cls).add(np.ones((8, 64), dtype=dtype)) + + +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +def test_string_dtype_rejected(index_cls): + with pytest.raises(TypeError): + _make(index_cls).add(np.full((8, 64), "x")) + + +# ------------------------------------------------------------------- +# Rejected ndim -> TypeError (scalar / 3-D). +# ------------------------------------------------------------------- + + +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +def test_scalar_rejected(index_cls): + with pytest.raises(TypeError): + _make(index_cls).add(np.float32(1.0)) + + +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +def test_3d_rejected(index_cls): + with pytest.raises(TypeError): + _make(index_cls).add(np.zeros((2, 3, 64), dtype=np.float32)) + + +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_wrong_width_rejected_before_coercion(index_cls, dtype): + # Width is a cheap metadata check validated on the ORIGINAL array *before* any + # dtype coercion: a wrong-width float64 is rejected with the same ValueError as + # float32, without first allocating its float32 twin (a wrong-shaped large + # array must not be copied just to be rejected). + bad = np.ones((4, 128), dtype=dtype) # width 128 != dim 64 + with pytest.raises(ValueError, match="dimension"): + _make(index_cls).add(bad) + + +# ------------------------------------------------------------------- +# Rejected layout -> ValueError, checked BEFORE coercion so a float64 transpose +# is never silently laundered into a contiguous float32 (hidden copy). +# ------------------------------------------------------------------- + + +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_transpose_rejected_not_silently_copied(index_cls, dtype): + v = np.asfortranarray(f32(8).astype(dtype)) # (8, 64) F-order -> non-C-contiguous + assert not v.flags["C_CONTIGUOUS"] + with pytest.raises(ValueError, match="C-contiguous"): + _make(index_cls).add(v) + + +# ------------------------------------------------------------------- +# Rejected values -> ValueError, finite check AFTER coercion. +# ------------------------------------------------------------------- + + +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +@pytest.mark.parametrize("bad", [np.nan, np.inf, -np.inf]) +def test_nonfinite_rejected(index_cls, bad): + v = f32(8) + v[2, 5] = bad + with pytest.raises(ValueError, match="finite"): + _make(index_cls).add(v) + + +@pytest.mark.filterwarnings("ignore:overflow encountered in cast") +@pytest.mark.parametrize("index_cls", INDEX_CLASSES) +def test_float64_overflow_to_inf_rejected(index_cls): + # 1e300 is finite in float64 but rounds to +inf in float32 (NumPy's cast emits + # a RuntimeWarning, ignored here) — the finite check runs on the POST-coercion + # f32, so this is caught, not silently indexed. + v = f32(8).astype(np.float64) + v[0, 0] = 1e300 + with pytest.raises(ValueError, match="finite"): + _make(index_cls).add(v) + + +# ------------------------------------------------------------------- +# The 1-D query path (as_f32_1d) shares the contract. +# ------------------------------------------------------------------- + + +def test_query_float64_accepted_and_faithful(): + corpus = f32(30) + a = Bitmap(dim=64, n_top=16) + a.add(corpus) + q32 = f32(1, seed=7)[0] + np.testing.assert_array_equal( + a.top_m_candidates(q32, m=10), + a.top_m_candidates(q32.astype(np.float64), m=10), + ) + + +def test_query_bool_rejected(): + a = SignBitmap(dim=64) + a.add(f32(10)) + with pytest.raises(TypeError): + a.top_m_candidates(np.ones(64, dtype=bool), m=5) + + +def test_query_noncontiguous_rejected(): + a = Bitmap(dim=64, n_top=16) + a.add(f32(10)) + strided = np.ascontiguousarray(f32(1, dim=128)[0])[::2] # len 64, non-contiguous + assert not strided.flags["C_CONTIGUOUS"] + with pytest.raises(ValueError, match="C-contiguous"): + a.top_m_candidates(strided, m=5) diff --git a/ordvec-python/tests/test_input_guards.py b/ordvec-python/tests/test_input_guards.py index 1a6a5bba..b48b535a 100644 --- a/ordvec-python/tests/test_input_guards.py +++ b/ordvec-python/tests/test_input_guards.py @@ -226,6 +226,128 @@ def test_subset_in_range_candidates_still_work(): assert int(ids[0]) == 0 # self-query → self ranks first +# ------------------------------------------------------------------- +# Candidate / doc-id dtype acceptance. The core takes u32 ids, but NumPy +# index arrays are int64 by default (np.arange, np.where()[0], fancy +# indexing, np.argpartition). The binding accepts any integer dtype and +# converts to u32 with checked bounds rather than rejecting non-uint32 +# with an opaque "ndarray cannot be cast as ndarray" TypeError. +# ------------------------------------------------------------------- + + +# Every integer dtype a candidate set might realistically arrive in. +INT_DTYPES = [ + np.uint32, # ordvec's own top_m_candidates output (zero-copy fast path) + np.int64, # NumPy default — np.arange / np.array([...]) / np.where()[0] + np.int32, + np.uint64, + np.int16, + np.uint16, + np.int8, + np.uint8, +] + + +@pytest.mark.parametrize("dtype", INT_DTYPES) +def test_subset_candidate_dtype_accepted_and_equivalent(dtype): + # Any integer dtype is accepted and yields results identical to the + # uint32 reference. (Friend's report: int64/int32/uint64 used to raise + # TypeError.) ids stay small enough for int8 (max 127). + vectors = unit_vectors(50, 128, seed=0) + idx = RankQuant(dim=128, bits=2) + idx.add(vectors) + ref = np.array([0, 7, 13, 25, 41], dtype=np.uint32) + s_ref, id_ref = idx.search_asymmetric_subset(vectors[0], ref, k=4) + + s, ids = idx.search_asymmetric_subset(vectors[0], ref.astype(dtype), k=4) + np.testing.assert_array_equal(ids, id_ref) + np.testing.assert_array_equal(s, s_ref) + + +def test_subset_candidate_natural_numpy_idioms_accepted(): + # The ways a user actually builds a candidate set — all int64. + vectors = unit_vectors(50, 128, seed=0) + idx = RankQuant(dim=128, bits=2) + idx.add(vectors) + for candidates in ( + np.arange(20), + np.where(np.arange(50) % 5 == 0)[0], + np.argpartition(np.arange(50)[::-1], 15)[:15], + ): + assert candidates.dtype == np.int64 # confirm the trap dtype + scores, ids = idx.search_asymmetric_subset(vectors[0], candidates, k=3) + assert scores.shape == (3,) and ids.shape == (3,) + + +def test_subset_noncontiguous_uint32_candidates_accepted(): + # A strided uint32 view (non-contiguous) is copied through the checked + # path rather than rejected — the contiguous fast path is just an + # optimisation, not a requirement, for candidate ids. + vectors = unit_vectors(50, 128, seed=0) + idx = RankQuant(dim=128, bits=2) + idx.add(vectors) + strided = np.arange(0, 48, 2, dtype=np.uint32)[::3] + assert not strided.flags["C_CONTIGUOUS"] + scores, ids = idx.search_asymmetric_subset(vectors[0], strided, k=3) + assert scores.shape == (3,) and ids.shape == (3,) + + +def test_subset_negative_candidate_raises_value_error(): + # Fail-loud: a negative id must NOT silently wrap to a huge u32 + # (np.asarray(-1, uint32) -> 4294967295). Reject with a clear ValueError. + vectors = unit_vectors(50, 128, seed=0) + idx = RankQuant(dim=128, bits=2) + idx.add(vectors) + candidates = np.array([0, -1, 5], dtype=np.int64) + with pytest.raises(ValueError, match="out of range for a u32"): + idx.search_asymmetric_subset(vectors[0], candidates, k=2) + + +def test_subset_overflow_candidate_raises_value_error(): + # Fail-loud: an id >= 2**32 must NOT silently wrap (2**32 + 5 -> 5) and + # score the wrong document. Reject with a clear ValueError. + vectors = unit_vectors(50, 128, seed=0) + idx = RankQuant(dim=128, bits=2) + idx.add(vectors) + candidates = np.array([0, 2**32 + 5], dtype=np.int64) + with pytest.raises(ValueError, match="out of range for a u32"): + idx.search_asymmetric_subset(vectors[0], candidates, k=2) + + +def test_subset_out_of_range_int64_candidate_raises_index_error(): + # The >= len(index) check applies regardless of input dtype. + vectors = unit_vectors(50, 128, seed=0) + idx = RankQuant(dim=128, bits=2) + idx.add(vectors) + candidates = np.array([0, 999], dtype=np.int64) + with pytest.raises(IndexError, match="out of range"): + idx.search_asymmetric_subset(vectors[0], candidates, k=2) + + +def test_subset_float_candidates_raise_type_error(): + # A non-integer dtype is a clear TypeError, not a silent truncation. + vectors = unit_vectors(50, 128, seed=0) + idx = RankQuant(dim=128, bits=2) + idx.add(vectors) + candidates = np.array([0.0, 1.0, 2.0], dtype=np.float32) + with pytest.raises(TypeError, match="integer"): + idx.search_asymmetric_subset(vectors[0], candidates, k=2) + + +def test_body_overlap_doc_ids_int64_accepted(): + # Bitmap.body_overlap_scores_subset shares the same coercion: int64 + # (sorted) doc_ids are accepted; the ascending-order policy still holds. + vectors = unit_vectors(50, 128, seed=0) + bm = Bitmap(dim=128, n_top=32) + bm.add(vectors) + qb = bm.build_query_bitmap_fp32(vectors[0]) + ids_sorted = np.array([2, 4, 8, 16, 32], dtype=np.int64) + out = bm.body_overlap_scores_subset(qb, ids_sorted) + assert out.shape == (5,) + with pytest.raises(ValueError, match="sorted"): + bm.body_overlap_scores_subset(qb, np.array([16, 2, 4], dtype=np.int64)) + + # ------------------------------------------------------------------- # Wrong array width (ncols/len != dim) -> ValueError, not silent # misalignment or a reshape panic. The core derives n = len/dim and only diff --git a/ordvec-python/tests/test_rank.py b/ordvec-python/tests/test_rank.py index a61b24ba..0be221e3 100644 --- a/ordvec-python/tests/test_rank.py +++ b/ordvec-python/tests/test_rank.py @@ -155,10 +155,16 @@ def test_swap_remove_shrinks_length(): assert len(idx) == 9 -def test_add_float64_is_rejected(): - # pyo3 numpy binding is strict on dtype — float64 is not silently - # up/down-converted; the caller must convert. - idx = Rank(dim=64) - v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64) - with pytest.raises(TypeError): - idx.add(v64) +def test_add_float64_is_coerced(): + # ordvec normalizes real-valued input to float32 at the boundary: float64 + # (NumPy's default) is accepted and coerced, producing the same index as the + # explicitly-f32 array. Rank discards magnitude, so coercion is lossless here. + rng = np.random.default_rng(0) + v32 = rng.standard_normal((9, 64)).astype(np.float32) + a = Rank(dim=64) + a.add(v32) + b = Rank(dim=64) + b.add(v32.astype(np.float64)) + assert len(a) == len(b) == 9 + q = rng.standard_normal((3, 64)).astype(np.float32) + np.testing.assert_array_equal(a.search(q, k=5)[1], b.search(q, k=5)[1]) diff --git a/ordvec-python/tests/test_rank_quant.py b/ordvec-python/tests/test_rank_quant.py index e63ca3ad..1f1023b9 100644 --- a/ordvec-python/tests/test_rank_quant.py +++ b/ordvec-python/tests/test_rank_quant.py @@ -122,13 +122,21 @@ def test_load_rejects_nonexistent_file(): @pytest.mark.parametrize("bits", [1, 2, 4]) -def test_add_float64_is_rejected(bits): - # pyo3 numpy binding is strict on dtype — float64 is not silently - # converted, the caller must convert. - idx = RankQuant(dim=64, bits=bits) - v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64) - with pytest.raises(TypeError): - idx.add(v64) +def test_add_float64_is_coerced(bits): + # float64 is accepted and coerced to float32 at the boundary. The asymmetric + # LUT keeps the query floats but scores against f32-quantised docs, so f64 + # precision beyond f32 is meaningless — same results as an f32 index. + rng = np.random.default_rng(0) + v32 = rng.standard_normal((9, 64)).astype(np.float32) + a = RankQuant(dim=64, bits=bits) + a.add(v32) + b = RankQuant(dim=64, bits=bits) + b.add(v32.astype(np.float64)) + assert len(a) == len(b) == 9 + q = rng.standard_normal((3, 64)).astype(np.float32) + np.testing.assert_array_equal( + a.search_asymmetric(q, k=5)[1], b.search_asymmetric(q, k=5)[1] + ) @pytest.mark.parametrize("bits", [1, 2, 4]) diff --git a/ordvec-python/tests/test_redteam_fuzz.py b/ordvec-python/tests/test_redteam_fuzz.py index a1216c38..436b624e 100644 --- a/ordvec-python/tests/test_redteam_fuzz.py +++ b/ordvec-python/tests/test_redteam_fuzz.py @@ -79,10 +79,12 @@ def unit_vectors(n: int, dim: int, seed: int = 0) -> np.ndarray: _SNAN = float(np.array([0x7FA00000], dtype=np.uint32).view(np.float32)[0]) _NEG_SNAN = float(np.array([0xFFA00000], dtype=np.uint32).view(np.float32)[0]) -# dtypes rust-numpy must reject for an f32 array param (strict, no coercion). +# dtypes a float32 embedding param must reject. float16/float32/float64 are now +# coerced at the boundary (see test_input_dtype.py); integer and bool arrays are +# *deliberately* rejected (a {0,1} or narrow-int vector rank-transforms to a +# degenerate index artefact, not a meaningful ordinal signal), and complex/object +# would be a silent reinterpretation. _WRONG_F32_DTYPES = [ - np.float64, - np.float16, np.int32, np.int64, np.uint8, @@ -92,6 +94,19 @@ def unit_vectors(n: int, dim: int, seed: int = 0) -> np.ndarray: object, ] +# Non-integer dtypes a candidate / doc-id param must reject. Integer dtypes are +# accepted (converted to u32 with checked bounds — see test_input_guards.py); a +# float / complex / object array is a clear TypeError, never a truncation. +_NON_INTEGER_ID_DTYPES = [ + np.float16, + np.float32, + np.float64, + np.complex64, + np.complex128, + bool, + object, +] + # Integer scalars that must NOT wrap to a giant usize / OOM. PyO3 maps a negative # Python int and anything >= 2**64 to a clean OverflowError on usize conversion. _BAD_INT_SCALARS = [-1, -(2**40), 2**64, 2**70] @@ -277,9 +292,11 @@ def test_rank_add_wrong_dtype_raises_type_error(dt): idx.add(bad) -@pytest.mark.parametrize("dt", [np.int64, np.uint8, np.int32, np.float32, np.uint64, np.int8]) -def test_subset_candidates_wrong_dtype_raises_type_error(dt): - # candidates must be uint32; int64/uint8/etc must not be reinterpreted. +@pytest.mark.parametrize("dt", _NON_INTEGER_ID_DTYPES) +def test_subset_candidates_noninteger_dtype_raises_type_error(dt): + # Candidate ids accept any *integer* dtype (converted to u32 by value, never + # by byte reinterpretation — see test_input_guards.py); a non-integer dtype + # must be a clean TypeError, not a silent truncation. idx = RankQuant(dim=64, bits=2) idx.add(unit_vectors(10, 64)) cand = np.array([0, 1, 2], dtype=dt) @@ -287,6 +304,20 @@ def test_subset_candidates_wrong_dtype_raises_type_error(dt): idx.search_asymmetric_subset(unit_vectors(1, 64, seed=1)[0], cand, k=2) +@pytest.mark.parametrize("dt", [np.uint8, np.int8, np.int64, np.uint64]) +def test_subset_candidates_integer_dtype_converted_by_value(dt): + # Adversarial: a narrow/wide integer dtype is read as logical *values*, not + # reinterpreted bytes. uint8 [1,2,3] -> ids 1,2,3, identical to uint32. + idx = RankQuant(dim=64, bits=2) + idx.add(unit_vectors(10, 64)) + q = unit_vectors(1, 64, seed=1)[0] + ref = np.array([1, 2, 3], dtype=np.uint32) + s_ref, id_ref = idx.search_asymmetric_subset(q, ref, k=3) + s, ids = idx.search_asymmetric_subset(q, ref.astype(dt), k=3) + np.testing.assert_array_equal(ids, id_ref) + np.testing.assert_array_equal(s, s_ref) + + @pytest.mark.parametrize("dt", [np.uint32, np.int64, np.float64, np.uint8]) def test_body_overlap_q_bitmap_wrong_dtype_raises_type_error(dt): # q_bitmap must be uint64; a narrower/float dtype must be a clean TypeError. @@ -297,8 +328,10 @@ def test_body_overlap_q_bitmap_wrong_dtype_raises_type_error(dt): idx.body_overlap_scores_subset(qb, np.array([0, 1], dtype=np.uint32)) -@pytest.mark.parametrize("dt", [np.int64, np.uint8, np.int32, np.uint64]) -def test_body_overlap_doc_ids_wrong_dtype_raises_type_error(dt): +@pytest.mark.parametrize("dt", _NON_INTEGER_ID_DTYPES) +def test_body_overlap_doc_ids_noninteger_dtype_raises_type_error(dt): + # doc_ids accept any integer dtype (converted to u32 with checked bounds); + # a non-integer dtype is a clean TypeError. idx = Bitmap(dim=128, n_top=32) idx.add(unit_vectors(10, 128)) qb = idx.build_query_bitmap_fp32(unit_vectors(1, 128, seed=1)[0]) diff --git a/ordvec-python/tests/test_sign_bitmap.py b/ordvec-python/tests/test_sign_bitmap.py index 8f550c80..69a0552d 100644 --- a/ordvec-python/tests/test_sign_bitmap.py +++ b/ordvec-python/tests/test_sign_bitmap.py @@ -152,11 +152,19 @@ def test_load_rejects_nonexistent_file(): SignBitmap.load("/nonexistent/path/does-not-exist.tvsb") -def test_add_float64_is_rejected(): - idx = SignBitmap(dim=64) - v64 = np.random.default_rng(0).standard_normal((4, 64)).astype(np.float64) - with pytest.raises(TypeError): - idx.add(v64) +def test_add_float64_is_coerced(): + # float64 accepted and coerced to float32 at the boundary; same index as f32. + rng = np.random.default_rng(0) + v32 = rng.standard_normal((20, 64)).astype(np.float32) + a = SignBitmap(dim=64) + a.add(v32) + b = SignBitmap(dim=64) + b.add(v32.astype(np.float64)) + assert len(a) == len(b) == 20 + q = rng.standard_normal(64).astype(np.float32) + np.testing.assert_array_equal( + a.top_m_candidates(q, m=5), b.top_m_candidates(q, m=5) + ) @pytest.mark.parametrize("dim", [64, 128, 256, 1024])