array2d
diff --git a/‎README.md‎
Lines changed: 6 additions & 5 deletions b/‎README.md‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎numpy/core.h‎
Lines changed: 224 additions & 0 deletions b/‎numpy/core.h‎
Lines changed: 224 additions & 0 deletions
@@ -4,7 +4,7 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![C++17](https://img.shields.io/badge/C%2B%2B-17-blue.svg)](https://en.cppreference.com/w/cpp/17)
 [![CMake](https://img.shields.io/badge/CMake-%3E%3D3.16-green.svg)](https://cmake.org/)
-[![Tests](https://img.shields.io/badge/tests-792%20bit--exact-brightgreen.svg)](tests/test_all.py)
+[![Tests](https://img.shields.io/badge/tests-900%20bit--exact-brightgreen.svg)](tests/test_all.py)
 [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](CONTRIBUTING.md)
 
 ## Background
@@ -17,7 +17,7 @@ We created `numpycpp` to keep NumPy's familiar usage patterns while letting C++
 
 `numpycpp` is a **header-only C++ library** implementing numpy's core API (`numpy.*`, `numpy.linalg.*`, `numpy.einsum`) with **bit-level precision alignment**. Raw pointer + size interface. Zero external dependencies — pure C++17 standard library.
 
-All APIs are tested against Python numpy under strict bit-level comparison: every IEEE 754 float bit must match exactly (754 tests, float64 + float32, including NaN passthrough, signed-zero, ±∞, and domain-error cases).
+All APIs are tested against Python numpy under strict bit-level comparison: every IEEE 754 float bit must match exactly (900 tests, float64 + float32, including NaN passthrough, signed-zero, ±∞, domain-error cases, and advanced indexing).
 
 **Bit-exact math** is achieved by resolving numpy's own math functions from `_multiarray_umath.so` at runtime. The SVML bridge auto-detects your CPU and selects the same path numpy uses: AVX‑512 SVML (`__svml_exp8`) when available, or scalar `npy_exp`/`npy_log`/etc. otherwise. AVX‑512 intrinsics are isolated behind `__attribute__((target))` — the binary is safe on any x86_64 CPU (no SIGILL). Every transcendental function produces the exact same IEEE 754 bits as numpy on **all architectures**.
 
@@ -93,7 +93,7 @@ Add `-Ipath/to/numpycpp` to your compiler flags and include the headers directly
 
 The test suite verifies **bit-level precision alignment** between every C++ function and Python numpy.
 No tolerance, no `atol`/`rtol` — raw IEEE 754 bits must match exactly.
-754 tests: float64 + float32, including NaN passthrough, signed-zero, ±∞, domain errors, and AVX-512 boundary sizes.
+900 tests: float64 + float32, including NaN passthrough, signed-zero, ±∞, domain errors, advanced indexing, and AVX-512 boundary sizes.
 
 ```bash
 # build
@@ -173,7 +173,7 @@ target_compile_options(<target> PRIVATE
 ### Alignment status
 
 The table below reflects the current bit-level parity between `numpycpp` C++ and Python numpy.
-All 754 tests pass under strict IEEE 754 bit comparison (float64 + float32).
+All 900 tests pass under strict IEEE 754 bit comparison (float64 + float32).
 
 ✅ = bit-exact on ALL architectures (SVML bridge with runtime CPU dispatch).
 
@@ -185,6 +185,7 @@ All 754 tests pass under strict IEEE 754 bit comparison (float64 + float32).
 | Logical           | ✅ | ✅ | bool-only (and/or/not/xor) |
 | Special values    | ✅ | ✅ | isnan, isinf, isfinite |
 | Manipulation      | ✅ | ✅ | diff, stack, concatenate, transpose, slice, roll, flip, repeat, tile, where |
+| **Advanced indexing** | ✅ | ✅ | take (any axis), compress (bool mask gather), nd_slice (step/reverse), put, boolean_assign, nd_slice_assign |
 | Sorting           | ✅ | ✅ | argsort, argmax, argmin |
 | Setops / interp   | ✅ | ✅ | isin, intersect1d, interp, safe_divide |
 | Access / convert  | ✅ | ✅ | array_get, asarray, to_vector |
@@ -226,7 +227,7 @@ numpycpp/
 │   └── einsum_py.h
 ├── tests/                    # bit-level precision tests + test module
 │   ├── module.cpp            # pybind11 module for testing
-│   ├── test_all.py           # single entry — all APIs, 754 tests, float64+float32
+│   ├── test_all.py           # single entry — all APIs, 900 tests, float64+float32
 │   ├── conftest.py           # silent-mode output suppression
 │   └── CMakeLists.txt        # test-module build
 ├── CMakeLists.txt            # build & .deb packaging
 
@@ -796,6 +796,230 @@ inline ptrdiff_t argmin(const T* data, size_t n) {
     return mi;
 }
 
+// ============================================================================
+// Advanced / Fancy indexing
+// ============================================================================
+
+/// Compute the number of elements produced by a slice(start, stop, step).
+/// Pre-condition: step != 0, and start/stop are already normalized
+/// (integers, with -1 allowed as the "before index 0" sentinel for negative step).
+///
+/// Matches Python's len(range(start, stop, step)) exactly:
+///   • step > 0: ceil((stop - start) / step) = max(0, (stop-start+step-1)/step)
+///   • step < 0: ceil((start - stop) / -step) = max(0, (start-stop-step-1)/(-step))
+///
+/// Example: slice(9, -1, -1).indices(10) → start=9, stop=-1, step=-1 → 10 elements
+inline ptrdiff_t slice_len(ptrdiff_t start, ptrdiff_t stop, ptrdiff_t step) noexcept {
+    if (step > 0) {
+        return (stop > start) ? (stop - start + step - 1) / step : 0;
+    } else {
+        return (start > stop) ? (start - stop - step - 1) / (-step) : 0;
+    }
+}
+
+/// numpy.take(a, indices, axis=None)
+/// axis = -1 means axis=None (gather from flattened array).
+/// Indices may be negative; they are wrapped around the axis dimension size.
+///
+/// For axis ≥ 0 the output shape is:
+///   shape[:axis] + (ni,) + shape[axis+1:]
+/// For axis = -1 (None) the output is flat with shape (ni,).
+template<typename T>
+inline void take(const T* src, T* dst,
+                 const ptrdiff_t* shape, int ndim, int axis,
+                 const ptrdiff_t* indices, size_t ni) {
+    if (ni == 0) return;
+
+    if (axis < 0) {
+        // axis=None: treat src as flat, gather by flat indices
+        ptrdiff_t total = 1;
+        for (int d = 0; d < ndim; ++d) total *= shape[d];
+        for (size_t k = 0; k < ni; ++k) {
+            ptrdiff_t idx = indices[k];
+            if (idx < 0) idx += total;
+            dst[k] = src[idx];
+        }
+        return;
+    }
+
+    // axis in [0, ndim)
+    ptrdiff_t leading = 1;
+    for (int d = 0; d < axis; ++d) leading *= shape[d];
+    ptrdiff_t axis_size = shape[axis];
+    ptrdiff_t trailing = 1;
+    for (int d = axis + 1; d < ndim; ++d) trailing *= shape[d];
+
+    for (ptrdiff_t l = 0; l < leading; ++l) {
+        const T* src_l = src + l * axis_size * trailing;
+        T*       dst_l = dst + l * static_cast<ptrdiff_t>(ni) * trailing;
+        for (size_t k = 0; k < ni; ++k) {
+            ptrdiff_t idx = indices[k];
+            if (idx < 0) idx += axis_size;
+            std::memcpy(dst_l + static_cast<ptrdiff_t>(k) * trailing,
+                        src_l + idx * trailing,
+                        static_cast<size_t>(trailing) * sizeof(T));
+        }
+    }
+}
+
+/// numpy.compress(condition, a, axis=None)
+/// Gathers elements from src (treated as flat) where mask[i] == true.
+/// Returns the count of elements written to dst.
+template<typename T>
+inline size_t compress(const T* src, T* dst, const bool* mask, size_t n) {
+    size_t cnt = 0;
+    for (size_t i = 0; i < n; ++i)
+        if (mask[i]) dst[cnt++] = src[i];
+    return cnt;
+}
+
+/// N-D slice with per-dimension start/stop/step (pre-normalized by caller;
+/// stop=-1 is the valid "before index 0" sentinel for negative-step slices).
+///
+/// Output shape[d] = slice_len(starts[d], stops[d], steps[d]).
+/// dst must be pre-allocated to product(output_shape) elements.
+///
+/// Overload of numpy::slice() that accepts per-dimension step vectors.
+/// Covers  a[s0:e0:k0, s1:e1:k1, ...]  for arbitrary N-D arrays.
+template<typename T>
+inline void slice(const T* src, T* dst,
+                  const ptrdiff_t* shape, int ndim,
+                  const ptrdiff_t* starts, const ptrdiff_t* stops,
+                  const ptrdiff_t* steps) {
+    if (ndim == 0) return;
+
+    // Compute output shape
+    std::vector<ptrdiff_t> out_shape(ndim);
+    ptrdiff_t total = 1;
+    for (int d = 0; d < ndim; ++d) {
+        out_shape[d] = slice_len(starts[d], stops[d], steps[d]);
+        if (out_shape[d] <= 0) return;  // empty slice
+        total *= out_shape[d];
+    }
+
+    // Input strides (C-contiguous)
+    std::vector<ptrdiff_t> in_stride(ndim);
+    in_stride[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d)
+        in_stride[d] = in_stride[d + 1] * shape[d + 1];
+
+    // Iterate output in C-order, compute source flat index from multi-index
+    for (ptrdiff_t out_idx = 0; out_idx < total; ++out_idx) {
+        ptrdiff_t rem = out_idx;
+        ptrdiff_t in_idx = 0;
+        for (int d = ndim - 1; d >= 0; --d) {
+            ptrdiff_t od = rem % out_shape[d];
+            rem /= out_shape[d];
+            in_idx += (starts[d] + od * steps[d]) * in_stride[d];
+        }
+        dst[out_idx] = src[in_idx];
+    }
+}
+
+/// N-D slice assignment with scalar: dst[slice] = value (in-place).
+/// Overload of slice_assign() that accepts per-dimension step vectors.
+/// starts/stops/steps are pre-normalized (same convention as slice()).
+template<typename T>
+inline void slice_assign(T* dst,
+                          const ptrdiff_t* shape, int ndim,
+                          const ptrdiff_t* starts, const ptrdiff_t* stops,
+                          const ptrdiff_t* steps, T value) {
+    if (ndim == 0) return;
+
+    std::vector<ptrdiff_t> out_shape(ndim);
+    ptrdiff_t total = 1;
+    for (int d = 0; d < ndim; ++d) {
+        out_shape[d] = slice_len(starts[d], stops[d], steps[d]);
+        if (out_shape[d] <= 0) return;
+        total *= out_shape[d];
+    }
+
+    std::vector<ptrdiff_t> in_stride(ndim);
+    in_stride[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d)
+        in_stride[d] = in_stride[d + 1] * shape[d + 1];
+
+    for (ptrdiff_t out_idx = 0; out_idx < total; ++out_idx) {
+        ptrdiff_t rem = out_idx;
+        ptrdiff_t in_idx = 0;
+        for (int d = ndim - 1; d >= 0; --d) {
+            ptrdiff_t od = rem % out_shape[d];
+            rem /= out_shape[d];
+            in_idx += (starts[d] + od * steps[d]) * in_stride[d];
+        }
+        dst[in_idx] = value;
+    }
+}
+
+/// N-D slice assignment with array: dst[slice] = values (in-place).
+/// Overload of slice_assign() with an array of values (one per slice element).
+/// values must contain exactly product(output_shape) elements in C-order.
+template<typename T>
+inline void slice_assign(T* dst,
+                          const ptrdiff_t* shape, int ndim,
+                          const ptrdiff_t* starts, const ptrdiff_t* stops,
+                          const ptrdiff_t* steps, const T* values) {
+    if (ndim == 0) return;
+
+    std::vector<ptrdiff_t> out_shape(ndim);
+    ptrdiff_t total = 1;
+    for (int d = 0; d < ndim; ++d) {
+        out_shape[d] = slice_len(starts[d], stops[d], steps[d]);
+        if (out_shape[d] <= 0) return;
+        total *= out_shape[d];
+    }
+
+    std::vector<ptrdiff_t> in_stride(ndim);
+    in_stride[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d)
+        in_stride[d] = in_stride[d + 1] * shape[d + 1];
+
+    for (ptrdiff_t out_idx = 0; out_idx < total; ++out_idx) {
+        ptrdiff_t rem = out_idx;
+        ptrdiff_t in_idx = 0;
+        for (int d = ndim - 1; d >= 0; --d) {
+            ptrdiff_t od = rem % out_shape[d];
+            rem /= out_shape[d];
+            in_idx += (starts[d] + od * steps[d]) * in_stride[d];
+        }
+        dst[in_idx] = values[out_idx];
+    }
+}
+
+/// numpy.put(a, indices, values, mode='raise')
+/// Scatters values into dst (flat) at the given indices.
+/// Negative indices are wrapped: idx < 0 → idx += n.
+/// Out-of-range indices are silently skipped (clip mode).
+template<typename T>
+inline void put(T* dst, size_t n,
+                const ptrdiff_t* indices, const T* values, size_t ni) {
+    for (size_t k = 0; k < ni; ++k) {
+        ptrdiff_t idx = indices[k];
+        if (idx < 0) idx += static_cast<ptrdiff_t>(n);
+        if (idx >= 0 && idx < static_cast<ptrdiff_t>(n))
+            dst[static_cast<size_t>(idx)] = values[k];
+    }
+}
+
+/// numpy.putmask(a, mask, values) — scalar variant.
+/// Sets dst[i] = value for every i where mask[i] is true.
+/// Equivalent to Python: a[mask] = scalar
+template<typename T>
+inline void putmask(T* dst, const bool* mask, size_t n, T value) {
+    for (size_t i = 0; i < n; ++i)
+        if (mask[i]) dst[i] = value;
+}
+
+/// numpy.putmask(a, mask, values) — array variant.
+/// Sets dst[i] = values[j++] for every i where mask[i] is true (sequential).
+/// Equivalent to Python: a[mask] = array_of_values
+template<typename T>
+inline void putmask(T* dst, const bool* mask, size_t n, const T* values) {
+    size_t j = 0;
+    for (size_t i = 0; i < n; ++i)
+        if (mask[i]) dst[i] = values[j++];
+}
+
 // ============================================================================
 // Set operations
 // ============================================================================