Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions src/ailego/math_batch/cosine_distance_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ namespace zvec::ailego::DistanceBatch {
template <typename T, size_t BatchSize, size_t PrefetchStep, typename = void>
struct CosineDistanceBatch;

template <typename T, size_t BatchSize, size_t PrefetchStep, typename = void>
struct MinusInnerProductDistanceBatch;


template <typename T, size_t BatchSize, size_t PrefetchStep, typename>
struct CosineDistanceBatch {
using ValueType = typename std::remove_cv<T>::type;
Expand Down Expand Up @@ -54,5 +58,28 @@ struct CosineDistanceBatch {
}
};

template <typename T, size_t BatchSize, size_t PrefetchStep, typename>
struct MinusInnerProductDistanceBatch {
using ValueType = typename std::remove_cv<T>::type;

static inline void ComputeBatch(const ValueType **vecs,
const ValueType *query, size_t num_vecs,
size_t dim, float *results) {
InnerProductDistanceBatch<ValueType, BatchSize, PrefetchStep>::ComputeBatch(
vecs, query, num_vecs, dim, results);

for (size_t i = 0; i < num_vecs; ++i) {
results[i] = -results[i];
}
}

using IPImplType =
InnerProductDistanceBatch<ValueType, BatchSize, PrefetchStep>;

static void QueryPreprocess(void *query, size_t dim) {
return IPImplType::QueryPreprocess(query, dim);
}
};


} // namespace zvec::ailego::DistanceBatch
6 changes: 6 additions & 0 deletions src/ailego/math_batch/distance_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ struct BaseDistance {
ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim,
out);
}
if constexpr (std::is_same_v<DistanceType<ValueType, 1, 1>,
MinusInnerProductMatrix<ValueType, 1, 1>>) {
return DistanceBatch::MinusInnerProductDistanceBatch<
ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim,
out);
}

_ComputeBatch(m, q, num, dim, out);
}
Expand Down
18 changes: 18 additions & 0 deletions src/ailego/math_batch/inner_product_distance_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <zvec/ailego/utility/type_helper.h>
#include "inner_product_distance_batch_impl.h"
#include "inner_product_distance_batch_impl_fp16.h"
#include "inner_product_distance_batch_impl_int4.h"
#include "inner_product_distance_batch_impl_int8.h"

namespace zvec::ailego::DistanceBatch {
Expand Down Expand Up @@ -130,6 +131,23 @@ struct InnerProductDistanceBatchImpl<ailego::Float16, BatchSize> {
}
};

template <size_t BatchSize>
struct InnerProductDistanceBatchImpl<uint8_t, BatchSize> {
using ValueType = uint8_t;
static void compute_one_to_many(
const uint8_t *query, const uint8_t **ptrs,
std::array<const uint8_t *, BatchSize> &prefetch_ptrs, size_t dim,
float *sums) {
#if defined(__AVX2__)
if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) {
return compute_one_to_many_avx2_int4<BatchSize>(query, ptrs,
prefetch_ptrs, dim, sums);
}
#endif
return compute_one_to_many_fallback(query, ptrs, prefetch_ptrs, dim, sums);
}
};

template <typename T, size_t BatchSize, size_t PrefetchStep, typename>
struct InnerProductDistanceBatch {
using ValueType = typename std::remove_cv<T>::type;
Expand Down
98 changes: 98 additions & 0 deletions src/ailego/math_batch/inner_product_distance_batch_impl_int4.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Copyright 2025-present the zvec project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

namespace zvec::ailego::DistanceBatch {

#if defined(__AVX2__)

static const __m256i MASK_INT4_AVX = _mm256_set1_epi32(0x0f0f0f0f);
static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = {
0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1,
0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1};
static const __m256i INT4_LOOKUP_AVX =
_mm256_load_si256((const __m256i *)Int4ConvertTable);
static const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001);

template <size_t dp_batch>
static void compute_one_to_many_avx2_int4(
const uint8_t *query, const uint8_t **ptrs,
std::array<const uint8_t *, dp_batch> &prefetch_ptrs, size_t dimensionality,
float *results) {
dimensionality >>= 1;
__m256i accs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
accs[i] = _mm256_setzero_si256();
}
size_t dim = 0;
for (; dim + 32 <= dimensionality; dim += 32) {
__m256i q = _mm256_loadu_si256((const __m256i *)(query + dim));
__m256i q0 = _mm256_shuffle_epi8(INT4_LOOKUP_AVX,
_mm256_and_si256(q, MASK_INT4_AVX));
__m256i q1 = _mm256_shuffle_epi8(
INT4_LOOKUP_AVX,
_mm256_and_si256(_mm256_srli_epi16(q, 4), MASK_INT4_AVX));
__m256i q0_abs = _mm256_abs_epi8(q0);
__m256i q1_abs = _mm256_abs_epi8(q1);
__m256i data_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_regs[i] = _mm256_loadu_si256((const __m256i *)(ptrs[i] + dim));
}
if (prefetch_ptrs[0]) {
for (size_t i = 0; i < dp_batch; ++i) {
ailego_prefetch(prefetch_ptrs[i] + dim);
}
}
for (size_t i = 0; i < dp_batch; ++i) {
__m256i data0 = _mm256_shuffle_epi8(
INT4_LOOKUP_AVX, _mm256_and_si256(data_regs[i], MASK_INT4_AVX));
__m256i data1 = _mm256_shuffle_epi8(
INT4_LOOKUP_AVX,
_mm256_and_si256(_mm256_srli_epi16(data_regs[i], 4), MASK_INT4_AVX));
data0 = _mm256_sign_epi8(data0, q0);
data1 = _mm256_sign_epi8(data1, q1);
data0 = _mm256_madd_epi16(_mm256_maddubs_epi16(q0_abs, data0),
ONES_INT16_AVX);
data1 = _mm256_madd_epi16(_mm256_maddubs_epi16(q1_abs, data1),
ONES_INT16_AVX);
accs[i] = _mm256_add_epi32(_mm256_add_epi32(data0, data1), accs[i]);
}
}
std::array<int, dp_batch> temp_results;
for (size_t i = 0; i < dp_batch; ++i) {
__m128i lo = _mm256_castsi256_si128(accs[i]);
__m128i hi = _mm256_extracti128_si256(accs[i], 1);
__m128i sum128 = _mm_add_epi32(lo, hi);
sum128 = _mm_hadd_epi32(sum128, sum128);
sum128 = _mm_hadd_epi32(sum128, sum128);
temp_results[i] = _mm_cvtsi128_si32(sum128);
}
for (; dim < dimensionality; ++dim) {
uint8_t q = query[dim];
for (size_t i = 0; i < dp_batch; ++i) {
uint8_t m = ptrs[i][dim];
temp_results[i] +=
Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] +
Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)];
}
}
for (size_t i = 0; i < dp_batch; ++i) {
results[i] = static_cast<float>(temp_results[i]);
}
}

#endif

} // namespace zvec::ailego::DistanceBatch
14 changes: 7 additions & 7 deletions src/core/metric/inner_product_metric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -354,20 +354,20 @@ class InnerProductMetric : public IndexMetric {
switch (data_type_) {
case IndexMeta::DataType::DT_FP32:
return reinterpret_cast<IndexMetric::MatrixBatchDistanceHandle>(
ailego::BaseDistance<ailego::MinusInnerProductMatrix, float, 1,
1>::ComputeBatch);
ailego::BaseDistance<ailego::MinusInnerProductMatrix, float, 12,
2>::ComputeBatch);
case IndexMeta::DataType::DT_FP16:
return reinterpret_cast<IndexMetric::MatrixBatchDistanceHandle>(
ailego::BaseDistance<ailego::MinusInnerProductMatrix,
ailego::Float16, 1, 1>::ComputeBatch);
ailego::Float16, 12, 2>::ComputeBatch);
case IndexMeta::DataType::DT_INT8:
return reinterpret_cast<IndexMetric::MatrixBatchDistanceHandle>(
ailego::BaseDistance<ailego::MinusInnerProductMatrix, int8_t, 1,
1>::ComputeBatch);
ailego::BaseDistance<ailego::MinusInnerProductMatrix, int8_t, 12,
2>::ComputeBatch);
case IndexMeta::DataType::DT_INT4:
return reinterpret_cast<IndexMetric::MatrixBatchDistanceHandle>(
ailego::BaseDistance<ailego::MinusInnerProductMatrix, uint8_t, 1,
1>::ComputeBatch);
ailego::BaseDistance<ailego::MinusInnerProductMatrix, uint8_t, 12,
2>::ComputeBatch);
default:
return nullptr;
}
Expand Down
6 changes: 5 additions & 1 deletion src/core/metric/quantized_integer_metric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,11 @@ class QuantizedIntegerMetric : public IndexMetric {
const override {
if (origin_metric_type_ == MetricType::kCosine &&
meta_.data_type() == IndexMeta::DataType::DT_INT8) {
return CosineMinusInnerProductDistanceBatchWithScoreUnquantized<
return CosineDistanceBatchWithScoreUnquantized<
int8_t, 1, 1>::GetQueryPreprocessFunc();
} else if (origin_metric_type_ == MetricType::kInnerProduct &&
meta_.data_type() == IndexMeta::DataType::DT_INT8) {
return MinusInnerProductDistanceBatchWithScoreUnquantized<
int8_t, 1, 1>::GetQueryPreprocessFunc();
}

Expand Down
96 changes: 80 additions & 16 deletions src/core/metric/quantized_integer_metric_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,17 @@ template <typename T, size_t BatchSize, size_t PrefetchStep>
struct MinusInnerProductDistanceBatchWithScoreUnquantized;

template <typename T, size_t BatchSize, size_t PrefetchStep>
struct CosineMinusInnerProductDistanceBatchWithScoreUnquantized;
struct CosineDistanceBatchWithScoreUnquantized;

template <typename T, size_t BatchSize, size_t PrefetchStep>
struct SquaredEuclideanDistanceBatchWithScoreUnquantized;

template <typename T, size_t BatchSize, size_t PrefetchStep>
struct MipsSquaredEuclideanDistanceBatchWithScoreUnquantized;

template <typename T, size_t BatchSize, size_t PrefetchStep>
struct InternalMinusInnerProductDistanceBatchWithScoreUnquantized;


template <template <typename, size_t, size_t> class DistanceType,
typename ValueType, size_t BatchSize, size_t PrefetchStep,
Expand All @@ -52,7 +55,13 @@ struct BaseDistanceBatchWithScoreUnquantized {
// }
if constexpr (std::is_same_v<DistanceType<ValueType, 1, 1>,
CosineMinusInnerProduct<ValueType, 1, 1>>) {
return CosineMinusInnerProductDistanceBatchWithScoreUnquantized<
return CosineDistanceBatchWithScoreUnquantized<
ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim,
out);
}
if constexpr (std::is_same_v<DistanceType<ValueType, 1, 1>,
MinusInnerProduct<ValueType, 1, 1>>) {
return MinusInnerProductDistanceBatchWithScoreUnquantized<
ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim,
out);
}
Expand All @@ -62,16 +71,15 @@ struct BaseDistanceBatchWithScoreUnquantized {
};

//===========================================================
// CosineMinusInnerProductDistanceBatchWithScoreUnquantized
// CosineDistanceBatchWithScoreUnquantized
//===========================================================

// Compute CosineMinusInnerProduct for quantized INT8
template <size_t BatchSize, size_t PrefetchStep>
struct CosineMinusInnerProductDistanceBatchWithScoreUnquantized<
int8_t, BatchSize, PrefetchStep> {
using ImplType =
MinusInnerProductDistanceBatchWithScoreUnquantized<int8_t, BatchSize,
PrefetchStep>;
struct CosineDistanceBatchWithScoreUnquantized<int8_t, BatchSize,
PrefetchStep> {
using ImplType = InternalMinusInnerProductDistanceBatchWithScoreUnquantized<
int8_t, BatchSize, PrefetchStep>;

static inline void ComputeBatch(const int8_t **vecs, const int8_t *query,
size_t num_vecs, size_t dim, float *results) {
Expand All @@ -94,25 +102,25 @@ struct CosineMinusInnerProductDistanceBatchWithScoreUnquantized<

// Compute CosineMinusInnerProduct for quantized INT4
template <size_t BatchSize, size_t PrefetchStep>
struct CosineMinusInnerProductDistanceBatchWithScoreUnquantized<
uint8_t, BatchSize, PrefetchStep> {
struct CosineDistanceBatchWithScoreUnquantized<uint8_t, BatchSize,
PrefetchStep> {
static inline void ComputeBatch(const uint8_t **vecs, const uint8_t *query,
size_t num_vecs, size_t dim, float *results) {
size_t original_dim = dim - 40;
MinusInnerProductDistanceBatchWithScoreUnquantized<
InternalMinusInnerProductDistanceBatchWithScoreUnquantized<
uint8_t, BatchSize, PrefetchStep>::ComputeBatch(vecs, query, num_vecs,
original_dim, results);
}
};

//===========================================================
// MinusInnerProductDistanceBatchWithScoreUnquantized
// InternalMinusInnerProductDistanceBatchWithScoreUnquantized
//===========================================================

// Compute MinusInnerProduct for quantized INT8
template <size_t BatchSize, size_t PrefetchStep>
struct MinusInnerProductDistanceBatchWithScoreUnquantized<int8_t, BatchSize,
PrefetchStep> {
struct InternalMinusInnerProductDistanceBatchWithScoreUnquantized<
int8_t, BatchSize, PrefetchStep> {
using ImplType =
ailego::DistanceBatch::InnerProductDistanceBatch<int8_t, BatchSize,
PrefetchStep>;
Expand Down Expand Up @@ -150,8 +158,8 @@ struct MinusInnerProductDistanceBatchWithScoreUnquantized<int8_t, BatchSize,

// Compute MinusInnerProduct for quantized INT4
template <size_t BatchSize, size_t PrefetchStep>
struct MinusInnerProductDistanceBatchWithScoreUnquantized<uint8_t, BatchSize,
PrefetchStep> {
struct InternalMinusInnerProductDistanceBatchWithScoreUnquantized<
uint8_t, BatchSize, PrefetchStep> {
static inline void ComputeBatch(const uint8_t **vecs, const uint8_t *query,
size_t num_vecs, size_t dim, float *results) {
const size_t original_dim = dim;
Expand Down Expand Up @@ -179,6 +187,62 @@ struct MinusInnerProductDistanceBatchWithScoreUnquantized<uint8_t, BatchSize,
}
};

//===========================================================
// MinusInnerProductDistanceBatchWithScoreUnquantized
//===========================================================

template <size_t BatchSize, size_t PrefetchStep>
struct MinusInnerProductDistanceBatchWithScoreUnquantized<int8_t, BatchSize,
PrefetchStep> {
using ImplType = InternalMinusInnerProductDistanceBatchWithScoreUnquantized<
int8_t, BatchSize, PrefetchStep>;
static inline void ComputeBatch(const int8_t **vecs, const int8_t *query,
size_t num_vecs, size_t dim, float *results) {
const size_t original_dim = dim - 16;
ImplType::ComputeBatch(vecs, query, num_vecs, original_dim, results);
}

static ailego::DistanceBatch::DistanceBatchQueryPreprocessFunc
GetQueryPreprocessFunc() {
return QueryPreprocess;
}

static void QueryPreprocess(void *query, size_t dim) {
if (auto func = ImplType::GetQueryPreprocessFunc(); func != nullptr) {
return func(query, dim - 16);
}
}
};


template <size_t BatchSize, size_t PrefetchStep>
struct MinusInnerProductDistanceBatchWithScoreUnquantized<uint8_t, BatchSize,
PrefetchStep> {
static inline void ComputeBatch(const uint8_t **vecs, const uint8_t *query,
size_t num_vecs, size_t dim, float *results) {
const size_t original_dim = dim - 32;
const size_t d = original_dim;
const size_t p = d >> 1;
ailego::DistanceBatch::InnerProductDistanceBatch<
uint8_t, BatchSize, PrefetchStep>::ComputeBatch(vecs, query, num_vecs,
original_dim, results);
const float *q_tail = reinterpret_cast<const float *>(
reinterpret_cast<const uint8_t *>(query) + p);
float qa = q_tail[0];
float qb = q_tail[1];
float qs = q_tail[2];
for (size_t i = 0; i < num_vecs; ++i) {
const float *m_tail = reinterpret_cast<const float *>(
reinterpret_cast<const uint8_t *>(vecs[i]) + p);
float ma = m_tail[0];
float mb = m_tail[1];
float ms = m_tail[2];
float &result = results[i];
result = -(ma * qa * result + mb * qa * qs + qb * ma * ms + d * qb * mb);
}
}
};

//===========================================================
// SquaredEuclideanDistanceBatchWithScoreUnquantized
//===========================================================
Expand Down
Loading