From 32016aad8fc97ec578fc120738b0c0b64212fbb1 Mon Sep 17 00:00:00 2001 From: Phong Nguyen Date: Thu, 14 May 2026 19:52:33 +0000 Subject: [PATCH 1/4] tscore: SIMD ts::memcpy_tolower; use in URL.cc cache-key fast path The bulk ASCII tolower loop used to canonicalize the scheme and host portions of a URL before hashing into the cache key runs at ~1.5 GB/s scalar (one byte and one ParseRules table lookup per iteration). The work is trivially data-parallel and there is no per-byte branching, so a SIMD kernel that lowercases a whole register at once gives a straightforward speedup once the input is long enough to amortize the vector setup. Add a header-only helper ts::memcpy_tolower under include/tscore/ink_memcpy_tolower.h with a compile-time-selected cascade of SIMD bodies: 64-byte AVX-512BW, 32-byte AVX2, 16-byte SSE2 on x86_64, plus 16-byte NEON on ARMv8. Wider bodies fall through to narrower drain loops, so the worst-case scalar tail is always <16 bytes. Selection is purely compile-time; runtime ifunc dispatch is left for a follow-up. The AVX-512BW body uses _mm512_mask_add_epi8 to fuse the conditional "+0x20 where upper" into a single op, and a masked load/store handles 1..63 leftover bytes in a single SIMD pass (inspired by Tony Finch's copytolower64.c, https://dotat.at/cgi/git/vectolower.git/). The whole AVX-512BW block is gated at n >= 64 because the masked load/store has ~7 ns of fixed setup that loses to the narrower paths for short inputs; below 64 bytes we fall through to the AVX2 + SSE2 cascade. Semantics match the existing ParseRules::ink_tolower table exactly: bytes in 'A'..'Z' map to 'a'..'z', all others (including 0x80..0xFF) pass through unchanged. Replace the static inline memcpy_tolower in src/proxy/hdrs/URL.cc with this helper. Baseline x86_64 builds use the 16-byte SSE2 path; builds that opt into a wider -march (x86-64-v3 = AVX2, x86-64-v4 = AVX-512BW) get the wider bodies automatically. Sub-16-byte inputs (e.g. short HTTP schemes like "http") use the scalar tail and see no perf change. Measured throughput on a 2.0 GHz Ice Lake Xeon Gold 6338, mean ns: size scalar SSE2 AVX2 AVX-512BW ---- ------ ---- ---- --------- 16 B 10.4 2.15 1.75 1.98 32 B 15.4 2.90 2.24 2.31 64 B 28.0 4.43 2.85 2.61 256 B 113 13.87 7.57 6.20 1024 B 425 50.47 24.23 17.49 Speedup vs scalar at 1024 B: SSE2 8.4x, AVX2 17.5x, AVX-512BW 24.3x. A new microbenchmark under tools/benchmark covers correctness across sizes 0..257 (bracketing each SIMD body size) plus an exhaustive byte- value sweep that guards against any future widening of the case-fold range, alongside scalar-vs-SIMD throughput numbers and a config-print case that emits the selected ISA path. Co-Authored-By: Claude Opus 4.7 (1M context) --- include/tscore/ink_memcpy_tolower.h | 163 +++++++++++++++++++ src/proxy/hdrs/URL.cc | 15 +- tools/benchmark/CMakeLists.txt | 3 + tools/benchmark/benchmark_memcpy_tolower.cc | 167 ++++++++++++++++++++ 4 files changed, 336 insertions(+), 12 deletions(-) create mode 100644 include/tscore/ink_memcpy_tolower.h create mode 100644 tools/benchmark/benchmark_memcpy_tolower.cc diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_memcpy_tolower.h new file mode 100644 index 00000000000..3c93559ae5e --- /dev/null +++ b/include/tscore/ink_memcpy_tolower.h @@ -0,0 +1,163 @@ +/** @file + + SIMD-accelerated bulk ASCII tolower copy. + + Used on the URL canonicalization fast path for cache-key digests + (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast). The scalar loop is + the bottleneck for hosts and schemes long enough to vectorize; for + shorter inputs the scalar tail handles them with no SIMD overhead. + + Semantics match a byte-at-a-time loop using ParseRules::ink_tolower(): + + - Bytes in 'A'..'Z' (0x41..0x5A) have bit 5 set, mapping them to + 'a'..'z'. All other bytes (including 0x80..0xFF) pass through + unchanged. There is no UTF-8 case folding. + + - The destination is written byte-for-byte; src and dst must point + to non-overlapping regions of size at least @n bytes. + + Implementation note: the bodies are stacked widest-first and each + drains its block size before falling through to the next. A build + with AVX-512BW gets the 64-byte body as the main loop, then at most + one 32-byte AVX2 iteration and one 16-byte SSE2 iteration to drain + the remainder before the scalar tail handles 0-15 bytes. Builds + without the wider ISAs simply skip those blocks. Selection is purely + compile-time; no runtime dispatch. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +#pragma once + +#include +#include + +#if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__) +#include +#elif defined(__ARM_NEON) || defined(__aarch64__) +#include +#endif + +namespace ts +{ + +inline void +memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept +{ +#if defined(__AVX512BW__) + // AVX-512BW: 64 bytes per iteration with two key optimizations over the + // narrower paths: + // - _mm512_mask_add_epi8 fuses the "+0x20 where upper" into a single + // op (no separate maskz_set1 + or). + // - A masked load/store handles the 1..63-byte tail in a single SIMD + // pass, so we don't need to cascade to AVX2/SSE2 to drain the + // remainder. + // + // The masked tail does carry ~7 ns of fixed setup cost, which loses to + // the cascade on short inputs. Gating the whole block on n >= 64 means + // tiny inputs fall through to the AVX2/SSE2 path below, where they keep + // the speedup that path already provides. + // + // Inspired by Tony Finch's copytolower64.c + // (https://dotat.at/cgi/git/vectolower.git/). + if (n >= 64) { + const __m512i A_vec = _mm512_set1_epi8('A'); + const __m512i Z_vec = _mm512_set1_epi8('Z'); + const __m512i delta = _mm512_set1_epi8('a' - 'A'); + do { + __m512i bytes = _mm512_loadu_epi8(src); + __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec); + _mm512_storeu_epi8(dst, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta)); + src += 64; + dst += 64; + n -= 64; + } while (n >= 64); + if (n != 0) { + auto len_mask = static_cast<__mmask64>((~0ULL) >> (64 - n)); + __m512i bytes = _mm512_maskz_loadu_epi8(len_mask, src); + __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec); + _mm512_mask_storeu_epi8(dst, len_mask, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta)); + } + return; + } +#endif + +#if defined(__AVX2__) + // 32 bytes per iteration. Same compare-and-OR pattern as SSE2. + { + const __m256i a_minus_one = _mm256_set1_epi8('A' - 1); + const __m256i z_plus_one = _mm256_set1_epi8('Z' + 1); + const __m256i bit5 = _mm256_set1_epi8(0x20); + while (n >= 32) { + __m256i bytes = _mm256_loadu_si256(reinterpret_cast(src)); + __m256i ge_A = _mm256_cmpgt_epi8(bytes, a_minus_one); + __m256i le_Z = _mm256_cmpgt_epi8(z_plus_one, bytes); + __m256i mask = _mm256_and_si256(_mm256_and_si256(ge_A, le_Z), bit5); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), _mm256_or_si256(bytes, mask)); + src += 32; + dst += 32; + n -= 32; + } + } +#endif + +#if defined(__SSE2__) + // 16 bytes per iteration. Signed compare works for ASCII A-Z because all + // letters live below 0x80; high bytes (0x80..0xFF) compare as negative + // and correctly miss the [A,Z] range so they pass through unchanged. + { + const __m128i a_minus_one = _mm_set1_epi8('A' - 1); + const __m128i z_plus_one = _mm_set1_epi8('Z' + 1); + const __m128i bit5 = _mm_set1_epi8(0x20); + while (n >= 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(src)); + __m128i ge_A = _mm_cmpgt_epi8(bytes, a_minus_one); + __m128i le_Z = _mm_cmpgt_epi8(z_plus_one, bytes); + __m128i mask = _mm_and_si128(_mm_and_si128(ge_A, le_Z), bit5); + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(bytes, mask)); + src += 16; + dst += 16; + n -= 16; + } + } +#elif defined(__ARM_NEON) || defined(__aarch64__) + // 16 bytes per iteration; unsigned compare available natively. + { + const uint8x16_t a_minus_one = vdupq_n_u8('A' - 1); + const uint8x16_t z_plus_one = vdupq_n_u8('Z' + 1); + const uint8x16_t bit5 = vdupq_n_u8(0x20); + while (n >= 16) { + uint8x16_t bytes = vld1q_u8(reinterpret_cast(src)); + uint8x16_t ge_A = vcgtq_u8(bytes, a_minus_one); + uint8x16_t le_Z = vcltq_u8(bytes, z_plus_one); + uint8x16_t mask = vandq_u8(vandq_u8(ge_A, le_Z), bit5); + vst1q_u8(reinterpret_cast(dst), vorrq_u8(bytes, mask)); + src += 16; + dst += 16; + n -= 16; + } + } +#endif + + while (n--) { + auto c = static_cast(*src++); + *dst++ = (c >= 'A' && c <= 'Z') ? static_cast(c | 0x20) : static_cast(c); + } +} + +} // namespace ts diff --git a/src/proxy/hdrs/URL.cc b/src/proxy/hdrs/URL.cc index 68d84b5d481..c8eff4dd69f 100644 --- a/src/proxy/hdrs/URL.cc +++ b/src/proxy/hdrs/URL.cc @@ -25,6 +25,7 @@ #include #include "tscore/ink_platform.h" #include "tscore/ink_memory.h" +#include "tscore/ink_memcpy_tolower.h" #include "proxy/hdrs/URL.h" #include "proxy/hdrs/MIME.h" #include "proxy/hdrs/HTTP.h" @@ -1684,16 +1685,6 @@ url_describe(HdrHeapObjImpl *raw, bool /* recurse ATS_UNUSED */) * * ***********************************************************************/ -static inline void -memcpy_tolower(char *d, const char *s, int n) -{ - while (n--) { - *d = ParseRules::ink_tolower(*s); - s++; - d++; - } -} - // fast path for CryptoHash, HTTP, no user/password/params/query, // no buffer overflow, no unescaping needed @@ -1704,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash char *p; p = buffer; - memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme); + ts::memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme); p += url->m_len_scheme; *p++ = ':'; *p++ = '/'; @@ -1713,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash *p++ = ':'; // no password *p++ = '@'; - memcpy_tolower(p, url->m_ptr_host, url->m_len_host); + ts::memcpy_tolower(p, url->m_ptr_host, url->m_len_host); p += url->m_len_host; *p++ = '/'; memcpy(p, url->m_ptr_path, url->m_len_path); diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt index 49f25fad1c1..84fc111925e 100644 --- a/tools/benchmark/CMakeLists.txt +++ b/tools/benchmark/CMakeLists.txt @@ -36,6 +36,9 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li add_executable(benchmark_Random benchmark_Random.cc) target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore) +add_executable(benchmark_memcpy_tolower benchmark_memcpy_tolower.cc) +target_link_libraries(benchmark_memcpy_tolower PRIVATE Catch2::Catch2WithMain ts::tscore) + add_executable(benchmark_HostDB benchmark_HostDB.cc) target_link_libraries( benchmark_HostDB diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_memcpy_tolower.cc new file mode 100644 index 00000000000..e2b45b48b5d --- /dev/null +++ b/tools/benchmark/benchmark_memcpy_tolower.cc @@ -0,0 +1,167 @@ +/** @file + + Micro benchmark for ts::memcpy_tolower against a byte-at-a-time scalar + loop equivalent to the prior URL.cc::memcpy_tolower definition. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#define CATCH_CONFIG_ENABLE_BENCHMARKING + +#include +#include + +#include "tscore/ink_memcpy_tolower.h" +#include "tscore/ParseRules.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace +{ + +// Sizes chosen to mirror the URL.cc hot path: +// 4-8B - common HTTP scheme strings ("http", "https") +// 16-32B - typical host names +// 64-256B - long host names / cache-key segments +// 1024B - stress the inner loop +constexpr std::array kSizes{4, 8, 16, 24, 32, 64, 256, 1024}; + +// Same character distribution we expect from URL/host input: ASCII letters +// (mixed case), digits, and the small set of non-alpha bytes that legitimately +// appear in URLs. +std::vector +make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL) +{ + std::mt19937_64 rng(seed); + std::vector v(n); + for (std::size_t i = 0; i < n; ++i) { + auto r = static_cast(rng() & 0x3FU); + if (r < 26U) { + v[i] = static_cast('A' + r); + } else if (r < 52U) { + v[i] = static_cast('a' + (r - 26U)); + } else { + static constexpr char kNonAlpha[] = "0123456789-_./:"; + v[i] = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)]; + } + } + return v; +} + +// Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here +// as the baseline the SIMD path is expected to beat. +inline void +memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept +{ + while (n--) { + *d = ParseRules::ink_tolower(*s); + ++s; + ++d; + } +} + +} // namespace + +TEST_CASE("active SIMD configuration", "[tolower][config]") +{ + // Print the compile-time ISA path so the benchmark output makes the + // selected configuration obvious. Cascades stack: AVX-512BW builds also + // emit the AVX2 and SSE2 drain loops; AVX2 builds emit the SSE2 drain. + std::cout << "ts::memcpy_tolower compiled with: "; +#if defined(__AVX512BW__) + std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade"; +#elif defined(__AVX2__) + std::cout << "AVX2 (32B body) + SSE2 (16B drain)"; +#elif defined(__SSE2__) + std::cout << "SSE2 (16B body)"; +#elif defined(__ARM_NEON) || defined(__aarch64__) + std::cout << "NEON (16B body)"; +#else + std::cout << "scalar only"; +#endif + std::cout << '\n'; + SUCCEED(); +} + +TEST_CASE("ts::memcpy_tolower matches scalar reference", "[tolower][correctness]") +{ + // Cover sizes that bracket the 16-byte SIMD body: smaller-than, equal-to, + // a couple of multiples, and several offsets between multiples. + for (std::size_t sz : std::array{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 64, 257}) { + auto input = make_mixed_case_ascii(sz, 0xC0FFEE + sz); + std::vector expected(sz); + std::vector actual(sz); + + memcpy_tolower_scalar(expected.data(), input.data(), sz); + ts::memcpy_tolower(actual.data(), input.data(), sz); + + CAPTURE(sz); + REQUIRE(actual == expected); + } +} + +TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[tolower][correctness]") +{ + // Every byte value from 0..255 should round-trip unchanged unless it is in + // 'A'..'Z', in which case it should map to 'a'..'z'. This catches anyone + // who later tries to "speed things up" by widening the range to Latin-1. + std::array input; + for (std::size_t i = 0; i < 256; ++i) { + input[i] = static_cast(i); + } + std::array output; + ts::memcpy_tolower(output.data(), reinterpret_cast(input.data()), input.size()); + + for (std::size_t i = 0; i < 256; ++i) { + auto in = static_cast(i); + auto out = static_cast(output[i]); + auto exp = (in >= 'A' && in <= 'Z') ? static_cast(in | 0x20) : in; + CAPTURE(i); + REQUIRE(out == exp); + } +} + +TEST_CASE("memcpy_tolower throughput", "[bench][tolower]") +{ + for (std::size_t sz : kSizes) { + auto input = make_mixed_case_ascii(sz); + std::vector output_scalar(sz); + std::vector output_simd(sz); + + std::string scalar_name = "scalar " + std::to_string(sz) + "B"; + BENCHMARK(scalar_name.c_str()) + { + memcpy_tolower_scalar(output_scalar.data(), input.data(), sz); + return output_scalar[0]; + }; + + std::string simd_name = "ts::mct " + std::to_string(sz) + "B"; + BENCHMARK(simd_name.c_str()) + { + ts::memcpy_tolower(output_simd.data(), input.data(), sz); + return output_simd[0]; + }; + } +} From faeb167c8a55026b8e91e483d08770d64de31ba9 Mon Sep 17 00:00:00 2001 From: Phong Nguyen Date: Thu, 21 May 2026 20:18:47 +0000 Subject: [PATCH 2/4] tscore/QPACK: address #13167 review feedback - Move ts::memcpy_tolower correctness coverage out of the ENABLE_BENCHMARKS-gated benchmark and into a new src/tscore/unit_tests/test_ink_memcpy_tolower.cc so ctest exercises the scalar and SIMD paths in every build. Covers boundary sizes bracketing each SIMD body width, the exhaustive 0..255 byte-value sweep, and the in-place (dst == src) form (Copilot). - Fix the implementation-note comment on ts::memcpy_tolower to describe the actual AVX-512BW control flow (gated main loop + masked-tail load/store + early return), and document that in-place (dst == src) is supported on every path (Copilot). - Add a Catch::Benchmark::keep_memory barrier in benchmark_memcpy_tolower so the compiler can no longer DCE the inlined stores past the first observed byte (Copilot). - Migrate the in-place tolower loop in src/proxy/http3/QPACK.cc::_encode_header to ts::memcpy_tolower, demonstrating the in-place contract (bryancall). - Add Tony Finch's copytolower64.c attribution to NOTICE (masaori335). Co-Authored-By: Claude Opus 4.7 (1M context) --- NOTICE | 7 + include/tscore/ink_memcpy_tolower.h | 31 ++-- src/proxy/http3/QPACK.cc | 5 +- src/tscore/CMakeLists.txt | 1 + .../unit_tests/test_ink_memcpy_tolower.cc | 133 ++++++++++++++++++ tools/benchmark/benchmark_memcpy_tolower.cc | 49 ++----- 6 files changed, 173 insertions(+), 53 deletions(-) create mode 100644 src/tscore/unit_tests/test_ink_memcpy_tolower.cc diff --git a/NOTICE b/NOTICE index 31787fecbf0..ed57a6c6cad 100644 --- a/NOTICE +++ b/NOTICE @@ -118,3 +118,10 @@ LS-HPACK provides functionality to encode and decode HTTP headers using HPACK compression mechanism specified in RFC 7541. Copyright (c) 2018 - 2023 LiteSpeed Technologies Inc, (MIT License) https://github.com/litespeedtech/ls-hpack.git + +~~ + +include/tscore/ink_memcpy_tolower.h AVX-512BW kernel design (fused +mask_add and masked-tail load/store) is adapted from Tony Finch's +copytolower64.c (0BSD OR MIT-0). +https://dotat.at/cgi/git/vectolower.git/ diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_memcpy_tolower.h index 3c93559ae5e..578ef3606b5 100644 --- a/include/tscore/ink_memcpy_tolower.h +++ b/include/tscore/ink_memcpy_tolower.h @@ -13,16 +13,27 @@ 'a'..'z'. All other bytes (including 0x80..0xFF) pass through unchanged. There is no UTF-8 case folding. - - The destination is written byte-for-byte; src and dst must point - to non-overlapping regions of size at least @n bytes. - - Implementation note: the bodies are stacked widest-first and each - drains its block size before falling through to the next. A build - with AVX-512BW gets the 64-byte body as the main loop, then at most - one 32-byte AVX2 iteration and one 16-byte SSE2 iteration to drain - the remainder before the scalar tail handles 0-15 bytes. Builds - without the wider ISAs simply skip those blocks. Selection is purely - compile-time; no runtime dispatch. + - In-place use (dst == src) is supported: every SIMD body loads a + full block into a register before storing back, and the AVX-512BW + masked tail uses masked-load/masked-store at the same offset. + Partial overlap where dst != src is not supported. + + Implementation note: selection is purely compile-time; no runtime + dispatch. The bodies are stacked widest-first. + + - AVX-512BW builds: when n >= 64, a 64-byte main loop handles the + bulk and a single masked load/store finishes any 1..63-byte tail, + then we return. When n < 64, we fall through to the AVX2 + SSE2 + cascade below so tiny inputs avoid the masked tail's fixed setup + cost. + + - AVX2 builds: a 32-byte main loop drains to a 16-byte SSE2 step + and then to a scalar tail of 0..15 bytes. + + - SSE2 / NEON builds: a single 16-byte main loop drains to a + scalar tail. + + - Other targets: scalar only. @section license License diff --git a/src/proxy/http3/QPACK.cc b/src/proxy/http3/QPACK.cc index dfdd2d278b3..e5145ecf18d 100644 --- a/src/proxy/http3/QPACK.cc +++ b/src/proxy/http3/QPACK.cc @@ -25,6 +25,7 @@ #include "proxy/hdrs/XPACK.h" #include "proxy/http3/QPACK.h" #include "tscore/ink_defs.h" +#include "tscore/ink_memcpy_tolower.h" #include "tscore/ink_memory.h" #define QPACKDebug(fmt, ...) Dbg(dbg_ctl_qpack, "[%s] " fmt, this->_qc->cids().data(), ##__VA_ARGS__) @@ -369,9 +370,7 @@ QPACK::_encode_header(const MIMEField &field, uint16_t base_index, IOBufferBlock { auto name{field.name_get()}; char *lowered_name = this->_arena.str_store(name.data(), name.length()); - for (size_t i = 0; i < name.length(); i++) { - lowered_name[i] = ParseRules::ink_tolower(lowered_name[i]); - } + ts::memcpy_tolower(lowered_name, lowered_name, name.length()); auto value{field.value_get()}; // TODO Set never_index flag on/off according to encoding headers diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt index 7790adc87dd..6c87ba2b0bb 100644 --- a/src/tscore/CMakeLists.txt +++ b/src/tscore/CMakeLists.txt @@ -159,6 +159,7 @@ if(BUILD_TESTING) unit_tests/test_Tokenizer.cc unit_tests/test_arena.cc unit_tests/test_ink_inet.cc + unit_tests/test_ink_memcpy_tolower.cc unit_tests/test_ink_memory.cc unit_tests/test_ink_string.cc unit_tests/test_layout.cc diff --git a/src/tscore/unit_tests/test_ink_memcpy_tolower.cc b/src/tscore/unit_tests/test_ink_memcpy_tolower.cc new file mode 100644 index 00000000000..1a33b13c681 --- /dev/null +++ b/src/tscore/unit_tests/test_ink_memcpy_tolower.cc @@ -0,0 +1,133 @@ +/** @file + + Unit tests for ts::memcpy_tolower. + + Runs as part of the standard test_tscore binary so the helper's SIMD + and scalar paths are exercised by ctest in every build, not just when + ENABLE_BENCHMARKS is set. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#include + +#include "tscore/ink_memcpy_tolower.h" +#include "tscore/ParseRules.h" + +#include +#include +#include +#include + +namespace +{ + +// Same mixed-case ASCII distribution we use in the benchmark, so the unit +// tests exercise inputs that look like real URL/header bytes. +std::vector +make_mixed_case_ascii(std::size_t n, std::uint64_t seed) +{ + std::mt19937_64 rng(seed); + std::vector v(n); + for (std::size_t i = 0; i < n; ++i) { + auto r = static_cast(rng() & 0x3FU); + if (r < 26U) { + v[i] = static_cast('A' + r); + } else if (r < 52U) { + v[i] = static_cast('a' + (r - 26U)); + } else { + static constexpr char kNonAlpha[] = "0123456789-_./:"; + v[i] = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)]; + } + } + return v; +} + +// Byte-at-a-time reference, equivalent to the prior static-inline +// memcpy_tolower in URL.cc. Anything ts::memcpy_tolower produces must match +// this for every input we test. +void +memcpy_tolower_reference(char *d, const char *s, std::size_t n) noexcept +{ + while (n--) { + *d = ParseRules::ink_tolower(*s); + ++s; + ++d; + } +} + +} // namespace + +TEST_CASE("ts::memcpy_tolower matches scalar reference", "[ts_memcpy_tolower]") +{ + // Bracket every SIMD body width (16/32/64) with both equal-to and + // offset-from-multiple lengths so the cascade transitions and the + // AVX-512BW masked tail are all exercised. + for (std::size_t sz : std::array{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) { + auto input = make_mixed_case_ascii(sz, 0xC0FFEE + sz); + std::vector expected(sz); + std::vector actual(sz); + + memcpy_tolower_reference(expected.data(), input.data(), sz); + ts::memcpy_tolower(actual.data(), input.data(), sz); + + CAPTURE(sz); + REQUIRE(actual == expected); + } +} + +TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]") +{ + // Every byte value 0..255 should round-trip unchanged unless it is in + // 'A'..'Z', in which case it should map to 'a'..'z'. Guards against any + // future "speed-up" that widens the case-fold range past ASCII. + std::array input; + for (std::size_t i = 0; i < 256; ++i) { + input[i] = static_cast(i); + } + std::array output; + ts::memcpy_tolower(output.data(), reinterpret_cast(input.data()), input.size()); + + for (std::size_t i = 0; i < 256; ++i) { + auto in = static_cast(i); + auto out = static_cast(output[i]); + auto exp = (in >= 'A' && in <= 'Z') ? static_cast(in | 0x20) : in; + CAPTURE(i); + REQUIRE(out == exp); + } +} + +TEST_CASE("ts::memcpy_tolower supports in-place (dst == src)", "[ts_memcpy_tolower]") +{ + // In-place use must match what an out-of-place call would have produced. + // Run across the same boundary sizes as the basic correctness case so the + // SIMD bodies and the AVX-512BW masked load/store are all exercised + // in-place. + for (std::size_t sz : std::array{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) { + auto input = make_mixed_case_ascii(sz, 0xBADF00D + sz); + std::vector expected(sz); + std::vector in_place(input); + + memcpy_tolower_reference(expected.data(), input.data(), sz); + ts::memcpy_tolower(in_place.data(), in_place.data(), sz); + + CAPTURE(sz); + REQUIRE(in_place == expected); + } +} diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_memcpy_tolower.cc index e2b45b48b5d..079c0fdcb2f 100644 --- a/tools/benchmark/benchmark_memcpy_tolower.cc +++ b/tools/benchmark/benchmark_memcpy_tolower.cc @@ -26,6 +26,7 @@ #include #include +#include #include "tscore/ink_memcpy_tolower.h" #include "tscore/ParseRules.h" @@ -87,8 +88,7 @@ memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept TEST_CASE("active SIMD configuration", "[tolower][config]") { // Print the compile-time ISA path so the benchmark output makes the - // selected configuration obvious. Cascades stack: AVX-512BW builds also - // emit the AVX2 and SSE2 drain loops; AVX2 builds emit the SSE2 drain. + // selected configuration obvious. std::cout << "ts::memcpy_tolower compiled with: "; #if defined(__AVX512BW__) std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade"; @@ -105,44 +105,6 @@ TEST_CASE("active SIMD configuration", "[tolower][config]") SUCCEED(); } -TEST_CASE("ts::memcpy_tolower matches scalar reference", "[tolower][correctness]") -{ - // Cover sizes that bracket the 16-byte SIMD body: smaller-than, equal-to, - // a couple of multiples, and several offsets between multiples. - for (std::size_t sz : std::array{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 64, 257}) { - auto input = make_mixed_case_ascii(sz, 0xC0FFEE + sz); - std::vector expected(sz); - std::vector actual(sz); - - memcpy_tolower_scalar(expected.data(), input.data(), sz); - ts::memcpy_tolower(actual.data(), input.data(), sz); - - CAPTURE(sz); - REQUIRE(actual == expected); - } -} - -TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[tolower][correctness]") -{ - // Every byte value from 0..255 should round-trip unchanged unless it is in - // 'A'..'Z', in which case it should map to 'a'..'z'. This catches anyone - // who later tries to "speed things up" by widening the range to Latin-1. - std::array input; - for (std::size_t i = 0; i < 256; ++i) { - input[i] = static_cast(i); - } - std::array output; - ts::memcpy_tolower(output.data(), reinterpret_cast(input.data()), input.size()); - - for (std::size_t i = 0; i < 256; ++i) { - auto in = static_cast(i); - auto out = static_cast(output[i]); - auto exp = (in >= 'A' && in <= 'Z') ? static_cast(in | 0x20) : in; - CAPTURE(i); - REQUIRE(out == exp); - } -} - TEST_CASE("memcpy_tolower throughput", "[bench][tolower]") { for (std::size_t sz : kSizes) { @@ -150,10 +112,16 @@ TEST_CASE("memcpy_tolower throughput", "[bench][tolower]") std::vector output_scalar(sz); std::vector output_simd(sz); + // Catch::Benchmark::keep_memory clobbers the buffer in the compiler's + // model, forcing it to materialize every byte we wrote. Without this an + // optimizing compiler can shrink or DCE the inline body's stores past + // the first element we observed. + std::string scalar_name = "scalar " + std::to_string(sz) + "B"; BENCHMARK(scalar_name.c_str()) { memcpy_tolower_scalar(output_scalar.data(), input.data(), sz); + Catch::Benchmark::keep_memory(output_scalar.data()); return output_scalar[0]; }; @@ -161,6 +129,7 @@ TEST_CASE("memcpy_tolower throughput", "[bench][tolower]") BENCHMARK(simd_name.c_str()) { ts::memcpy_tolower(output_simd.data(), input.data(), sz); + Catch::Benchmark::keep_memory(output_simd.data()); return output_simd[0]; }; } From df8e07bb90e9ed6cfe7564cbdd71b89a6afa029f Mon Sep 17 00:00:00 2001 From: Phong Nguyen Date: Thu, 21 May 2026 20:59:37 +0000 Subject: [PATCH 3/4] tscore: rename memcpy_tolower to ts::ascii::tolower_{copy,inplace} memcpy_tolower carried two warts: the "memcpy" prefix implied non-overlapping by convention with libc memcpy (we explicitly support the in-place case), and the unqualified name didn't surface the ASCII-only semantics. Rename the helper to ts::ascii::tolower_copy and add a thin ts::ascii::tolower_inplace(buf, n) wrapper so call sites that operate on a single buffer read naturally instead of passing the same pointer twice. Rename the header to include/tscore/ink_ascii_tolower.h, the unit test to src/tscore/unit_tests/test_ink_ascii_tolower.cc, and the benchmark to tools/benchmark/benchmark_ascii_tolower.cc to match. Update the two existing call sites (URL.cc fast-path scheme/host and QPACK::_encode_header in-place name lowercasing) accordingly. No behavior change: the helper bodies are unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- NOTICE | 2 +- ...k_memcpy_tolower.h => ink_ascii_tolower.h} | 37 ++++++++++++------- src/proxy/hdrs/URL.cc | 6 +-- src/proxy/http3/QPACK.cc | 4 +- src/tscore/CMakeLists.txt | 2 +- ...y_tolower.cc => test_ink_ascii_tolower.cc} | 33 ++++++++--------- tools/benchmark/CMakeLists.txt | 4 +- ..._tolower.cc => benchmark_ascii_tolower.cc} | 20 +++++----- 8 files changed, 59 insertions(+), 49 deletions(-) rename include/tscore/{ink_memcpy_tolower.h => ink_ascii_tolower.h} (84%) rename src/tscore/unit_tests/{test_ink_memcpy_tolower.cc => test_ink_ascii_tolower.cc} (76%) rename tools/benchmark/{benchmark_memcpy_tolower.cc => benchmark_ascii_tolower.cc} (85%) diff --git a/NOTICE b/NOTICE index ed57a6c6cad..c50ea163f76 100644 --- a/NOTICE +++ b/NOTICE @@ -121,7 +121,7 @@ https://github.com/litespeedtech/ls-hpack.git ~~ -include/tscore/ink_memcpy_tolower.h AVX-512BW kernel design (fused +include/tscore/ink_ascii_tolower.h AVX-512BW kernel design (fused mask_add and masked-tail load/store) is adapted from Tony Finch's copytolower64.c (0BSD OR MIT-0). https://dotat.at/cgi/git/vectolower.git/ diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_ascii_tolower.h similarity index 84% rename from include/tscore/ink_memcpy_tolower.h rename to include/tscore/ink_ascii_tolower.h index 578ef3606b5..d98aafb48dc 100644 --- a/include/tscore/ink_memcpy_tolower.h +++ b/include/tscore/ink_ascii_tolower.h @@ -3,9 +3,11 @@ SIMD-accelerated bulk ASCII tolower copy. Used on the URL canonicalization fast path for cache-key digests - (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast). The scalar loop is - the bottleneck for hosts and schemes long enough to vectorize; for - shorter inputs the scalar tail handles them with no SIMD overhead. + (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast) and any other place + that needs to fold ASCII to lowercase over a small-to-moderate + buffer. The scalar byte-at-a-time loop is the bottleneck for hosts + and schemes long enough to vectorize; for shorter inputs the scalar + tail handles them with no SIMD overhead. Semantics match a byte-at-a-time loop using ParseRules::ink_tolower(): @@ -13,13 +15,14 @@ 'a'..'z'. All other bytes (including 0x80..0xFF) pass through unchanged. There is no UTF-8 case folding. - - In-place use (dst == src) is supported: every SIMD body loads a - full block into a register before storing back, and the AVX-512BW - masked tail uses masked-load/masked-store at the same offset. - Partial overlap where dst != src is not supported. + - In-place use (dst == src) is supported on every path. Each SIMD + body loads a full block into a register before storing back at + the same offset, and the AVX-512BW masked tail does masked-load + / masked-store at the same offset. Partial overlap where + dst != src but the ranges intersect is not supported. Implementation note: selection is purely compile-time; no runtime - dispatch. The bodies are stacked widest-first. + dispatch. Bodies are stacked widest-first. - AVX-512BW builds: when n >= 64, a 64-byte main loop handles the bulk and a single masked load/store finishes any 1..63-byte tail, @@ -64,11 +67,11 @@ #include #endif -namespace ts +namespace ts::ascii { inline void -memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept +tolower_copy(char *dst, const char *src, std::size_t n) noexcept { #if defined(__AVX512BW__) // AVX-512BW: 64 bytes per iteration with two key optimizations over the @@ -84,8 +87,7 @@ memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept // tiny inputs fall through to the AVX2/SSE2 path below, where they keep // the speedup that path already provides. // - // Inspired by Tony Finch's copytolower64.c - // (https://dotat.at/cgi/git/vectolower.git/). + // Adapted from Tony Finch's copytolower64.c (see NOTICE). if (n >= 64) { const __m512i A_vec = _mm512_set1_epi8('A'); const __m512i Z_vec = _mm512_set1_epi8('Z'); @@ -171,4 +173,13 @@ memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept } } -} // namespace ts +// Thin sugar over tolower_copy for the in-place case. Makes call sites +// like ts::ascii::tolower_inplace(buf, n) read naturally instead of +// ts::ascii::tolower_copy(buf, buf, n). +inline void +tolower_inplace(char *buf, std::size_t n) noexcept +{ + tolower_copy(buf, buf, n); +} + +} // namespace ts::ascii diff --git a/src/proxy/hdrs/URL.cc b/src/proxy/hdrs/URL.cc index c8eff4dd69f..1c9eb4170b2 100644 --- a/src/proxy/hdrs/URL.cc +++ b/src/proxy/hdrs/URL.cc @@ -25,7 +25,7 @@ #include #include "tscore/ink_platform.h" #include "tscore/ink_memory.h" -#include "tscore/ink_memcpy_tolower.h" +#include "tscore/ink_ascii_tolower.h" #include "proxy/hdrs/URL.h" #include "proxy/hdrs/MIME.h" #include "proxy/hdrs/HTTP.h" @@ -1695,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash char *p; p = buffer; - ts::memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme); + ts::ascii::tolower_copy(p, url->m_ptr_scheme, url->m_len_scheme); p += url->m_len_scheme; *p++ = ':'; *p++ = '/'; @@ -1704,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash *p++ = ':'; // no password *p++ = '@'; - ts::memcpy_tolower(p, url->m_ptr_host, url->m_len_host); + ts::ascii::tolower_copy(p, url->m_ptr_host, url->m_len_host); p += url->m_len_host; *p++ = '/'; memcpy(p, url->m_ptr_path, url->m_len_path); diff --git a/src/proxy/http3/QPACK.cc b/src/proxy/http3/QPACK.cc index e5145ecf18d..92ac2b024f6 100644 --- a/src/proxy/http3/QPACK.cc +++ b/src/proxy/http3/QPACK.cc @@ -25,7 +25,7 @@ #include "proxy/hdrs/XPACK.h" #include "proxy/http3/QPACK.h" #include "tscore/ink_defs.h" -#include "tscore/ink_memcpy_tolower.h" +#include "tscore/ink_ascii_tolower.h" #include "tscore/ink_memory.h" #define QPACKDebug(fmt, ...) Dbg(dbg_ctl_qpack, "[%s] " fmt, this->_qc->cids().data(), ##__VA_ARGS__) @@ -370,7 +370,7 @@ QPACK::_encode_header(const MIMEField &field, uint16_t base_index, IOBufferBlock { auto name{field.name_get()}; char *lowered_name = this->_arena.str_store(name.data(), name.length()); - ts::memcpy_tolower(lowered_name, lowered_name, name.length()); + ts::ascii::tolower_inplace(lowered_name, name.length()); auto value{field.value_get()}; // TODO Set never_index flag on/off according to encoding headers diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt index 6c87ba2b0bb..ceb770d7ecc 100644 --- a/src/tscore/CMakeLists.txt +++ b/src/tscore/CMakeLists.txt @@ -158,8 +158,8 @@ if(BUILD_TESTING) unit_tests/test_Throttler.cc unit_tests/test_Tokenizer.cc unit_tests/test_arena.cc + unit_tests/test_ink_ascii_tolower.cc unit_tests/test_ink_inet.cc - unit_tests/test_ink_memcpy_tolower.cc unit_tests/test_ink_memory.cc unit_tests/test_ink_string.cc unit_tests/test_layout.cc diff --git a/src/tscore/unit_tests/test_ink_memcpy_tolower.cc b/src/tscore/unit_tests/test_ink_ascii_tolower.cc similarity index 76% rename from src/tscore/unit_tests/test_ink_memcpy_tolower.cc rename to src/tscore/unit_tests/test_ink_ascii_tolower.cc index 1a33b13c681..5f2cb8df3c9 100644 --- a/src/tscore/unit_tests/test_ink_memcpy_tolower.cc +++ b/src/tscore/unit_tests/test_ink_ascii_tolower.cc @@ -1,6 +1,6 @@ /** @file - Unit tests for ts::memcpy_tolower. + Unit tests for ts::ascii::tolower_copy and ts::ascii::tolower_inplace. Runs as part of the standard test_tscore binary so the helper's SIMD and scalar paths are exercised by ctest in every build, not just when @@ -27,7 +27,7 @@ #include -#include "tscore/ink_memcpy_tolower.h" +#include "tscore/ink_ascii_tolower.h" #include "tscore/ParseRules.h" #include @@ -60,10 +60,10 @@ make_mixed_case_ascii(std::size_t n, std::uint64_t seed) } // Byte-at-a-time reference, equivalent to the prior static-inline -// memcpy_tolower in URL.cc. Anything ts::memcpy_tolower produces must match -// this for every input we test. +// memcpy_tolower in URL.cc. Anything ts::ascii::tolower_copy produces must +// match this for every input we test. void -memcpy_tolower_reference(char *d, const char *s, std::size_t n) noexcept +tolower_reference(char *d, const char *s, std::size_t n) noexcept { while (n--) { *d = ParseRules::ink_tolower(*s); @@ -74,7 +74,7 @@ memcpy_tolower_reference(char *d, const char *s, std::size_t n) noexcept } // namespace -TEST_CASE("ts::memcpy_tolower matches scalar reference", "[ts_memcpy_tolower]") +TEST_CASE("ts::ascii::tolower_copy matches scalar reference", "[ts_ascii_tolower]") { // Bracket every SIMD body width (16/32/64) with both equal-to and // offset-from-multiple lengths so the cascade transitions and the @@ -84,15 +84,15 @@ TEST_CASE("ts::memcpy_tolower matches scalar reference", "[ts_memcpy_tolower]") std::vector expected(sz); std::vector actual(sz); - memcpy_tolower_reference(expected.data(), input.data(), sz); - ts::memcpy_tolower(actual.data(), input.data(), sz); + tolower_reference(expected.data(), input.data(), sz); + ts::ascii::tolower_copy(actual.data(), input.data(), sz); CAPTURE(sz); REQUIRE(actual == expected); } } -TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]") +TEST_CASE("ts::ascii::tolower_copy preserves non-ASCII bytes", "[ts_ascii_tolower]") { // Every byte value 0..255 should round-trip unchanged unless it is in // 'A'..'Z', in which case it should map to 'a'..'z'. Guards against any @@ -102,7 +102,7 @@ TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]") input[i] = static_cast(i); } std::array output; - ts::memcpy_tolower(output.data(), reinterpret_cast(input.data()), input.size()); + ts::ascii::tolower_copy(output.data(), reinterpret_cast(input.data()), input.size()); for (std::size_t i = 0; i < 256; ++i) { auto in = static_cast(i); @@ -113,19 +113,18 @@ TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]") } } -TEST_CASE("ts::memcpy_tolower supports in-place (dst == src)", "[ts_memcpy_tolower]") +TEST_CASE("ts::ascii::tolower_inplace matches tolower_copy", "[ts_ascii_tolower]") { - // In-place use must match what an out-of-place call would have produced. - // Run across the same boundary sizes as the basic correctness case so the - // SIMD bodies and the AVX-512BW masked load/store are all exercised - // in-place. + // The inplace form must produce the same result as a non-overlapping copy. + // Exercise the same boundary sizes so the SIMD bodies and the AVX-512BW + // masked load/store are all exercised in-place. for (std::size_t sz : std::array{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) { auto input = make_mixed_case_ascii(sz, 0xBADF00D + sz); std::vector expected(sz); std::vector in_place(input); - memcpy_tolower_reference(expected.data(), input.data(), sz); - ts::memcpy_tolower(in_place.data(), in_place.data(), sz); + tolower_reference(expected.data(), input.data(), sz); + ts::ascii::tolower_inplace(in_place.data(), sz); CAPTURE(sz); REQUIRE(in_place == expected); diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt index 84fc111925e..c08e9a94d12 100644 --- a/tools/benchmark/CMakeLists.txt +++ b/tools/benchmark/CMakeLists.txt @@ -36,8 +36,8 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li add_executable(benchmark_Random benchmark_Random.cc) target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore) -add_executable(benchmark_memcpy_tolower benchmark_memcpy_tolower.cc) -target_link_libraries(benchmark_memcpy_tolower PRIVATE Catch2::Catch2WithMain ts::tscore) +add_executable(benchmark_ascii_tolower benchmark_ascii_tolower.cc) +target_link_libraries(benchmark_ascii_tolower PRIVATE Catch2::Catch2WithMain ts::tscore) add_executable(benchmark_HostDB benchmark_HostDB.cc) target_link_libraries( diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_ascii_tolower.cc similarity index 85% rename from tools/benchmark/benchmark_memcpy_tolower.cc rename to tools/benchmark/benchmark_ascii_tolower.cc index 079c0fdcb2f..4c799713c99 100644 --- a/tools/benchmark/benchmark_memcpy_tolower.cc +++ b/tools/benchmark/benchmark_ascii_tolower.cc @@ -1,7 +1,7 @@ /** @file - Micro benchmark for ts::memcpy_tolower against a byte-at-a-time scalar - loop equivalent to the prior URL.cc::memcpy_tolower definition. + Micro benchmark for ts::ascii::tolower_copy against a byte-at-a-time + scalar loop equivalent to the prior URL.cc::memcpy_tolower definition. @section license License @@ -28,7 +28,7 @@ #include #include -#include "tscore/ink_memcpy_tolower.h" +#include "tscore/ink_ascii_tolower.h" #include "tscore/ParseRules.h" #include @@ -74,7 +74,7 @@ make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL) // Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here // as the baseline the SIMD path is expected to beat. inline void -memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept +tolower_scalar(char *d, const char *s, std::size_t n) noexcept { while (n--) { *d = ParseRules::ink_tolower(*s); @@ -89,7 +89,7 @@ TEST_CASE("active SIMD configuration", "[tolower][config]") { // Print the compile-time ISA path so the benchmark output makes the // selected configuration obvious. - std::cout << "ts::memcpy_tolower compiled with: "; + std::cout << "ts::ascii::tolower_copy compiled with: "; #if defined(__AVX512BW__) std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade"; #elif defined(__AVX2__) @@ -105,7 +105,7 @@ TEST_CASE("active SIMD configuration", "[tolower][config]") SUCCEED(); } -TEST_CASE("memcpy_tolower throughput", "[bench][tolower]") +TEST_CASE("tolower throughput", "[bench][tolower]") { for (std::size_t sz : kSizes) { auto input = make_mixed_case_ascii(sz); @@ -117,18 +117,18 @@ TEST_CASE("memcpy_tolower throughput", "[bench][tolower]") // optimizing compiler can shrink or DCE the inline body's stores past // the first element we observed. - std::string scalar_name = "scalar " + std::to_string(sz) + "B"; + std::string scalar_name = "scalar " + std::to_string(sz) + "B"; BENCHMARK(scalar_name.c_str()) { - memcpy_tolower_scalar(output_scalar.data(), input.data(), sz); + tolower_scalar(output_scalar.data(), input.data(), sz); Catch::Benchmark::keep_memory(output_scalar.data()); return output_scalar[0]; }; - std::string simd_name = "ts::mct " + std::to_string(sz) + "B"; + std::string simd_name = "ts::atc " + std::to_string(sz) + "B"; BENCHMARK(simd_name.c_str()) { - ts::memcpy_tolower(output_simd.data(), input.data(), sz); + ts::ascii::tolower_copy(output_simd.data(), input.data(), sz); Catch::Benchmark::keep_memory(output_simd.data()); return output_simd[0]; }; From fb9b195c33903945c7b7bd3fb5f37b60a51c9d96 Mon Sep 17 00:00:00 2001 From: Phong Nguyen Date: Thu, 21 May 2026 21:17:31 +0000 Subject: [PATCH 4/4] proxy: migrate HPACK / UrlRewrite tolower loops and add behavioral tests Migrate two more byte-at-a-time ASCII tolower loops to ts::ascii::tolower_copy. Both call sites use a separate destination buffer, so the copy form is the right fit: - hpack_encode_header_block(): lower-cases each MIMEField name before encoding to match the HTTP/2 lowercase-header-name requirement. - UrlRewrite::_mappingLookup(): lower-cases the incoming request host into a stack buffer before the table lookup, so the lookup is case-insensitive against the lower-cased keys built at config-load time. The previous code used libc tolower(int) on signed char values, which is technically UB for bytes >= 0x80; the new call avoids that. The existing unit tests in test_URL, test_HpackIndexingTable, and test_RemapRules executed the tolower paths but only with inputs that were already lower-case, so they would have missed a "skip the lowercasing" regression. Add focused behavioral coverage: - test_URL.cc: four extra get_hash_test_cases that hash a request with uppercase/mixed-case scheme or host and require an equal hash to the lower-case form. Includes a 49-byte uppercase host that crosses both the 16- and 32-byte SIMD bodies. - test_RemapRules.cc: a new SCENARIO that builds a UrlRewrite from a map for a lower-case host and requires that uppercase, mixed-case, and long-uppercase request hosts all match. - test_HpackIndexingTable.cc: a new TEST_CASE that encodes a long mixed-case field name with hpack_encode_header_block and requires the encoded byte stream to be identical to encoding the same field with an already-lower-case name. QPACK already exercises the in-place path through its Encoding test and the helper's own ts::ascii::tolower_inplace unit test covers in-place semantics exhaustively; an additional focused QPACK test would need the external .qif fixture infrastructure, which is out of scope here. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/proxy/hdrs/unit_tests/test_URL.cc | 33 +++++++++++ src/proxy/http/remap/UrlRewrite.cc | 7 +-- .../http/remap/unit-tests/test_RemapRules.cc | 55 +++++++++++++++++++ src/proxy/http2/HPACK.cc | 5 +- .../unit_tests/test_HpackIndexingTable.cc | 32 +++++++++++ 5 files changed, 125 insertions(+), 7 deletions(-) diff --git a/src/proxy/hdrs/unit_tests/test_URL.cc b/src/proxy/hdrs/unit_tests/test_URL.cc index dc5ff4ade74..8dda4e9501a 100644 --- a/src/proxy/hdrs/unit_tests/test_URL.cc +++ b/src/proxy/hdrs/unit_tests/test_URL.cc @@ -659,6 +659,39 @@ std::vector get_hash_test_cases = { !IGNORE_QUERY, HAS_EQUAL_HASH, }, + { + // Verifies the scheme/host SIMD-tolower path in url_CryptoHash_get_fast: + // an uppercase host with a long enough prefix to hit the 16-byte SIMD + // body should hash identically to its lowercased form. + "Uppercase host: equal hashes", + "http://ONE.EXAMPLE.COM/a/path?name=value", + "http://one.example.com/a/path?name=value", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Mixed-case host: equal hashes", + "http://One.Example.Com/a/path?name=value", + "http://one.example.com/a/path?name=value", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Uppercase scheme: equal hashes", + "HTTP://one.example.com/a/path?name=value", + "http://one.example.com/a/path?name=value", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + // Long uppercase host crosses 16- and 32-byte SIMD body boundaries so + // the wider paths (when compiled in) are exercised by this fixture. + "Long uppercase host: equal hashes", + "http://A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM/a/path", + "http://a-very-long-host-name-for-simd.example.com/a/path", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, }; /** Return the hash related to a URI. diff --git a/src/proxy/http/remap/UrlRewrite.cc b/src/proxy/http/remap/UrlRewrite.cc index fbda217462b..d2af3b343e9 100644 --- a/src/proxy/http/remap/UrlRewrite.cc +++ b/src/proxy/http/remap/UrlRewrite.cc @@ -22,6 +22,8 @@ */ +#include "tscore/ink_ascii_tolower.h" + #include "proxy/http/remap/UrlRewrite.h" #include "proxy/http/remap/RemapYamlConfig.h" #include "iocore/eventsystem/ConfigProcessor.h" @@ -931,10 +933,7 @@ UrlRewrite::_mappingLookup(MappingsStore &mappings, URL *request_url, int reques return false; } - // lowercase - for (int i = 0; i < request_host_len; ++i) { - request_host_lower[i] = tolower(request_host[i]); - } + ts::ascii::tolower_copy(request_host_lower, request_host, request_host_len); request_host_lower[request_host_len] = 0; bool retval = false; diff --git a/src/proxy/http/remap/unit-tests/test_RemapRules.cc b/src/proxy/http/remap/unit-tests/test_RemapRules.cc index 012e8b55bd7..6f2dfc32668 100644 --- a/src/proxy/http/remap/unit-tests/test_RemapRules.cc +++ b/src/proxy/http/remap/unit-tests/test_RemapRules.cc @@ -225,3 +225,58 @@ map_with_recv_port http://front.example.com \ } } } + +SCENARIO("UrlRewrite host lookup is case-insensitive", "[proxy][remap]") +{ + // _mappingLookup lower-cases the request host before consulting the hash + // table; these scenarios exercise that path with inputs that would not + // match in a strict byte-compare. Sized to cross the 16-byte SSE2 body + // for hosts that get a real SIMD pass. + GIVEN("A forward map with a lowercase source host") + { + auto urlrw = std::make_unique(); + std::string config = R"RMCFG( +map http://www.example.com http://origin.example.com + )RMCFG"; + + auto cpath = write_test_remap(config, "case_insensitive"); + int rc = urlrw->BuildTable(cpath.c_str()); + REQUIRE(rc == TS_SUCCESS); + REQUIRE(urlrw->rule_count() == 1); + + EasyURL url("http://www.example.com"); + UrlMappingContainer urlmap; + + THEN("uppercase request host matches the lowercase rule") + { + const char *host = "WWW.EXAMPLE.COM"; + REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap)); + } + THEN("mixed-case request host matches the lowercase rule") + { + const char *host = "Www.Example.Com"; + REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap)); + } + } + + GIVEN("A forward map with a long host that exercises the 16-byte SIMD body") + { + auto urlrw = std::make_unique(); + std::string config = R"RMCFG( +map http://a-very-long-host-name-for-simd.example.com http://origin.example.com + )RMCFG"; + + auto cpath = write_test_remap(config, "case_insensitive_long"); + int rc = urlrw->BuildTable(cpath.c_str()); + REQUIRE(rc == TS_SUCCESS); + + EasyURL url("http://a-very-long-host-name-for-simd.example.com"); + UrlMappingContainer urlmap; + + THEN("an all-uppercase 49-char host (covers >=32 SIMD bytes) matches") + { + const char *host = "A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM"; + REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap)); + } + } +} diff --git a/src/proxy/http2/HPACK.cc b/src/proxy/http2/HPACK.cc index 7e4fd974f57..34ff607ced2 100644 --- a/src/proxy/http2/HPACK.cc +++ b/src/proxy/http2/HPACK.cc @@ -23,6 +23,7 @@ #include "proxy/http2/HPACK.h" +#include "tscore/ink_ascii_tolower.h" #include "tsutil/LocalBuffer.h" #include "swoc/TextView.h" @@ -789,9 +790,7 @@ hpack_encode_header_block(HpackIndexingTable &indexing_table, uint8_t *out_buf, int name_len = original_name.size(); ts::LocalBuffer local_buffer(name_len); char *lower_name = local_buffer.data(); - for (int i = 0; i < name_len; i++) { - lower_name[i] = ParseRules::ink_tolower(original_name[i]); - } + ts::ascii::tolower_copy(lower_name, original_name.data(), name_len); std::string_view name{lower_name, static_cast(name_len)}; std::string_view value = field.value_get(); diff --git a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc index ad373211fb8..7692931b1af 100644 --- a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc +++ b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc @@ -24,6 +24,7 @@ limitations under the License. */ +#include #include #include @@ -531,3 +532,34 @@ TEST_CASE("HPACK high level APIs", "[hpack]") } } } + +// Validates that hpack_encode_header_block() lower-cases mixed-case field +// names per RFC 7540 ยง 8.1.2 before emitting them. The lower-case step is the +// path that goes through ts::ascii::tolower_copy; if a regression broke the +// lowercasing, the byte-for-byte comparison below would fail. +TEST_CASE("HPACK encode lower-cases mixed-case field names", "[hpack]") +{ + uint8_t buf_mixed[BUFSIZE_FOR_REGRESSION_TEST]; + uint8_t buf_lower[BUFSIZE_FOR_REGRESSION_TEST]; + HpackIndexingTable table_mixed(MAX_TABLE_SIZE); + HpackIndexingTable table_lower(MAX_TABLE_SIZE); + + // Use a name long enough to exercise the 16-byte SSE2 body when present. + auto encode_one = [](HpackIndexingTable &table, uint8_t *buf, const char *name, const char *value) -> int64_t { + std::unique_ptr headers(new HTTPHdr, destroy_http_hdr); + headers->create(HTTPType::REQUEST); + MIMEField *field = mime_field_create(headers->m_heap, headers->m_http->m_fields_impl); + field->name_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{name}); + field->value_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{value}); + mime_hdr_field_attach(headers->m_http->m_fields_impl, field, 1, nullptr); + std::memset(buf, 0, BUFSIZE_FOR_REGRESSION_TEST); + return hpack_encode_header_block(table, buf, BUFSIZE_FOR_REGRESSION_TEST, headers.get()); + }; + + int64_t mixed_len = encode_one(table_mixed, buf_mixed, "Long-Custom-Header-Name", "abc"); + int64_t lower_len = encode_one(table_lower, buf_lower, "long-custom-header-name", "abc"); + + REQUIRE(mixed_len > 0); + REQUIRE(mixed_len == lower_len); + REQUIRE(std::memcmp(buf_mixed, buf_lower, lower_len) == 0); +}