From 32016aad8fc97ec578fc120738b0c0b64212fbb1 Mon Sep 17 00:00:00 2001
From: Phong Nguyen <phongn@gmail.com>
Date: Thu, 14 May 2026 19:52:33 +0000
Subject: [PATCH 1/4] tscore: SIMD ts::memcpy_tolower; use in URL.cc cache-key
 fast path

The bulk ASCII tolower loop used to canonicalize the scheme and host
portions of a URL before hashing into the cache key runs at ~1.5 GB/s
scalar (one byte and one ParseRules table lookup per iteration). The
work is trivially data-parallel and there is no per-byte branching, so
a SIMD kernel that lowercases a whole register at once gives a
straightforward speedup once the input is long enough to amortize the
vector setup.

Add a header-only helper ts::memcpy_tolower under
include/tscore/ink_memcpy_tolower.h with a compile-time-selected
cascade of SIMD bodies: 64-byte AVX-512BW, 32-byte AVX2, 16-byte SSE2
on x86_64, plus 16-byte NEON on ARMv8. Wider bodies fall through to
narrower drain loops, so the worst-case scalar tail is always <16
bytes. Selection is purely compile-time; runtime ifunc dispatch is
left for a follow-up.

The AVX-512BW body uses _mm512_mask_add_epi8 to fuse the conditional
"+0x20 where upper" into a single op, and a masked load/store handles
1..63 leftover bytes in a single SIMD pass (inspired by Tony Finch's
copytolower64.c, https://dotat.at/cgi/git/vectolower.git/). The whole
AVX-512BW block is gated at n >= 64 because the masked load/store has
~7 ns of fixed setup that loses to the narrower paths for short
inputs; below 64 bytes we fall through to the AVX2 + SSE2 cascade.

Semantics match the existing ParseRules::ink_tolower table exactly:
bytes in 'A'..'Z' map to 'a'..'z', all others (including 0x80..0xFF)
pass through unchanged.

Replace the static inline memcpy_tolower in src/proxy/hdrs/URL.cc with
this helper. Baseline x86_64 builds use the 16-byte SSE2 path; builds
that opt into a wider -march (x86-64-v3 = AVX2, x86-64-v4 = AVX-512BW)
get the wider bodies automatically. Sub-16-byte inputs (e.g. short
HTTP schemes like "http") use the scalar tail and see no perf change.

Measured throughput on a 2.0 GHz Ice Lake Xeon Gold 6338, mean ns:

  size   scalar   SSE2     AVX2     AVX-512BW
  ----   ------   ----     ----     ---------
  16 B   10.4     2.15     1.75     1.98
  32 B   15.4     2.90     2.24     2.31
  64 B   28.0     4.43     2.85     2.61
  256 B  113      13.87    7.57     6.20
  1024 B 425      50.47    24.23    17.49

Speedup vs scalar at 1024 B: SSE2 8.4x, AVX2 17.5x, AVX-512BW 24.3x.

A new microbenchmark under tools/benchmark covers correctness across
sizes 0..257 (bracketing each SIMD body size) plus an exhaustive byte-
value sweep that guards against any future widening of the case-fold
range, alongside scalar-vs-SIMD throughput numbers and a config-print
case that emits the selected ISA path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 include/tscore/ink_memcpy_tolower.h         | 163 +++++++++++++++++++
 src/proxy/hdrs/URL.cc                       |  15 +-
 tools/benchmark/CMakeLists.txt              |   3 +
 tools/benchmark/benchmark_memcpy_tolower.cc | 167 ++++++++++++++++++++
 4 files changed, 336 insertions(+), 12 deletions(-)
 create mode 100644 include/tscore/ink_memcpy_tolower.h
 create mode 100644 tools/benchmark/benchmark_memcpy_tolower.cc

diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_memcpy_tolower.h
new file mode 100644
index 00000000000..3c93559ae5e
--- /dev/null
+++ b/include/tscore/ink_memcpy_tolower.h
@@ -0,0 +1,163 @@
+/** @file
+
+  SIMD-accelerated bulk ASCII tolower copy.
+
+  Used on the URL canonicalization fast path for cache-key digests
+  (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast). The scalar loop is
+  the bottleneck for hosts and schemes long enough to vectorize; for
+  shorter inputs the scalar tail handles them with no SIMD overhead.
+
+  Semantics match a byte-at-a-time loop using ParseRules::ink_tolower():
+
+    - Bytes in 'A'..'Z' (0x41..0x5A) have bit 5 set, mapping them to
+      'a'..'z'. All other bytes (including 0x80..0xFF) pass through
+      unchanged. There is no UTF-8 case folding.
+
+    - The destination is written byte-for-byte; src and dst must point
+      to non-overlapping regions of size at least @n bytes.
+
+  Implementation note: the bodies are stacked widest-first and each
+  drains its block size before falling through to the next. A build
+  with AVX-512BW gets the 64-byte body as the main loop, then at most
+  one 32-byte AVX2 iteration and one 16-byte SSE2 iteration to drain
+  the remainder before the scalar tail handles 0-15 bytes. Builds
+  without the wider ISAs simply skip those blocks. Selection is purely
+  compile-time; no runtime dispatch.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__)
+#include <immintrin.h>
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+namespace ts
+{
+
+inline void
+memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
+{
+#if defined(__AVX512BW__)
+  // AVX-512BW: 64 bytes per iteration with two key optimizations over the
+  // narrower paths:
+  //   - _mm512_mask_add_epi8 fuses the "+0x20 where upper" into a single
+  //     op (no separate maskz_set1 + or).
+  //   - A masked load/store handles the 1..63-byte tail in a single SIMD
+  //     pass, so we don't need to cascade to AVX2/SSE2 to drain the
+  //     remainder.
+  //
+  // The masked tail does carry ~7 ns of fixed setup cost, which loses to
+  // the cascade on short inputs. Gating the whole block on n >= 64 means
+  // tiny inputs fall through to the AVX2/SSE2 path below, where they keep
+  // the speedup that path already provides.
+  //
+  // Inspired by Tony Finch's copytolower64.c
+  // (https://dotat.at/cgi/git/vectolower.git/).
+  if (n >= 64) {
+    const __m512i A_vec = _mm512_set1_epi8('A');
+    const __m512i Z_vec = _mm512_set1_epi8('Z');
+    const __m512i delta = _mm512_set1_epi8('a' - 'A');
+    do {
+      __m512i   bytes    = _mm512_loadu_epi8(src);
+      __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
+      _mm512_storeu_epi8(dst, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
+      src += 64;
+      dst += 64;
+      n   -= 64;
+    } while (n >= 64);
+    if (n != 0) {
+      auto      len_mask = static_cast<__mmask64>((~0ULL) >> (64 - n));
+      __m512i   bytes    = _mm512_maskz_loadu_epi8(len_mask, src);
+      __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
+      _mm512_mask_storeu_epi8(dst, len_mask, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
+    }
+    return;
+  }
+#endif
+
+#if defined(__AVX2__)
+  // 32 bytes per iteration. Same compare-and-OR pattern as SSE2.
+  {
+    const __m256i a_minus_one = _mm256_set1_epi8('A' - 1);
+    const __m256i z_plus_one  = _mm256_set1_epi8('Z' + 1);
+    const __m256i bit5        = _mm256_set1_epi8(0x20);
+    while (n >= 32) {
+      __m256i bytes = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+      __m256i ge_A  = _mm256_cmpgt_epi8(bytes, a_minus_one);
+      __m256i le_Z  = _mm256_cmpgt_epi8(z_plus_one, bytes);
+      __m256i mask  = _mm256_and_si256(_mm256_and_si256(ge_A, le_Z), bit5);
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), _mm256_or_si256(bytes, mask));
+      src += 32;
+      dst += 32;
+      n   -= 32;
+    }
+  }
+#endif
+
+#if defined(__SSE2__)
+  // 16 bytes per iteration. Signed compare works for ASCII A-Z because all
+  // letters live below 0x80; high bytes (0x80..0xFF) compare as negative
+  // and correctly miss the [A,Z] range so they pass through unchanged.
+  {
+    const __m128i a_minus_one = _mm_set1_epi8('A' - 1);
+    const __m128i z_plus_one  = _mm_set1_epi8('Z' + 1);
+    const __m128i bit5        = _mm_set1_epi8(0x20);
+    while (n >= 16) {
+      __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+      __m128i ge_A  = _mm_cmpgt_epi8(bytes, a_minus_one);
+      __m128i le_Z  = _mm_cmpgt_epi8(z_plus_one, bytes);
+      __m128i mask  = _mm_and_si128(_mm_and_si128(ge_A, le_Z), bit5);
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(bytes, mask));
+      src += 16;
+      dst += 16;
+      n   -= 16;
+    }
+  }
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+  // 16 bytes per iteration; unsigned compare available natively.
+  {
+    const uint8x16_t a_minus_one = vdupq_n_u8('A' - 1);
+    const uint8x16_t z_plus_one  = vdupq_n_u8('Z' + 1);
+    const uint8x16_t bit5        = vdupq_n_u8(0x20);
+    while (n >= 16) {
+      uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
+      uint8x16_t ge_A  = vcgtq_u8(bytes, a_minus_one);
+      uint8x16_t le_Z  = vcltq_u8(bytes, z_plus_one);
+      uint8x16_t mask  = vandq_u8(vandq_u8(ge_A, le_Z), bit5);
+      vst1q_u8(reinterpret_cast<uint8_t *>(dst), vorrq_u8(bytes, mask));
+      src += 16;
+      dst += 16;
+      n   -= 16;
+    }
+  }
+#endif
+
+  while (n--) {
+    auto c = static_cast<unsigned char>(*src++);
+    *dst++ = (c >= 'A' && c <= 'Z') ? static_cast<char>(c | 0x20) : static_cast<char>(c);
+  }
+}
+
+} // namespace ts
diff --git a/src/proxy/hdrs/URL.cc b/src/proxy/hdrs/URL.cc
index 68d84b5d481..c8eff4dd69f 100644
--- a/src/proxy/hdrs/URL.cc
+++ b/src/proxy/hdrs/URL.cc
@@ -25,6 +25,7 @@
 #include <new>
 #include "tscore/ink_platform.h"
 #include "tscore/ink_memory.h"
+#include "tscore/ink_memcpy_tolower.h"
 #include "proxy/hdrs/URL.h"
 #include "proxy/hdrs/MIME.h"
 #include "proxy/hdrs/HTTP.h"
@@ -1684,16 +1685,6 @@ url_describe(HdrHeapObjImpl *raw, bool /* recurse ATS_UNUSED */)
  *                                                                     *
  ***********************************************************************/
 
-static inline void
-memcpy_tolower(char *d, const char *s, int n)
-{
-  while (n--) {
-    *d = ParseRules::ink_tolower(*s);
-    s++;
-    d++;
-  }
-}
-
 // fast path for CryptoHash, HTTP, no user/password/params/query,
 // no buffer overflow, no unescaping needed
 
@@ -1704,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   char *p;
 
   p = buffer;
-  memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
+  ts::memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
   p    += url->m_len_scheme;
   *p++  = ':';
   *p++  = '/';
@@ -1713,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   *p++ = ':';
   // no password
   *p++ = '@';
-  memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
+  ts::memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
   p    += url->m_len_host;
   *p++  = '/';
   memcpy(p, url->m_ptr_path, url->m_len_path);
diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt
index 49f25fad1c1..84fc111925e 100644
--- a/tools/benchmark/CMakeLists.txt
+++ b/tools/benchmark/CMakeLists.txt
@@ -36,6 +36,9 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li
 add_executable(benchmark_Random benchmark_Random.cc)
 target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore)
 
+add_executable(benchmark_memcpy_tolower benchmark_memcpy_tolower.cc)
+target_link_libraries(benchmark_memcpy_tolower PRIVATE Catch2::Catch2WithMain ts::tscore)
+
 add_executable(benchmark_HostDB benchmark_HostDB.cc)
 target_link_libraries(
   benchmark_HostDB
diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_memcpy_tolower.cc
new file mode 100644
index 00000000000..e2b45b48b5d
--- /dev/null
+++ b/tools/benchmark/benchmark_memcpy_tolower.cc
@@ -0,0 +1,167 @@
+/** @file
+
+  Micro benchmark for ts::memcpy_tolower against a byte-at-a-time scalar
+  loop equivalent to the prior URL.cc::memcpy_tolower definition.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#define CATCH_CONFIG_ENABLE_BENCHMARKING
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ParseRules.h"
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+
+namespace
+{
+
+// Sizes chosen to mirror the URL.cc hot path:
+//   4-8B   - common HTTP scheme strings ("http", "https")
+//   16-32B - typical host names
+//   64-256B - long host names / cache-key segments
+//   1024B  - stress the inner loop
+constexpr std::array<std::size_t, 8> kSizes{4, 8, 16, 24, 32, 64, 256, 1024};
+
+// Same character distribution we expect from URL/host input: ASCII letters
+// (mixed case), digits, and the small set of non-alpha bytes that legitimately
+// appear in URLs.
+std::vector<char>
+make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL)
+{
+  std::mt19937_64   rng(seed);
+  std::vector<char> v(n);
+  for (std::size_t i = 0; i < n; ++i) {
+    auto r = static_cast<unsigned>(rng() & 0x3FU);
+    if (r < 26U) {
+      v[i] = static_cast<char>('A' + r);
+    } else if (r < 52U) {
+      v[i] = static_cast<char>('a' + (r - 26U));
+    } else {
+      static constexpr char kNonAlpha[] = "0123456789-_./:";
+      v[i]                              = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)];
+    }
+  }
+  return v;
+}
+
+// Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here
+// as the baseline the SIMD path is expected to beat.
+inline void
+memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept
+{
+  while (n--) {
+    *d = ParseRules::ink_tolower(*s);
+    ++s;
+    ++d;
+  }
+}
+
+} // namespace
+
+TEST_CASE("active SIMD configuration", "[tolower][config]")
+{
+  // Print the compile-time ISA path so the benchmark output makes the
+  // selected configuration obvious. Cascades stack: AVX-512BW builds also
+  // emit the AVX2 and SSE2 drain loops; AVX2 builds emit the SSE2 drain.
+  std::cout << "ts::memcpy_tolower compiled with: ";
+#if defined(__AVX512BW__)
+  std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade";
+#elif defined(__AVX2__)
+  std::cout << "AVX2 (32B body) + SSE2 (16B drain)";
+#elif defined(__SSE2__)
+  std::cout << "SSE2 (16B body)";
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+  std::cout << "NEON (16B body)";
+#else
+  std::cout << "scalar only";
+#endif
+  std::cout << '\n';
+  SUCCEED();
+}
+
+TEST_CASE("ts::memcpy_tolower matches scalar reference", "[tolower][correctness]")
+{
+  // Cover sizes that bracket the 16-byte SIMD body: smaller-than, equal-to,
+  // a couple of multiples, and several offsets between multiples.
+  for (std::size_t sz : std::array<std::size_t, 12>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 64, 257}) {
+    auto              input = make_mixed_case_ascii(sz, 0xC0FFEE + sz);
+    std::vector<char> expected(sz);
+    std::vector<char> actual(sz);
+
+    memcpy_tolower_scalar(expected.data(), input.data(), sz);
+    ts::memcpy_tolower(actual.data(), input.data(), sz);
+
+    CAPTURE(sz);
+    REQUIRE(actual == expected);
+  }
+}
+
+TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[tolower][correctness]")
+{
+  // Every byte value from 0..255 should round-trip unchanged unless it is in
+  // 'A'..'Z', in which case it should map to 'a'..'z'. This catches anyone
+  // who later tries to "speed things up" by widening the range to Latin-1.
+  std::array<unsigned char, 256> input;
+  for (std::size_t i = 0; i < 256; ++i) {
+    input[i] = static_cast<unsigned char>(i);
+  }
+  std::array<char, 256> output;
+  ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
+
+  for (std::size_t i = 0; i < 256; ++i) {
+    auto in  = static_cast<unsigned char>(i);
+    auto out = static_cast<unsigned char>(output[i]);
+    auto exp = (in >= 'A' && in <= 'Z') ? static_cast<unsigned char>(in | 0x20) : in;
+    CAPTURE(i);
+    REQUIRE(out == exp);
+  }
+}
+
+TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
+{
+  for (std::size_t sz : kSizes) {
+    auto              input = make_mixed_case_ascii(sz);
+    std::vector<char> output_scalar(sz);
+    std::vector<char> output_simd(sz);
+
+    std::string scalar_name = "scalar  " + std::to_string(sz) + "B";
+    BENCHMARK(scalar_name.c_str())
+    {
+      memcpy_tolower_scalar(output_scalar.data(), input.data(), sz);
+      return output_scalar[0];
+    };
+
+    std::string simd_name = "ts::mct " + std::to_string(sz) + "B";
+    BENCHMARK(simd_name.c_str())
+    {
+      ts::memcpy_tolower(output_simd.data(), input.data(), sz);
+      return output_simd[0];
+    };
+  }
+}

From faeb167c8a55026b8e91e483d08770d64de31ba9 Mon Sep 17 00:00:00 2001
From: Phong Nguyen <phongn@gmail.com>
Date: Thu, 21 May 2026 20:18:47 +0000
Subject: [PATCH 2/4] tscore/QPACK: address #13167 review feedback

- Move ts::memcpy_tolower correctness coverage out of the
  ENABLE_BENCHMARKS-gated benchmark and into a new
  src/tscore/unit_tests/test_ink_memcpy_tolower.cc so ctest exercises
  the scalar and SIMD paths in every build. Covers boundary sizes
  bracketing each SIMD body width, the exhaustive 0..255 byte-value
  sweep, and the in-place (dst == src) form (Copilot).

- Fix the implementation-note comment on ts::memcpy_tolower to
  describe the actual AVX-512BW control flow (gated main loop +
  masked-tail load/store + early return), and document that in-place
  (dst == src) is supported on every path (Copilot).

- Add a Catch::Benchmark::keep_memory barrier in
  benchmark_memcpy_tolower so the compiler can no longer DCE the
  inlined stores past the first observed byte (Copilot).

- Migrate the in-place tolower loop in
  src/proxy/http3/QPACK.cc::_encode_header to ts::memcpy_tolower,
  demonstrating the in-place contract (bryancall).

- Add Tony Finch's copytolower64.c attribution to NOTICE
  (masaori335).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NOTICE                                        |   7 +
 include/tscore/ink_memcpy_tolower.h           |  31 ++--
 src/proxy/http3/QPACK.cc                      |   5 +-
 src/tscore/CMakeLists.txt                     |   1 +
 .../unit_tests/test_ink_memcpy_tolower.cc     | 133 ++++++++++++++++++
 tools/benchmark/benchmark_memcpy_tolower.cc   |  49 ++-----
 6 files changed, 173 insertions(+), 53 deletions(-)
 create mode 100644 src/tscore/unit_tests/test_ink_memcpy_tolower.cc

diff --git a/NOTICE b/NOTICE
index 31787fecbf0..ed57a6c6cad 100644
--- a/NOTICE
+++ b/NOTICE
@@ -118,3 +118,10 @@ LS-HPACK provides functionality to encode and decode HTTP headers using
 HPACK compression mechanism specified in RFC 7541.
 Copyright (c) 2018 - 2023 LiteSpeed Technologies Inc, (MIT License)
 https://github.com/litespeedtech/ls-hpack.git
+
+~~
+
+include/tscore/ink_memcpy_tolower.h AVX-512BW kernel design (fused
+mask_add and masked-tail load/store) is adapted from Tony Finch's
+copytolower64.c (0BSD OR MIT-0).
+https://dotat.at/cgi/git/vectolower.git/
diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_memcpy_tolower.h
index 3c93559ae5e..578ef3606b5 100644
--- a/include/tscore/ink_memcpy_tolower.h
+++ b/include/tscore/ink_memcpy_tolower.h
@@ -13,16 +13,27 @@
       'a'..'z'. All other bytes (including 0x80..0xFF) pass through
       unchanged. There is no UTF-8 case folding.
 
-    - The destination is written byte-for-byte; src and dst must point
-      to non-overlapping regions of size at least @n bytes.
-
-  Implementation note: the bodies are stacked widest-first and each
-  drains its block size before falling through to the next. A build
-  with AVX-512BW gets the 64-byte body as the main loop, then at most
-  one 32-byte AVX2 iteration and one 16-byte SSE2 iteration to drain
-  the remainder before the scalar tail handles 0-15 bytes. Builds
-  without the wider ISAs simply skip those blocks. Selection is purely
-  compile-time; no runtime dispatch.
+    - In-place use (dst == src) is supported: every SIMD body loads a
+      full block into a register before storing back, and the AVX-512BW
+      masked tail uses masked-load/masked-store at the same offset.
+      Partial overlap where dst != src is not supported.
+
+  Implementation note: selection is purely compile-time; no runtime
+  dispatch. The bodies are stacked widest-first.
+
+    - AVX-512BW builds: when n >= 64, a 64-byte main loop handles the
+      bulk and a single masked load/store finishes any 1..63-byte tail,
+      then we return. When n < 64, we fall through to the AVX2 + SSE2
+      cascade below so tiny inputs avoid the masked tail's fixed setup
+      cost.
+
+    - AVX2 builds: a 32-byte main loop drains to a 16-byte SSE2 step
+      and then to a scalar tail of 0..15 bytes.
+
+    - SSE2 / NEON builds: a single 16-byte main loop drains to a
+      scalar tail.
+
+    - Other targets: scalar only.
 
   @section license License
 
diff --git a/src/proxy/http3/QPACK.cc b/src/proxy/http3/QPACK.cc
index dfdd2d278b3..e5145ecf18d 100644
--- a/src/proxy/http3/QPACK.cc
+++ b/src/proxy/http3/QPACK.cc
@@ -25,6 +25,7 @@
 #include "proxy/hdrs/XPACK.h"
 #include "proxy/http3/QPACK.h"
 #include "tscore/ink_defs.h"
+#include "tscore/ink_memcpy_tolower.h"
 #include "tscore/ink_memory.h"
 
 #define QPACKDebug(fmt, ...)   Dbg(dbg_ctl_qpack, "[%s] " fmt, this->_qc->cids().data(), ##__VA_ARGS__)
@@ -369,9 +370,7 @@ QPACK::_encode_header(const MIMEField &field, uint16_t base_index, IOBufferBlock
 {
   auto  name{field.name_get()};
   char *lowered_name = this->_arena.str_store(name.data(), name.length());
-  for (size_t i = 0; i < name.length(); i++) {
-    lowered_name[i] = ParseRules::ink_tolower(lowered_name[i]);
-  }
+  ts::memcpy_tolower(lowered_name, lowered_name, name.length());
   auto value{field.value_get()};
 
   // TODO Set never_index flag on/off according to encoding headers
diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt
index 7790adc87dd..6c87ba2b0bb 100644
--- a/src/tscore/CMakeLists.txt
+++ b/src/tscore/CMakeLists.txt
@@ -159,6 +159,7 @@ if(BUILD_TESTING)
     unit_tests/test_Tokenizer.cc
     unit_tests/test_arena.cc
     unit_tests/test_ink_inet.cc
+    unit_tests/test_ink_memcpy_tolower.cc
     unit_tests/test_ink_memory.cc
     unit_tests/test_ink_string.cc
     unit_tests/test_layout.cc
diff --git a/src/tscore/unit_tests/test_ink_memcpy_tolower.cc b/src/tscore/unit_tests/test_ink_memcpy_tolower.cc
new file mode 100644
index 00000000000..1a33b13c681
--- /dev/null
+++ b/src/tscore/unit_tests/test_ink_memcpy_tolower.cc
@@ -0,0 +1,133 @@
+/** @file
+
+  Unit tests for ts::memcpy_tolower.
+
+  Runs as part of the standard test_tscore binary so the helper's SIMD
+  and scalar paths are exercised by ctest in every build, not just when
+  ENABLE_BENCHMARKS is set.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ParseRules.h"
+
+#include <array>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+namespace
+{
+
+// Same mixed-case ASCII distribution we use in the benchmark, so the unit
+// tests exercise inputs that look like real URL/header bytes.
+std::vector<char>
+make_mixed_case_ascii(std::size_t n, std::uint64_t seed)
+{
+  std::mt19937_64   rng(seed);
+  std::vector<char> v(n);
+  for (std::size_t i = 0; i < n; ++i) {
+    auto r = static_cast<unsigned>(rng() & 0x3FU);
+    if (r < 26U) {
+      v[i] = static_cast<char>('A' + r);
+    } else if (r < 52U) {
+      v[i] = static_cast<char>('a' + (r - 26U));
+    } else {
+      static constexpr char kNonAlpha[] = "0123456789-_./:";
+      v[i]                              = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)];
+    }
+  }
+  return v;
+}
+
+// Byte-at-a-time reference, equivalent to the prior static-inline
+// memcpy_tolower in URL.cc. Anything ts::memcpy_tolower produces must match
+// this for every input we test.
+void
+memcpy_tolower_reference(char *d, const char *s, std::size_t n) noexcept
+{
+  while (n--) {
+    *d = ParseRules::ink_tolower(*s);
+    ++s;
+    ++d;
+  }
+}
+
+} // namespace
+
+TEST_CASE("ts::memcpy_tolower matches scalar reference", "[ts_memcpy_tolower]")
+{
+  // Bracket every SIMD body width (16/32/64) with both equal-to and
+  // offset-from-multiple lengths so the cascade transitions and the
+  // AVX-512BW masked tail are all exercised.
+  for (std::size_t sz : std::array<std::size_t, 14>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) {
+    auto              input = make_mixed_case_ascii(sz, 0xC0FFEE + sz);
+    std::vector<char> expected(sz);
+    std::vector<char> actual(sz);
+
+    memcpy_tolower_reference(expected.data(), input.data(), sz);
+    ts::memcpy_tolower(actual.data(), input.data(), sz);
+
+    CAPTURE(sz);
+    REQUIRE(actual == expected);
+  }
+}
+
+TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]")
+{
+  // Every byte value 0..255 should round-trip unchanged unless it is in
+  // 'A'..'Z', in which case it should map to 'a'..'z'. Guards against any
+  // future "speed-up" that widens the case-fold range past ASCII.
+  std::array<unsigned char, 256> input;
+  for (std::size_t i = 0; i < 256; ++i) {
+    input[i] = static_cast<unsigned char>(i);
+  }
+  std::array<char, 256> output;
+  ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
+
+  for (std::size_t i = 0; i < 256; ++i) {
+    auto in  = static_cast<unsigned char>(i);
+    auto out = static_cast<unsigned char>(output[i]);
+    auto exp = (in >= 'A' && in <= 'Z') ? static_cast<unsigned char>(in | 0x20) : in;
+    CAPTURE(i);
+    REQUIRE(out == exp);
+  }
+}
+
+TEST_CASE("ts::memcpy_tolower supports in-place (dst == src)", "[ts_memcpy_tolower]")
+{
+  // In-place use must match what an out-of-place call would have produced.
+  // Run across the same boundary sizes as the basic correctness case so the
+  // SIMD bodies and the AVX-512BW masked load/store are all exercised
+  // in-place.
+  for (std::size_t sz : std::array<std::size_t, 14>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) {
+    auto              input = make_mixed_case_ascii(sz, 0xBADF00D + sz);
+    std::vector<char> expected(sz);
+    std::vector<char> in_place(input);
+
+    memcpy_tolower_reference(expected.data(), input.data(), sz);
+    ts::memcpy_tolower(in_place.data(), in_place.data(), sz);
+
+    CAPTURE(sz);
+    REQUIRE(in_place == expected);
+  }
+}
diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_memcpy_tolower.cc
index e2b45b48b5d..079c0fdcb2f 100644
--- a/tools/benchmark/benchmark_memcpy_tolower.cc
+++ b/tools/benchmark/benchmark_memcpy_tolower.cc
@@ -26,6 +26,7 @@
 
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/benchmark/catch_optimizer.hpp>
 
 #include "tscore/ink_memcpy_tolower.h"
 #include "tscore/ParseRules.h"
@@ -87,8 +88,7 @@ memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept
 TEST_CASE("active SIMD configuration", "[tolower][config]")
 {
   // Print the compile-time ISA path so the benchmark output makes the
-  // selected configuration obvious. Cascades stack: AVX-512BW builds also
-  // emit the AVX2 and SSE2 drain loops; AVX2 builds emit the SSE2 drain.
+  // selected configuration obvious.
   std::cout << "ts::memcpy_tolower compiled with: ";
 #if defined(__AVX512BW__)
   std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade";
@@ -105,44 +105,6 @@ TEST_CASE("active SIMD configuration", "[tolower][config]")
   SUCCEED();
 }
 
-TEST_CASE("ts::memcpy_tolower matches scalar reference", "[tolower][correctness]")
-{
-  // Cover sizes that bracket the 16-byte SIMD body: smaller-than, equal-to,
-  // a couple of multiples, and several offsets between multiples.
-  for (std::size_t sz : std::array<std::size_t, 12>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 64, 257}) {
-    auto              input = make_mixed_case_ascii(sz, 0xC0FFEE + sz);
-    std::vector<char> expected(sz);
-    std::vector<char> actual(sz);
-
-    memcpy_tolower_scalar(expected.data(), input.data(), sz);
-    ts::memcpy_tolower(actual.data(), input.data(), sz);
-
-    CAPTURE(sz);
-    REQUIRE(actual == expected);
-  }
-}
-
-TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[tolower][correctness]")
-{
-  // Every byte value from 0..255 should round-trip unchanged unless it is in
-  // 'A'..'Z', in which case it should map to 'a'..'z'. This catches anyone
-  // who later tries to "speed things up" by widening the range to Latin-1.
-  std::array<unsigned char, 256> input;
-  for (std::size_t i = 0; i < 256; ++i) {
-    input[i] = static_cast<unsigned char>(i);
-  }
-  std::array<char, 256> output;
-  ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
-
-  for (std::size_t i = 0; i < 256; ++i) {
-    auto in  = static_cast<unsigned char>(i);
-    auto out = static_cast<unsigned char>(output[i]);
-    auto exp = (in >= 'A' && in <= 'Z') ? static_cast<unsigned char>(in | 0x20) : in;
-    CAPTURE(i);
-    REQUIRE(out == exp);
-  }
-}
-
 TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
 {
   for (std::size_t sz : kSizes) {
@@ -150,10 +112,16 @@ TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
     std::vector<char> output_scalar(sz);
     std::vector<char> output_simd(sz);
 
+    // Catch::Benchmark::keep_memory clobbers the buffer in the compiler's
+    // model, forcing it to materialize every byte we wrote. Without this an
+    // optimizing compiler can shrink or DCE the inline body's stores past
+    // the first element we observed.
+
     std::string scalar_name = "scalar  " + std::to_string(sz) + "B";
     BENCHMARK(scalar_name.c_str())
     {
       memcpy_tolower_scalar(output_scalar.data(), input.data(), sz);
+      Catch::Benchmark::keep_memory(output_scalar.data());
       return output_scalar[0];
     };
 
@@ -161,6 +129,7 @@ TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
     BENCHMARK(simd_name.c_str())
     {
       ts::memcpy_tolower(output_simd.data(), input.data(), sz);
+      Catch::Benchmark::keep_memory(output_simd.data());
       return output_simd[0];
     };
   }

From df8e07bb90e9ed6cfe7564cbdd71b89a6afa029f Mon Sep 17 00:00:00 2001
From: Phong Nguyen <phongn@gmail.com>
Date: Thu, 21 May 2026 20:59:37 +0000
Subject: [PATCH 3/4] tscore: rename memcpy_tolower to
 ts::ascii::tolower_{copy,inplace}

memcpy_tolower carried two warts: the "memcpy" prefix implied
non-overlapping by convention with libc memcpy (we explicitly support
the in-place case), and the unqualified name didn't surface the
ASCII-only semantics. Rename the helper to ts::ascii::tolower_copy
and add a thin ts::ascii::tolower_inplace(buf, n) wrapper so call
sites that operate on a single buffer read naturally instead of
passing the same pointer twice.

Rename the header to include/tscore/ink_ascii_tolower.h, the unit
test to src/tscore/unit_tests/test_ink_ascii_tolower.cc, and the
benchmark to tools/benchmark/benchmark_ascii_tolower.cc to match.
Update the two existing call sites (URL.cc fast-path scheme/host and
QPACK::_encode_header in-place name lowercasing) accordingly. No
behavior change: the helper bodies are unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NOTICE                                        |  2 +-
 ...k_memcpy_tolower.h => ink_ascii_tolower.h} | 37 ++++++++++++-------
 src/proxy/hdrs/URL.cc                         |  6 +--
 src/proxy/http3/QPACK.cc                      |  4 +-
 src/tscore/CMakeLists.txt                     |  2 +-
 ...y_tolower.cc => test_ink_ascii_tolower.cc} | 33 ++++++++---------
 tools/benchmark/CMakeLists.txt                |  4 +-
 ..._tolower.cc => benchmark_ascii_tolower.cc} | 20 +++++-----
 8 files changed, 59 insertions(+), 49 deletions(-)
 rename include/tscore/{ink_memcpy_tolower.h => ink_ascii_tolower.h} (84%)
 rename src/tscore/unit_tests/{test_ink_memcpy_tolower.cc => test_ink_ascii_tolower.cc} (76%)
 rename tools/benchmark/{benchmark_memcpy_tolower.cc => benchmark_ascii_tolower.cc} (85%)

diff --git a/NOTICE b/NOTICE
index ed57a6c6cad..c50ea163f76 100644
--- a/NOTICE
+++ b/NOTICE
@@ -121,7 +121,7 @@ https://github.com/litespeedtech/ls-hpack.git
 
 ~~
 
-include/tscore/ink_memcpy_tolower.h AVX-512BW kernel design (fused
+include/tscore/ink_ascii_tolower.h AVX-512BW kernel design (fused
 mask_add and masked-tail load/store) is adapted from Tony Finch's
 copytolower64.c (0BSD OR MIT-0).
 https://dotat.at/cgi/git/vectolower.git/
diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_ascii_tolower.h
similarity index 84%
rename from include/tscore/ink_memcpy_tolower.h
rename to include/tscore/ink_ascii_tolower.h
index 578ef3606b5..d98aafb48dc 100644
--- a/include/tscore/ink_memcpy_tolower.h
+++ b/include/tscore/ink_ascii_tolower.h
@@ -3,9 +3,11 @@
   SIMD-accelerated bulk ASCII tolower copy.
 
   Used on the URL canonicalization fast path for cache-key digests
-  (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast). The scalar loop is
-  the bottleneck for hosts and schemes long enough to vectorize; for
-  shorter inputs the scalar tail handles them with no SIMD overhead.
+  (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast) and any other place
+  that needs to fold ASCII to lowercase over a small-to-moderate
+  buffer. The scalar byte-at-a-time loop is the bottleneck for hosts
+  and schemes long enough to vectorize; for shorter inputs the scalar
+  tail handles them with no SIMD overhead.
 
   Semantics match a byte-at-a-time loop using ParseRules::ink_tolower():
 
@@ -13,13 +15,14 @@
       'a'..'z'. All other bytes (including 0x80..0xFF) pass through
       unchanged. There is no UTF-8 case folding.
 
-    - In-place use (dst == src) is supported: every SIMD body loads a
-      full block into a register before storing back, and the AVX-512BW
-      masked tail uses masked-load/masked-store at the same offset.
-      Partial overlap where dst != src is not supported.
+    - In-place use (dst == src) is supported on every path. Each SIMD
+      body loads a full block into a register before storing back at
+      the same offset, and the AVX-512BW masked tail does masked-load
+      / masked-store at the same offset. Partial overlap where
+      dst != src but the ranges intersect is not supported.
 
   Implementation note: selection is purely compile-time; no runtime
-  dispatch. The bodies are stacked widest-first.
+  dispatch. Bodies are stacked widest-first.
 
     - AVX-512BW builds: when n >= 64, a 64-byte main loop handles the
       bulk and a single masked load/store finishes any 1..63-byte tail,
@@ -64,11 +67,11 @@
 #include <arm_neon.h>
 #endif
 
-namespace ts
+namespace ts::ascii
 {
 
 inline void
-memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
+tolower_copy(char *dst, const char *src, std::size_t n) noexcept
 {
 #if defined(__AVX512BW__)
   // AVX-512BW: 64 bytes per iteration with two key optimizations over the
@@ -84,8 +87,7 @@ memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
   // tiny inputs fall through to the AVX2/SSE2 path below, where they keep
   // the speedup that path already provides.
   //
-  // Inspired by Tony Finch's copytolower64.c
-  // (https://dotat.at/cgi/git/vectolower.git/).
+  // Adapted from Tony Finch's copytolower64.c (see NOTICE).
   if (n >= 64) {
     const __m512i A_vec = _mm512_set1_epi8('A');
     const __m512i Z_vec = _mm512_set1_epi8('Z');
@@ -171,4 +173,13 @@ memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
   }
 }
 
-} // namespace ts
+// Thin sugar over tolower_copy for the in-place case. Makes call sites
+// like ts::ascii::tolower_inplace(buf, n) read naturally instead of
+// ts::ascii::tolower_copy(buf, buf, n).
+inline void
+tolower_inplace(char *buf, std::size_t n) noexcept
+{
+  tolower_copy(buf, buf, n);
+}
+
+} // namespace ts::ascii
diff --git a/src/proxy/hdrs/URL.cc b/src/proxy/hdrs/URL.cc
index c8eff4dd69f..1c9eb4170b2 100644
--- a/src/proxy/hdrs/URL.cc
+++ b/src/proxy/hdrs/URL.cc
@@ -25,7 +25,7 @@
 #include <new>
 #include "tscore/ink_platform.h"
 #include "tscore/ink_memory.h"
-#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ink_ascii_tolower.h"
 #include "proxy/hdrs/URL.h"
 #include "proxy/hdrs/MIME.h"
 #include "proxy/hdrs/HTTP.h"
@@ -1695,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   char *p;
 
   p = buffer;
-  ts::memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
+  ts::ascii::tolower_copy(p, url->m_ptr_scheme, url->m_len_scheme);
   p    += url->m_len_scheme;
   *p++  = ':';
   *p++  = '/';
@@ -1704,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   *p++ = ':';
   // no password
   *p++ = '@';
-  ts::memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
+  ts::ascii::tolower_copy(p, url->m_ptr_host, url->m_len_host);
   p    += url->m_len_host;
   *p++  = '/';
   memcpy(p, url->m_ptr_path, url->m_len_path);
diff --git a/src/proxy/http3/QPACK.cc b/src/proxy/http3/QPACK.cc
index e5145ecf18d..92ac2b024f6 100644
--- a/src/proxy/http3/QPACK.cc
+++ b/src/proxy/http3/QPACK.cc
@@ -25,7 +25,7 @@
 #include "proxy/hdrs/XPACK.h"
 #include "proxy/http3/QPACK.h"
 #include "tscore/ink_defs.h"
-#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ink_ascii_tolower.h"
 #include "tscore/ink_memory.h"
 
 #define QPACKDebug(fmt, ...)   Dbg(dbg_ctl_qpack, "[%s] " fmt, this->_qc->cids().data(), ##__VA_ARGS__)
@@ -370,7 +370,7 @@ QPACK::_encode_header(const MIMEField &field, uint16_t base_index, IOBufferBlock
 {
   auto  name{field.name_get()};
   char *lowered_name = this->_arena.str_store(name.data(), name.length());
-  ts::memcpy_tolower(lowered_name, lowered_name, name.length());
+  ts::ascii::tolower_inplace(lowered_name, name.length());
   auto value{field.value_get()};
 
   // TODO Set never_index flag on/off according to encoding headers
diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt
index 6c87ba2b0bb..ceb770d7ecc 100644
--- a/src/tscore/CMakeLists.txt
+++ b/src/tscore/CMakeLists.txt
@@ -158,8 +158,8 @@ if(BUILD_TESTING)
     unit_tests/test_Throttler.cc
     unit_tests/test_Tokenizer.cc
     unit_tests/test_arena.cc
+    unit_tests/test_ink_ascii_tolower.cc
     unit_tests/test_ink_inet.cc
-    unit_tests/test_ink_memcpy_tolower.cc
     unit_tests/test_ink_memory.cc
     unit_tests/test_ink_string.cc
     unit_tests/test_layout.cc
diff --git a/src/tscore/unit_tests/test_ink_memcpy_tolower.cc b/src/tscore/unit_tests/test_ink_ascii_tolower.cc
similarity index 76%
rename from src/tscore/unit_tests/test_ink_memcpy_tolower.cc
rename to src/tscore/unit_tests/test_ink_ascii_tolower.cc
index 1a33b13c681..5f2cb8df3c9 100644
--- a/src/tscore/unit_tests/test_ink_memcpy_tolower.cc
+++ b/src/tscore/unit_tests/test_ink_ascii_tolower.cc
@@ -1,6 +1,6 @@
 /** @file
 
-  Unit tests for ts::memcpy_tolower.
+  Unit tests for ts::ascii::tolower_copy and ts::ascii::tolower_inplace.
 
   Runs as part of the standard test_tscore binary so the helper's SIMD
   and scalar paths are exercised by ctest in every build, not just when
@@ -27,7 +27,7 @@
 
 #include <catch2/catch_test_macros.hpp>
 
-#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ink_ascii_tolower.h"
 #include "tscore/ParseRules.h"
 
 #include <array>
@@ -60,10 +60,10 @@ make_mixed_case_ascii(std::size_t n, std::uint64_t seed)
 }
 
 // Byte-at-a-time reference, equivalent to the prior static-inline
-// memcpy_tolower in URL.cc. Anything ts::memcpy_tolower produces must match
-// this for every input we test.
+// memcpy_tolower in URL.cc. Anything ts::ascii::tolower_copy produces must
+// match this for every input we test.
 void
-memcpy_tolower_reference(char *d, const char *s, std::size_t n) noexcept
+tolower_reference(char *d, const char *s, std::size_t n) noexcept
 {
   while (n--) {
     *d = ParseRules::ink_tolower(*s);
@@ -74,7 +74,7 @@ memcpy_tolower_reference(char *d, const char *s, std::size_t n) noexcept
 
 } // namespace
 
-TEST_CASE("ts::memcpy_tolower matches scalar reference", "[ts_memcpy_tolower]")
+TEST_CASE("ts::ascii::tolower_copy matches scalar reference", "[ts_ascii_tolower]")
 {
   // Bracket every SIMD body width (16/32/64) with both equal-to and
   // offset-from-multiple lengths so the cascade transitions and the
@@ -84,15 +84,15 @@ TEST_CASE("ts::memcpy_tolower matches scalar reference", "[ts_memcpy_tolower]")
     std::vector<char> expected(sz);
     std::vector<char> actual(sz);
 
-    memcpy_tolower_reference(expected.data(), input.data(), sz);
-    ts::memcpy_tolower(actual.data(), input.data(), sz);
+    tolower_reference(expected.data(), input.data(), sz);
+    ts::ascii::tolower_copy(actual.data(), input.data(), sz);
 
     CAPTURE(sz);
     REQUIRE(actual == expected);
   }
 }
 
-TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]")
+TEST_CASE("ts::ascii::tolower_copy preserves non-ASCII bytes", "[ts_ascii_tolower]")
 {
   // Every byte value 0..255 should round-trip unchanged unless it is in
   // 'A'..'Z', in which case it should map to 'a'..'z'. Guards against any
@@ -102,7 +102,7 @@ TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]")
     input[i] = static_cast<unsigned char>(i);
   }
   std::array<char, 256> output;
-  ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
+  ts::ascii::tolower_copy(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
 
   for (std::size_t i = 0; i < 256; ++i) {
     auto in  = static_cast<unsigned char>(i);
@@ -113,19 +113,18 @@ TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]")
   }
 }
 
-TEST_CASE("ts::memcpy_tolower supports in-place (dst == src)", "[ts_memcpy_tolower]")
+TEST_CASE("ts::ascii::tolower_inplace matches tolower_copy", "[ts_ascii_tolower]")
 {
-  // In-place use must match what an out-of-place call would have produced.
-  // Run across the same boundary sizes as the basic correctness case so the
-  // SIMD bodies and the AVX-512BW masked load/store are all exercised
-  // in-place.
+  // The inplace form must produce the same result as a non-overlapping copy.
+  // Exercise the same boundary sizes so the SIMD bodies and the AVX-512BW
+  // masked load/store are all exercised in-place.
   for (std::size_t sz : std::array<std::size_t, 14>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) {
     auto              input = make_mixed_case_ascii(sz, 0xBADF00D + sz);
     std::vector<char> expected(sz);
     std::vector<char> in_place(input);
 
-    memcpy_tolower_reference(expected.data(), input.data(), sz);
-    ts::memcpy_tolower(in_place.data(), in_place.data(), sz);
+    tolower_reference(expected.data(), input.data(), sz);
+    ts::ascii::tolower_inplace(in_place.data(), sz);
 
     CAPTURE(sz);
     REQUIRE(in_place == expected);
diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt
index 84fc111925e..c08e9a94d12 100644
--- a/tools/benchmark/CMakeLists.txt
+++ b/tools/benchmark/CMakeLists.txt
@@ -36,8 +36,8 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li
 add_executable(benchmark_Random benchmark_Random.cc)
 target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore)
 
-add_executable(benchmark_memcpy_tolower benchmark_memcpy_tolower.cc)
-target_link_libraries(benchmark_memcpy_tolower PRIVATE Catch2::Catch2WithMain ts::tscore)
+add_executable(benchmark_ascii_tolower benchmark_ascii_tolower.cc)
+target_link_libraries(benchmark_ascii_tolower PRIVATE Catch2::Catch2WithMain ts::tscore)
 
 add_executable(benchmark_HostDB benchmark_HostDB.cc)
 target_link_libraries(
diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_ascii_tolower.cc
similarity index 85%
rename from tools/benchmark/benchmark_memcpy_tolower.cc
rename to tools/benchmark/benchmark_ascii_tolower.cc
index 079c0fdcb2f..4c799713c99 100644
--- a/tools/benchmark/benchmark_memcpy_tolower.cc
+++ b/tools/benchmark/benchmark_ascii_tolower.cc
@@ -1,7 +1,7 @@
 /** @file
 
-  Micro benchmark for ts::memcpy_tolower against a byte-at-a-time scalar
-  loop equivalent to the prior URL.cc::memcpy_tolower definition.
+  Micro benchmark for ts::ascii::tolower_copy against a byte-at-a-time
+  scalar loop equivalent to the prior URL.cc::memcpy_tolower definition.
 
   @section license License
 
@@ -28,7 +28,7 @@
 #include <catch2/benchmark/catch_benchmark.hpp>
 #include <catch2/benchmark/catch_optimizer.hpp>
 
-#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ink_ascii_tolower.h"
 #include "tscore/ParseRules.h"
 
 #include <array>
@@ -74,7 +74,7 @@ make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL)
 // Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here
 // as the baseline the SIMD path is expected to beat.
 inline void
-memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept
+tolower_scalar(char *d, const char *s, std::size_t n) noexcept
 {
   while (n--) {
     *d = ParseRules::ink_tolower(*s);
@@ -89,7 +89,7 @@ TEST_CASE("active SIMD configuration", "[tolower][config]")
 {
   // Print the compile-time ISA path so the benchmark output makes the
   // selected configuration obvious.
-  std::cout << "ts::memcpy_tolower compiled with: ";
+  std::cout << "ts::ascii::tolower_copy compiled with: ";
 #if defined(__AVX512BW__)
   std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade";
 #elif defined(__AVX2__)
@@ -105,7 +105,7 @@ TEST_CASE("active SIMD configuration", "[tolower][config]")
   SUCCEED();
 }
 
-TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
+TEST_CASE("tolower throughput", "[bench][tolower]")
 {
   for (std::size_t sz : kSizes) {
     auto              input = make_mixed_case_ascii(sz);
@@ -117,18 +117,18 @@ TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
     // optimizing compiler can shrink or DCE the inline body's stores past
     // the first element we observed.
 
-    std::string scalar_name = "scalar  " + std::to_string(sz) + "B";
+    std::string scalar_name = "scalar   " + std::to_string(sz) + "B";
     BENCHMARK(scalar_name.c_str())
     {
-      memcpy_tolower_scalar(output_scalar.data(), input.data(), sz);
+      tolower_scalar(output_scalar.data(), input.data(), sz);
       Catch::Benchmark::keep_memory(output_scalar.data());
       return output_scalar[0];
     };
 
-    std::string simd_name = "ts::mct " + std::to_string(sz) + "B";
+    std::string simd_name = "ts::atc  " + std::to_string(sz) + "B";
     BENCHMARK(simd_name.c_str())
     {
-      ts::memcpy_tolower(output_simd.data(), input.data(), sz);
+      ts::ascii::tolower_copy(output_simd.data(), input.data(), sz);
       Catch::Benchmark::keep_memory(output_simd.data());
       return output_simd[0];
     };

From fb9b195c33903945c7b7bd3fb5f37b60a51c9d96 Mon Sep 17 00:00:00 2001
From: Phong Nguyen <phongn@gmail.com>
Date: Thu, 21 May 2026 21:17:31 +0000
Subject: [PATCH 4/4] proxy: migrate HPACK / UrlRewrite tolower loops and add
 behavioral tests

Migrate two more byte-at-a-time ASCII tolower loops to
ts::ascii::tolower_copy. Both call sites use a separate destination
buffer, so the copy form is the right fit:

- hpack_encode_header_block(): lower-cases each MIMEField name before
  encoding to match the HTTP/2 lowercase-header-name requirement.

- UrlRewrite::_mappingLookup(): lower-cases the incoming request host
  into a stack buffer before the table lookup, so the lookup is
  case-insensitive against the lower-cased keys built at config-load
  time. The previous code used libc tolower(int) on signed char values,
  which is technically UB for bytes >= 0x80; the new call avoids that.

The existing unit tests in test_URL, test_HpackIndexingTable, and
test_RemapRules executed the tolower paths but only with inputs that
were already lower-case, so they would have missed a "skip the
lowercasing" regression. Add focused behavioral coverage:

- test_URL.cc: four extra get_hash_test_cases that hash a request with
  uppercase/mixed-case scheme or host and require an equal hash to the
  lower-case form. Includes a 49-byte uppercase host that crosses both
  the 16- and 32-byte SIMD bodies.

- test_RemapRules.cc: a new SCENARIO that builds a UrlRewrite from a
  map for a lower-case host and requires that uppercase, mixed-case,
  and long-uppercase request hosts all match.

- test_HpackIndexingTable.cc: a new TEST_CASE that encodes a long
  mixed-case field name with hpack_encode_header_block and requires
  the encoded byte stream to be identical to encoding the same field
  with an already-lower-case name.

QPACK already exercises the in-place path through its Encoding test
and the helper's own ts::ascii::tolower_inplace unit test covers
in-place semantics exhaustively; an additional focused QPACK test
would need the external .qif fixture infrastructure, which is out of
scope here.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/proxy/hdrs/unit_tests/test_URL.cc         | 33 +++++++++++
 src/proxy/http/remap/UrlRewrite.cc            |  7 +--
 .../http/remap/unit-tests/test_RemapRules.cc  | 55 +++++++++++++++++++
 src/proxy/http2/HPACK.cc                      |  5 +-
 .../unit_tests/test_HpackIndexingTable.cc     | 32 +++++++++++
 5 files changed, 125 insertions(+), 7 deletions(-)

diff --git a/src/proxy/hdrs/unit_tests/test_URL.cc b/src/proxy/hdrs/unit_tests/test_URL.cc
index dc5ff4ade74..8dda4e9501a 100644
--- a/src/proxy/hdrs/unit_tests/test_URL.cc
+++ b/src/proxy/hdrs/unit_tests/test_URL.cc
@@ -659,6 +659,39 @@ std::vector<get_hash_test_case> get_hash_test_cases = {
     !IGNORE_QUERY,
     HAS_EQUAL_HASH,
   },
+  {
+    // Verifies the scheme/host SIMD-tolower path in url_CryptoHash_get_fast:
+    // an uppercase host with a long enough prefix to hit the 16-byte SIMD
+    // body should hash identically to its lowercased form.
+    "Uppercase host: equal hashes",
+    "http://ONE.EXAMPLE.COM/a/path?name=value",
+    "http://one.example.com/a/path?name=value",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Mixed-case host: equal hashes",
+    "http://One.Example.Com/a/path?name=value",
+    "http://one.example.com/a/path?name=value",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Uppercase scheme: equal hashes",
+    "HTTP://one.example.com/a/path?name=value",
+    "http://one.example.com/a/path?name=value",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    // Long uppercase host crosses 16- and 32-byte SIMD body boundaries so
+    // the wider paths (when compiled in) are exercised by this fixture.
+    "Long uppercase host: equal hashes",
+    "http://A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM/a/path",
+    "http://a-very-long-host-name-for-simd.example.com/a/path",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
 };
 
 /** Return the hash related to a URI.
diff --git a/src/proxy/http/remap/UrlRewrite.cc b/src/proxy/http/remap/UrlRewrite.cc
index fbda217462b..d2af3b343e9 100644
--- a/src/proxy/http/remap/UrlRewrite.cc
+++ b/src/proxy/http/remap/UrlRewrite.cc
@@ -22,6 +22,8 @@
 
  */
 
+#include "tscore/ink_ascii_tolower.h"
+
 #include "proxy/http/remap/UrlRewrite.h"
 #include "proxy/http/remap/RemapYamlConfig.h"
 #include "iocore/eventsystem/ConfigProcessor.h"
@@ -931,10 +933,7 @@ UrlRewrite::_mappingLookup(MappingsStore &mappings, URL *request_url, int reques
     return false;
   }
 
-  // lowercase
-  for (int i = 0; i < request_host_len; ++i) {
-    request_host_lower[i] = tolower(request_host[i]);
-  }
+  ts::ascii::tolower_copy(request_host_lower, request_host, request_host_len);
   request_host_lower[request_host_len] = 0;
 
   bool         retval       = false;
diff --git a/src/proxy/http/remap/unit-tests/test_RemapRules.cc b/src/proxy/http/remap/unit-tests/test_RemapRules.cc
index 012e8b55bd7..6f2dfc32668 100644
--- a/src/proxy/http/remap/unit-tests/test_RemapRules.cc
+++ b/src/proxy/http/remap/unit-tests/test_RemapRules.cc
@@ -225,3 +225,58 @@ map_with_recv_port http://front.example.com \
     }
   }
 }
+
+SCENARIO("UrlRewrite host lookup is case-insensitive", "[proxy][remap]")
+{
+  // _mappingLookup lower-cases the request host before consulting the hash
+  // table; these scenarios exercise that path with inputs that would not
+  // match in a strict byte-compare. Sized to cross the 16-byte SSE2 body
+  // for hosts that get a real SIMD pass.
+  GIVEN("A forward map with a lowercase source host")
+  {
+    auto        urlrw  = std::make_unique<UrlRewrite>();
+    std::string config = R"RMCFG(
+map http://www.example.com http://origin.example.com
+  )RMCFG";
+
+    auto cpath = write_test_remap(config, "case_insensitive");
+    int  rc    = urlrw->BuildTable(cpath.c_str());
+    REQUIRE(rc == TS_SUCCESS);
+    REQUIRE(urlrw->rule_count() == 1);
+
+    EasyURL             url("http://www.example.com");
+    UrlMappingContainer urlmap;
+
+    THEN("uppercase request host matches the lowercase rule")
+    {
+      const char *host = "WWW.EXAMPLE.COM";
+      REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap));
+    }
+    THEN("mixed-case request host matches the lowercase rule")
+    {
+      const char *host = "Www.Example.Com";
+      REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap));
+    }
+  }
+
+  GIVEN("A forward map with a long host that exercises the 16-byte SIMD body")
+  {
+    auto        urlrw  = std::make_unique<UrlRewrite>();
+    std::string config = R"RMCFG(
+map http://a-very-long-host-name-for-simd.example.com http://origin.example.com
+  )RMCFG";
+
+    auto cpath = write_test_remap(config, "case_insensitive_long");
+    int  rc    = urlrw->BuildTable(cpath.c_str());
+    REQUIRE(rc == TS_SUCCESS);
+
+    EasyURL             url("http://a-very-long-host-name-for-simd.example.com");
+    UrlMappingContainer urlmap;
+
+    THEN("an all-uppercase 49-char host (covers >=32 SIMD bytes) matches")
+    {
+      const char *host = "A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM";
+      REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap));
+    }
+  }
+}
diff --git a/src/proxy/http2/HPACK.cc b/src/proxy/http2/HPACK.cc
index 7e4fd974f57..34ff607ced2 100644
--- a/src/proxy/http2/HPACK.cc
+++ b/src/proxy/http2/HPACK.cc
@@ -23,6 +23,7 @@
 
 #include "proxy/http2/HPACK.h"
 
+#include "tscore/ink_ascii_tolower.h"
 #include "tsutil/LocalBuffer.h"
 #include "swoc/TextView.h"
 
@@ -789,9 +790,7 @@ hpack_encode_header_block(HpackIndexingTable &indexing_table, uint8_t *out_buf,
     int                   name_len      = original_name.size();
     ts::LocalBuffer<char> local_buffer(name_len);
     char                 *lower_name = local_buffer.data();
-    for (int i = 0; i < name_len; i++) {
-      lower_name[i] = ParseRules::ink_tolower(original_name[i]);
-    }
+    ts::ascii::tolower_copy(lower_name, original_name.data(), name_len);
 
     std::string_view name{lower_name, static_cast<size_t>(name_len)};
     std::string_view value = field.value_get();
diff --git a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc
index ad373211fb8..7692931b1af 100644
--- a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc
+++ b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc
@@ -24,6 +24,7 @@
     limitations under the License.
  */
 
+#include <cstring>
 #include <memory>
 #include <string_view>
 
@@ -531,3 +532,34 @@ TEST_CASE("HPACK high level APIs", "[hpack]")
     }
   }
 }
+
+// Validates that hpack_encode_header_block() lower-cases mixed-case field
+// names per RFC 7540 § 8.1.2 before emitting them. The lower-case step is the
+// path that goes through ts::ascii::tolower_copy; if a regression broke the
+// lowercasing, the byte-for-byte comparison below would fail.
+TEST_CASE("HPACK encode lower-cases mixed-case field names", "[hpack]")
+{
+  uint8_t            buf_mixed[BUFSIZE_FOR_REGRESSION_TEST];
+  uint8_t            buf_lower[BUFSIZE_FOR_REGRESSION_TEST];
+  HpackIndexingTable table_mixed(MAX_TABLE_SIZE);
+  HpackIndexingTable table_lower(MAX_TABLE_SIZE);
+
+  // Use a name long enough to exercise the 16-byte SSE2 body when present.
+  auto encode_one = [](HpackIndexingTable &table, uint8_t *buf, const char *name, const char *value) -> int64_t {
+    std::unique_ptr<HTTPHdr, void (*)(HTTPHdr *)> headers(new HTTPHdr, destroy_http_hdr);
+    headers->create(HTTPType::REQUEST);
+    MIMEField *field = mime_field_create(headers->m_heap, headers->m_http->m_fields_impl);
+    field->name_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{name});
+    field->value_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{value});
+    mime_hdr_field_attach(headers->m_http->m_fields_impl, field, 1, nullptr);
+    std::memset(buf, 0, BUFSIZE_FOR_REGRESSION_TEST);
+    return hpack_encode_header_block(table, buf, BUFSIZE_FOR_REGRESSION_TEST, headers.get());
+  };
+
+  int64_t mixed_len = encode_one(table_mixed, buf_mixed, "Long-Custom-Header-Name", "abc");
+  int64_t lower_len = encode_one(table_lower, buf_lower, "long-custom-header-name", "abc");
+
+  REQUIRE(mixed_len > 0);
+  REQUIRE(mixed_len == lower_len);
+  REQUIRE(std::memcmp(buf_mixed, buf_lower, lower_len) == 0);
+}