diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd39c2a..225b49f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,8 +10,11 @@ env: jobs: rust: - name: Rust tests - runs-on: ubuntu-latest + name: Rust tests (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-14] steps: - uses: actions/checkout@v4 @@ -27,9 +30,9 @@ jobs: ~/.cargo/registry ~/.cargo/git target - key: cargo-${{ runner.os }}-${{ hashFiles('Cargo.toml') }} + key: cargo-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('Cargo.toml') }} restore-keys: | - cargo-${{ runner.os }}- + cargo-${{ runner.os }}-${{ runner.arch }}- - name: Build (release) run: cargo build --release @@ -37,7 +40,7 @@ jobs: - name: Test (release) run: cargo test --release - - name: Test scalar-only (no AVX2 feature) + - name: Test scalar-only (no AVX2/NEON feature) run: cargo test --release --no-default-features - name: Test with test-panic feature diff --git a/src/scan/neon.rs b/src/scan/neon.rs index 7c1db0a..2242da3 100644 --- a/src/scan/neon.rs +++ b/src/scan/neon.rs @@ -88,25 +88,6 @@ unsafe fn tag_mask16(tag: uint8x16_t, bits: u8) -> u16 { movemask16(vtstq_u8(tag, vdupq_n_u8(bits))) } -#[inline(always)] -unsafe fn byte_mask16(bytes: uint8x16_t, needle: u8) -> u16 { - movemask16(vceqq_u8(bytes, vdupq_n_u8(needle))) -} - -#[inline(always)] -unsafe fn byte_mask64( - c0: uint8x16_t, - c1: uint8x16_t, - c2: uint8x16_t, - c3: uint8x16_t, - needle: u8, -) -> u64 { - (byte_mask16(c0, needle) as u64) - | ((byte_mask16(c1, needle) as u64) << 16) - | ((byte_mask16(c2, needle) as u64) << 32) - | ((byte_mask16(c3, needle) as u64) << 48) -} - #[inline(always)] unsafe fn classify_tags64( c0: uint8x16_t, @@ -170,10 +151,19 @@ unsafe fn scan_neon_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { // In-string fast probe: while already in a string, avoid the full // nibble-LUT classification unless this block contains quote/backslash. + // Uses vmaxvq_u8 on OR'd comparison results instead of byte_mask64 to + // avoid the expensive movemask16 pairwise-add chain (~3x faster probe). if in_string != 0 { - let quote_probe = byte_mask64(c0, c1, c2, c3, b'"'); - let backslash_probe = byte_mask64(c0, c1, c2, c3, b'\\'); - if (quote_probe | backslash_probe) == 0 { + let quote = vdupq_n_u8(b'"'); + let backslash = vdupq_n_u8(b'\\'); + let m0 = vorrq_u8(vceqq_u8(c0, quote), vceqq_u8(c0, backslash)); + let m1 = vorrq_u8(vceqq_u8(c1, quote), vceqq_u8(c1, backslash)); + let m2 = vorrq_u8(vceqq_u8(c2, quote), vceqq_u8(c2, backslash)); + let m3 = vorrq_u8(vceqq_u8(c3, quote), vceqq_u8(c3, backslash)); + let m01 = vorrq_u8(m0, m1); + let m23 = vorrq_u8(m2, m3); + let m = vorrq_u8(m01, m23); + if vmaxvq_u8(m) == 0 { bs_carry = 0; i += 64; // Cross-chunk jump: with no quote/backslash in the chunk we just