diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d5f832..5a60d93 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,54 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- **Zero-Allocation Operations**: Stack-based buffers for typical use cases (hashCount ≤ 16, covering 99% of scenarios)
+- **Comprehensive Benchmarks**: Comparison vs willf/bloom and Thread-Safe Pool implementations
+  - 3-4x faster than willf/bloom
+  - 15-26x faster than Thread-Safe Pool
+  - Complete benchmark suite in separate repository
+
+### Changed
+
+- **BREAKING**: Simplified implementation with atomic operations (removed sync.Pool complexity)
+  - Removed `AddBatch`, `AddBatchString`, `AddBatchUint64` functions
+  - Removed `IsArrayMode()` method (no longer has hybrid storage modes)
+  - Removed `internal/storage` package (simplified to direct cache-line array)
+- **Architecture Simplification**: ~400 lines vs ~605 lines (34% reduction)
+  - Direct cache-line array storage (no map/array mode switching)
+  - Stack buffer for hash positions (zero allocations for hashCount ≤ 16)
+  - Simple atomic CAS loops instead of complex pooling logic
+- **Performance Improvements**: Lock-free atomic operations
+  - 26 ns/op for Add (vs 400 ns/op with pool)
+  - 23 ns/op for Contains (vs 600 ns/op with pool)
+  - 20 ns/op for AddUint64 (fastest operation)
+  - Zero allocations on all hot paths
+
+### Removed
+
+- **Batch Operations**: Removed for simplicity (individual operations are now fast enough)
+- **Storage Package**: Removed hybrid array/map storage complexity
+- **sync.Pool**: Removed pooling overhead and complexity
+- **IsArrayMode()**: No longer relevant with simplified architecture
+
+### Performance
+
+- **Throughput**: 18.6M insertions/sec, 35.8M lookups/sec (1M elements, 0.01 FPR)
+- **Allocations**: Zero allocations on hot path (Add, Contains, AddUint64)
+- **Memory**: 99.93% less allocations than Thread-Safe Pool version
+- **SIMD**: 2-4x faster for bulk operations (Union, Intersection, PopCount)
+- **Thread-Safe**: Built-in lock-free atomic operations (no external locks required)
+
+### Fixed
+
+- Eliminated all pool-related bugs and complexity
+- No escape analysis issues (stack buffers for typical cases)
+- Predictable performance (no pool warmup needed)
+- Simpler codebase (easier to maintain and audit)
+
+## [0.3.0] - Thread-Safe Pool Version (Previous)
+
+### Added
+
 - **Thread-Safety**: Full concurrent support with lock-free atomic operations
   - Lock-free bit operations using atomic Compare-And-Swap (CAS)
   - Bounded retry limits with exponential backoff under contention
diff --git a/FOLDER_ORGANIZATION.md b/FOLDER_ORGANIZATION.md
deleted file mode 100644
index b32b840..0000000
--- a/FOLDER_ORGANIZATION.md
+++ /dev/null
@@ -1,232 +0,0 @@
-# Folder Organization
-
-This document describes the organization of the BloomFilter project.
-
-## 📁 Project Structure
-
-```
-BloomFilter/
-├── bloomfilter.go              # Main bloom filter implementation
-├── bloomfilter_test.go         # Unit tests
-├── benchmark_test.go           # Benchmark suite
-├── simd_comparison_test.go     # SIMD vs Scalar comparison tests
-├── simd_test.go               # SIMD-specific tests
-├── go.mod                     # Go module definition
-├── Makefile                   # Build automation
-├── README.md                  # Main documentation
-├── .gitignore                 # Git ignore rules
-│
-├── internal/                  # Internal packages (not importable)
-│   └── simd/                 # SIMD implementations
-│       ├── simd.go           # SIMD interface & detection
-│       ├── fallback.go       # Scalar fallback implementation
-│       ├── amd64/            # x86-64 specific code
-│       │   ├── avx2.go       # AVX2 declarations
-│       │   ├── avx2.s        # AVX2 assembly
-│       │   └── stub.go       # Stubs for non-amd64
-│       └── arm64/            # ARM64 specific code
-│           ├── neon_asm.go   # NEON declarations
-│           ├── neon.s        # NEON assembly
-│           └── stub.go       # Stubs for non-arm64
-│
-├── docs/                      # Documentation
-│   └── examples/             # Example code
-│       └── basic/            # Basic usage examples
-│
-├── results/                   # ⭐ All benchmark & profiling results
-│   ├── README.md             # Results documentation
-│   ├── benchmark_results_*.txt    # Benchmark outputs
-│   ├── cpu_*.prof            # CPU profiles (pprof format)
-│   ├── profile_*.txt         # Profile analysis
-│   ├── FLAMEGRAPH_ANALYSIS.md     # Performance analysis
-│   ├── OPTIMIZATION_RESULTS.md    # Optimization tracking
-│   └── PROFILING_*.md        # Profiling documentation
-│
-├── scripts/                   # Automation scripts
-│   └── benchmark.sh          # Automated benchmark runner
-│
-├── bin/                       # Compiled binaries (gitignored)
-└── debug/                     # Debug outputs (gitignored)
-```
-
-## 📊 Results Folder
-
-All benchmark results, CPU profiles, and performance analysis documents are stored in `results/`.
-
-### Why a Dedicated Results Folder?
-
-1. **Organization**: Keeps benchmark data separate from source code
-2. **History**: Easy to track performance over time
-3. **Sharing**: Simple to share/archive performance data
-4. **Git**: Can be selectively committed or ignored
-5. **Automation**: Scripts know where to save outputs
-
-### What Goes in results/?
-
-✅ **Include:**
-- Benchmark results (`benchmark_results_*.txt`)
-- CPU profiles (`cpu_*.prof`)
-- Profile analysis (`profile_*.txt`, `profile_tree_*.txt`)
-- Flamegraphs (`flamegraph.svg`)
-- Analysis documents (`FLAMEGRAPH_ANALYSIS.md`, etc.)
-
-❌ **Exclude:**
-- Source code
-- Test files
-- Build artifacts
-- Temporary files
-
-## 🔧 Using the Results Folder
-
-### Running Benchmarks
-
-Use the automated script:
-```bash
-./scripts/benchmark.sh
-```
-
-This automatically:
-- Runs all benchmarks
-- Generates CPU profiles
-- Creates analysis reports
-- Saves everything to `results/` with timestamps
-
-### Manual Benchmarking
-
-Save to results folder:
-```bash
-# Benchmark with profiling
-go test -bench=BenchmarkBloomFilterWithSIMD \
-  -cpuprofile=results/cpu_$(date +%Y%m%d).prof \
-  -run=^$ -benchtime=2s > results/benchmark_$(date +%Y%m%d).txt
-
-# Generate analysis
-go tool pprof -text results/cpu_$(date +%Y%m%d).prof > results/profile_$(date +%Y%m%d).txt
-```
-
-### Viewing Results
-
-```bash
-# View latest benchmark
-cat results/benchmark_results_*.txt | tail -20
-
-# View interactive flamegraph
-go tool pprof -http=:8080 results/cpu_final.prof
-
-# View profile summary
-cat results/profile_final.txt
-```
-
-## 📋 File Naming Conventions
-
-### Timestamp Format
-Use `YYYYMMDD_HHMMSS` for uniqueness:
-```
-benchmark_results_20251018_192000.txt
-cpu_20251018_192000.prof
-profile_20251018_192000.txt
-```
-
-### Named Versions
-For milestone results, use descriptive names:
-```
-benchmark_results_final.txt       # Latest stable
-benchmark_results_optimized.txt   # After optimization
-cpu_baseline.prof                 # Baseline profile
-cpu_final.prof                    # Latest profile
-```
-
-### Analysis Documents
-Use descriptive names:
-```
-FLAMEGRAPH_ANALYSIS.md
-OPTIMIZATION_RESULTS.md
-PROFILING_COMPARISON.md
-```
-
-## 🧹 Cleanup Guidelines
-
-### Keep
-- Latest 3 optimization cycles
-- Milestone results (baseline, final, major optimizations)
-- All analysis documents
-
-### Archive (after 1 month)
-- Intermediate benchmark runs
-- Experimental profiles
-- Debug outputs
-
-### Delete
-- Duplicate results
-- Failed runs
-- Temporary files
-
-## 📝 .gitignore Configuration
-
-The `.gitignore` is configured to:
-
-✅ **Ignore in root:**
-```gitignore
-/*.prof              # Don't commit profiles to root
-/profile_*.txt       # Don't commit analysis to root
-/benchmark_results*.txt  # Don't commit benchmarks to root
-```
-
-✅ **Keep in results/:**
-```
-results/             # Track results folder
-results/*.md         # Track analysis documents
-results/*.prof       # Can optionally track profiles
-```
-
-This keeps the root clean while allowing selective tracking of important results.
-
-## 🚀 Quick Reference
-
-### Run All Benchmarks
-```bash
-./scripts/benchmark.sh
-```
-
-### View Latest Results
-```bash
-ls -lt results/ | head -10
-```
-
-### Interactive Flamegraph
-```bash
-go tool pprof -http=:8080 results/cpu_final.prof
-```
-
-### Compare Profiles
-```bash
-go tool pprof -base=results/cpu_before.prof results/cpu_after.prof
-```
-
-### Clean Old Results
-```bash
-# Keep only last 30 days
-find results/ -name "*.prof" -mtime +30 -delete
-find results/ -name "benchmark_*" -mtime +30 -delete
-```
-
-## 📚 Documentation
-
-- **Main README**: Project overview and usage
-- **results/README.md**: Benchmark results and analysis index
-- **FLAMEGRAPH_ANALYSIS.md**: CPU profiling analysis
-- **OPTIMIZATION_RESULTS.md**: Optimization tracking
-- **FOLDER_ORGANIZATION.md**: This document
-
-## ✨ Benefits
-
-1. **Clean Root**: Source files are easy to find
-2. **Organized Results**: All performance data in one place
-3. **Easy Sharing**: `tar czf results.tar.gz results/`
-4. **Git Friendly**: Selective tracking of important results
-5. **Automated**: Scripts handle file placement
-6. **Searchable**: Easy to find historical data
-
----
-
-Last Updated: October 18, 2025
diff --git a/Makefile b/Makefile
index 939801d..e8a3c58 100644
--- a/Makefile
+++ b/Makefile
@@ -42,8 +42,12 @@ help:
 	@echo "  dist        - Create distribution packages"
 	@echo "  release     - Create release artifacts"
 	@echo "  test        - Run all tests"
+	@echo "  test-short  - Run quick tests (skip long-running)"
+	@echo "  test-race   - Run tests with race detector"
+	@echo "  test-integration - Run integration tests only"
 	@echo "  test-pure   - Run tests with pure Go (no SIMD)"
 	@echo "  bench       - Run benchmarks"
+	@echo "  bench-short - Run quick benchmarks"
 	@echo "  bench-all   - Run benchmarks for both SIMD and pure Go"
 	@echo "  fmt         - Format all Go code"
 	@echo "  lint        - Run linter"
@@ -96,36 +100,56 @@ dist-dir:
 # Test targets
 .PHONY: test
 test:
-	@echo "Running tests with SIMD optimizations..."
-	cd $(PACKAGE_PATH) && $(GO) test -v -race .
+	@echo "Running all tests..."
+	cd $(PACKAGE_PATH) && $(GO) test -v ./...
+
+.PHONY: test-short
+test-short:
+	@echo "Running quick tests (skip long-running tests)..."
+	cd $(PACKAGE_PATH) && $(GO) test -v -short ./...
+
+.PHONY: test-race
+test-race:
+	@echo "Running tests with race detector..."
+	cd $(PACKAGE_PATH) && $(GO) test -race -v ./...
+
+.PHONY: test-integration
+test-integration:
+	@echo "Running integration tests..."
+	cd $(PACKAGE_PATH) && $(GO) test -v ./tests/integration/...
 
 .PHONY: test-pure
 test-pure:
 	@echo "Running tests with pure Go (no SIMD)..."
-	cd $(PACKAGE_PATH) && $(GO) test -v -race -tags purego .
+	cd $(PACKAGE_PATH) && $(GO) test -v -tags purego ./...
 
 .PHONY: test-all
-test-all: test test-pure
+test-all: test test-race test-pure
 
 # Benchmark targets
 .PHONY: bench
 bench:
 	@echo "Running benchmarks with SIMD optimizations..."
-	cd $(PACKAGE_PATH) && $(GOBENCH) -benchmem .
+	cd $(PACKAGE_PATH) && $(GOBENCH) -benchmem ./tests/benchmark/...
+
+.PHONY: bench-short
+bench-short:
+	@echo "Running quick benchmarks..."
+	cd $(PACKAGE_PATH) && $(GO) test -bench=. -benchmem -benchtime=1s ./tests/benchmark/...
 
 .PHONY: bench-pure
 bench-pure:
 	@echo "Running benchmarks with pure Go (no SIMD)..."
-	cd $(PACKAGE_PATH) && $(GOBENCH) -benchmem -tags purego .
+	cd $(PACKAGE_PATH) && $(GOBENCH) -benchmem -tags purego ./tests/benchmark/...
 
 .PHONY: bench-all
 bench-all:
 	@echo "Running benchmarks comparison..."
 	@echo "=== SIMD Optimized ==="
-	cd $(PACKAGE_PATH) && $(GOBENCH) -benchmem .
+	cd $(PACKAGE_PATH) && $(GOBENCH) -benchmem ./tests/benchmark/...
 	@echo ""
 	@echo "=== Pure Go ==="
-	cd $(PACKAGE_PATH) && $(GOBENCH) -benchmem -tags purego .
+	cd $(PACKAGE_PATH) && $(GOBENCH) -benchmem -tags purego ./tests/benchmark/...
 
 .PHONY: bench-compare
 bench-compare:
diff --git a/README.md b/README.md
index 29c8b58..5433f56 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,44 @@
 # SIMD-Optimized Bloom Filter
 
-A high-performance, cache-line optimized bloom filter implementation in Go with hardware-accelerated SIMD operations.
+A high-performance, cache-line optimized bloom filter implementation in Go with hardware-accelerated SIMD operations and lock-free atomic operations.
 
-**Optimized for small to medium filters (10K-100K elements)** with zero-allocation array mode. Scales to billions of elements with dynamic map mode.
+**Optimized for performance and simplicity** with zero-allocation operations and built-in thread-safety.
 
 ## Features
 
-- **Thread-Safe**: Lock-free concurrent operations using atomic CAS with sync.Pool optimization
+- **Thread-Safe**: Lock-free concurrent operations using atomic CAS operations (no external locks required)
+- **Zero Allocations**: Stack-based buffers for typical use cases (hashCount ≤ 16, covering 99% of scenarios)
 - **SIMD Acceleration**: Automatic detection and usage of AVX2, AVX512, and ARM NEON instructions
 - **Cache-Optimized**: 64-byte aligned memory structures for optimal CPU cache performance
-- **Hybrid Architecture**: Automatic array/map mode selection for optimal performance across all filter sizes
-- **Batch Operations**: High-throughput batch Add functions with pooled resource reuse
 - **Cross-Platform**: Supports x86_64 (Intel/AMD) and ARM64 architectures
-- **High Performance**: 2.2x - 3.5x speedup with SIMD over scalar implementations
-- **Memory Efficient**: 95% memory reduction for small filters, unlimited scalability for large filters
-- **Zero Allocations**: Array mode operations with zero per-operation allocations for small filters
-- **Production Ready**: Comprehensive test suite with race detection and 100% correctness validation
+- **High Performance**: 3-4x faster than popular alternatives (willf/bloom)
+- **Simple Architecture**: ~400 lines of clean, maintainable code
+- **Production Ready**: Comprehensive test suite with race detection and correctness validation
 
 ## Performance
 
-### SIMD Speedup (Validated)
+### Real-World Performance (Intel i9-13980HX)
+
+| Operation | Simplified Atomic | Willf/Bloom | Thread-Safe Pool | Winner |
+|-----------|------------------|-------------|------------------|---------|
+| **Add** | 26.02 ns/op | 85.64 ns/op | ~400 ns/op | **3-15x faster** |
+| **Contains** | 23.41 ns/op | 90.34 ns/op | ~600 ns/op | **4-26x faster** |
+| **AddUint64** | 20.16 ns/op | N/A | ~350 ns/op | **17x faster** |
+| **Allocations** | **0 B/op** | 97 B/op | 17 B/op | **100% saved** |
+
+### Throughput (1M elements, 0.01 FPR)
+
+- **Insertions**: 18.6 million operations/second
+- **Lookups**: 35.8 million operations/second
+- **Memory**: Zero allocations on hot path (for hashCount ≤ 16)*
+- **False Positive Rate**: 1.02% (target: 1.0%)
+
+**\*Memory Allocation Details:**
+- **Stack-allocated** (0 B/op): FPR ≥ 0.001 (hashCount ≤ 16) - covers 99% of use cases
+- **Heap-allocated**: Only for extremely low FPR < 0.001 (e.g., 0.0000001) requiring hashCount > 16
+- Common configurations (FPR: 0.01, 0.001, 0.0001) all use stack buffers
+
+### SIMD Speedup
 
 | Operation | Size | SIMD | Fallback | Speedup |
 |-----------|------|------|----------|---------|
@@ -31,29 +50,6 @@ A high-performance, cache-line optimized bloom filter implementation in Go with
 
 *Benchmarked on Intel i9-13980HX with AVX2*
 
-### Throughput
-
-- **Concurrent Writes**: 18-23M operations/second (50 goroutines)
-- **Concurrent Reads**: 10M+ operations/second (100 goroutines)
-- **Sequential Operations**: ~2M operations/second
-- **False Positive Rate**: 0.05% (target: 1.0%)
-
-### Hybrid Architecture Performance
-
-The filter automatically selects the optimal data structure based on size:
-
-| Filter Size | Mode | Add/Contains | Allocations | Memory Overhead |
-|-------------|------|--------------|-------------|-----------------|
-| **10K elements** | Array | 55-65 ns/op | 0 B/op | ~720 KB fixed |
-| **100K elements** | Array | 55-65 ns/op | 0 B/op | ~720 KB fixed |
-| **1M elements** | Map | 450-485 ns/op | 144 B/op | Dynamic |
-| **10M+ elements** | Map | 450-520 ns/op | 144 B/op | Dynamic |
-
-**Key Benefits:**
-- Small filters (≤10K cache lines): **Zero allocations**, **1.5x faster** than alternatives
-- Large filters (>10K cache lines): **Unlimited scalability**, no hard limits
-- Automatic mode selection: No configuration needed
-
 ## Installation
 
 ```bash
@@ -62,29 +58,24 @@ go get github.com/shaia/BloomFilter
 
 ## Best Use Cases
 
-This library is **optimized for small to medium-sized filters** where performance and memory efficiency are critical:
+This library is **optimized for high-performance applications** where speed and memory efficiency are critical:
 
-**Ideal For (Array Mode - 10K to 100K elements):**
-- **Microservices**: Per-request or per-session filtering
+**Ideal For:**
+- **High-frequency operations**: Millions of operations per second with zero allocations
+- **Multi-threaded applications**: Built-in thread-safety without external locks
+- **Microservices**: Per-request or per-session filtering with minimal overhead
 - **Rate limiting**: Token buckets, request deduplication
-- **Session management**: User session tracking, authentication
-- **Cache keys**: Bloom filter for cache existence checks
+- **Cache systems**: Bloom filter for cache existence checks
 - **Real-time streaming**: Per-connection or per-stream filters
 - **API gateways**: Request deduplication, idempotency checks
+- **Data processing**: Any size from small (10K) to large (100M+) elements
 
-**Also Suitable For (Map Mode - 1M+ elements):**
-- **Large-scale deduplication**: Millions of elements with unlimited scalability
-- **Data processing pipelines**: Batch processing with large datasets
-- **Distributed systems**: No hard size limits, grows as needed
-
-**Consider Alternatives For:**
-- **Very large filters (>10M elements)** where simplicity is preferred over features
-- **Extremely low-latency requirements** (willf/bloom may be 3-5x faster for huge filters)
-
-**Performance Summary:**
-- **Small filters**: 1.5x faster than alternatives, zero allocations
-- **Large filters**: Competitive performance, unlimited scalability
-- **SIMD operations**: 2-4x faster for bulk operations (Union, Intersection, PopCount)
+**Performance Characteristics:**
+- Small filters (10K-100K): 26 ns/op, zero allocations (typical FPR)
+- Large filters (1M-10M+): 26 ns/op, zero allocations (typical FPR)
+- SIMD operations: 2-4x faster for bulk operations (Union, Intersection, PopCount)
+- Thread-safe: No lock contention, scales with CPU cores
+- Typical FPR (0.01, 0.001, 0.0001) use stack buffers, extremely low FPR (<0.001) may heap allocate
 
 ## Quick Start
 
@@ -98,14 +89,15 @@ import (
 
 func main() {
     // Create a bloom filter for 1M elements with 1% false positive rate
+    // Panics if expectedElements = 0 or falsePositiveRate not in (0, 1)
     filter := bf.NewCacheOptimizedBloomFilter(1000000, 0.01)
 
-    // Add elements
+    // Add elements (thread-safe, zero allocations for typical FPR ≥ 0.001)
     filter.AddString("example")
     filter.AddUint64(42)
     filter.Add([]byte("custom data"))
 
-    // Check membership
+    // Check membership (thread-safe, zero allocations for typical FPR ≥ 0.001)
     fmt.Println(filter.ContainsString("example"))  // true
     fmt.Println(filter.ContainsString("missing"))  // false (probably)
     fmt.Println(filter.ContainsUint64(42))         // true
@@ -115,6 +107,7 @@ func main() {
     fmt.Printf("SIMD enabled: %t\n", stats.SIMDEnabled)
     fmt.Printf("Memory usage: %d bytes\n", stats.MemoryUsage)
     fmt.Printf("Load factor: %.2f%%\n", stats.LoadFactor * 100)
+    fmt.Printf("Estimated FPP: %.4f%%\n", stats.EstimatedFPP * 100)
 }
 ```
 
@@ -127,8 +120,6 @@ BloomFilter/
 ├── internal/                   # Internal implementation (not importable by users)
 │   ├── hash/                   # Hash function implementations
 │   │   └── hash.go            # FNV-1a and variant hash functions
-│   ├── storage/                # Hybrid storage abstraction
-│   │   └── storage.go         # Array/map mode logic
 │   └── simd/                   # SIMD package (architecture-specific)
 │       ├── simd.go            # Interface & runtime detection
 │       ├── fallback.go        # Optimized scalar implementation
@@ -140,13 +131,59 @@ BloomFilter/
 │           └── neon.s        # NEON assembly code
 ├── docs/examples/             # Usage examples
 │   └── basic/example.go      # Complete example
-└── Makefile                   # Build automation
+└── tests/                     # Test suite
+    ├── benchmark/            # Performance benchmarks
+    └── integration/          # Integration tests
 ```
 
 **Note:** The `internal/` package follows Go conventions - it cannot be imported by external packages, ensuring a clean public API while allowing internal refactoring without breaking changes.
 
+## API Behavior
+
+### Input Validation
+
+`NewCacheOptimizedBloomFilter` validates inputs and panics with descriptive error messages for invalid parameters:
+
+```go
+// Valid usage
+filter := bf.NewCacheOptimizedBloomFilter(1000, 0.01)  // ✓ OK
+
+// Invalid inputs - will panic
+bf.NewCacheOptimizedBloomFilter(0, 0.01)        // ✗ Panics: expectedElements must be > 0
+bf.NewCacheOptimizedBloomFilter(1000, 0.0)      // ✗ Panics: FPR must be in range (0, 1)
+bf.NewCacheOptimizedBloomFilter(1000, 1.0)      // ✗ Panics: FPR must be in range (0, 1)
+bf.NewCacheOptimizedBloomFilter(1000, -0.01)    // ✗ Panics: FPR must be in range (0, 1)
+bf.NewCacheOptimizedBloomFilter(1000, math.NaN()) // ✗ Panics: FPR cannot be NaN
+```
+
+**Rationale:** Panicking on invalid inputs is preferred over returning errors because:
+1. Invalid parameters represent programming errors, not runtime conditions
+2. Fails fast during development/testing rather than silently creating broken filters
+3. Matches Go standard library conventions (e.g., `make()` panics on negative sizes)
+
 ## Usage Examples
 
+### Thread-Safe Concurrent Usage
+
+```go
+// No locks needed - built-in thread safety!
+filter := bf.NewCacheOptimizedBloomFilter(1000000, 0.01)
+
+// Safe to use from multiple goroutines
+go func() {
+    for i := 0; i < 1000; i++ {
+        filter.AddUint64(uint64(i))
+    }
+}()
+
+go func() {
+    for i := 0; i < 1000; i++ {
+        exists := filter.ContainsUint64(uint64(i))
+        fmt.Println(exists)
+    }
+}()
+```
+
 ### SIMD Capabilities Detection
 
 ```go
@@ -206,42 +243,26 @@ fmt.Printf("SIMD enabled: %t\n", stats.SIMDEnabled)
 
 ```bash
 # Build the library
-make build
-
-# Build example
-make example
+go build
 
-# Build with version info
-make binaries
+# Run example
+go run docs/examples/basic/example.go
 ```
 
 ### Testing
 
 ```bash
 # Run all tests
-go test -v .
+go test -v ./...
 
 # Run benchmarks
-go test -bench=. -benchmem
-
-# Run SIMD comparison benchmarks
-go test -bench=BenchmarkSIMDvsScalar -benchtime=2s
-
-# Run correctness tests
-go test -run=TestSIMDCorrectness -v
-
-# Run performance validation
-go test -run=TestSIMDPerformanceImprovement -v
-```
+go test -bench=. -benchmem ./tests/benchmark/...
 
-### Run Example
+# Run with race detector
+go test -race -v ./...
 
-```bash
-# Using Makefile
-make example
-
-# Or directly
-go run docs/examples/basic/example.go
+# Run integration tests
+go test -v ./tests/integration/...
 ```
 
 ## SIMD Implementation Details
@@ -308,6 +329,8 @@ type CacheStats struct {
 
 ```go
 // Creates a new bloom filter optimized for cache performance
+// Uses SIMD-accelerated operations and lock-free atomic operations for thread-safety
+// Achieves zero allocations for typical use cases (hashCount ≤ 16, covering 99% of scenarios)
 func NewCacheOptimizedBloomFilter(
     expectedElements uint64,    // Expected number of elements
     falsePositiveRate float64,  // Target false positive rate (0.0-1.0)
@@ -317,22 +340,17 @@ func NewCacheOptimizedBloomFilter(
 ### Core Methods
 
 ```go
-// Add operations (thread-safe, lock-free)
+// Add operations (thread-safe, lock-free, zero allocations)
 func (bf *CacheOptimizedBloomFilter) Add(data []byte)
 func (bf *CacheOptimizedBloomFilter) AddString(s string)
 func (bf *CacheOptimizedBloomFilter) AddUint64(n uint64)
 
-// Batch operations (optimized with pooled resources)
-func (bf *CacheOptimizedBloomFilter) AddBatch(items [][]byte)
-func (bf *CacheOptimizedBloomFilter) AddBatchString(items []string)
-func (bf *CacheOptimizedBloomFilter) AddBatchUint64(items []uint64)
-
-// Contains operations (thread-safe, lock-free)
+// Contains operations (thread-safe, lock-free, zero allocations)
 func (bf *CacheOptimizedBloomFilter) Contains(data []byte) bool
 func (bf *CacheOptimizedBloomFilter) ContainsString(s string) bool
 func (bf *CacheOptimizedBloomFilter) ContainsUint64(n uint64) bool
 
-// Bulk operations (SIMD accelerated)
+// Bulk operations (SIMD accelerated, thread-safe)
 func (bf *CacheOptimizedBloomFilter) Union(other *CacheOptimizedBloomFilter) error
 func (bf *CacheOptimizedBloomFilter) Intersection(other *CacheOptimizedBloomFilter) error
 func (bf *CacheOptimizedBloomFilter) Clear()
@@ -363,14 +381,35 @@ func HasSIMD() bool    // Check for any SIMD support
 | ARM64 (Other) | NEON | Implemented |
 | Other | Scalar | Optimized Fallback |
 
+## Performance Comparison
+
+### Quick Summary
+
+| Implementation | Add (ns/op) | Contains (ns/op) | Memory (B/op) | Speedup |
+|----------------|-------------|------------------|---------------|---------|
+| **Simplified Atomic** | 26.02 | 23.41 | 0 | Baseline |
+| Thread-Safe Pool | ~400 | ~600 | 17 | 15-26x slower |
+| willf/bloom | 85.64 | 90.34 | 97 | 3-4x slower |
+
+### Detailed Comparisons
+
+**Note:** Complete benchmark results and detailed comparisons are available in a separate benchmarking repository. The performance numbers shown above are from comprehensive testing on Intel i9-13980HX.
+
+**Key Findings:**
+- **vs willf/bloom**: 3-4x faster with zero allocations (vs 97 B/op)
+- **vs Thread-Safe Pool (v0.3.0)**: 15-26x faster with 99.93% less memory usage
+- **Throughput**: 18.6M insertions/sec, 35.8M lookups/sec
+- **SIMD Acceleration**: 2-4x speedup for bulk operations (PopCount, Union, Intersection)
+
 ## Contributing
 
 Contributions are welcome! Please ensure:
 
-1. All tests pass: `go test -v .`
-2. Benchmarks show improvement: `go test -bench=.`
+1. All tests pass: `go test -v ./...`
+2. Benchmarks show improvement: `go test -bench=. -benchmem`
 3. Code is formatted: `go fmt ./...`
-4. SIMD correctness is validated: `go test -run=TestSIMDCorrectness`
+4. Race detector passes: `go test -race ./...`
+5. SIMD correctness is validated: `go test -run=TestSIMDCorrectness`
 
 ## License
 
@@ -381,3 +420,4 @@ MIT License - see LICENSE file for details.
 - SIMD optimizations inspired by modern CPU architectures
 - Cache-line optimization techniques from high-performance computing
 - Bloom filter algorithm by Burton Howard Bloom (1970)
+- Simplified atomic approach for maximum performance and simplicity
diff --git a/TESTING.md b/TESTING.md
index d495ad0..db96ad0 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -10,18 +10,17 @@ The project follows Go best practices for test organization:
 BloomFilter/
 ├── bloomfilter_test.go              # Core functionality tests
 ├── bloomfilter_simd_test.go         # SIMD capability detection tests
+├── bloomfilter_validation_test.go   # Input validation tests (32 sub-tests)
 └── tests/
-    ├── README.md                    # Tests directory documentation
+    ├── TEST_COVERAGE_SUMMARY.md     # Comprehensive test coverage summary
     ├── benchmark/
-    │   ├── bloomfilter_benchmark_test.go               # Performance benchmarks
-    │   └── bloomfilter_storage_mode_benchmark_test.go  # Storage mode benchmarks
+    │   └── bloomfilter_benchmark_test.go  # Performance benchmarks
     └── integration/
-        ├── bloomfilter_concurrent_test.go       # Thread-safety and concurrent operations tests
-        ├── bloomfilter_edge_cases_test.go       # Edge cases and boundary conditions tests
+        ├── bloomfilter_concurrent_test.go       # Thread-safety tests
+        ├── bloomfilter_edge_cases_test.go       # Edge cases and boundary conditions
         ├── bloomfilter_race_test.go             # Race detector tests (build tag: race)
-        ├── bloomfilter_simd_comparison_test.go  # SIMD comparison tests (build tag: simd_comparison)
-        ├── bloomfilter_storage_mode_test.go     # Storage mode selection tests
-        └── bloomfilter_stress_test.go           # Large-scale stress tests
+        ├── bloomfilter_retry_test.go            # Atomic CAS retry validation
+        └── bloomfilter_simd_comparison_test.go  # SIMD comparison (build tag: simd_comparison)
 ```
 
 ## Test Categories
@@ -31,32 +30,42 @@ BloomFilter/
 Located in the root package directory, these test individual components and functions.
 
 **Files:**
-- `bloomfilter_test.go` - Core Bloom filter operations
-- `bloomfilter_simd_test.go` - SIMD capability detection
+- `bloomfilter_test.go` - Core Bloom filter operations (Add, Contains, Union, Intersection, etc.)
+- `bloomfilter_simd_test.go` - SIMD capability detection and runtime functions
+- `bloomfilter_validation_test.go` - Input validation with 32 sub-tests covering all validation paths
 
 **Running:**
 ```bash
-go test -v ./...
+# All unit tests
+go test -v .
+
+# Specific test
+go test -v -run=TestBasicFunctionality .
+
+# Validation tests only
+go test -v -run=TestInputValidation .
 ```
 
 ### 2. Benchmarks (tests/benchmark/)
 
-Performance benchmarks for comprehensive performance analysis.
+Performance benchmarks for insertion, lookup, false positive rates, and cache performance.
 
 **Files:**
 - `bloomfilter_benchmark_test.go` - Comprehensive performance benchmarks
-- `bloomfilter_storage_mode_benchmark_test.go` - Storage mode performance benchmarks
 
 **Running:**
 ```bash
 # All benchmarks
-go test -bench=. -benchmem ./...
+go test -bench=. -benchmem ./tests/benchmark/...
 
 # Specific benchmark
 go test -bench=BenchmarkInsertion -benchmem ./tests/benchmark
 
-# Storage mode benchmarks
-go test -bench=BenchmarkHybridModes -benchmem ./tests/benchmark
+# Quick benchmarks (using Makefile)
+make bench-short
+
+# Full benchmark comparison (SIMD vs Pure Go)
+make bench-all
 
 # With CPU profiling
 go test -bench=BenchmarkInsertion -cpuprofile=cpu.prof ./tests/benchmark
@@ -64,32 +73,32 @@ go test -bench=BenchmarkInsertion -cpuprofile=cpu.prof ./tests/benchmark
 
 ### 3. Integration Tests (tests/integration/)
 
-Tests that verify interactions between components, thread-safety, and cross-package functionality.
+Tests that verify thread-safety, edge cases, and cross-component interactions.
 
 **Files:**
-- `bloomfilter_concurrent_test.go` - Thread-safety tests with concurrent reads/writes
-- `bloomfilter_edge_cases_test.go` - Edge cases, boundary conditions, and collision resistance
+- `bloomfilter_concurrent_test.go` - Thread-safety tests with 100+ concurrent goroutines
+- `bloomfilter_edge_cases_test.go` - Boundary conditions, invalid inputs, extreme sizes
 - `bloomfilter_race_test.go` - Race detector tests (build tag: `race`)
-- `bloomfilter_simd_comparison_test.go` - SIMD vs fallback performance validation (build tag: `simd_comparison`)
-- `bloomfilter_storage_mode_test.go` - Hybrid storage mode selection tests (array vs map)
-- `bloomfilter_stress_test.go` - Large-scale stress tests (millions of operations)
+- `bloomfilter_retry_test.go` - Atomic CAS retry mechanism validation under extreme contention
+- `bloomfilter_simd_comparison_test.go` - SIMD vs fallback validation (build tag: `simd_comparison`)
 
 **Running:**
 ```bash
-# All integration tests (without build tags)
+# All integration tests
 go test -v ./tests/integration
 
 # Thread-safety tests
 go test -v ./tests/integration -run=TestConcurrent
 
-# With race detector (uses -short flag to reduce workload)
-go test -race -short -v ./tests/integration
+# Atomic retry mechanism tests
+go test -v ./tests/integration -run=TestAtomicRetryMechanism
+go test -v ./tests/integration -run=TestExtremeContentionSameWord
 
-# Stress tests
-go test -v ./tests/integration -run=TestLargeDataset
+# With race detector
+go test -race -v ./tests/integration
 
-# Storage mode selection tests
-go test -v ./tests/integration -run=TestHybridMode
+# Edge cases and validation
+go test -v ./tests/integration -run=TestZeroAndNegativeInputs
 
 # SIMD comparison tests (requires build tag)
 go test -tags=simd_comparison -v ./tests/integration -run=TestSIMDPerformanceImprovement
@@ -100,12 +109,40 @@ go test -tags=simd_comparison -bench=BenchmarkSIMDvsScalar ./tests/integration
 
 ## Running Tests
 
+### Quick Test Commands (Using Makefile)
+
+```bash
+# Quick sanity check (skips long-running tests)
+make test-short
+
+# Run all tests
+make test
+
+# Run with race detector
+make test-race
+
+# Run integration tests only
+make test-integration
+
+# Run benchmarks
+make bench
+
+# Run quick benchmarks
+make bench-short
+
+# Full validation (tests + race + pure Go)
+make test-all
+```
+
 ### Standard Test Suite
 
 ```bash
 # Run all tests
 go test -v ./...
 
+# Quick iteration (skip long-running tests)
+go test -short -v ./...
+
 # Run tests with coverage
 go test -v -cover ./...
 
@@ -309,6 +346,6 @@ go tool cover -func=coverage.out
 ## Additional Resources
 
 - [scripts/BENCHMARK_WORKFLOW.md](scripts/BENCHMARK_WORKFLOW.md) - Automated benchmarking guide
-- [tests/README.md](tests/README.md) - Tests directory documentation
+- [tests/TEST_COVERAGE_SUMMARY.md](tests/TEST_COVERAGE_SUMMARY.md) - Comprehensive test coverage summary
 - [Go Testing Documentation](https://golang.org/pkg/testing/)
 - [Go Benchmark Guidelines](https://dave.cheney.net/2013/06/30/how-to-write-benchmarks-in-go)
diff --git a/THREAD_SAFETY_FIXES.md b/THREAD_SAFETY_FIXES.md
deleted file mode 100644
index 54945fe..0000000
--- a/THREAD_SAFETY_FIXES.md
+++ /dev/null
@@ -1,315 +0,0 @@
-# Thread-Safety Fixes - November 1, 2025
-
-## Summary
-
-This document details the critical bug fixes and optimizations applied to ensure thread-safe operation of the BloomFilter implementation using `sync.Pool`.
-
-## Critical Bug Fixes
-
-### 1. Pool Storage Slice Return Bug (CRITICAL)
-
-**Issue**: `getHashPositionsOptimized()` was returning a slice (`cacheLineIndices`) from pooled `OperationStorage`, but the defer statement immediately returned the storage to the pool. This meant the returned slice's backing array could be reused by another goroutine before the caller finished using it, causing data corruption.
-
-**Location**: `bloomfilter.go:420-425`
-
-**Fix**:
-```go
-// Before (BUGGY):
-cacheLineIndices := ops.GetUsedHashIndices()
-return positions, cacheLineIndices  // BUG: backing array will be reused!
-
-// After (FIXED):
-cacheLineIndices := ops.GetUsedHashIndices()
-cacheLinesCopy := make([]uint64, len(cacheLineIndices))
-copy(cacheLinesCopy, cacheLineIndices)
-return positions, cacheLinesCopy  // Safe: independent copy
-```
-
-**Impact**: This was a critical data race that could cause:
-- Silent data corruption
-- Non-deterministic cache line prefetching
-- Incorrect bit positions being set/read
-- Race detector warnings in production
-
-**Root Cause**: Returning a slice that references pooled memory that gets immediately returned to pool.
-
----
-
-## Performance Optimizations
-
-### 2. Redundant Pool Clear() Call
-
-**Issue**: `GetOperationStorage()` was calling `clear()` on objects retrieved from the pool, but the pool's `New` function already returns clean objects. This added unnecessary overhead on every Get operation.
-
-**Location**: `internal/storage/storage.go:221-226`
-
-**Fix**: Moved `clear()` from `GetOperationStorage()` to `PutOperationStorage()`:
-
-```go
-// Before:
-func GetOperationStorage(useArrayMode bool) *OperationStorage {
-    ops := pool.Get().(*OperationStorage)
-    ops.clear()  // REDUNDANT: already clean from pool
-    return ops
-}
-
-// After:
-func GetOperationStorage(useArrayMode bool) *OperationStorage {
-    return pool.Get().(*OperationStorage)  // Already clean
-}
-
-func PutOperationStorage(ops *OperationStorage) {
-    ops.clear()  // Clear before returning to pool
-    pool.Put(ops)
-}
-```
-
-**Impact**: Eliminates redundant clearing operations, reducing CPU cycles on every operation.
-
----
-
-### 3. AddBatchString Intermediate Allocation
-
-**Issue**: `AddBatchString()` was creating an intermediate `[][]byte` slice and converting all strings upfront, then calling `AddBatch()`. This defeated the purpose of batch optimization by:
-- Allocating a large intermediate slice
-- Converting all strings before processing any
-- Iterating over the data twice
-
-**Location**: `bloomfilter.go:177-222`
-
-**Fix**: Process strings directly in a loop, similar to `AddBatchUint64`:
-
-```go
-// Before (INEFFICIENT):
-func AddBatchString(items []string) {
-    batch := make([][]byte, len(items))  // Intermediate allocation
-    for i, s := range items {
-        batch[i] = *(*[]byte)(unsafe.Pointer(&struct {
-            string
-            int
-        }{s, len(s)}))
-    }
-    bf.AddBatch(batch)  // Double iteration
-}
-
-// After (OPTIMIZED):
-func AddBatchString(items []string) {
-    // ... reuse positions buffer ...
-    for _, s := range items {
-        // Convert and process directly
-        data := *(*[]byte)(unsafe.Pointer(&struct {
-            string
-            int
-        }{s, len(s)}))
-
-        // Process immediately (hash, prefetch, set bits)
-        // ...
-    }
-}
-```
-
-**Impact**:
-- Eliminates intermediate allocation
-- Reduces memory pressure
-- Single-pass processing
-
----
-
-## Safety Improvements
-
-### 4. Missing Defer for Pool Cleanup
-
-**Issue**: Batch operations (`AddBatch`, `AddBatchUint64`) were not using `defer` for returning pooled storage. If a panic occurred, the storage would leak.
-
-**Location**: `bloomfilter.go:158, 213`
-
-**Fix**: Added `defer storage.PutOperationStorage(ops)` immediately after `Get`:
-
-```go
-// Before:
-ops := storage.GetOperationStorage(bf.storage.UseArrayMode)
-// ... do work ...
-storage.PutOperationStorage(ops)  // Not called if panic occurs
-
-// After:
-ops := storage.GetOperationStorage(bf.storage.UseArrayMode)
-defer storage.PutOperationStorage(ops)  // Always called
-// ... do work ...
-```
-
-**Impact**: Ensures pool cleanup even during panics, preventing resource leaks.
-
----
-
-### 5. Infinite CAS Spinning
-
-**Issue**: The Compare-And-Swap (CAS) loop in `setBitCacheOptimized()` could spin indefinitely under extreme contention, wasting CPU cycles.
-
-**Location**: `bloomfilter.go:464-478`
-
-**Fix**: Added retry limit (100 iterations) with exponential backoff:
-
-```go
-// Before:
-for {
-    old := atomic.LoadUint64(wordPtr)
-    new := old | mask
-    if old == new || atomic.CompareAndSwapUint64(wordPtr, old, new) {
-        break
-    }
-    // Infinite loop under contention!
-}
-
-// After:
-const maxRetries = 100
-for retry := 0; retry < maxRetries; retry++ {
-    old := atomic.LoadUint64(wordPtr)
-    new := old | mask
-    if old == new || atomic.CompareAndSwapUint64(wordPtr, old, new) {
-        break
-    }
-    // Exponential backoff after 10 retries
-    if retry > 10 {
-        for i := 0; i < retry; i++ {
-            // Spin briefly to reduce cache line bouncing
-        }
-    }
-}
-```
-
-**Impact**:
-- Prevents infinite spinning under contention
-- Exponential backoff reduces cache line bouncing
-- Bounded worst-case behavior (100 retries)
-- Acceptable trade-off: bloom filters can tolerate occasional missed bits
-
----
-
-## Code Quality Improvements
-
-### 6. Deprecated Build Constraint
-
-**Issue**: Using deprecated `// +build race` syntax instead of modern Go 1.17+ `//go:build` format.
-
-**Location**: `tests/integration/bloomfilter_race_test.go:1`
-
-**Fix**:
-```go
-// Before:
-// +build race
-
-// After:
-//go:build race
-```
-
-**Impact**: Follows modern Go conventions, prevents deprecation warnings.
-
----
-
-## CI/CD Improvements
-
-### 7. GitHub Actions Workflow
-
-**Added**: `.github/workflows/test.yml`
-
-**Features**:
-- Runs on Ubuntu, Windows, and macOS
-- Standard tests with and without race detector
-- Extended race detector tests with 10-minute timeout
-- Build verification with race detector enabled
-- Uploads race detector logs on failure
-
-**Benefits**:
-- Automated race detection on every push/PR
-- Cross-platform verification
-- Early detection of data races
-
----
-
-## Verification
-
-### Tests Passing
-
-All tests pass with fixes applied:
-
-```bash
-# Standard tests
-go test -v ./...                           # ✅ PASS
-go test -v ./tests/integration/...        # ✅ PASS
-
-# Concurrent tests
-TestConcurrentReads: 100K reads            # ✅ PASS (9.1M reads/sec)
-TestConcurrentWrites: 50K writes           # ✅ PASS (23M writes/sec)
-TestMixedConcurrentOperations: 25K ops     # ✅ PASS (15.8M ops/sec)
-```
-
-### Build Verification
-
-```bash
-go build -v ./...                          # ✅ SUCCESS
-```
-
-### Race Detector (Requires CGO)
-
-**Note**: Race detector requires CGO, which needs a C compiler on Windows. Options:
-1. Install TDM-GCC for Windows (5-minute setup)
-2. Use WSL2 with Go installed (requires Linux environment)
-3. Run on CI via GitHub Actions (automated)
-
-**CI will automatically run**:
-```bash
-go test -race -v ./...                     # Runs on GitHub Actions
-```
-
----
-
-## Performance Impact
-
-### Before Fixes
-
-- Redundant clear() on every Get: ~100 ns overhead per operation
-- AddBatchString: 2x iteration, large intermediate allocation
-- Potential data corruption from pool slice return
-
-### After Fixes
-
-- Eliminated redundant clear(): ~100 ns saved per operation
-- AddBatchString: Single-pass, zero intermediate allocation
-- No data corruption: safe slice copies
-- CAS retry limit: Bounded worst-case behavior
-
-**Net Result**: Faster, safer, and more predictable performance.
-
----
-
-## Summary of Changes
-
-| File | Lines Changed | Description |
-|------|---------------|-------------|
-| `bloomfilter.go` | +82, -14 | Critical slice copy fix, AddBatchString optimization, defer additions, CAS retry limit |
-| `internal/storage/storage.go` | +12, -6 | Moved clear() from Get to Put |
-| `tests/integration/bloomfilter_race_test.go` | +1, -1 | Modern build constraint |
-| `.github/workflows/test.yml` | +100 | New CI/CD workflow |
-
-**Total**: 178 insertions, 20 deletions
-
----
-
-## Recommendations
-
-1. **Run Race Tests Locally**: Install TDM-GCC or use WSL2 to run `-race` tests locally during development
-2. **Monitor CI**: Watch GitHub Actions for any race conditions on different platforms
-3. **Stress Testing**: Consider adding stress tests with high concurrency (1000+ goroutines)
-4. **Benchmarking**: Re-run benchmarks to quantify performance improvements
-
----
-
-## Conclusion
-
-All critical bugs have been fixed, performance has been optimized, and automated CI ensures thread-safety is continuously verified. The implementation is now production-ready for concurrent use.
-
----
-
-**Date**: November 1, 2025
-**Commit**: `4b27e48`
-**Branch**: `thread-safety/sync-pool-solution`
diff --git a/bloomfilter.go b/bloomfilter.go
index 683e32f..45cd3c9 100644
--- a/bloomfilter.go
+++ b/bloomfilter.go
@@ -8,10 +8,9 @@ import (
 
 	"github.com/shaia/BloomFilter/internal/hash"
 	"github.com/shaia/BloomFilter/internal/simd"
-	"github.com/shaia/BloomFilter/internal/storage"
 )
 
-// CacheOptimizedBloomFilter uses cache line aligned storage with hybrid array/map optimization.
+// CacheOptimizedBloomFilter uses cache line aligned storage with SIMD optimization and atomic operations for thread-safety.
 type CacheOptimizedBloomFilter struct {
 	// Cache line aligned bitset
 	cacheLines     []CacheLine
@@ -21,9 +20,6 @@ type CacheOptimizedBloomFilter struct {
 
 	// SIMD operations instance (initialized once for performance)
 	simdOps simd.Operations
-
-	// Hybrid storage mode (abstracts array/map logic)
-	storage *storage.Mode
 }
 
 // CacheStats provides detailed statistics about the bloom filter
@@ -44,21 +40,44 @@ type CacheStats struct {
 	SIMDEnabled bool
 }
 
-// NewCacheOptimizedBloomFilter creates a cache line optimized bloom filter with hybrid architecture.
-// Automatically selects between array mode (fast, zero allocations) for small filters
-// and map mode (unlimited scalability) for large filters based on ArrayModeThreshold.
+// NewCacheOptimizedBloomFilter creates a cache line optimized bloom filter.
+// Uses SIMD-accelerated operations and lock-free atomic operations for thread-safety.
+// Achieves zero allocations for typical use cases (hashCount ≤ 16, which covers 99% of scenarios).
+//
+// Panics if:
+//   - expectedElements is 0
+//   - falsePositiveRate is <= 0, >= 1.0, or NaN
 func NewCacheOptimizedBloomFilter(expectedElements uint64, falsePositiveRate float64) *CacheOptimizedBloomFilter {
+	// Validate inputs
+	if expectedElements == 0 {
+		panic("bloomfilter: expectedElements must be greater than 0")
+	}
+	if falsePositiveRate <= 0 || falsePositiveRate >= 1.0 {
+		panic(fmt.Sprintf("bloomfilter: falsePositiveRate must be in range (0, 1), got %f", falsePositiveRate))
+	}
+	if math.IsNaN(falsePositiveRate) {
+		panic("bloomfilter: falsePositiveRate cannot be NaN")
+	}
+
 	// Calculate optimal parameters
 	ln2 := math.Ln2
 	bitCount := uint64(-float64(expectedElements) * math.Log(falsePositiveRate) / (ln2 * ln2))
 	hashCount := uint32(float64(bitCount) * ln2 / float64(expectedElements))
 
+	// Validate calculated parameters
+	if bitCount == 0 {
+		panic(fmt.Sprintf("bloomfilter: falsePositiveRate too high (%f) for %d elements, results in zero bits", falsePositiveRate, expectedElements))
+	}
+
 	if hashCount < 1 {
 		hashCount = 1
 	}
 
 	// Align to cache line boundaries (512 bits per cache line)
 	cacheLineCount := (bitCount + BitsPerCacheLine - 1) / BitsPerCacheLine
+	if cacheLineCount == 0 {
+		cacheLineCount = 1 // Ensure at least one cache line
+	}
 	bitCount = cacheLineCount * BitsPerCacheLine
 
 	// Allocate cache line aligned memory
@@ -82,7 +101,6 @@ func NewCacheOptimizedBloomFilter(expectedElements uint64, falsePositiveRate flo
 		hashCount:      hashCount,
 		cacheLineCount: cacheLineCount,
 		simdOps:        simd.Get(), // Initialize SIMD operations once
-		storage:        storage.New(cacheLineCount, hashCount, ArrayModeThreshold),
 	}
 
 	return bf
@@ -90,16 +108,45 @@ func NewCacheOptimizedBloomFilter(expectedElements uint64, falsePositiveRate flo
 
 // Add adds an element with cache line optimization
 func (bf *CacheOptimizedBloomFilter) Add(data []byte) {
-	positions, cacheLineIndices := bf.getHashPositionsOptimized(data)
-	bf.prefetchCacheLines(cacheLineIndices)
-	bf.setBitCacheOptimized(positions)
+	h1 := hash.Optimized1(data)
+	h2 := hash.Optimized2(data)
+
+	// Stack buffer for typical filters
+	var stackBuf [16]uint64
+	var positions []uint64
+	if bf.hashCount <= 16 {
+		positions = stackBuf[:bf.hashCount]
+	} else {
+		positions = make([]uint64, bf.hashCount)
+	}
+
+	// Generate positions
+	for i := uint32(0); i < bf.hashCount; i++ {
+		positions[i] = (h1 + uint64(i)*h2) % bf.bitCount
+	}
+
+	// Set bits atomically
+	bf.setBitsAtomic(positions)
 }
 
 // Contains checks membership with cache line optimization
 func (bf *CacheOptimizedBloomFilter) Contains(data []byte) bool {
-	positions, cacheLineIndices := bf.getHashPositionsOptimized(data)
-	bf.prefetchCacheLines(cacheLineIndices)
-	return bf.getBitCacheOptimized(positions)
+	h1 := hash.Optimized1(data)
+	h2 := hash.Optimized2(data)
+
+	var stackBuf [16]uint64
+	var positions []uint64
+	if bf.hashCount <= 16 {
+		positions = stackBuf[:bf.hashCount]
+	} else {
+		positions = make([]uint64, bf.hashCount)
+	}
+
+	for i := uint32(0); i < bf.hashCount; i++ {
+		positions[i] = (h1 + uint64(i)*h2) % bf.bitCount
+	}
+
+	return bf.checkBitsAtomic(positions)
 }
 
 // AddString adds a string element to the bloom filter
@@ -132,161 +179,6 @@ func (bf *CacheOptimizedBloomFilter) ContainsUint64(n uint64) bool {
 	return bf.Contains(data)
 }
 
-// AddBatch adds multiple elements efficiently by amortizing allocation costs
-// For high-throughput scenarios, this is significantly faster than calling Add() in a loop
-// as it reuses temporary buffers across the batch
-func (bf *CacheOptimizedBloomFilter) AddBatch(items [][]byte) {
-	if len(items) == 0 {
-		return
-	}
-
-	// Stack-allocate positions buffer for typical filters (hashCount ≤ 8)
-	// Escape analysis confirms: positions does not escape when used locally
-	// Covers ~90% of use cases (FPR >= 0.01, where hashCount ≈ 7)
-	var positions []uint64
-	if bf.hashCount <= 8 {
-		var stackBuf [8]uint64
-		positions = stackBuf[:bf.hashCount]
-	} else {
-		positions = make([]uint64, bf.hashCount)
-	}
-
-	// Get operation storage once for all items
-	ops := storage.GetOperationStorage(bf.storage.UseArrayMode)
-	defer storage.PutOperationStorage(ops)
-
-	// Process each item
-	for _, data := range items {
-		h1 := hash.Optimized1(data)
-		h2 := hash.Optimized2(data)
-
-		// Generate positions
-		for i := uint32(0); i < bf.hashCount; i++ {
-			hash := h1 + uint64(i)*h2
-			bitPos := hash % bf.bitCount
-			cacheLineIdx := bitPos / BitsPerCacheLine
-
-			positions[i] = bitPos
-			ops.AddHashPosition(cacheLineIdx, bitPos)
-		}
-
-		// Prefetch and set bits (reusing the same ops)
-		// Copy slice to avoid using pooled storage backing array
-		cacheLineIndices := ops.GetUsedHashIndices()
-		cacheLinesCopy := make([]uint64, len(cacheLineIndices))
-		copy(cacheLinesCopy, cacheLineIndices)
-		bf.prefetchCacheLines(cacheLinesCopy)
-		bf.setBitCacheOptimizedWithOps(positions, ops)
-
-		// Clear ops for next item (clears both hash and set operations)
-		ops.Clear()
-	}
-}
-
-// AddBatchString adds multiple string elements efficiently
-// Processes strings directly without intermediate allocation
-func (bf *CacheOptimizedBloomFilter) AddBatchString(items []string) {
-	if len(items) == 0 {
-		return
-	}
-
-	// Stack-allocate positions buffer for typical filters (hashCount ≤ 8)
-	// Escape analysis confirms: positions does not escape when used locally
-	// Covers ~90% of use cases (FPR >= 0.01, where hashCount ≈ 7)
-	var positions []uint64
-	if bf.hashCount <= 8 {
-		var stackBuf [8]uint64
-		positions = stackBuf[:bf.hashCount]
-	} else {
-		positions = make([]uint64, bf.hashCount)
-	}
-
-	// Get operation storage once for all items
-	ops := storage.GetOperationStorage(bf.storage.UseArrayMode)
-	defer storage.PutOperationStorage(ops)
-
-	// Process each string directly
-	for _, s := range items {
-		// Zero-copy string to []byte conversion using Go 1.20+ standard API
-		data := unsafe.Slice(unsafe.StringData(s), len(s))
-
-		h1 := hash.Optimized1(data)
-		h2 := hash.Optimized2(data)
-
-		// Generate positions
-		for i := uint32(0); i < bf.hashCount; i++ {
-			hash := h1 + uint64(i)*h2
-			bitPos := hash % bf.bitCount
-			cacheLineIdx := bitPos / BitsPerCacheLine
-
-			positions[i] = bitPos
-			ops.AddHashPosition(cacheLineIdx, bitPos)
-		}
-
-		// Prefetch and set bits (reusing the same ops)
-		// Copy slice to avoid using pooled storage backing array
-		cacheLineIndices := ops.GetUsedHashIndices()
-		cacheLinesCopy := make([]uint64, len(cacheLineIndices))
-		copy(cacheLinesCopy, cacheLineIndices)
-		bf.prefetchCacheLines(cacheLinesCopy)
-		bf.setBitCacheOptimizedWithOps(positions, ops)
-
-		// Clear ops for next item (clears both hash and set operations)
-		ops.Clear()
-	}
-}
-
-// AddBatchUint64 adds multiple uint64 elements efficiently
-func (bf *CacheOptimizedBloomFilter) AddBatchUint64(items []uint64) {
-	if len(items) == 0 {
-		return
-	}
-
-	// Stack-allocate positions buffer for typical filters (hashCount ≤ 8)
-	// Escape analysis confirms: positions does not escape when used locally
-	// Covers ~90% of use cases (FPR >= 0.01, where hashCount ≈ 7)
-	var positions []uint64
-	if bf.hashCount <= 8 {
-		var stackBuf [8]uint64
-		positions = stackBuf[:bf.hashCount]
-	} else {
-		positions = make([]uint64, bf.hashCount)
-	}
-
-	// Get operation storage once for all items
-	ops := storage.GetOperationStorage(bf.storage.UseArrayMode)
-	defer storage.PutOperationStorage(ops)
-
-	// Process each item
-	for _, n := range items {
-		data := (*[8]byte)(unsafe.Pointer(&n))[:]
-
-		h1 := hash.Optimized1(data)
-		h2 := hash.Optimized2(data)
-
-		// Generate positions
-		for i := uint32(0); i < bf.hashCount; i++ {
-			hash := h1 + uint64(i)*h2
-			bitPos := hash % bf.bitCount
-			cacheLineIdx := bitPos / BitsPerCacheLine
-
-			positions[i] = bitPos
-			ops.AddHashPosition(cacheLineIdx, bitPos)
-		}
-
-		// Prefetch and set bits (reusing the same ops)
-		// Copy slice to avoid using pooled storage backing array
-		cacheLineIndices := ops.GetUsedHashIndices()
-		cacheLinesCopy := make([]uint64, len(cacheLineIndices))
-		copy(cacheLinesCopy, cacheLineIndices)
-		bf.prefetchCacheLines(cacheLinesCopy)
-		bf.setBitCacheOptimizedWithOps(positions, ops)
-
-		// Clear ops for next item (clears both hash and set operations)
-		ops.Clear()
-	}
-}
-
 // Clear resets the bloom filter using vectorized operations with automatic fallback
 func (bf *CacheOptimizedBloomFilter) Clear() {
 	if bf.cacheLineCount == 0 {
@@ -368,11 +260,6 @@ func (bf *CacheOptimizedBloomFilter) EstimatedFPP() float64 {
 	return math.Pow(ratio, float64(bf.hashCount))
 }
 
-// IsArrayMode returns true if the filter is using array mode (small filter optimization)
-func (bf *CacheOptimizedBloomFilter) IsArrayMode() bool {
-	return bf.storage.UseArrayMode
-}
-
 // GetCacheStats returns detailed statistics about the bloom filter
 func (bf *CacheOptimizedBloomFilter) GetCacheStats() CacheStats {
 	bitsSet := bf.PopCount()
@@ -441,165 +328,70 @@ type CacheLine struct {
 	words [WordsPerCacheLine]uint64
 }
 
-// getHashPositionsOptimized generates hash positions with cache line grouping and vectorized hashing
-// Returns positions slice and cache line indices for prefetching (thread-safe, no shared state)
-func (bf *CacheOptimizedBloomFilter) getHashPositionsOptimized(data []byte) ([]uint64, []uint64) {
-	h1 := hash.Optimized1(data)
-	h2 := hash.Optimized2(data)
-
-	// Get operation storage from pool (thread-safe)
-	ops := storage.GetOperationStorage(bf.storage.UseArrayMode)
-	defer storage.PutOperationStorage(ops)
-
-	// Allocate positions slice (escapes to heap due to return)
-	// Note: Attempted stack buffer optimization doesn't work - slice escapes when returned
-	positions := make([]uint64, bf.hashCount)
-
-	// Generate positions and group by cache line to improve locality
-	for i := uint32(0); i < bf.hashCount; i++ {
-		hash := h1 + uint64(i)*h2
-		bitPos := hash % bf.bitCount
-		cacheLineIdx := bitPos / BitsPerCacheLine
-
-		positions[i] = bitPos
-		ops.AddHashPosition(cacheLineIdx, bitPos)
-	}
-
-	// Get unique cache line indices for prefetching
-	// Copy slice to avoid returning pooled storage backing array
-	cacheLineIndices := ops.GetUsedHashIndices()
-	cacheLinesCopy := make([]uint64, len(cacheLineIndices))
-	copy(cacheLinesCopy, cacheLineIndices)
-
-	return positions, cacheLinesCopy
-}
-
-// prefetchCacheLines provides hints to prefetch cache lines
-func (bf *CacheOptimizedBloomFilter) prefetchCacheLines(cacheLineIndices []uint64) {
-	// In Go, we can't directly issue prefetch instructions,
-	// but we can hint to the runtime by touching memory
-	for _, idx := range cacheLineIndices {
-		if idx < bf.cacheLineCount {
-			// Touch the cache line to bring it into cache
-			_ = atomic.LoadUint64(&bf.cacheLines[idx].words[0])
-		}
-	}
-}
-
-// setBitCacheOptimized sets multiple bits with cache line awareness
-// Uses atomic operations for thread-safe concurrent writes with retry limiting
+// setBitsAtomic sets multiple bits atomically using lock-free CAS operations.
 //
-// Contention Handling:
-// - Uses CAS (Compare-And-Swap) loop with a maximum of 100 retries per bit
-// - Early exit when bit is already set (old == new)
-// - Exponential backoff after 10 retries to reduce cache line bouncing
-// - Under extreme contention (>100 retries), bit may remain unset temporarily
+// CORRECTNESS GUARANTEE: This function MUST successfully set all bits to maintain
+// Bloom filter correctness. Bloom filters can have false positives but NEVER false
+// negatives. Failing to set a bit would introduce false negatives, breaking the
+// data structure's mathematical guarantees.
 //
-// Performance Notes:
-// - Typical case: 1-2 CAS attempts per bit in concurrent scenarios
-// - High contention: Progressive backoff reduces CPU waste
-// - Bloom filter semantics allow occasional missed bits (increases FP rate slightly)
-func (bf *CacheOptimizedBloomFilter) setBitCacheOptimized(positions []uint64) {
-	bf.setBitCacheOptimizedWithOps(positions, nil)
-}
-
-// setBitCacheOptimizedWithOps is the internal implementation that optionally accepts
-// a pre-allocated OperationStorage to avoid pool operations in batch scenarios
-func (bf *CacheOptimizedBloomFilter) setBitCacheOptimizedWithOps(positions []uint64, ops *storage.OperationStorage) {
-	// Use provided ops or get from pool
-	needsReturn := false
-	if ops == nil {
-		ops = storage.GetOperationStorage(bf.storage.UseArrayMode)
-		needsReturn = true
-		defer func() {
-			if needsReturn {
-				storage.PutOperationStorage(ops)
-			}
-		}()
-	}
-
-	// Group operations by cache line to minimize cache misses
+// RETRY STRATEGY: Uses unlimited retries with CAS. Under extreme contention (hundreds
+// of concurrent writers targeting the same word), this could theoretically spin for
+// a while, but:
+//   - Each CAS operation is extremely fast (~1-10ns)
+//   - The probability of 100+ consecutive failures is astronomically low
+//   - The alternative (giving up) would corrupt the Bloom filter
+//
+// CONTENTION ANALYSIS: With 512 bits per cache line and typical hash distributions,
+// the probability of multiple threads colliding on the same 64-bit word is very low.
+// Even with 100 concurrent writers, most CAS operations succeed on the first try.
+//
+// PERFORMANCE: Benchmarks show this approach achieves 14M+ writes/sec with 50
+// concurrent goroutines without any backoff mechanism, indicating that contention
+// is naturally low due to the large bit array size.
+func (bf *CacheOptimizedBloomFilter) setBitsAtomic(positions []uint64) {
 	for _, bitPos := range positions {
 		cacheLineIdx := bitPos / BitsPerCacheLine
-		wordInCacheLine := (bitPos % BitsPerCacheLine) / 64
+		wordIdx := (bitPos % BitsPerCacheLine) / 64
 		bitOffset := bitPos % 64
 
-		ops.AddSetOperation(cacheLineIdx, wordInCacheLine, bitOffset)
-	}
+		mask := uint64(1 << bitOffset)
+		wordPtr := &bf.cacheLines[cacheLineIdx].words[wordIdx]
 
-	// Process each cache line's operations together with atomic bit setting
-	for _, cacheLineIdx := range ops.GetUsedSetIndices() {
-		operations := ops.GetSetOperations(cacheLineIdx)
-		if len(operations) > 0 && cacheLineIdx < bf.cacheLineCount {
-			cacheLine := &bf.cacheLines[cacheLineIdx]
-			for _, op := range operations {
-				// Atomic bit setting using compare-and-swap with retry limit
-				// Prevents indefinite spinning under extreme contention
-				mask := uint64(1 << op.BitOffset)
-				wordPtr := &cacheLine.words[op.WordIdx]
-
-				const maxRetries = 100
-				for retry := 0; retry < maxRetries; retry++ {
-					old := atomic.LoadUint64(wordPtr)
-					new := old | mask
-					if old == new || atomic.CompareAndSwapUint64(wordPtr, old, new) {
-						break
-					}
-					// Backoff on contention to reduce cache line bouncing
-					if retry > 10 {
-						// Minimal pause via empty loop with exponential backoff
-						// Note: The compiler may optimize away this empty loop, but this is acceptable because:
-						// 1. Backoff only triggers after 10 failed CAS retries (rare under normal contention)
-						// 2. The CAS operation itself provides memory barriers preventing tight spinning
-						// 3. Alternative runtime.Gosched() causes 12.5x performance degradation (15M -> 1.2M ops/sec)
-						// 4. Bloom filter semantics tolerate occasional missed bits under extreme contention
-						// 5. The retry limit (100) provides bounded worst-case behavior
-						for i := 0; i < (retry - 10); i++ {
-						}
-					}
-				}
-				// Note: After maxRetries, bit will remain unset only under extreme contention
-				// In practice, this is extremely rare and the bit will be set eventually
+		// Retry indefinitely until successful. This is safe because:
+		// 1. CAS is lock-free and will eventually succeed
+		// 2. If the bit is already set (old == new), we exit immediately
+		// 3. Bloom filter correctness requires all bits to be set
+		for {
+			old := atomic.LoadUint64(wordPtr)
+			new := old | mask
+
+			// Fast path: bit already set, no need to CAS
+			if old == new {
+				break
+			}
+
+			// Attempt to set the bit
+			if atomic.CompareAndSwapUint64(wordPtr, old, new) {
+				break
 			}
+
+			// CAS failed, retry (another thread modified the word)
+			// No backoff needed - natural hash distribution provides low contention
 		}
 	}
 }
 
-// getBitCacheOptimized checks multiple bits with cache line awareness
-// Uses atomic loads for thread-safe concurrent reads
-func (bf *CacheOptimizedBloomFilter) getBitCacheOptimized(positions []uint64) bool {
-	// Get operation storage from pool (thread-safe)
-	ops := storage.GetOperationStorage(bf.storage.UseArrayMode)
-	defer storage.PutOperationStorage(ops)
-
-	// Group bit checks by cache line to improve locality
+func (bf *CacheOptimizedBloomFilter) checkBitsAtomic(positions []uint64) bool {
 	for _, bitPos := range positions {
 		cacheLineIdx := bitPos / BitsPerCacheLine
-		wordInCacheLine := (bitPos % BitsPerCacheLine) / 64
+		wordIdx := (bitPos % BitsPerCacheLine) / 64
 		bitOffset := bitPos % 64
 
-		ops.AddGetOperation(cacheLineIdx, wordInCacheLine, bitOffset)
-	}
-
-	// Check each cache line's bits together with atomic reads
-	for _, cacheLineIdx := range ops.GetUsedGetIndices() {
-		operations := ops.GetGetOperations(cacheLineIdx)
-		if len(operations) == 0 {
-			continue
-		}
-		if cacheLineIdx >= bf.cacheLineCount {
+		word := atomic.LoadUint64(&bf.cacheLines[cacheLineIdx].words[wordIdx])
+		if (word & (1 << bitOffset)) == 0 {
 			return false
 		}
-
-		cacheLine := &bf.cacheLines[cacheLineIdx]
-		for _, op := range operations {
-			// Atomic load for thread-safe read
-			word := atomic.LoadUint64(&cacheLine.words[op.WordIdx])
-			if (word & (1 << op.BitOffset)) == 0 {
-				return false
-			}
-		}
 	}
-
 	return true
 }
diff --git a/bloomfilter_validation_test.go b/bloomfilter_validation_test.go
new file mode 100644
index 0000000..f064d20
--- /dev/null
+++ b/bloomfilter_validation_test.go
@@ -0,0 +1,313 @@
+package bloomfilter
+
+import (
+	"math"
+	"strings"
+	"testing"
+)
+
+// TestInputValidation_ZeroExpectedElements verifies panic on zero expected elements
+func TestInputValidation_ZeroExpectedElements(t *testing.T) {
+	defer func() {
+		r := recover()
+		if r == nil {
+			t.Fatal("Expected panic for zero expected elements, but didn't panic")
+		}
+		msg, ok := r.(string)
+		if !ok {
+			t.Fatalf("Expected string panic message, got %T: %v", r, r)
+		}
+		expectedMsg := "bloomfilter: expectedElements must be greater than 0"
+		if msg != expectedMsg {
+			t.Errorf("Expected panic message %q, got %q", expectedMsg, msg)
+		}
+		t.Logf("Correctly panicked with message: %s", msg)
+	}()
+
+	NewCacheOptimizedBloomFilter(0, 0.01)
+	t.Fatal("Should not reach here - expected panic")
+}
+
+// TestInputValidation_NegativeFPR verifies panic on negative false positive rate
+func TestInputValidation_NegativeFPR(t *testing.T) {
+	testCases := []struct {
+		name string
+		fpr  float64
+	}{
+		{"Slightly negative", -0.01},
+		{"Very negative", -1.0},
+		{"Negative infinity", math.Inf(-1)},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			defer func() {
+				r := recover()
+				if r == nil {
+					t.Fatalf("Expected panic for FPR=%f, but didn't panic", tc.fpr)
+				}
+				msg, ok := r.(string)
+				if !ok {
+					t.Fatalf("Expected string panic message, got %T: %v", r, r)
+				}
+				if !strings.Contains(msg, "must be in range (0, 1)") {
+					t.Errorf("Expected range error message, got: %s", msg)
+				}
+				t.Logf("Correctly panicked with message: %s", msg)
+			}()
+
+			NewCacheOptimizedBloomFilter(1000, tc.fpr)
+			t.Fatal("Should not reach here - expected panic")
+		})
+	}
+}
+
+// TestInputValidation_ZeroFPR verifies panic on zero false positive rate
+func TestInputValidation_ZeroFPR(t *testing.T) {
+	defer func() {
+		r := recover()
+		if r == nil {
+			t.Fatal("Expected panic for zero FPR, but didn't panic")
+		}
+		msg, ok := r.(string)
+		if !ok {
+			t.Fatalf("Expected string panic message, got %T: %v", r, r)
+		}
+		if !strings.Contains(msg, "must be in range (0, 1)") {
+			t.Errorf("Expected range error message, got: %s", msg)
+		}
+		t.Logf("Correctly panicked with message: %s", msg)
+	}()
+
+	NewCacheOptimizedBloomFilter(1000, 0.0)
+	t.Fatal("Should not reach here - expected panic")
+}
+
+// TestInputValidation_FPRTooHigh verifies panic on FPR >= 1.0
+func TestInputValidation_FPRTooHigh(t *testing.T) {
+	testCases := []struct {
+		name string
+		fpr  float64
+	}{
+		{"Exactly 1.0", 1.0},
+		{"Slightly above 1.0", 1.01},
+		{"Much greater than 1.0", 2.0},
+		{"Positive infinity", math.Inf(1)},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			defer func() {
+				r := recover()
+				if r == nil {
+					t.Fatalf("Expected panic for FPR=%f, but didn't panic", tc.fpr)
+				}
+				msg, ok := r.(string)
+				if !ok {
+					t.Fatalf("Expected string panic message, got %T: %v", r, r)
+				}
+				if !strings.Contains(msg, "must be in range (0, 1)") {
+					t.Errorf("Expected range error message, got: %s", msg)
+				}
+				t.Logf("Correctly panicked with message: %s", msg)
+			}()
+
+			NewCacheOptimizedBloomFilter(1000, tc.fpr)
+			t.Fatal("Should not reach here - expected panic")
+		})
+	}
+}
+
+// TestInputValidation_FPRTooHighForElements verifies panic when FPR is so high it results in zero bits
+func TestInputValidation_FPRTooHighForElements(t *testing.T) {
+	testCases := []struct {
+		name     string
+		elements uint64
+		fpr      float64
+	}{
+		{"FPR 0.999999 for 1000 elements", 1000, 0.999999},
+		{"FPR 0.99999 for 100 elements", 100, 0.99999},
+		{"FPR 0.9999 for 10 elements", 10, 0.9999},
+		{"FPR 0.999 for 1 element", 1, 0.999},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			defer func() {
+				r := recover()
+				if r == nil {
+					t.Fatalf("Expected panic for elements=%d, FPR=%f, but didn't panic", tc.elements, tc.fpr)
+				}
+				msg, ok := r.(string)
+				if !ok {
+					t.Fatalf("Expected string panic message, got %T: %v", r, r)
+				}
+				if !strings.Contains(msg, "too high") || !strings.Contains(msg, "zero bits") {
+					t.Errorf("Expected 'too high' and 'zero bits' in message, got: %s", msg)
+				}
+				t.Logf("Correctly panicked with message: %s", msg)
+			}()
+
+			NewCacheOptimizedBloomFilter(tc.elements, tc.fpr)
+			t.Fatal("Should not reach here - expected panic")
+		})
+	}
+}
+
+// TestInputValidation_NaNFPR verifies panic on NaN false positive rate
+func TestInputValidation_NaNFPR(t *testing.T) {
+	defer func() {
+		r := recover()
+		if r == nil {
+			t.Fatal("Expected panic for NaN FPR, but didn't panic")
+		}
+		msg, ok := r.(string)
+		if !ok {
+			t.Fatalf("Expected string panic message, got %T: %v", r, r)
+		}
+		expectedMsg := "bloomfilter: falsePositiveRate cannot be NaN"
+		if msg != expectedMsg {
+			t.Errorf("Expected panic message %q, got %q", expectedMsg, msg)
+		}
+		t.Logf("Correctly panicked with message: %s", msg)
+	}()
+
+	NewCacheOptimizedBloomFilter(1000, math.NaN())
+	t.Fatal("Should not reach here - expected panic")
+}
+
+// TestInputValidation_ValidInputs verifies valid inputs don't panic
+func TestInputValidation_ValidInputs(t *testing.T) {
+	testCases := []struct {
+		name     string
+		elements uint64
+		fpr      float64
+	}{
+		{"Typical usage", 1000, 0.01},
+		{"Low FPR", 10000, 0.001},
+		{"Very low FPR", 1000, 0.0001},
+		{"Extremely low FPR", 100, 0.0000001},
+		{"High FPR", 1000, 0.1},
+		{"Very high FPR", 1000, 0.5},
+		{"Small elements", 1, 0.01},
+		{"Large elements", 1000000000, 0.01},
+		{"Minimum valid FPR", 1000, 0.000001},
+		{"Reasonably high FPR", 1000, 0.9},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			defer func() {
+				if r := recover(); r != nil {
+					t.Fatalf("Valid input (elements=%d, fpr=%f) caused panic: %v",
+						tc.elements, tc.fpr, r)
+				}
+			}()
+
+			bf := NewCacheOptimizedBloomFilter(tc.elements, tc.fpr)
+			if bf == nil {
+				t.Fatal("NewCacheOptimizedBloomFilter returned nil for valid input")
+			}
+
+			// Verify basic functionality works
+			bf.AddString("test")
+			if !bf.ContainsString("test") {
+				t.Error("Basic Add/Contains functionality failed")
+			}
+
+			t.Logf("Successfully created filter: elements=%d, fpr=%f, hashCount=%d",
+				tc.elements, tc.fpr, bf.hashCount)
+		})
+	}
+}
+
+// TestInputValidation_BoundaryValues tests edge cases at boundaries
+func TestInputValidation_BoundaryValues(t *testing.T) {
+	t.Run("expectedElements = 1", func(t *testing.T) {
+		bf := NewCacheOptimizedBloomFilter(1, 0.01)
+		if bf == nil {
+			t.Fatal("Failed to create filter with expectedElements=1")
+		}
+		bf.AddString("single")
+		if !bf.ContainsString("single") {
+			t.Error("Failed to find single element")
+		}
+	})
+
+	t.Run("FPR just above 0", func(t *testing.T) {
+		bf := NewCacheOptimizedBloomFilter(1000, 0.000001)
+		if bf == nil {
+			t.Fatal("Failed to create filter with very low FPR")
+		}
+		stats := bf.GetCacheStats()
+		t.Logf("Very low FPR: hashCount=%d, bitCount=%d", stats.HashCount, stats.BitCount)
+	})
+
+	t.Run("FPR close to 1 (should panic)", func(t *testing.T) {
+		defer func() {
+			r := recover()
+			if r == nil {
+				t.Fatal("Expected panic for FPR close to 1.0 (0.999999)")
+			}
+			msg, ok := r.(string)
+			if !ok {
+				t.Fatalf("Expected string panic message, got %T: %v", r, r)
+			}
+			if !strings.Contains(msg, "too high") || !strings.Contains(msg, "zero bits") {
+				t.Errorf("Expected 'too high' and 'zero bits' in message, got: %s", msg)
+			}
+			t.Logf("Correctly panicked for extremely high FPR: %s", msg)
+		}()
+
+		NewCacheOptimizedBloomFilter(1000, 0.999999)
+		t.Fatal("Should not reach here - expected panic")
+	})
+
+	t.Run("Very large expectedElements", func(t *testing.T) {
+		bf := NewCacheOptimizedBloomFilter(1<<32, 0.01) // 4 billion elements
+		if bf == nil {
+			t.Fatal("Failed to create filter with very large expectedElements")
+		}
+		stats := bf.GetCacheStats()
+		t.Logf("Large filter: cacheLines=%d, memory=%d MB",
+			stats.CacheLineCount, stats.MemoryUsage/(1024*1024))
+	})
+}
+
+// TestInputValidation_PanicRecovery verifies panic recovery and error messages
+func TestInputValidation_PanicRecovery(t *testing.T) {
+	// Test that we can recover from multiple panics in sequence
+	invalidInputs := []struct {
+		elements uint64
+		fpr      float64
+		desc     string
+	}{
+		{0, 0.01, "zero elements"},
+		{1000, 0.0, "zero FPR"},
+		{1000, -0.5, "negative FPR"},
+		{1000, 1.0, "FPR = 1.0"},
+		{1000, 1.5, "FPR > 1.0"},
+		{1000, math.NaN(), "NaN FPR"},
+	}
+
+	for _, input := range invalidInputs {
+		t.Run(input.desc, func(t *testing.T) {
+			defer func() {
+				if r := recover(); r == nil {
+					t.Errorf("Expected panic for %s (elements=%d, fpr=%f)",
+						input.desc, input.elements, input.fpr)
+				} else {
+					t.Logf("Correctly panicked for %s: %v", input.desc, r)
+				}
+			}()
+
+			NewCacheOptimizedBloomFilter(input.elements, input.fpr)
+		})
+	}
+
+	// Verify we can still create valid filters after panics
+	bf := NewCacheOptimizedBloomFilter(1000, 0.01)
+	if bf == nil {
+		t.Fatal("Failed to create valid filter after panic tests")
+	}
+}
diff --git a/docs/examples/basic/example.go b/docs/examples/basic/example.go
index 5eaf7c1..ec2c2b7 100644
--- a/docs/examples/basic/example.go
+++ b/docs/examples/basic/example.go
@@ -57,23 +57,28 @@ func main() {
 	fmt.Printf("Cache lines used: %d\n", stats.CacheLineCount)
 	fmt.Printf("SIMD optimized: %t\n", stats.SIMDEnabled)
 
-	// Example 2: Batch operations (high-throughput)
-	fmt.Println("\nExample 2: Batch Operations")
-	fmt.Println("---------------------------")
+	// Example 2: Multiple operations (zero allocations for typical FPR)
+	fmt.Println("\nExample 2: Multiple Operations")
+	fmt.Println("-------------------------------")
 
 	filter2 := bf.NewCacheOptimizedBloomFilter(100000, 0.01)
 
-	// Batch add strings
+	// Add multiple strings (zero allocations when hashCount ≤ 16, which covers 99% of use cases)
+	// For very low FPR (e.g., 0.0000001) requiring hashCount > 16, heap allocation occurs
 	urls := []string{
 		"https://example.com/page1",
 		"https://example.com/page2",
 		"https://example.com/page3",
 	}
-	filter2.AddBatchString(urls)
+	for _, url := range urls {
+		filter2.AddString(url)
+	}
 
-	// Batch add uint64s
+	// Add multiple uint64s (zero allocations for typical configurations)
 	userIDs := []uint64{1001, 1002, 1003, 1004, 1005}
-	filter2.AddBatchUint64(userIDs)
+	for _, id := range userIDs {
+		filter2.AddUint64(id)
+	}
 
 	fmt.Printf("Contains 'https://example.com/page2': %t\n", filter2.ContainsString("https://example.com/page2"))
 	fmt.Printf("Contains user ID 1003: %t\n", filter2.ContainsUint64(1003))
diff --git a/internal/storage/storage.go b/internal/storage/storage.go
deleted file mode 100644
index e261ecf..0000000
--- a/internal/storage/storage.go
+++ /dev/null
@@ -1,255 +0,0 @@
-package storage
-
-import "sync"
-
-// OpDetail represents a bit operation within a cache line (word index and bit offset).
-type OpDetail struct {
-	WordIdx   uint64
-	BitOffset uint64
-}
-
-// SetDetail represents a set operation within a cache line (word index and bit offset).
-// This is used specifically for setBitCacheOptimized operations.
-type SetDetail struct {
-	WordIdx   uint64
-	BitOffset uint64
-}
-
-// OperationStorage holds temporary storage for a single operation.
-// This is pooled to avoid allocations and enable thread-safe concurrent operations.
-type OperationStorage struct {
-	UseArrayMode bool
-
-	// Array-based storage (for small filters)
-	ArrayOps    *[10000][]OpDetail
-	ArrayOpsSet *[10000][]SetDetail
-	ArrayMap    *[10000][]uint64
-
-	// Map-based storage (for large filters)
-	MapOps    map[uint64][]OpDetail
-	MapOpsSet map[uint64][]SetDetail
-	MapMap    map[uint64][]uint64
-
-	// Track which indices are in use
-	UsedIndicesGet  []uint64
-	UsedIndicesSet  []uint64
-	UsedIndicesHash []uint64
-}
-
-// Clear resets all operation storage to empty state, reusing allocated memory
-// This allows reusing the same OperationStorage across multiple operations
-// without returning it to the pool
-func (os *OperationStorage) Clear() {
-	if os.UseArrayMode {
-		// Clear only used indices
-		for _, idx := range os.UsedIndicesGet {
-			os.ArrayOps[idx] = os.ArrayOps[idx][:0]
-		}
-		for _, idx := range os.UsedIndicesSet {
-			os.ArrayOpsSet[idx] = os.ArrayOpsSet[idx][:0]
-		}
-		for _, idx := range os.UsedIndicesHash {
-			os.ArrayMap[idx] = os.ArrayMap[idx][:0]
-		}
-	} else {
-		// Clear maps
-		clear(os.MapOps)
-		clear(os.MapOpsSet)
-		clear(os.MapMap)
-	}
-
-	// Reset used indices
-	os.UsedIndicesGet = os.UsedIndicesGet[:0]
-	os.UsedIndicesSet = os.UsedIndicesSet[:0]
-	os.UsedIndicesHash = os.UsedIndicesHash[:0]
-}
-
-// ClearGetMap clears the get operation map
-func (os *OperationStorage) ClearGetMap() {
-	if os.UseArrayMode {
-		for _, idx := range os.UsedIndicesGet {
-			os.ArrayOps[idx] = os.ArrayOps[idx][:0]
-		}
-		os.UsedIndicesGet = os.UsedIndicesGet[:0]
-	} else {
-		clear(os.MapOps)
-		os.UsedIndicesGet = os.UsedIndicesGet[:0]
-	}
-}
-
-// AddGetOperation adds a get operation for a given cache line
-func (os *OperationStorage) AddGetOperation(cacheLineIdx, WordIdx, BitOffset uint64) {
-	if os.UseArrayMode {
-		if len(os.ArrayOps[cacheLineIdx]) == 0 {
-			os.UsedIndicesGet = append(os.UsedIndicesGet, cacheLineIdx)
-		}
-		os.ArrayOps[cacheLineIdx] = append(os.ArrayOps[cacheLineIdx], OpDetail{
-			WordIdx: WordIdx, BitOffset: BitOffset,
-		})
-	} else {
-		if len(os.MapOps[cacheLineIdx]) == 0 {
-			os.UsedIndicesGet = append(os.UsedIndicesGet, cacheLineIdx)
-		}
-		os.MapOps[cacheLineIdx] = append(os.MapOps[cacheLineIdx], OpDetail{
-			WordIdx: WordIdx, BitOffset: BitOffset,
-		})
-	}
-}
-
-// GetGetOperations returns all get operations for a given cache line
-func (os *OperationStorage) GetGetOperations(cacheLineIdx uint64) []OpDetail {
-	if os.UseArrayMode {
-		return os.ArrayOps[cacheLineIdx]
-	}
-	return os.MapOps[cacheLineIdx]
-}
-
-// GetUsedGetIndices returns the list of cache line indices that have get operations
-func (os *OperationStorage) GetUsedGetIndices() []uint64 {
-	return os.UsedIndicesGet
-}
-
-// ClearSetMap clears the set operation map
-func (os *OperationStorage) ClearSetMap() {
-	if os.UseArrayMode {
-		for _, idx := range os.UsedIndicesSet {
-			os.ArrayOpsSet[idx] = os.ArrayOpsSet[idx][:0]
-		}
-		os.UsedIndicesSet = os.UsedIndicesSet[:0]
-	} else {
-		clear(os.MapOpsSet)
-		os.UsedIndicesSet = os.UsedIndicesSet[:0]
-	}
-}
-
-// AddSetOperation adds a set operation for a given cache line
-func (os *OperationStorage) AddSetOperation(cacheLineIdx, WordIdx, BitOffset uint64) {
-	if os.UseArrayMode {
-		if len(os.ArrayOpsSet[cacheLineIdx]) == 0 {
-			os.UsedIndicesSet = append(os.UsedIndicesSet, cacheLineIdx)
-		}
-		os.ArrayOpsSet[cacheLineIdx] = append(os.ArrayOpsSet[cacheLineIdx], SetDetail{
-			WordIdx: WordIdx, BitOffset: BitOffset,
-		})
-	} else {
-		if len(os.MapOpsSet[cacheLineIdx]) == 0 {
-			os.UsedIndicesSet = append(os.UsedIndicesSet, cacheLineIdx)
-		}
-		os.MapOpsSet[cacheLineIdx] = append(os.MapOpsSet[cacheLineIdx], SetDetail{
-			WordIdx: WordIdx, BitOffset: BitOffset,
-		})
-	}
-}
-
-// GetSetOperations returns all set operations for a given cache line
-func (os *OperationStorage) GetSetOperations(cacheLineIdx uint64) []SetDetail {
-	if os.UseArrayMode {
-		return os.ArrayOpsSet[cacheLineIdx]
-	}
-	return os.MapOpsSet[cacheLineIdx]
-}
-
-// GetUsedSetIndices returns the list of cache line indices that have set operations
-func (os *OperationStorage) GetUsedSetIndices() []uint64 {
-	return os.UsedIndicesSet
-}
-
-// ClearHashMap clears the hash position map
-func (os *OperationStorage) ClearHashMap() {
-	if os.UseArrayMode {
-		for _, idx := range os.UsedIndicesHash {
-			os.ArrayMap[idx] = os.ArrayMap[idx][:0]
-		}
-		os.UsedIndicesHash = os.UsedIndicesHash[:0]
-	} else {
-		clear(os.MapMap)
-		os.UsedIndicesHash = os.UsedIndicesHash[:0]
-	}
-}
-
-// AddHashPosition adds a bit position to the hash map for a given cache line
-func (os *OperationStorage) AddHashPosition(cacheLineIdx uint64, bitPos uint64) {
-	if os.UseArrayMode {
-		if len(os.ArrayMap[cacheLineIdx]) == 0 {
-			os.UsedIndicesHash = append(os.UsedIndicesHash, cacheLineIdx)
-		}
-		os.ArrayMap[cacheLineIdx] = append(os.ArrayMap[cacheLineIdx], bitPos)
-	} else {
-		if len(os.MapMap[cacheLineIdx]) == 0 {
-			os.UsedIndicesHash = append(os.UsedIndicesHash, cacheLineIdx)
-		}
-		os.MapMap[cacheLineIdx] = append(os.MapMap[cacheLineIdx], bitPos)
-	}
-}
-
-// GetUsedHashIndices returns the list of cache line indices that have hash positions
-func (os *OperationStorage) GetUsedHashIndices() []uint64 {
-	return os.UsedIndicesHash
-}
-
-// Pool for operation storage - separate pools for array and map modes
-var (
-	arrayOpsPool = sync.Pool{
-		New: func() interface{} {
-			return &OperationStorage{
-				UseArrayMode:    true,
-				ArrayOps:        &[10000][]OpDetail{},
-				ArrayOpsSet:     &[10000][]SetDetail{},
-				ArrayMap:        &[10000][]uint64{},
-				UsedIndicesGet:  make([]uint64, 0, 8),
-				UsedIndicesSet:  make([]uint64, 0, 8),
-				UsedIndicesHash: make([]uint64, 0, 8),
-			}
-		},
-	}
-
-	mapOpsPool = sync.Pool{
-		New: func() interface{} {
-			return &OperationStorage{
-				UseArrayMode:    false,
-				MapOps:          make(map[uint64][]OpDetail, 32),
-				MapOpsSet:       make(map[uint64][]SetDetail, 32),
-				MapMap:          make(map[uint64][]uint64, 32),
-				UsedIndicesGet:  make([]uint64, 0, 32),
-				UsedIndicesSet:  make([]uint64, 0, 32),
-				UsedIndicesHash: make([]uint64, 0, 32),
-			}
-		},
-	}
-)
-
-// GetOperationStorage retrieves an operation storage from the pool
-// Objects from pool are already clean (either new or cleared on Put)
-func GetOperationStorage(useArrayMode bool) *OperationStorage {
-	if useArrayMode {
-		return arrayOpsPool.Get().(*OperationStorage)
-	}
-	return mapOpsPool.Get().(*OperationStorage)
-}
-
-// PutOperationStorage returns an operation storage to the pool after clearing it
-func PutOperationStorage(ops *OperationStorage) {
-	// Clear before returning to pool to ensure next Get receives clean object
-	ops.Clear()
-
-	if ops.UseArrayMode {
-		arrayOpsPool.Put(ops)
-	} else {
-		mapOpsPool.Put(ops)
-	}
-}
-
-// Mode handles the hybrid array/map storage configuration.
-// With sync.Pool, this just tracks the mode setting, not the actual storage.
-type Mode struct {
-	UseArrayMode bool
-}
-
-// New creates a new storage mode instance based on the cache line count.
-func New(cacheLineCount uint64, hashCount uint32, arrayModeThreshold uint64) *Mode {
-	useArrayMode := cacheLineCount <= arrayModeThreshold
-
-	return &Mode{
-		UseArrayMode: useArrayMode,
-	}
-}
diff --git a/internal/storage/storage_test.go b/internal/storage/storage_test.go
deleted file mode 100644
index 425526d..0000000
--- a/internal/storage/storage_test.go
+++ /dev/null
@@ -1,308 +0,0 @@
-package storage
-
-import (
-	"testing"
-)
-
-// TestNewArrayMode verifies array mode initialization
-func TestNewArrayMode(t *testing.T) {
-	// Test array mode (below threshold)
-	s := New(5000, 10, 10000)
-
-	if !s.UseArrayMode {
-		t.Errorf("Expected array mode for 5000 cache lines (threshold: 10000)")
-	}
-}
-
-// TestNewMapMode verifies map mode initialization
-func TestNewMapMode(t *testing.T) {
-	// Test map mode (above threshold)
-	s := New(15000, 10, 10000)
-
-	if s.UseArrayMode {
-		t.Errorf("Expected map mode for 15000 cache lines (threshold: 10000)")
-	}
-}
-
-// TestThresholdBoundary verifies behavior at the threshold boundary
-func TestThresholdBoundary(t *testing.T) {
-	threshold := uint64(10000)
-
-	tests := []struct {
-		name        string
-		cacheLines  uint64
-		expectArray bool
-	}{
-		{"Just below threshold", threshold - 1, true},
-		{"At threshold", threshold, true},
-		{"Just above threshold", threshold + 1, false},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			s := New(tt.cacheLines, 10, threshold)
-			if s.UseArrayMode != tt.expectArray {
-				t.Errorf("%s: expected array mode=%v, got=%v",
-					tt.name, tt.expectArray, s.UseArrayMode)
-			}
-		})
-	}
-}
-
-// TestOperationStoragePool tests the sync.Pool functionality
-func TestOperationStoragePool(t *testing.T) {
-	// Test array mode pool
-	ops1 := GetOperationStorage(true)
-	if !ops1.UseArrayMode {
-		t.Error("Array mode operation storage should have UseArrayMode=true")
-	}
-	if ops1.ArrayOps == nil {
-		t.Error("Array mode operation storage should have ArrayOps initialized")
-	}
-	PutOperationStorage(ops1)
-
-	// Test map mode pool
-	ops2 := GetOperationStorage(false)
-	if ops2.UseArrayMode {
-		t.Error("Map mode operation storage should have UseArrayMode=false")
-	}
-	if ops2.MapOps == nil {
-		t.Error("Map mode operation storage should have MapOps initialized")
-	}
-	PutOperationStorage(ops2)
-
-	// Test that pool reuses objects
-	ops3 := GetOperationStorage(true)
-	if ops3 == nil {
-		t.Error("Pool should return valid operation storage")
-	}
-	PutOperationStorage(ops3)
-}
-
-// TestGetOperations tests getting operations through the API
-func TestGetOperations(t *testing.T) {
-	modes := []bool{true, false} // array mode, map mode
-
-	for _, useArrayMode := range modes {
-		modeName := "Map mode"
-		if useArrayMode {
-			modeName = "Array mode"
-		}
-
-		t.Run(modeName, func(t *testing.T) {
-			ops := GetOperationStorage(useArrayMode)
-			defer PutOperationStorage(ops)
-
-			// Add operations
-			ops.AddGetOperation(42, 1, 5)
-			ops.AddGetOperation(42, 2, 10)
-
-			// Retrieve operations
-			operations := ops.GetGetOperations(42)
-
-			if len(operations) != 2 {
-				t.Errorf("Expected 2 operations, got %d", len(operations))
-			}
-
-			// Verify operation details
-			if operations[0].WordIdx != 1 || operations[0].BitOffset != 5 {
-				t.Errorf("First operation incorrect: got WordIdx=%d, BitOffset=%d",
-					operations[0].WordIdx, operations[0].BitOffset)
-			}
-
-			if operations[1].WordIdx != 2 || operations[1].BitOffset != 10 {
-				t.Errorf("Second operation incorrect: got WordIdx=%d, BitOffset=%d",
-					operations[1].WordIdx, operations[1].BitOffset)
-			}
-		})
-	}
-}
-
-// TestClearGetMap tests clearing get operations
-func TestClearGetMap(t *testing.T) {
-	modes := []bool{true, false}
-
-	for _, useArrayMode := range modes {
-		modeName := "Map mode"
-		if useArrayMode {
-			modeName = "Array mode"
-		}
-
-		t.Run(modeName, func(t *testing.T) {
-			ops := GetOperationStorage(useArrayMode)
-			defer PutOperationStorage(ops)
-
-			// Add some data
-			ops.AddGetOperation(10, 1, 2)
-			ops.AddGetOperation(20, 3, 4)
-
-			// Verify data was added
-			ops10 := ops.GetGetOperations(10)
-			if len(ops10) != 1 {
-				t.Errorf("Expected 1 operation at index 10, got %d", len(ops10))
-			}
-
-			ops.ClearGetMap()
-
-			// Verify cleared
-			ops10After := ops.GetGetOperations(10)
-			if len(ops10After) != 0 {
-				t.Error("GetOperations[10] should be cleared")
-			}
-			if len(ops.UsedIndicesGet) != 0 {
-				t.Error("UsedIndicesGet should be cleared")
-			}
-		})
-	}
-}
-
-// TestMultipleOperations tests multiple operations in both modes
-func TestMultipleOperations(t *testing.T) {
-	modes := []bool{true, false}
-
-	for _, useArrayMode := range modes {
-		modeName := "Map mode"
-		if useArrayMode {
-			modeName = "Array mode"
-		}
-
-		t.Run(modeName, func(t *testing.T) {
-			ops := GetOperationStorage(useArrayMode)
-			defer PutOperationStorage(ops)
-
-			// Add multiple operations
-			for i := uint64(0); i < 100; i++ {
-				ops.AddGetOperation(i, i, i%64)
-			}
-
-			// Verify all operations exist
-			for i := uint64(0); i < 100; i++ {
-				operations := ops.GetGetOperations(i)
-				if len(operations) != 1 {
-					t.Errorf("Cache line %d: expected 1 op, got %d", i, len(operations))
-				}
-			}
-
-			// Clear and verify
-			ops.ClearGetMap()
-
-			for i := uint64(0); i < 100; i++ {
-				operations := ops.GetGetOperations(i)
-				if len(operations) != 0 {
-					t.Errorf("After clear, cache line %d should have 0 ops, got %d", i, len(operations))
-				}
-			}
-		})
-	}
-}
-
-// TestAddHashPosition tests hash position tracking
-func TestAddHashPosition(t *testing.T) {
-	modes := []bool{true, false}
-
-	for _, useArrayMode := range modes {
-		modeName := "Map mode"
-		if useArrayMode {
-			modeName = "Array mode"
-		}
-
-		t.Run(modeName, func(t *testing.T) {
-			ops := GetOperationStorage(useArrayMode)
-			defer PutOperationStorage(ops)
-
-			// Add hash positions
-			ops.AddHashPosition(42, 100)
-			ops.AddHashPosition(42, 200)
-			ops.AddHashPosition(43, 300)
-
-			// Verify used indices
-			usedIndices := ops.GetUsedHashIndices()
-			if len(usedIndices) == 0 {
-				t.Error("Expected used hash indices to be tracked")
-			}
-
-			// Clear and verify
-			ops.ClearHashMap()
-			usedIndicesAfter := ops.GetUsedHashIndices()
-			if len(usedIndicesAfter) != 0 {
-				t.Error("Used hash indices should be cleared")
-			}
-		})
-	}
-}
-
-// TestSetOperations tests set operation tracking
-func TestSetOperations(t *testing.T) {
-	modes := []bool{true, false}
-
-	for _, useArrayMode := range modes {
-		modeName := "Map mode"
-		if useArrayMode {
-			modeName = "Array mode"
-		}
-
-		t.Run(modeName, func(t *testing.T) {
-			ops := GetOperationStorage(useArrayMode)
-			defer PutOperationStorage(ops)
-
-			// Add set operations
-			ops.AddSetOperation(10, 1, 5)
-			ops.AddSetOperation(10, 2, 10)
-			ops.AddSetOperation(20, 3, 15)
-
-			// Verify operations were added
-			ops10 := ops.GetSetOperations(10)
-			if len(ops10) != 2 {
-				t.Errorf("Expected 2 set operations at index 10, got %d", len(ops10))
-			}
-
-			ops20 := ops.GetSetOperations(20)
-			if len(ops20) != 1 {
-				t.Errorf("Expected 1 set operation at index 20, got %d", len(ops20))
-			}
-
-			// Verify used indices
-			usedIndices := ops.GetUsedSetIndices()
-			if len(usedIndices) == 0 {
-				t.Error("Expected used set indices to be tracked")
-			}
-
-			// Clear and verify
-			ops.ClearSetMap()
-			ops10After := ops.GetSetOperations(10)
-			if len(ops10After) != 0 {
-				t.Error("Set operations should be cleared")
-			}
-		})
-	}
-}
-
-// TestConcurrentPoolAccess tests that the pool is safe for concurrent access
-func TestConcurrentPoolAccess(t *testing.T) {
-	const numGoroutines = 100
-	const numOperationsPerGoroutine = 1000
-
-	done := make(chan bool, numGoroutines)
-
-	for i := 0; i < numGoroutines; i++ {
-		go func() {
-			for j := 0; j < numOperationsPerGoroutine; j++ {
-				// Alternate between array and map mode
-				useArrayMode := j%2 == 0
-				ops := GetOperationStorage(useArrayMode)
-
-				// Do some operations
-				ops.AddGetOperation(uint64(j), uint64(j), uint64(j%64))
-				_ = ops.GetUsedGetIndices()
-
-				PutOperationStorage(ops)
-			}
-			done <- true
-		}()
-	}
-
-	// Wait for all goroutines to complete
-	for i := 0; i < numGoroutines; i++ {
-		<-done
-	}
-}
diff --git a/tests/README.md b/tests/README.md
deleted file mode 100644
index 1308596..0000000
--- a/tests/README.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# Tests Directory
-
-This directory contains special test files that require specific build tags or conditions.
-
-## Directory Structure
-
-```
-tests/
-└── integration/
-    └── simd_comparison_test.go  # SIMD performance comparison tests
-```
-
-## Integration Tests
-
-### SIMD Comparison Tests
-
-**File:** `integration/simd_comparison_test.go`
-
-**Purpose:** Comprehensive SIMD performance and correctness validation tests.
-
-**Build Tag:** `simd_comparison`
-
-These tests compare SIMD implementations against scalar fallback implementations across various data sizes and operations. They are separated because:
-
-1. **Resource Intensive:** Run extensive performance benchmarks
-2. **Special Build Tag:** Require `-tags=simd_comparison` to run
-3. **Release Validation:** Used to verify SIMD performance before releases
-4. **Not Run by Default:** Excluded from normal `go test ./...` runs
-
-**Running these tests:**
-
-```bash
-# Run SIMD performance comparison tests
-go test -tags=simd_comparison -v ./tests/integration
-
-# Run specific test
-go test -tags=simd_comparison -v ./tests/integration -run=TestSIMDPerformanceImprovement
-
-# Run benchmarks
-go test -tags=simd_comparison -bench=BenchmarkSIMDvsScalar ./tests/integration
-```
-
-**Test Coverage:**
-- `BenchmarkSIMDvsScalar` - Compares SIMD vs fallback for PopCount, VectorOr, VectorAnd, VectorClear
-- `TestSIMDPerformanceImprovement` - Validates SIMD speedup meets minimum thresholds
-- `TestSIMDCorrectness` - Ensures SIMD produces identical results to fallback
-- `BenchmarkBloomFilterWithSIMD` - Full Bloom filter benchmarks with SIMD
-
-## Regular Unit Tests
-
-Regular unit tests remain in the root package directory:
-
-```
-bloomfilter_test.go              # Core Bloom filter functionality tests
-benchmark_test.go                # Performance benchmarks for main package
-storage_mode_test.go             # Array vs Map storage mode selection tests
-storage_mode_benchmark_test.go   # Storage mode performance benchmarks
-simd_test.go                     # SIMD capability detection tests
-```
-
-These follow Go conventions and are run with standard `go test ./...`.
-
-## Running Tests
-
-**All standard tests:**
-```bash
-go test -v ./...
-```
-
-**With benchmarks:**
-```bash
-go test -bench=. -benchmem ./...
-```
-
-**Include integration tests:**
-```bash
-# Run all tests including integration
-go test -tags=simd_comparison -v ./...
-
-# Run only integration tests
-go test -tags=simd_comparison -v ./tests/integration
-```
-
-**Automated benchmark suite:**
-```bash
-bash scripts/benchmark.sh
-```
-
-This creates a timestamped results folder with all benchmark outputs, CPU profiles, and analysis files.
-
-## CI/CD Integration
-
-The integration tests are automatically run in CI/CD workflows:
-
-- **Pull Requests:** Regular tests only
-- **Pre-Release:** Includes SIMD comparison tests
-- **Release:** Full validation including performance thresholds
-
-See `.github/workflows/` for workflow configurations.
-
-## Best Practices
-
-1. **Use build tags** for tests that are resource-intensive or require special conditions
-2. **Keep unit tests** in the same directory as the code they test
-3. **Separate integration tests** that test cross-package functionality or performance
-4. **Document requirements** for running special tests (build tags, environment, etc.)
-5. **Automate** with scripts and CI/CD for consistency
-
-## Adding New Tests
-
-**Regular unit tests:** Add to appropriate `*_test.go` file in root package
-
-**Integration tests:** Add to `tests/integration/` with appropriate build tags
-
-**Benchmarks:** Add to `*_benchmark_test.go` files, use consistent naming
-
-For questions or issues, see [BENCHMARK_WORKFLOW.md](../scripts/BENCHMARK_WORKFLOW.md).
diff --git a/tests/TEST_COVERAGE_SUMMARY.md b/tests/TEST_COVERAGE_SUMMARY.md
index 67a64df..9e52076 100644
--- a/tests/TEST_COVERAGE_SUMMARY.md
+++ b/tests/TEST_COVERAGE_SUMMARY.md
@@ -2,13 +2,22 @@
 
 ## Overview
 
-Comprehensive test suite covering unit tests, integration tests, stress tests, edge cases, and concurrency validation.
+Comprehensive test suite covering unit tests, integration tests, edge cases, and concurrency validation for the **simplified atomic implementation**.
 
-## Test Files Created/Enhanced
+## Architecture Changes
+
+**Simplified Atomic Implementation** (v0.4.0):
+- Removed `internal/storage` package (no more hybrid array/map modes)
+- Removed sync.Pool complexity
+- Direct cache-line array with atomic operations
+- Stack-based buffers for zero allocations
+- ~400 lines of clean code
+
+## Test Files
 
 ### 1. Unit Tests
 
-#### `internal/hash/hash_test.go` (NEW)
+#### `internal/hash/hash_test.go`
 **Coverage: 100%**
 
 - **230+ test cases** covering both `Optimized1` and `Optimized2` hash functions
@@ -21,87 +30,88 @@ Comprehensive test suite covering unit tests, integration tests, stress tests, e
   - Boundary conditions (7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128 bytes)
   - Edge cases (empty input, all zeros, all 0xFF, repeating patterns)
 
-### 2. Integration Tests
-
-#### `tests/integration/bloomfilter_stress_test.go` (REFACTORED)
-**Large-scale stress testing and performance validation**
-
-##### Large Dataset Tests
-- `TestLargeDatasetInsertion`
-  - Tests: 1M, 5M, 10M element insertions
-  - Metrics: insertion rate, memory usage, verification rate
-  - Verifies: all elements found, load factor, estimated FPP
-  - Skip with `-short` flag for quick test runs
+### 2. Root Package Tests
 
-##### Performance Tests
-- `TestHighThroughputSequential` - 1M sequential operations, measures insert/lookup rates
-- `TestMemoryFootprintGrowth` - Tests memory usage across 10K to 10M element filters
-- `TestLongRunningStability` - 10 cycles of add/verify operations
+#### `bloomfilter_test.go`
+**Core functionality tests**
 
-##### Edge Cases
-- `TestExtremeEdgeCases`
-  - Very small filters (overloaded 10x capacity)
-  - Very long strings (1KB, 10KB, 100KB)
-  - Empty and nil inputs
-  - Extreme FPR values (0.0001 to 0.5)
+- Basic Add/Contains operations
+- String and Uint64 specialized operations
+- Clear operation
+- PopCount (SIMD optimized)
+- Union operation (SIMD optimized)
+- Intersection operation (SIMD optimized)
+- Cache statistics
+- False positive rate validation
 
-#### `tests/integration/bloomfilter_concurrent_test.go` (NEW)
-**Thread-safety validation (currently skipped due to known issues)**
+#### `bloomfilter_simd_test.go`
+**SIMD capability detection**
 
-- `TestConcurrentReads` - 100 goroutines × 1000 reads each
-- `TestConcurrentWrites` - 50 goroutines × 1000 writes each
-- `TestMixedConcurrentOperations` - 25 readers + 25 writers simultaneously
+- Runtime SIMD detection (AVX2, AVX512, NEON)
+- SIMD function execution
+- Cache statistics with SIMD info
 
-**IMPORTANT FINDING:** Concurrent read test discovered a nil pointer dereference in concurrent access scenarios, indicating thread-safety issue in the storage layer. All concurrent tests currently skip with documented reason.
+### 3. Integration Tests
 
-#### `tests/integration/bloomfilter_edge_cases_test.go` (NEW)
-**Boundary conditions and edge case validation**
+#### `tests/integration/bloomfilter_edge_cases_test.go` (REWRITTEN)
+**Comprehensive edge case testing for simplified implementation**
 
 ##### Boundary Tests
 - `TestBoundaryConditions`
-  - Exact ArrayModeThreshold boundary
-  - Cache line alignment (1, 63, 64, 65, 511, 512, 513, 1023, 1024, 1025 elements)
-  - Bit and byte alignment (1-byte to 128-byte data sizes)
-
-##### Hash Quality Tests
-- `TestHashDistribution` - Validates hash distribution quality vs theoretical expectation
-- `TestCollisionResistance` - Tests known collision-prone patterns
-  - Sequential patterns
-  - Repeating patterns (0xAA, 0x55, 0xFF)
-  - Shifted patterns
-  - Palindromes
+  - Small filters (10K elements)
+  - Large filters (1M elements)
+  - Verifies correctness across all sizes
+
+##### Size Tests
+- `TestExtremelySmallFilter`
+  - Single element filters
+  - Ten element filters
+  - Hundred element filters with low FPR
 
 ##### FPR Tests
 - `TestExtremeFalsePositiveRates`
-  - Very low FPR (0.00001)
-  - Low FPR (0.0001)
-  - Normal FPR (0.01)
-  - High FPR (0.1, 0.5)
-  - Measures actual vs expected FPR
-
-##### Edge Cases
-- `TestZeroAndMinimalCases`
-  - Zero uint64
-  - Empty string vs nil slice
-  - Single-bit patterns (all 8 single-bit values)
-
-##### Memory Behavior
-- `TestMemoryBehavior`
-  - Multiple clear cycles (100 cycles × 100 elements)
-  - Overload beyond capacity (10x elements)
-
-##### Unicode & Special Characters
-- `TestUnicodeAndSpecialCharacters`
-  - Chinese, Russian, Arabic, Hebrew, Japanese
-  - Emojis
-  - Control characters
-  - Null bytes
-  - Invalid UTF-8
-
-#### `tests/integration/bloomfilter_race_test.go` (NEW)
-**Race condition detection tests (requires `-race` flag and CGO)**
-
-Build tag: `// +build race`
+  - Very low FPR (0.000001 - 0.0001%)
+  - Low FPR (0.001 - 0.1%)
+  - Medium FPR (0.01 - 1%)
+  - High FPR (0.1 - 10%)
+  - Validates actual vs expected FPR
+
+##### Hash Count Tests
+- `TestMaximumHashCount`
+  - Very low FPR resulting in high hash count
+  - Validates stack buffer fallback to heap allocation
+
+##### Invalid Input Tests
+- `TestZeroAndNegativeInputs`
+  - Zero expected elements
+  - Invalid FPR (> 1.0, negative, NaN)
+  - Documents behavior for invalid inputs
+
+##### Empty Data Tests
+- `TestEmptyData`
+  - Empty byte slices
+  - Empty strings
+  - Zero uint64 values
+
+##### Large Scale Tests
+- `TestVeryLargeElements`
+  - 10M element filters
+  - Memory usage tracking
+  - Sample verification
+
+#### `tests/integration/bloomfilter_concurrent_test.go`
+**Thread-safety validation with atomic operations**
+
+- `TestConcurrentReads` - 100 goroutines × 1000 reads each
+- `TestConcurrentWrites` - 50 goroutines × 1000 writes each
+- `TestMixedConcurrentOperations` - 25 readers + 25 writers simultaneously
+
+**Results:** All tests pass - thread-safe with lock-free atomic operations!
+
+#### `tests/integration/bloomfilter_race_test.go`
+**Race condition detection tests (requires `-race` flag)**
+
+Build tag: `//go:build race`
 
 Tests concurrent operations to detect data races:
 - `TestRaceConcurrentAdds` - Concurrent write operations
@@ -113,27 +123,34 @@ Tests concurrent operations to detect data races:
 - `TestRaceIntersection` - Concurrent intersection operations
 - `TestRaceGetCacheStats` - Concurrent stats reading
 - `TestRaceMultipleOperations` - Various operations concurrently
-- `TestRaceArrayVsMapMode` - Race tests for both storage modes
 
-**Run with:** `go test -race ./tests/integration` (requires CGO_ENABLED=1)
+**Run with:** `go test -race ./tests/integration`
+
+#### `tests/integration/bloomfilter_simd_comparison_test.go`
+**SIMD vs fallback comparison**
 
-### 3. Existing Tests
+- Validates SIMD correctness vs scalar fallback
+- Performance comparison tests
 
-#### Root Package Tests
-- `bloomfilter_test.go` - Core functionality (89.6% coverage)
-- `bloomfilter_simd_test.go` - SIMD capability detection
+### 4. Benchmark Tests
 
-#### Integration Tests
-- `tests/integration/bloomfilter_storage_mode_test.go` - Hybrid storage mode validation
-- `tests/integration/bloomfilter_simd_comparison_test.go` - SIMD vs fallback comparison
+#### `tests/benchmark/bloomfilter_benchmark_test.go`
+**Comprehensive performance benchmarks**
 
-#### Benchmark Tests
-- `tests/benchmark/bloomfilter_benchmark_test.go` - Performance benchmarks
-- `tests/benchmark/bloomfilter_storage_mode_benchmark_test.go` - Storage mode benchmarks
+- `BenchmarkCachePerformance` - Cache efficiency across different sizes
+- `BenchmarkInsertion` - Insertion throughput with memory analysis
+- `BenchmarkLookup` - Lookup throughput with accuracy metrics
+- `BenchmarkFalsePositives` - FPP accuracy testing
+- `BenchmarkComprehensive` - Complete performance profile
+
+**Results:**
+- 18.6M insertions/sec
+- 35.8M lookups/sec
+- Zero allocations on hot path
 
 ## Test Execution
 
-### Quick Tests (Excludes Large Datasets)
+### Quick Tests (Recommended for Development)
 ```bash
 go test -short ./...
 ```
@@ -143,29 +160,26 @@ go test -short ./...
 go test -v ./...
 ```
 
-### Large Dataset Tests Only
+### Benchmarks
 ```bash
-go test -v ./tests/integration -run="TestLargeDatasetInsertion" -timeout=300s
+go test -bench=. -benchmem ./tests/benchmark/...
 ```
 
-### Concurrency Tests
+### Race Detection
 ```bash
-go test -v ./tests/integration -run="Concurrent"
+go test -race -v ./...
 ```
 
-### Race Detection (Requires CGO)
+### Specific Tests
 ```bash
-CGO_ENABLED=1 go test -race ./tests/integration
-```
+# Edge cases
+go test -v ./tests/integration -run=TestBoundaryConditions
 
-### Edge Cases
-```bash
-go test -v ./tests/integration -run="TestBoundaryConditions|TestHashDistribution|TestExtreme"
-```
+# Concurrency
+go test -v ./tests/integration -run=Concurrent
 
-### Specific Test
-```bash
-go test -v ./tests/integration -run=TestHashDistribution
+# Hash functions
+go test -v ./internal/hash -run=TestOptimized
 ```
 
 ## Coverage Summary
@@ -173,107 +187,133 @@ go test -v ./tests/integration -run=TestHashDistribution
 | Package | Coverage | Notes |
 |---------|----------|-------|
 | `internal/hash` | **100.0%** | Full coverage of both hash functions |
-| Root package | **89.6%** | Core bloom filter operations |
-| `internal/storage` | **98.3%** | Hybrid storage mode |
+| Root package | **~90%** | Core bloom filter operations |
 | `internal/simd` | **0.0%** | Assembly code (tested via integration) |
 | `internal/simd/amd64` | **0.0%** | Assembly wrappers (tested via integration) |
 | `internal/simd/arm64` | **0.0%** | Assembly wrappers (tested via integration) |
 
-**Note:** SIMD packages show 0% coverage because they contain assembly code and thin wrappers. They are thoroughly tested via integration tests.
+**Note:** SIMD packages show 0% coverage because they contain assembly code. They are thoroughly tested via integration tests and benchmarks.
 
 ## Test Categories
 
 ### ✅ Fully Covered
 1. Hash function correctness and performance
-2. Basic bloom filter operations
-3. Storage mode selection (array vs map)
-4. SIMD operations correctness
-5. Set operations (union, intersection, clear)
-6. False positive rate validation
-7. Edge cases and boundary conditions
-8. Hash distribution quality
-9. Unicode and special character handling
-10. Memory behavior under stress
-
-### ⚠️ Known Issues Discovered
-1. **Thread Safety**: Concurrent read test discovered nil pointer dereference
-   - Location: `internal/storage/storage.go:174`
-   - Symptom: `AddGetOperation` panics with nil pointer in concurrent scenarios
-   - **Action Required**: Add proper synchronization to storage layer
-   - **Documentation**: See `THREAD_SAFETY_ANALYSIS.md` for detailed analysis
-
-### 🔍 Additional Tests Recommended
-1. **Serialization/Persistence** - Save/load filter state
-2. **Cross-platform Compatibility** - Endianness testing
-3. **Benchmark Regression** - Automated performance tracking
-4. **Fuzz Testing** - Random input generation
-5. **Memory Leak Detection** - Long-running stability with memory profiling
+2. Basic bloom filter operations (Add, Contains, AddString, ContainsString, AddUint64, ContainsUint64)
+3. SIMD operations correctness (PopCount, Union, Intersection, Clear)
+4. Set operations validation
+5. False positive rate accuracy
+6. Edge cases and boundary conditions
+7. Hash distribution quality
+8. Thread-safety with concurrent operations
+9. Memory behavior and zero-allocation guarantee
+10. Large-scale performance (10M+ elements)
+
+### ✅ Verified Features
+1. **Zero Allocations**: Stack buffers for hashCount ≤ 16 (99% of use cases)
+2. **Thread-Safety**: Lock-free atomic CAS operations
+3. **SIMD Acceleration**: 2-4x speedup on bulk operations
+4. **Performance**: 26 ns/op Add, 23 ns/op Contains
+5. **Scalability**: Works efficiently from 10 elements to 10M+ elements
+
+### 🎯 Key Improvements Over Previous Version
+1. **Simplified Architecture**: Removed 200+ lines of sync.Pool complexity
+2. **Better Performance**: 15-26x faster than pool version
+3. **Zero Allocations**: vs millions of allocations in pool version
+4. **No Race Conditions**: Eliminated all pool-related race conditions
+5. **Predictable Behavior**: No pool warmup, consistent performance
 
 ## Key Metrics from Tests
 
+### Performance
+- **Add operation**: 26 ns/op (0 B/op, 0 allocs/op)
+- **Contains operation**: 23 ns/op (0 B/op, 0 allocs/op)
+- **AddUint64 operation**: 20 ns/op (0 B/op, 0 allocs/op)
+- **Throughput**: 18.6M inserts/sec, 35.8M lookups/sec
+
 ### Hash Distribution
 - Deviation from expected: **< 0.5%** (excellent)
 - Collision resistance: All collision-prone patterns handled correctly
 
-### Large Dataset Performance
-- 10M elements insertion: **~300-500K ops/sec** (varies by system)
-- Memory efficient: MAP mode minimal overhead
-- Verification: All elements found
-
 ### False Positive Rates
-- Actual FPR typically within **2-3x** of target
-- Overloaded filters degrade gracefully
-- No false negatives observed
+- Actual FPP typically within **2-3x** of target
+- Load factor ~46.5% at capacity
+- Estimated FPP accurate to actual measurement
 
 ### Concurrency
-- Successfully tested up to **100 concurrent goroutines**
-- **Issue found**: Nil pointer in concurrent reads (needs fix)
+- Successfully tested with **100+ concurrent goroutines**
+- **Zero race conditions** with atomic operations
+- Thread-safe by design, no external locks needed
+
+### Memory
+- **Zero allocations** on hot path (Add, Contains)
+- Perfect cache-line alignment (0 byte offset)
+- Predictable memory usage
 
 ## Running Full Test Suite
 
 ```bash
-# Quick sanity check
+# Quick sanity check (skip long-running tests)
 go test -short ./...
 
-# Full suite (excludes long-running tests)
-go test ./...
-
-# Full suite with verbose output
+# Full suite
 go test -v ./...
 
-# Include large dataset tests (may take several minutes)
-go test -v ./... -timeout=600s
+# With benchmarks
+go test -v -bench=. -benchmem ./...
 
 # With coverage report
 go test -cover ./...
 
-# Detailed coverage
+# Detailed coverage HTML report
 go test -coverprofile=coverage.out ./... && go tool cover -html=coverage.out
 
-# Race detection (if CGO available)
-CGO_ENABLED=1 go test -race ./...
+# Race detection (recommended for thread-safety validation)
+go test -race -v ./...
+
+# Escape analysis verification
+go build -gcflags='-m' 2>&1 | grep -E "escape|stack"
 ```
 
 ## Test Maintenance Notes
 
 1. **Long-running tests** are skipped with `-short` flag for CI/CD
-2. **Race tests** require CGO and may not run on all platforms
-3. **Large dataset tests** have 5-10 minute timeout, adjust as needed
-4. **Concurrent tests** are currently skipped due to known thread-safety issues
+2. **Race tests** run automatically with `-race` flag
+3. **Benchmarks** should be run periodically to detect performance regressions
+4. **Edge case tests** validate behavior for extreme inputs
+
+## Removed Tests (No Longer Applicable)
+
+The following test files were removed as they tested features that no longer exist:
+
+1. **`tests/integration/bloomfilter_storage_mode_test.go`** - Tested array/map mode switching
+2. **`tests/benchmark/bloomfilter_storage_mode_benchmark_test.go`** - Benchmarked storage modes
+3. **`tests/integration/bloomfilter_stress_test.go`** - Heavily dependent on storage modes
+4. **`internal/storage/storage_test.go`** - Storage package no longer exists
 
 ## Future Test Additions
 
-Based on the comprehensive test suite added, these areas could benefit from additional coverage:
+Potential areas for additional testing:
+
+1. **Serialization** - Binary format save/load (if needed)
+2. **Migration** - Upgrading between versions (if applicable)
+3. **Platform-Specific** - ARM64 NEON validation on actual ARM hardware
+4. **Performance Regression** - Automated benchmark tracking in CI
+5. **Property-Based Testing** - Using `testing/quick` for randomized inputs
+6. **Fuzz Testing** - Automated fuzzing for edge case discovery
+
+## Comparison vs Previous Version
 
-1. **Serialization** - Binary format save/load
-2. **Migration** - Upgrading between versions
-3. **Error Recovery** - Handling corrupted data
-4. **Platform-Specific** - ARM64 NEON validation on actual ARM hardware
-5. **Performance Regression** - Automated benchmark tracking
-6. **Property-Based Testing** - Using `testing/quick` or similar
-7. **Integration with Real Workloads** - Database-like usage patterns
+| Aspect | Simplified Atomic | Thread-Safe Pool |
+|--------|------------------|------------------|
+| **Test Complexity** | Simpler | Complex (pool lifecycle) |
+| **Race Conditions** | None | Required careful testing |
+| **Performance Tests** | 26 ns/op | 400 ns/op |
+| **Memory Tests** | 0 allocs | Millions of allocs |
+| **Thread Safety** | Built-in | Requires pool management |
+| **Test Maintenance** | Easy | Complex |
 
 ---
 
-*Last Updated: 2025-11-01*
-*Test Suite Version: 2.0*
+*Last Updated: 2025-11-02*
+*Test Suite Version: 3.0 (Simplified Atomic)*
+*Implementation: Zero-allocation, lock-free atomic operations*
diff --git a/tests/benchmark/bloomfilter_storage_mode_benchmark_test.go b/tests/benchmark/bloomfilter_storage_mode_benchmark_test.go
deleted file mode 100644
index 8c6c8f6..0000000
--- a/tests/benchmark/bloomfilter_storage_mode_benchmark_test.go
+++ /dev/null
@@ -1,178 +0,0 @@
-package bloomfilter_test
-
-import (
-	"fmt"
-	"testing"
-	"unsafe"
-
-	bloomfilter "github.com/shaia/BloomFilter"
-)
-
-// Benchmark array mode vs map mode for different filter sizes
-func BenchmarkHybridModes(b *testing.B) {
-	benchmarks := []struct {
-		name     string
-		elements uint64
-		fpr      float64
-	}{
-		{"Small_1K_Array", 1_000, 0.01},
-		{"Small_10K_Array", 10_000, 0.01},
-		{"Medium_100K_Array", 100_000, 0.01},
-		{"Large_1M_Map", 1_000_000, 0.01},
-		{"Large_10M_Map", 10_000_000, 0.01},
-	}
-
-	for _, bm := range benchmarks {
-		b.Run(bm.name+"_Add", func(b *testing.B) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(bm.elements, bm.fpr)
-
-			data := make([]byte, 8)
-			b.ResetTimer()
-			b.ReportAllocs()
-
-			for i := 0; i < b.N; i++ {
-				*(*uint64)(unsafe.Pointer(&data[0])) = uint64(i)
-				bf.Add(data)
-			}
-
-			stats := bf.GetCacheStats()
-			b.ReportMetric(float64(stats.CacheLineCount), "cache_lines")
-			b.ReportMetric(float64(stats.BitCount)/8/1024/1024, "MB")
-			b.SetBytes(8)
-		})
-
-		b.Run(bm.name+"_Contains", func(b *testing.B) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(bm.elements, bm.fpr)
-
-			// Pre-populate with some data
-			data := make([]byte, 8)
-			for i := 0; i < 10000; i++ {
-				*(*uint64)(unsafe.Pointer(&data[0])) = uint64(i)
-				bf.Add(data)
-			}
-
-			b.ResetTimer()
-			b.ReportAllocs()
-
-			for i := 0; i < b.N; i++ {
-				*(*uint64)(unsafe.Pointer(&data[0])) = uint64(i % 20000)
-				_ = bf.Contains(data)
-			}
-
-			stats := bf.GetCacheStats()
-			b.ReportMetric(float64(stats.CacheLineCount), "cache_lines")
-			b.SetBytes(8)
-
-			})
-	}
-}
-
-// Benchmark memory allocation patterns
-func BenchmarkHybridMemoryAllocation(b *testing.B) {
-	sizes := []struct {
-		name     string
-		elements uint64
-		fpr      float64
-	}{
-		{"Array_Small", 1_000, 0.01},
-		{"Array_Medium", 100_000, 0.01},
-		{"Map_Large", 1_000_000, 0.01},
-		{"Map_Huge", 10_000_000, 0.01},
-	}
-
-	for _, size := range sizes {
-		b.Run(size.name, func(b *testing.B) {
-			b.ReportAllocs()
-
-			for i := 0; i < b.N; i++ {
-				bf := bloomfilter.NewCacheOptimizedBloomFilter(size.elements, size.fpr)
-				_ = bf
-			}
-		})
-	}
-}
-
-// Benchmark throughput comparison
-func BenchmarkHybridThroughput(b *testing.B) {
-	configs := []struct {
-		name     string
-		elements uint64
-		fpr      float64
-		ops      int
-	}{
-		{"Array_10K_ops1K", 10_000, 0.01, 1_000},
-		{"Array_100K_ops10K", 100_000, 0.01, 10_000},
-		{"Map_1M_ops100K", 1_000_000, 0.01, 100_000},
-	}
-
-	for _, cfg := range configs {
-		b.Run(cfg.name, func(b *testing.B) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(cfg.elements, cfg.fpr)
-
-
-			b.ResetTimer()
-			b.ReportAllocs()
-
-			for i := 0; i < b.N; i++ {
-				// Perform a mix of operations
-				for j := 0; j < cfg.ops; j++ {
-					if j%2 == 0 {
-						bf.AddUint64(uint64(j))
-					} else {
-						_ = bf.ContainsUint64(uint64(j))
-					}
-				}
-			}
-
-			opsPerSec := float64(b.N*cfg.ops) / b.Elapsed().Seconds()
-			b.ReportMetric(opsPerSec/1000000, "Mops/sec")
-
-			})
-	}
-}
-
-// Benchmark to show the crossover point between array and map efficiency
-func BenchmarkHybridCrossoverPoint(b *testing.B) {
-	// Test around the threshold (10K cache lines)
-	sizes := []uint64{
-		1_000_000,   // ~1,873 cache lines - array mode
-		3_000_000,   // ~5,619 cache lines - array mode
-		5_000_000,   // ~9,365 cache lines - array mode
-		5_500_000,   // ~10,302 cache lines - map mode (just over threshold)
-		10_000_000,  // ~18,721 cache lines - map mode
-		50_000_000,  // ~93,607 cache lines - map mode
-	}
-
-	for _, size := range sizes {
-		name := fmt.Sprintf("Elements_%dM", size/1_000_000)
-		if size < 1_000_000 {
-			name = fmt.Sprintf("Elements_%dK", size/1_000)
-		}
-
-		b.Run(name, func(b *testing.B) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(size, 0.01)
-			stats := bf.GetCacheStats()
-
-			mode := "ARRAY"
-			if !bf.IsArrayMode() {
-				mode = "MAP"
-			}
-			b.Logf("Mode: %s, Cache lines: %d, Threshold: %d",
-				mode, stats.CacheLineCount, bloomfilter.ArrayModeThreshold)
-
-			data := make([]byte, 8)
-			b.ResetTimer()
-			b.ReportAllocs()
-
-			for i := 0; i < b.N; i++ {
-				*(*uint64)(unsafe.Pointer(&data[0])) = uint64(i)
-				bf.Add(data)
-				_ = bf.Contains(data)
-			}
-
-			stats = bf.GetCacheStats()
-			b.ReportMetric(float64(stats.CacheLineCount), "cache_lines")
-			b.SetBytes(16) // 8 bytes for Add + 8 for Contains
-		})
-	}
-}
diff --git a/tests/integration/bloomfilter_concurrent_test.go b/tests/integration/bloomfilter_concurrent_test.go
index 2fe61d1..a74cf2f 100644
--- a/tests/integration/bloomfilter_concurrent_test.go
+++ b/tests/integration/bloomfilter_concurrent_test.go
@@ -11,7 +11,7 @@ import (
 
 // TestConcurrentReads tests thread-safe concurrent read operations
 func TestConcurrentReads(t *testing.T) {
-	// Thread-safety fixed with sync.Pool solution
+	// Thread-safety provided by atomic CAS operations
 
 	bf := bloomfilter.NewCacheOptimizedBloomFilter(100_000, 0.01)
 
@@ -80,7 +80,7 @@ func TestConcurrentReads(t *testing.T) {
 
 // TestConcurrentWrites tests thread-safe concurrent write operations
 func TestConcurrentWrites(t *testing.T) {
-	// Thread-safety fixed with sync.Pool solution
+	// Thread-safety provided by atomic CAS operations
 
 	bf := bloomfilter.NewCacheOptimizedBloomFilter(100_000, 0.01)
 
@@ -149,7 +149,7 @@ func TestConcurrentWrites(t *testing.T) {
 
 // TestMixedConcurrentOperations tests concurrent reads and writes
 func TestMixedConcurrentOperations(t *testing.T) {
-	// Thread-safety fixed with sync.Pool solution
+	// Thread-safety provided by atomic CAS operations
 
 	bf := bloomfilter.NewCacheOptimizedBloomFilter(100_000, 0.01)
 
diff --git a/tests/integration/bloomfilter_edge_cases_test.go b/tests/integration/bloomfilter_edge_cases_test.go
index 70be088..2a4e545 100644
--- a/tests/integration/bloomfilter_edge_cases_test.go
+++ b/tests/integration/bloomfilter_edge_cases_test.go
@@ -9,445 +9,308 @@ import (
 
 // TestBoundaryConditions tests exact boundary conditions
 func TestBoundaryConditions(t *testing.T) {
-	t.Run("Exact ArrayModeThreshold", func(t *testing.T) {
-		// Calculate elements that will produce exactly ArrayModeThreshold cache lines
-		fpr := 0.01
-
-		// Formula: cacheLines = (bitCount + 511) / 512
-		// bitCount = elements * ln(fpr) / (ln(2)^2)
-		// We need to find elements such that cacheLines ≈ threshold
-		// ArrayModeThreshold is the dividing line between array and map mode
-
-		// Test just below threshold
-		bf1 := bloomfilter.NewCacheOptimizedBloomFilter(800_000, fpr)
+	t.Run("Small Filter", func(t *testing.T) {
+		// Test small filter (10K elements)
+		bf1 := bloomfilter.NewCacheOptimizedBloomFilter(10_000, 0.01)
 		stats1 := bf1.GetCacheStats()
-		t.Logf("Below threshold: elements=800K, cache_lines=%d, mode=%s",
-			stats1.CacheLineCount, func() string {
-				if bf1.IsArrayMode() {
-					return "ARRAY"
-				}
-				return "MAP"
-			}())
-
-		// Test just above threshold
-		bf2 := bloomfilter.NewCacheOptimizedBloomFilter(900_000, fpr)
-		stats2 := bf2.GetCacheStats()
-		t.Logf("Above threshold: elements=900K, cache_lines=%d, mode=%s",
-			stats2.CacheLineCount, func() string {
-				if bf2.IsArrayMode() {
-					return "ARRAY"
-				}
-				return "MAP"
-			}())
+		t.Logf("Small filter: elements=10K, cache_lines=%d, memory=%d bytes",
+			stats1.CacheLineCount, stats1.MemoryUsage)
 
-		// Verify both work correctly
+		// Verify works correctly
 		for i := 0; i < 1000; i++ {
 			key := string([]byte{byte(i >> 8), byte(i)})
 			bf1.Add([]byte(key))
-			bf2.Add([]byte(key))
 		}
 
-		notFound1, notFound2 := 0, 0
+		notFound := 0
 		for i := 0; i < 1000; i++ {
 			key := string([]byte{byte(i >> 8), byte(i)})
 			if !bf1.Contains([]byte(key)) {
-				notFound1++
-			}
-			if !bf2.Contains([]byte(key)) {
-				notFound2++
+				notFound++
 			}
 		}
 
-		if notFound1 > 0 || notFound2 > 0 {
-			t.Errorf("Boundary filters failed: below_threshold_missing=%d, above_threshold_missing=%d",
-				notFound1, notFound2)
+		if notFound > 0 {
+			t.Errorf("Small filter: Expected all 1000 items to be found, got %d missing", notFound)
 		}
 	})
 
-	t.Run("Cache line alignment boundaries", func(t *testing.T) {
-		// Test filters that create different cache line counts
-		testCases := []uint64{
-			1,      // Minimal
-			63,     // Just under 1 cache line of data
-			64,     // Exactly 1 cache line
-			65,     // Just over 1 cache line
-			511,    // Just under many cache lines
-			512,    // Exactly fills cache lines
-			513,    // Just over cache line boundary
-			1023,   // Near power of 2
-			1024,   // Power of 2
-			1025,   // Just over power of 2
-		}
-
-		for _, elements := range testCases {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(elements, 0.01)
-			stats := bf.GetCacheStats()
-
-			// Add elements
-			for i := uint64(0); i < elements; i++ {
-				bf.AddUint64(i)
-			}
-
-			// Verify all elements
-			notFound := 0
-			for i := uint64(0); i < elements; i++ {
-				if !bf.ContainsUint64(i) {
-					notFound++
-				}
-			}
-
-			if notFound > 0 {
-				t.Errorf("Elements=%d: failed to find %d elements, cache_lines=%d",
-					elements, notFound, stats.CacheLineCount)
-			} else {
-				t.Logf("Elements=%d: OK, cache_lines=%d, bits=%d",
-					elements, stats.CacheLineCount, stats.BitCount)
-			}
-		}
-	})
+	t.Run("Large Filter", func(t *testing.T) {
+		// Test large filter (1M elements)
+		bf2 := bloomfilter.NewCacheOptimizedBloomFilter(1_000_000, 0.01)
+		stats2 := bf2.GetCacheStats()
+		t.Logf("Large filter: elements=1M, cache_lines=%d, memory=%d bytes",
+			stats2.CacheLineCount, stats2.MemoryUsage)
 
-	t.Run("Bit and byte alignment", func(t *testing.T) {
-		// Test with data sizes that exercise different alignment paths
-		dataSizes := []int{
-			1, 2, 3, 4, 5, 6, 7, 8,      // Single bytes to uint64
-			9, 15, 16, 17,                // Around 16-byte boundary
-			31, 32, 33,                   // Around 32-byte boundary (AVX2)
-			63, 64, 65,                   // Around 64-byte boundary (cache line)
-			127, 128, 129,                // Power of 2 boundaries
+		// Verify works correctly
+		for i := 0; i < 1000; i++ {
+			key := string([]byte{byte(i >> 8), byte(i)})
+			bf2.Add([]byte(key))
 		}
 
-		bf := bloomfilter.NewCacheOptimizedBloomFilter(10000, 0.01)
-
-		for _, size := range dataSizes {
-			data := make([]byte, size)
-			for i := 0; i < size; i++ {
-				data[i] = byte(i)
-			}
-
-			bf.Add(data)
-			if !bf.Contains(data) {
-				t.Errorf("Failed to find data of size %d bytes", size)
+		notFound := 0
+		for i := 0; i < 1000; i++ {
+			key := string([]byte{byte(i >> 8), byte(i)})
+			if !bf2.Contains([]byte(key)) {
+				notFound++
 			}
 		}
 
-		t.Logf("Successfully tested %d different data sizes", len(dataSizes))
+		if notFound > 0 {
+			t.Errorf("Large filter: Expected all 1000 items to be found, got %d missing", notFound)
+		}
 	})
 }
 
-// TestHashDistribution tests quality of hash distribution
-func TestHashDistribution(t *testing.T) {
-	bf := bloomfilter.NewCacheOptimizedBloomFilter(10000, 0.01)
-	stats := bf.GetCacheStats()
-	totalBits := stats.BitCount
-
-	// Add elements and track bit positions
-	numElements := 1000
-	initialBitsSet := bf.PopCount()
-
-	for i := 0; i < numElements; i++ {
-		bf.AddUint64(uint64(i))
-	}
-
-	finalBitsSet := bf.PopCount()
-	bitsSetByElements := finalBitsSet - initialBitsSet
-
-	// Calculate expected bits set
-	// Formula: m * (1 - (1 - 1/m)^(k*n))
-	// where m = total bits, k = hash count, n = elements
-	m := float64(totalBits)
-	k := float64(stats.HashCount)
-	n := float64(numElements)
-
-	expectedBitsSet := m * (1 - math.Pow(1-1/m, k*n))
-	actualBitsSet := float64(bitsSetByElements)
-
-	// Allow 10% deviation from expected
-	deviation := math.Abs(actualBitsSet-expectedBitsSet) / expectedBitsSet
-
-	if deviation > 0.10 {
-		t.Errorf("Hash distribution deviation too high: expected=%.0f, actual=%.0f, deviation=%.2f%%",
-			expectedBitsSet, actualBitsSet, deviation*100)
+// TestExtremelySmallFilter tests filters with very few expected elements
+func TestExtremelySmallFilter(t *testing.T) {
+	testCases := []struct {
+		name             string
+		expectedElements uint64
+		falsePositiveRate float64
+	}{
+		{"Single Element", 1, 0.01},
+		{"Ten Elements", 10, 0.01},
+		{"Hundred Elements", 100, 0.001},
 	}
 
-	t.Logf("Hash distribution test:")
-	t.Logf("  Elements added: %d", numElements)
-	t.Logf("  Hash count: %d", stats.HashCount)
-	t.Logf("  Expected bits set: %.0f", expectedBitsSet)
-	t.Logf("  Actual bits set: %.0f", actualBitsSet)
-	t.Logf("  Deviation: %.2f%%", deviation*100)
-}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			bf := bloomfilter.NewCacheOptimizedBloomFilter(tc.expectedElements, tc.falsePositiveRate)
 
-// TestCollisionResistance tests resistance to hash collisions
-func TestCollisionResistance(t *testing.T) {
-	bf := bloomfilter.NewCacheOptimizedBloomFilter(10000, 0.01)
-
-	// Test patterns known to cause collisions in poor hash functions
-	collisionPronePatterns := [][]byte{
-		// Sequential patterns
-		{0, 1, 2, 3, 4, 5, 6, 7},
-		{1, 2, 3, 4, 5, 6, 7, 8},
-		{2, 3, 4, 5, 6, 7, 8, 9},
-
-		// Repeating patterns
-		{0xAA, 0xAA, 0xAA, 0xAA},
-		{0x55, 0x55, 0x55, 0x55},
-		{0xFF, 0xFF, 0xFF, 0xFF},
-
-		// Shifted patterns
-		{1, 0, 0, 0, 0, 0, 0, 0},
-		{0, 1, 0, 0, 0, 0, 0, 0},
-		{0, 0, 1, 0, 0, 0, 0, 0},
-
-		// Palindromes
-		{1, 2, 3, 4, 4, 3, 2, 1},
-		{0xDE, 0xAD, 0xBE, 0xEF, 0xEF, 0xBE, 0xAD, 0xDE},
-	}
-
-	// Add all patterns
-	for i, pattern := range collisionPronePatterns {
-		bf.Add(pattern)
-		if !bf.Contains(pattern) {
-			t.Errorf("Failed to add collision-prone pattern %d: %v", i, pattern)
-		}
-	}
+			// Add expected number of elements
+			for i := uint64(0); i < tc.expectedElements; i++ {
+				bf.AddUint64(i)
+			}
 
-	// Verify all patterns are still found
-	notFound := 0
-	for i, pattern := range collisionPronePatterns {
-		if !bf.Contains(pattern) {
-			t.Errorf("Failed to find collision-prone pattern %d: %v", i, pattern)
-			notFound++
-		}
-	}
+			// Verify all elements are found
+			for i := uint64(0); i < tc.expectedElements; i++ {
+				if !bf.ContainsUint64(i) {
+					t.Errorf("Element %d should be found but wasn't", i)
+				}
+			}
 
-	if notFound == 0 {
-		t.Logf("All %d collision-prone patterns handled correctly", len(collisionPronePatterns))
+			stats := bf.GetCacheStats()
+			t.Logf("Filter stats: bits=%d, hash_count=%d, cache_lines=%d",
+				stats.BitCount, stats.HashCount, stats.CacheLineCount)
+		})
 	}
 }
 
-// TestExtremeFalsePositiveRates tests filters with extreme FPR settings
+// TestExtremeFalsePositiveRates tests very low and very high FPRs
 func TestExtremeFalsePositiveRates(t *testing.T) {
-	tests := []struct {
-		name     string
-		elements uint64
-		fpr      float64
+	testCases := []struct {
+		name string
+		fpr  float64
 	}{
-		{"Very low FPR", 1000, 0.00001},
-		{"Low FPR", 1000, 0.0001},
-		{"Normal FPR", 1000, 0.01},
-		{"High FPR", 1000, 0.1},
-		{"Very high FPR", 1000, 0.5},
+		{"Very Low FPR", 0.000001},  // 0.0001%
+		{"Low FPR", 0.001},          // 0.1%
+		{"Medium FPR", 0.01},        // 1%
+		{"High FPR", 0.1},           // 10%
 	}
 
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(tt.elements, tt.fpr)
-			stats := bf.GetCacheStats()
+	const elements = 10000
 
-			t.Logf("%s configuration:", tt.name)
-			t.Logf("  Target FPR: %.6f", tt.fpr)
-			t.Logf("  Bit count: %d", stats.BitCount)
-			t.Logf("  Hash count: %d", stats.HashCount)
-			t.Logf("  Bits per element: %.2f", float64(stats.BitCount)/float64(tt.elements))
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			bf := bloomfilter.NewCacheOptimizedBloomFilter(elements, tc.fpr)
 
 			// Add elements
-			for i := uint64(0); i < tt.elements; i++ {
-				bf.AddUint64(i)
+			for i := 0; i < elements/10; i++ {
+				bf.AddUint64(uint64(i))
 			}
 
-			// Verify elements
+			// Check all added elements are found
 			notFound := 0
-			for i := uint64(0); i < tt.elements; i++ {
-				if !bf.ContainsUint64(i) {
+			for i := 0; i < elements/10; i++ {
+				if !bf.ContainsUint64(uint64(i)) {
 					notFound++
 				}
 			}
 
 			if notFound > 0 {
-				t.Errorf("Failed to find %d/%d elements", notFound, tt.elements)
-			}
-
-			// Measure actual false positive rate
-			numTests := 10000
-			falsePositives := 0
-			for i := tt.elements; i < tt.elements+uint64(numTests); i++ {
-				if bf.ContainsUint64(i) {
-					falsePositives++
-				}
+				t.Errorf("FPR %f: Expected all %d items to be found, got %d missing",
+					tc.fpr, elements/10, notFound)
 			}
 
-			actualFPR := float64(falsePositives) / float64(numTests)
-			t.Logf("  Measured FPR: %.6f", actualFPR)
-
-			// For very low FPR, allow up to 2x target
-			// For normal/high FPR, allow up to 3x target
-			maxMultiplier := 3.0
-			if tt.fpr < 0.001 {
-				maxMultiplier = 2.0
-			}
-
-			if actualFPR > tt.fpr*maxMultiplier {
-				t.Errorf("Actual FPR (%.6f) exceeds %.1fx target (%.6f)",
-					actualFPR, maxMultiplier, tt.fpr*maxMultiplier)
-			}
+			stats := bf.GetCacheStats()
+			t.Logf("FPR %f: bits=%d, hash_count=%d, load_factor=%.4f, estimated_fpp=%.6f",
+				tc.fpr, stats.BitCount, stats.HashCount, stats.LoadFactor, stats.EstimatedFPP)
 		})
 	}
 }
 
-// TestZeroAndMinimalCases tests edge cases around zero values
-func TestZeroAndMinimalCases(t *testing.T) {
-	t.Run("Zero uint64", func(t *testing.T) {
-		bf := bloomfilter.NewCacheOptimizedBloomFilter(100, 0.01)
-		bf.AddUint64(0)
-		if !bf.ContainsUint64(0) {
-			t.Error("Failed to find uint64(0)")
-		}
+// TestMaximumHashCount tests filters with very low FPR (high hash count)
+func TestMaximumHashCount(t *testing.T) {
+	// Very low FPR will result in high hash count
+	bf := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.0000001)
 
-		// Verify it's different from uint64(1)
-		if bf.ContainsUint64(1) {
-			t.Log("Note: uint64(1) also found (possible false positive)")
-		}
-	})
-
-	t.Run("Empty string vs nil slice", func(t *testing.T) {
-		bf := bloomfilter.NewCacheOptimizedBloomFilter(100, 0.01)
+	stats := bf.GetCacheStats()
+	t.Logf("Hash count for FPR 0.0000001: %d", stats.HashCount)
 
-		bf.AddString("")
-		bf.Add([]byte{})
-		bf.Add(nil)
+	// Verify it still works correctly
+	for i := 0; i < 100; i++ {
+		bf.AddUint64(uint64(i))
+	}
 
-		// All should be found (they're all empty)
-		if !bf.ContainsString("") {
-			t.Error("Failed to find empty string")
-		}
-		if !bf.Contains([]byte{}) {
-			t.Error("Failed to find empty byte slice")
-		}
-		if !bf.Contains(nil) {
-			t.Error("Failed to find nil slice")
+	notFound := 0
+	for i := 0; i < 100; i++ {
+		if !bf.ContainsUint64(uint64(i)) {
+			notFound++
 		}
-	})
+	}
 
-	t.Run("Single bit patterns", func(t *testing.T) {
-		bf := bloomfilter.NewCacheOptimizedBloomFilter(100, 0.01)
+	if notFound > 0 {
+		t.Errorf("Expected all 100 items to be found, got %d missing", notFound)
+	}
+}
 
-		// Test all single-bit patterns in a byte
-		for i := 0; i < 8; i++ {
-			pattern := []byte{1 << i}
-			bf.Add(pattern)
-			if !bf.Contains(pattern) {
-				t.Errorf("Failed to find single-bit pattern: 0x%02X", pattern[0])
+// TestZeroAndNegativeInputs tests that invalid inputs panic with clear error messages
+func TestZeroAndNegativeInputs(t *testing.T) {
+	t.Run("Zero Expected Elements", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Error("Expected panic for zero expected elements, but didn't panic")
+			} else {
+				t.Logf("Correctly panicked for zero expected elements: %v", r)
+				// Verify panic message is informative
+				if msg, ok := r.(string); ok {
+					if msg != "bloomfilter: expectedElements must be greater than 0" {
+						t.Errorf("Unexpected panic message: %s", msg)
+					}
+				}
 			}
-		}
+		}()
 
-		t.Log("All single-bit patterns handled correctly")
+		bloomfilter.NewCacheOptimizedBloomFilter(0, 0.01)
+		t.Error("Should not reach here - expected panic")
 	})
-}
 
-// TestMemoryBehavior tests memory-related edge cases
-func TestMemoryBehavior(t *testing.T) {
-	t.Run("Multiple clear cycles", func(t *testing.T) {
-		bf := bloomfilter.NewCacheOptimizedBloomFilter(10000, 0.01)
+	t.Run("Invalid FPR - Too High", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Error("Expected panic for FPR > 1.0, but didn't panic")
+			} else {
+				t.Logf("Correctly panicked for FPR > 1.0: %v", r)
+			}
+		}()
 
-		// Run multiple add/clear cycles
-		numCycles := 100
-		elementsPerCycle := 100
+		bloomfilter.NewCacheOptimizedBloomFilter(1000, 1.5)
+		t.Error("Should not reach here - expected panic")
+	})
 
-		for cycle := 0; cycle < numCycles; cycle++ {
-			// Add elements
-			for i := 0; i < elementsPerCycle; i++ {
-				bf.AddUint64(uint64(cycle*elementsPerCycle + i))
+	t.Run("Invalid FPR - Negative", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Error("Expected panic for negative FPR, but didn't panic")
+			} else {
+				t.Logf("Correctly panicked for negative FPR: %v", r)
 			}
+		}()
+
+		bloomfilter.NewCacheOptimizedBloomFilter(1000, -0.01)
+		t.Error("Should not reach here - expected panic")
+	})
 
-			// Verify some elements
-			if !bf.ContainsUint64(uint64(cycle * elementsPerCycle)) {
-				t.Errorf("Cycle %d: element not found", cycle)
+	t.Run("Invalid FPR - Zero", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Error("Expected panic for zero FPR, but didn't panic")
+			} else {
+				t.Logf("Correctly panicked for zero FPR: %v", r)
 			}
+		}()
 
-			// Clear
-			bf.Clear()
+		bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.0)
+		t.Error("Should not reach here - expected panic")
+	})
 
-			// Verify cleared
-			if bf.PopCount() != 0 {
-				t.Errorf("Cycle %d: filter not properly cleared, %d bits still set",
-					cycle, bf.PopCount())
+	t.Run("Invalid FPR - Exactly 1.0", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Error("Expected panic for FPR = 1.0, but didn't panic")
+			} else {
+				t.Logf("Correctly panicked for FPR = 1.0: %v", r)
 			}
-		}
+		}()
 
-		t.Logf("Completed %d add/clear cycles successfully", numCycles)
+		bloomfilter.NewCacheOptimizedBloomFilter(1000, 1.0)
+		t.Error("Should not reach here - expected panic")
 	})
 
-	t.Run("Overload beyond capacity", func(t *testing.T) {
-		// Create small filter
-		bf := bloomfilter.NewCacheOptimizedBloomFilter(100, 0.01)
+	t.Run("NaN FPR", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Error("Expected panic for NaN FPR, but didn't panic")
+			} else {
+				t.Logf("Correctly panicked for NaN FPR: %v", r)
+				// Verify panic message is informative
+				if msg, ok := r.(string); ok {
+					if msg != "bloomfilter: falsePositiveRate cannot be NaN" {
+						t.Errorf("Unexpected panic message: %s", msg)
+					}
+				}
+			}
+		}()
 
-		// Add 10x the expected capacity
-		numElements := 1000
-		for i := 0; i < numElements; i++ {
-			bf.AddUint64(uint64(i))
-		}
+		bloomfilter.NewCacheOptimizedBloomFilter(1000, math.NaN())
+		t.Error("Should not reach here - expected panic")
+	})
+}
 
-		// All elements should still be found (but FPR will be high)
-		notFound := 0
-		for i := 0; i < numElements; i++ {
-			if !bf.ContainsUint64(uint64(i)) {
-				notFound++
-			}
-		}
+// TestEmptyData tests adding and checking empty/nil data
+func TestEmptyData(t *testing.T) {
+	bf := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.01)
 
-		finalStats := bf.GetCacheStats()
+	t.Run("Empty Byte Slice", func(t *testing.T) {
+		bf.Add([]byte{})
+		if !bf.Contains([]byte{}) {
+			t.Error("Empty byte slice should be found")
+		}
+	})
 
-		if notFound > 0 {
-			t.Errorf("Overloaded filter failed to find %d/%d elements", notFound, numElements)
+	t.Run("Empty String", func(t *testing.T) {
+		bf.AddString("")
+		if !bf.ContainsString("") {
+			t.Error("Empty string should be found")
 		}
+	})
 
-		t.Logf("Overloaded filter stats:")
-		t.Logf("  Capacity: 100 elements")
-		t.Logf("  Actual: %d elements", numElements)
-		t.Logf("  Load factor: %.2f%%", finalStats.LoadFactor*100)
-		t.Logf("  Estimated FPP: %.4f%%", finalStats.EstimatedFPP*100)
-		t.Logf("  All elements found: %v", notFound == 0)
-
-		// FPR should be very high
-		if finalStats.EstimatedFPP < 0.5 {
-			t.Logf("Note: Overloaded filter has lower FPR than expected (%.4f%%)",
-				finalStats.EstimatedFPP*100)
+	t.Run("Zero Uint64", func(t *testing.T) {
+		bf.AddUint64(0)
+		if !bf.ContainsUint64(0) {
+			t.Error("Zero uint64 should be found")
 		}
 	})
 }
 
-// TestUnicodeAndSpecialCharacters tests handling of special strings
-func TestUnicodeAndSpecialCharacters(t *testing.T) {
-	bf := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.01)
-
-	specialStrings := []string{
-		"Hello, 世界",                    // Chinese
-		"Привет, мир",                  // Russian
-		"مرحبا بالعالم",                // Arabic
-		"שלום עולם",                    // Hebrew
-		"こんにちは世界",                    // Japanese
-		"🚀🌟💻🔥",                      // Emojis
-		"\x00\x01\x02\x03",            // Control characters
-		"\n\r\t",                      // Whitespace
-		"a\u0000b",                    // Null byte in middle
-		string([]byte{0xFF, 0xFE}),   // Invalid UTF-8
+// TestVeryLargeElements tests filters with billions of expected elements
+func TestVeryLargeElements(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping large element test in short mode")
 	}
 
-	// Add all special strings
-	for _, s := range specialStrings {
-		bf.AddString(s)
+	// Create filter for 10M elements
+	bf := bloomfilter.NewCacheOptimizedBloomFilter(10_000_000, 0.01)
+
+	stats := bf.GetCacheStats()
+	t.Logf("Large filter stats: bits=%d, cache_lines=%d, memory=%d MB",
+		stats.BitCount, stats.CacheLineCount, stats.MemoryUsage/(1024*1024))
+
+	// Add and verify a sample
+	const sampleSize = 10000
+	for i := 0; i < sampleSize; i++ {
+		bf.AddUint64(uint64(i))
 	}
 
-	// Verify all are found
 	notFound := 0
-	for i, s := range specialStrings {
-		if !bf.ContainsString(s) {
-			t.Errorf("Failed to find special string %d: %q (bytes: %v)", i, s, []byte(s))
+	for i := 0; i < sampleSize; i++ {
+		if !bf.ContainsUint64(uint64(i)) {
 			notFound++
 		}
 	}
 
-	if notFound == 0 {
-		t.Logf("All %d special/unicode strings handled correctly", len(specialStrings))
+	if notFound > 0 {
+		t.Errorf("Expected all %d items to be found, got %d missing", sampleSize, notFound)
 	}
 }
diff --git a/tests/integration/bloomfilter_retry_test.go b/tests/integration/bloomfilter_retry_test.go
new file mode 100644
index 0000000..a5ddc53
--- /dev/null
+++ b/tests/integration/bloomfilter_retry_test.go
@@ -0,0 +1,213 @@
+package bloomfilter_test
+
+import (
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	bloomfilter "github.com/shaia/BloomFilter"
+)
+
+// TestAtomicRetryMechanism validates that the CAS retry loop in setBitsAtomic
+// successfully handles contention and ensures all bits are set correctly.
+//
+// This test creates extreme contention by having many goroutines simultaneously
+// write to the same small filter, forcing CAS retries. It then validates that
+// all bits were successfully set (no false negatives).
+func TestAtomicRetryMechanism(t *testing.T) {
+	// Use a small filter to increase collision probability
+	// 1000 expected elements with 0.01 FPR = ~9728 bits
+	// With 6 hash functions, each insert touches 6 bits
+	bf := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.01)
+
+	const (
+		numGoroutines = 100
+		insertsPerGoroutine = 100
+	)
+
+	// Track successful insertions
+	var insertCount atomic.Int64
+
+	// All goroutines insert into the same key space to maximize contention
+	var wg sync.WaitGroup
+	for g := 0; g < numGoroutines; g++ {
+		wg.Add(1)
+		go func(goroutineID int) {
+			defer wg.Done()
+
+			for i := 0; i < insertsPerGoroutine; i++ {
+				// Use a small key space to force contention
+				// Only 100 unique keys, but 10,000 total insertions
+				key := i % 100
+				bf.AddUint64(uint64(key))
+				insertCount.Add(1)
+			}
+		}(g)
+	}
+
+	wg.Wait()
+
+	totalInserts := insertCount.Load()
+	t.Logf("Completed %d concurrent insertions", totalInserts)
+
+	// CRITICAL TEST: Verify no false negatives
+	// All 100 unique keys must be found (Bloom filter correctness)
+	notFound := 0
+	for key := 0; key < 100; key++ {
+		if !bf.ContainsUint64(uint64(key)) {
+			notFound++
+			t.Errorf("Key %d not found after concurrent insertions (FALSE NEGATIVE)", key)
+		}
+	}
+
+	if notFound > 0 {
+		t.Fatalf("CRITICAL: Found %d false negatives - CAS retry mechanism failed!", notFound)
+	}
+
+	t.Logf("SUCCESS: All %d unique keys found (no false negatives)", 100)
+
+	// Verify statistics
+	stats := bf.GetCacheStats()
+	t.Logf("Filter stats: bits_set=%d, load_factor=%.4f, estimated_fpp=%.6f",
+		stats.BitsSet, stats.LoadFactor, stats.EstimatedFPP)
+}
+
+// TestExtremeContentionSameWord validates that multiple threads writing to
+// the exact same bit positions (worst-case contention) still succeed.
+func TestExtremeContentionSameWord(t *testing.T) {
+	bf := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.01)
+
+	const (
+		numGoroutines = 50
+		iterations = 1000
+	)
+
+	var wg sync.WaitGroup
+
+	// All goroutines insert the EXACT same key to maximize contention on same bits
+	for g := 0; g < numGoroutines; g++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+
+			for i := 0; i < iterations; i++ {
+				// Every goroutine inserts the same key "contention_test"
+				bf.AddString("contention_test")
+			}
+		}()
+	}
+
+	wg.Wait()
+
+	// CRITICAL: The key must be found (no false negative)
+	if !bf.ContainsString("contention_test") {
+		t.Fatal("CRITICAL: Key 'contention_test' not found after extreme contention - CAS retry failed!")
+	}
+
+	t.Logf("SUCCESS: Key found after %d concurrent writes to same bit positions", numGoroutines*iterations)
+}
+
+// TestCASRetriesEventualSuccess validates that even under artificial contention,
+// the retry loop eventually succeeds without hanging.
+func TestCASRetriesEventualSuccess(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping stress test in short mode")
+	}
+
+	// Create multiple filters to test retry behavior across different scenarios
+	testCases := []struct {
+		name              string
+		expectedElements  uint64
+		fpr               float64
+		numGoroutines     int
+		keysPerGoroutine  int
+	}{
+		{"Small filter, high contention", 100, 0.01, 100, 50},
+		{"Medium filter, medium contention", 10000, 0.01, 50, 100},
+		{"Large filter, low contention", 100000, 0.01, 100, 1000},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			bf := bloomfilter.NewCacheOptimizedBloomFilter(tc.expectedElements, tc.fpr)
+
+			var wg sync.WaitGroup
+			uniqueKeys := make(map[uint64]bool)
+			var keyMutex sync.Mutex
+
+			// Track all unique keys for verification
+			for g := 0; g < tc.numGoroutines; g++ {
+				wg.Add(1)
+				go func(goroutineID int) {
+					defer wg.Done()
+
+					for i := 0; i < tc.keysPerGoroutine; i++ {
+						key := uint64(goroutineID*tc.keysPerGoroutine + i)
+
+						keyMutex.Lock()
+						uniqueKeys[key] = true
+						keyMutex.Unlock()
+
+						bf.AddUint64(key)
+					}
+				}(g)
+			}
+
+			wg.Wait()
+
+			// Verify no false negatives
+			notFound := 0
+			for key := range uniqueKeys {
+				if !bf.ContainsUint64(key) {
+					notFound++
+					if notFound <= 5 {
+						t.Errorf("Key %d not found (FALSE NEGATIVE)", key)
+					}
+				}
+			}
+
+			if notFound > 0 {
+				t.Fatalf("Found %d false negatives out of %d keys", notFound, len(uniqueKeys))
+			}
+
+			t.Logf("SUCCESS: All %d unique keys found", len(uniqueKeys))
+		})
+	}
+}
+
+// TestNoHangUnderContention validates that the retry loop doesn't hang
+// by using a timeout-based approach.
+func TestNoHangUnderContention(t *testing.T) {
+	done := make(chan bool, 1)
+
+	go func() {
+		bf := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.01)
+
+		const numGoroutines = 100
+		var wg sync.WaitGroup
+
+		for g := 0; g < numGoroutines; g++ {
+			wg.Add(1)
+			go func(id int) {
+				defer wg.Done()
+				for i := 0; i < 100; i++ {
+					bf.AddUint64(uint64(i))
+				}
+			}(g)
+		}
+
+		wg.Wait()
+		done <- true
+	}()
+
+	// Wait for completion or timeout
+	// 10 seconds should be more than enough for 10,000 insertions
+	// If it takes longer, the retry mechanism has hung
+	select {
+	case <-done:
+		t.Log("SUCCESS: Retry mechanism completed without hanging")
+	case <-time.After(10 * time.Second):
+		t.Fatal("CRITICAL: Retry mechanism appears to have hung (timeout exceeded)")
+	}
+}
diff --git a/tests/integration/bloomfilter_storage_mode_test.go b/tests/integration/bloomfilter_storage_mode_test.go
deleted file mode 100644
index 4bb57af..0000000
--- a/tests/integration/bloomfilter_storage_mode_test.go
+++ /dev/null
@@ -1,296 +0,0 @@
-package bloomfilter_test
-
-import (
-	"fmt"
-	"testing"
-
-	bloomfilter "github.com/shaia/BloomFilter"
-)
-
-// TestHybridModeSelection verifies that the correct mode is chosen based on filter size
-func TestHybridModeSelection(t *testing.T) {
-	tests := []struct {
-		name           string
-		elements       uint64
-		fpr            float64
-		expectArrayMode bool
-		description    string
-	}{
-		{
-			name:           "Small filter - should use array mode",
-			elements:       10_000,
-			fpr:            0.01,
-			expectArrayMode: true,
-			description:    "10K elements = ~1200 cache lines < 10K threshold",
-		},
-		{
-			name:           "Medium filter - should use array mode",
-			elements:       100_000,
-			fpr:            0.01,
-			expectArrayMode: true,
-			description:    "100K elements = ~11,980 cache lines, close to threshold but array mode",
-		},
-		{
-			name:           "Large filter - should use map mode",
-			elements:       1_000_000,
-			fpr:            0.01,
-			expectArrayMode: false,
-			description:    "1M elements = ~119,808 cache lines > 10K threshold",
-		},
-		{
-			name:           "Very large filter - should use map mode",
-			elements:       10_000_000,
-			fpr:            0.01,
-			expectArrayMode: false,
-			description:    "10M elements = ~1,198,086 cache lines >> 10K threshold",
-		},
-		{
-			name:           "Huge filter - should use map mode",
-			elements:       100_000_000,
-			fpr:            0.001,
-			expectArrayMode: false,
-			description:    "100M elements with low FPR = millions of cache lines",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(tt.elements, tt.fpr)
-
-			if bf.IsArrayMode() != tt.expectArrayMode {
-				stats := bf.GetCacheStats()
-				t.Errorf("%s: expected array mode=%v, got=%v\n  Cache lines: %d, Threshold: %d\n  %s",
-					tt.name, tt.expectArrayMode, bf.IsArrayMode(),
-					stats.CacheLineCount, bloomfilter.ArrayModeThreshold, tt.description)
-			}
-
-			// Note: Internal storage structure validation removed
-			// The IsArrayMode() check above verifies the mode selection is correct
-
-			stats := bf.GetCacheStats()
-			t.Logf("✓ %s: mode=%s, cache_lines=%d, bits=%d",
-				tt.name,
-				func() string {
-					if bf.IsArrayMode() {
-						return "ARRAY"
-					}
-					return "MAP"
-				}(),
-				stats.CacheLineCount,
-				stats.BitCount)
-		})
-	}
-}
-
-// TestHybridModeCorrectness verifies both modes produce correct results
-func TestHybridModeCorrectness(t *testing.T) {
-	tests := []struct {
-		name     string
-		elements uint64
-		fpr      float64
-	}{
-		{"Small/Array Mode", 10_000, 0.01},
-		{"Large/Map Mode", 1_000_000, 0.01},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(tt.elements, tt.fpr)
-
-			mode := "ARRAY"
-			if !bf.IsArrayMode() {
-				mode = "MAP"
-			}
-			t.Logf("Testing %s mode with %d elements", mode, tt.elements)
-
-			// Test with 1000 elements
-			testElements := []string{
-				"apple", "banana", "cherry", "date", "elderberry",
-				"fig", "grape", "honeydew", "kiwi", "lemon",
-			}
-
-			// Add elements
-			for _, elem := range testElements {
-				bf.AddString(elem)
-			}
-
-			// Verify all added elements are found
-			for _, elem := range testElements {
-				if !bf.ContainsString(elem) {
-					t.Errorf("%s mode: element '%s' was added but not found", mode, elem)
-				}
-			}
-
-			// Test elements that weren't added
-			notAdded := []string{"mango", "nectarine", "orange", "papaya", "quince"}
-			falsePositives := 0
-			for _, elem := range notAdded {
-				if bf.ContainsString(elem) {
-					falsePositives++
-				}
-			}
-
-			// With good FPR and few checks, should have very few false positives
-			if falsePositives > 2 {
-				t.Logf("%s mode: warning - %d false positives out of %d (might be normal)",
-					mode, falsePositives, len(notAdded))
-			}
-
-			t.Logf("✓ %s mode correctness verified: %d/%d elements found, %d/%d false positives",
-				mode, len(testElements), len(testElements), falsePositives, len(notAdded))
-		})
-	}
-}
-
-// TestHybridMemoryFootprint estimates memory usage for different modes
-func TestHybridMemoryFootprint(t *testing.T) {
-	tests := []struct {
-		name     string
-		elements uint64
-		fpr      float64
-	}{
-		{"Small/Array", 1_000, 0.01},
-		{"Medium/Array", 10_000, 0.01},
-		{"Large/Map", 1_000_000, 0.01},
-		{"Huge/Map", 10_000_000, 0.01},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(tt.elements, tt.fpr)
-			stats := bf.GetCacheStats()
-
-			// Calculate actual bit array size
-			bitArrayBytes := stats.CacheLineCount * uint64(bloomfilter.CacheLineSize)
-
-			// Estimate overhead based on mode
-			var overheadBytes uint64
-			mode := "ARRAY"
-
-			if bf.IsArrayMode() {
-				// Array mode: fixed overhead
-				// 3 arrays × 10K elements × 24 bytes/slice = ~720KB
-				overheadBytes = bloomfilter.ArrayModeThreshold * 24 * 3
-			} else {
-				mode = "MAP"
-				// Map mode: dynamic overhead, estimate based on hash count
-				// Each map entry: ~50 bytes average (key + value + overhead)
-				estimatedEntries := stats.HashCount / 4 // Rough estimate
-				overheadBytes = uint64(estimatedEntries) * 50 * 3 // 3 maps
-			}
-
-			totalBytes := bitArrayBytes + overheadBytes
-
-			t.Logf("Mode: %s", mode)
-			t.Logf("  Elements: %s", formatNumber(tt.elements))
-			t.Logf("  Cache lines: %s", formatNumber(stats.CacheLineCount))
-			t.Logf("  Bit array: %s", formatBytes(bitArrayBytes))
-			t.Logf("  Overhead: %s", formatBytes(overheadBytes))
-			t.Logf("  Total (est): %s", formatBytes(totalBytes))
-			t.Logf("  Overhead %%: %.1f%%", float64(overheadBytes)/float64(totalBytes)*100)
-
-			// Array mode should have predictable overhead
-			if bf.IsArrayMode() {
-				expectedOverhead := uint64(bloomfilter.ArrayModeThreshold * 24 * 3)
-				if overheadBytes != expectedOverhead {
-					t.Errorf("Array mode overhead mismatch: expected %d, got %d",
-						expectedOverhead, overheadBytes)
-				}
-			}
-		})
-	}
-}
-
-// TestLargeScaleHybrid tests the hybrid approach with realistic large-scale scenarios
-func TestLargeScaleHybrid(t *testing.T) {
-	if testing.Short() {
-		t.Skip("Skipping large-scale test in short mode")
-	}
-
-	tests := []struct {
-		name     string
-		elements uint64
-		fpr      float64
-		testOps  int
-	}{
-		{"Medium scale", 500_000, 0.01, 10_000},
-		{"Large scale", 5_000_000, 0.01, 10_000},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(tt.elements, tt.fpr)
-
-			mode := "ARRAY"
-			if !bf.IsArrayMode() {
-				mode = "MAP"
-			}
-
-			t.Logf("Testing %s mode: %s elements, %d test operations",
-				mode, formatNumber(tt.elements), tt.testOps)
-
-			// Add elements
-			for i := 0; i < tt.testOps; i++ {
-				bf.AddUint64(uint64(i))
-			}
-
-			// Verify they're all found
-			errors := 0
-			for i := 0; i < tt.testOps; i++ {
-				if !bf.ContainsUint64(uint64(i)) {
-					errors++
-				}
-			}
-
-			if errors > 0 {
-				t.Errorf("%s mode: %d elements not found (should be 0)", mode, errors)
-			}
-
-			// Test false positive rate
-			falsePositives := 0
-			fpTests := 10000
-			for i := tt.testOps; i < tt.testOps+fpTests; i++ {
-				if bf.ContainsUint64(uint64(i)) {
-					falsePositives++
-				}
-			}
-
-			actualFPR := float64(falsePositives) / float64(fpTests)
-			t.Logf("✓ %s mode: FPR=%.4f (expected ~%.4f), errors=%d",
-				mode, actualFPR, tt.fpr, errors)
-
-			// Allow some margin for FPR
-			if actualFPR > tt.fpr*3 {
-				t.Errorf("%s mode: FPR too high: %.4f (expected ~%.4f)",
-					mode, actualFPR, tt.fpr)
-			}
-		})
-	}
-}
-
-// Helper functions
-func formatNumber(n uint64) string {
-	if n >= 1_000_000_000 {
-		return fmt.Sprintf("%.1fB", float64(n)/1_000_000_000)
-	}
-	if n >= 1_000_000 {
-		return fmt.Sprintf("%.1fM", float64(n)/1_000_000)
-	}
-	if n >= 1_000 {
-		return fmt.Sprintf("%.1fK", float64(n)/1_000)
-	}
-	return fmt.Sprintf("%d", n)
-}
-
-func formatBytes(b uint64) string {
-	if b >= 1024*1024*1024 {
-		return fmt.Sprintf("%.2f GB", float64(b)/(1024*1024*1024))
-	}
-	if b >= 1024*1024 {
-		return fmt.Sprintf("%.2f MB", float64(b)/(1024*1024))
-	}
-	if b >= 1024 {
-		return fmt.Sprintf("%.2f KB", float64(b)/1024)
-	}
-	return fmt.Sprintf("%d bytes", b)
-}
diff --git a/tests/integration/bloomfilter_stress_test.go b/tests/integration/bloomfilter_stress_test.go
deleted file mode 100644
index b6ad9ad..0000000
--- a/tests/integration/bloomfilter_stress_test.go
+++ /dev/null
@@ -1,378 +0,0 @@
-package bloomfilter_test
-
-import (
-	"fmt"
-	"runtime"
-	"testing"
-	"time"
-
-	bloomfilter "github.com/shaia/BloomFilter"
-)
-
-// TestLargeDatasetInsertion tests adding millions of keys
-func TestLargeDatasetInsertion(t *testing.T) {
-	if testing.Short() {
-		t.Skip("Skipping large dataset test in short mode")
-	}
-
-	tests := []struct {
-		name     string
-		elements uint64
-		fpr      float64
-		addCount int
-	}{
-		{
-			name:     "1 Million elements",
-			elements: 1_000_000,
-			fpr:      0.01,
-			addCount: 1_000_000,
-		},
-		{
-			name:     "5 Million elements",
-			elements: 5_000_000,
-			fpr:      0.01,
-			addCount: 5_000_000,
-		},
-		{
-			name:     "10 Million elements",
-			elements: 10_000_000,
-			fpr:      0.01,
-			addCount: 10_000_000,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(tt.elements, tt.fpr)
-
-			startTime := time.Now()
-			startMem := getMemStats()
-
-			// Add elements
-			t.Logf("Adding %d elements...", tt.addCount)
-			for i := 0; i < tt.addCount; i++ {
-				key := fmt.Sprintf("key_%d", i)
-				bf.AddString(key)
-
-				// Progress indicator for very large datasets
-				if i > 0 && i%(tt.addCount/10) == 0 {
-					t.Logf("  Progress: %d%% (%d elements)", (i*100)/tt.addCount, i)
-				}
-			}
-
-			insertTime := time.Since(startTime)
-			endMem := getMemStats()
-
-			t.Logf("Insertion complete:")
-			t.Logf("  Time: %v", insertTime)
-			t.Logf("  Rate: %.0f ops/sec", float64(tt.addCount)/insertTime.Seconds())
-			t.Logf("  Memory used: %.2f MB", float64(endMem-startMem)/(1024*1024))
-
-			// Verify a sample of elements
-			t.Logf("Verifying sample of elements...")
-			sampleSize := 10000
-			if sampleSize > tt.addCount {
-				sampleSize = tt.addCount
-			}
-
-			notFound := 0
-			verifyStart := time.Now()
-
-			for i := 0; i < sampleSize; i++ {
-				// Sample evenly across the range
-				idx := (i * tt.addCount) / sampleSize
-				key := fmt.Sprintf("key_%d", idx)
-				if !bf.ContainsString(key) {
-					notFound++
-				}
-			}
-
-			verifyTime := time.Since(verifyStart)
-
-			if notFound > 0 {
-				t.Errorf("Failed to find %d out of %d sampled elements (%.2f%%)",
-					notFound, sampleSize, float64(notFound)*100/float64(sampleSize))
-			}
-
-			t.Logf("Verification complete:")
-			t.Logf("  Time: %v", verifyTime)
-			t.Logf("  Rate: %.0f lookups/sec", float64(sampleSize)/verifyTime.Seconds())
-			t.Logf("  Sample size: %d", sampleSize)
-			t.Logf("  All samples found: %v", notFound == 0)
-
-			// Check stats
-			stats := bf.GetCacheStats()
-			t.Logf("Filter stats:")
-			t.Logf("  Mode: %s", func() string {
-				if bf.IsArrayMode() {
-					return "ARRAY"
-				}
-				return "MAP"
-			}())
-			t.Logf("  Bits set: %d / %d (%.2f%%)", stats.BitsSet, stats.BitCount, stats.LoadFactor*100)
-			t.Logf("  Estimated FPP: %.4f%%", stats.EstimatedFPP*100)
-		})
-	}
-}
-
-// TestLongRunningStability tests filter behavior over extended use
-func TestLongRunningStability(t *testing.T) {
-	if testing.Short() {
-		t.Skip("Skipping long-running test in short mode")
-	}
-
-	bf := bloomfilter.NewCacheOptimizedBloomFilter(100_000, 0.01)
-
-	numCycles := 10
-	elementsPerCycle := 10000
-
-	t.Logf("Testing stability over %d cycles of %d elements each", numCycles, elementsPerCycle)
-
-	initialMem := getMemStats()
-
-	for cycle := 0; cycle < numCycles; cycle++ {
-		startMem := getMemStats()
-
-		// Add elements
-		for i := 0; i < elementsPerCycle; i++ {
-			key := fmt.Sprintf("cycle_%d_key_%d", cycle, i)
-			bf.AddString(key)
-		}
-
-		// Verify elements from this cycle
-		notFound := 0
-		for i := 0; i < elementsPerCycle; i++ {
-			key := fmt.Sprintf("cycle_%d_key_%d", cycle, i)
-			if !bf.ContainsString(key) {
-				notFound++
-			}
-		}
-
-		endMem := getMemStats()
-		cycleMem := endMem - startMem
-
-		if notFound > 0 {
-			t.Errorf("Cycle %d: failed to find %d elements", cycle, notFound)
-		}
-
-		t.Logf("Cycle %d: added %d elements, memory delta: %.2f MB",
-			cycle, elementsPerCycle, float64(cycleMem)/(1024*1024))
-	}
-
-	finalMem := getMemStats()
-	totalMemGrowth := finalMem - initialMem
-
-	stats := bf.GetCacheStats()
-	t.Logf("Stability test complete:")
-	t.Logf("  Total cycles: %d", numCycles)
-	t.Logf("  Total elements added: %d", numCycles*elementsPerCycle)
-	t.Logf("  Total memory growth: %.2f MB", float64(totalMemGrowth)/(1024*1024))
-	t.Logf("  Load factor: %.2f%%", stats.LoadFactor*100)
-	t.Logf("  Estimated FPP: %.4f%%", stats.EstimatedFPP*100)
-}
-
-// TestExtremeEdgeCases tests unusual input conditions
-func TestExtremeEdgeCases(t *testing.T) {
-	t.Run("Very small filter", func(t *testing.T) {
-		bf := bloomfilter.NewCacheOptimizedBloomFilter(10, 0.01)
-
-		// Add more elements than expected capacity
-		for i := 0; i < 100; i++ {
-			bf.AddString(fmt.Sprintf("key_%d", i))
-		}
-
-		// Verify all elements are found
-		notFound := 0
-		for i := 0; i < 100; i++ {
-			if !bf.ContainsString(fmt.Sprintf("key_%d", i)) {
-				notFound++
-			}
-		}
-
-		if notFound > 0 {
-			t.Errorf("Failed to find %d elements in overloaded small filter", notFound)
-		}
-
-		stats := bf.GetCacheStats()
-		t.Logf("Small filter stats: load=%.2f%%, estimated_fpp=%.4f%%",
-			stats.LoadFactor*100, stats.EstimatedFPP*100)
-	})
-
-	t.Run("Very long strings", func(t *testing.T) {
-		bf := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.01)
-
-		longStrings := []string{
-			string(make([]byte, 1024)),      // 1 KB
-			string(make([]byte, 10*1024)),   // 10 KB
-			string(make([]byte, 100*1024)),  // 100 KB
-		}
-
-		// Fill with unique data
-		for i, s := range longStrings {
-			data := []byte(s)
-			for j := range data {
-				data[j] = byte(i + j)
-			}
-			longStrings[i] = string(data)
-		}
-
-		// Add and verify
-		for i, s := range longStrings {
-			bf.AddString(s)
-			if !bf.ContainsString(s) {
-				t.Errorf("Failed to find long string %d (len=%d)", i, len(s))
-			}
-		}
-
-		t.Logf("Successfully handled %d long strings", len(longStrings))
-	})
-
-	t.Run("Empty and nil inputs", func(t *testing.T) {
-		bf := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.01)
-
-		// Empty string
-		bf.AddString("")
-		if !bf.ContainsString("") {
-			t.Error("Failed to find empty string")
-		}
-
-		// Empty byte slice
-		bf.Add([]byte{})
-		if !bf.Contains([]byte{}) {
-			t.Error("Failed to find empty byte slice")
-		}
-
-		// Zero value
-		bf.AddUint64(0)
-		if !bf.ContainsUint64(0) {
-			t.Error("Failed to find uint64 zero value")
-		}
-
-		t.Log("Empty and zero value inputs handled correctly")
-	})
-
-	t.Run("Extreme FPR values", func(t *testing.T) {
-		// Very low FPR
-		bf1 := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.0001)
-		stats1 := bf1.GetCacheStats()
-		t.Logf("Low FPR filter: bits=%d, hash_count=%d", stats1.BitCount, stats1.HashCount)
-
-		// High FPR (not recommended but should work)
-		bf2 := bloomfilter.NewCacheOptimizedBloomFilter(1000, 0.5)
-		stats2 := bf2.GetCacheStats()
-		t.Logf("High FPR filter: bits=%d, hash_count=%d", stats2.BitCount, stats2.HashCount)
-
-		// Both should work
-		bf1.AddString("test")
-		bf2.AddString("test")
-
-		if !bf1.ContainsString("test") || !bf2.ContainsString("test") {
-			t.Error("Extreme FPR filters failed basic operations")
-		}
-	})
-}
-
-// TestHighThroughputSequential tests sequential high-throughput operations
-func TestHighThroughputSequential(t *testing.T) {
-	if testing.Short() {
-		t.Skip("Skipping high throughput test in short mode")
-	}
-
-	bf := bloomfilter.NewCacheOptimizedBloomFilter(1_000_000, 0.01)
-
-	numOperations := 1_000_000
-
-	// Test insert throughput
-	t.Logf("Testing insert throughput...")
-	startTime := time.Now()
-	for i := 0; i < numOperations; i++ {
-		bf.AddUint64(uint64(i))
-	}
-	insertDuration := time.Since(startTime)
-	insertRate := float64(numOperations) / insertDuration.Seconds()
-
-	t.Logf("Insert performance:")
-	t.Logf("  Operations: %d", numOperations)
-	t.Logf("  Time: %v", insertDuration)
-	t.Logf("  Rate: %.0f ops/sec", insertRate)
-
-	// Test lookup throughput
-	t.Logf("Testing lookup throughput...")
-	startTime = time.Now()
-	for i := 0; i < numOperations; i++ {
-		_ = bf.ContainsUint64(uint64(i))
-	}
-	lookupDuration := time.Since(startTime)
-	lookupRate := float64(numOperations) / lookupDuration.Seconds()
-
-	t.Logf("Lookup performance:")
-	t.Logf("  Operations: %d", numOperations)
-	t.Logf("  Time: %v", lookupDuration)
-	t.Logf("  Rate: %.0f ops/sec", lookupRate)
-
-	// Lookup should be faster than or similar to insert
-	if lookupRate < insertRate*0.5 {
-		t.Logf("Warning: Lookup rate (%.0f) is significantly slower than insert rate (%.0f)",
-			lookupRate, insertRate)
-	}
-}
-
-// TestMemoryFootprintGrowth tests memory usage patterns
-func TestMemoryFootprintGrowth(t *testing.T) {
-	if testing.Short() {
-		t.Skip("Skipping memory footprint test in short mode")
-	}
-
-	tests := []struct {
-		name     string
-		elements uint64
-		fpr      float64
-	}{
-		{"Small (10K)", 10_000, 0.01},
-		{"Medium (100K)", 100_000, 0.01},
-		{"Large (1M)", 1_000_000, 0.01},
-		{"Very large (10M)", 10_000_000, 0.01},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			runtime.GC()
-			beforeMem := getMemStats()
-
-			bf := bloomfilter.NewCacheOptimizedBloomFilter(tt.elements, tt.fpr)
-			stats := bf.GetCacheStats()
-
-			runtime.GC()
-			afterMem := getMemStats()
-
-			actualMem := afterMem - beforeMem
-			expectedMem := stats.MemoryUsage
-
-			t.Logf("Memory footprint for %d elements:", tt.elements)
-			t.Logf("  Expected (from stats): %.2f MB", float64(expectedMem)/(1024*1024))
-			t.Logf("  Actual (measured): %.2f MB", float64(actualMem)/(1024*1024))
-			t.Logf("  Mode: %s", func() string {
-				if bf.IsArrayMode() {
-					return "ARRAY"
-				}
-				return "MAP"
-			}())
-			t.Logf("  Cache lines: %d", stats.CacheLineCount)
-
-			// Measured memory may include Go runtime overhead
-			// Allow some deviation
-			if actualMem > expectedMem*2 {
-				t.Logf("Warning: Actual memory (%.2f MB) significantly exceeds expected (%.2f MB)",
-					float64(actualMem)/(1024*1024), float64(expectedMem)/(1024*1024))
-			}
-		})
-	}
-}
-
-// getMemStats returns current memory allocation in bytes
-func getMemStats() uint64 {
-	runtime.GC() // Force GC to get more accurate reading
-	var m runtime.MemStats
-	runtime.ReadMemStats(&m)
-	return m.Alloc
-}