diff --git a/.vscode/launch.json b/.vscode/launch.json
index 2decad3c..e9505d8c 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -26,7 +26,7 @@
             "mode": "debug",
             "program": "${workspaceFolder}/flashring/cmd/flashringtest",
             "env": {
-                "PLAN": "readthrough-batched"
+                "PLAN": "readthrough"
             }
         }
 
diff --git a/flashring/cmd/flashringtest/plan_badger.go b/flashring/cmd/flashringtest/plan_badger.go
index 4ba266d4..1e06f8fa 100644
--- a/flashring/cmd/flashringtest/plan_badger.go
+++ b/flashring/cmd/flashringtest/plan_badger.go
@@ -10,7 +10,7 @@ import (
 	"strings"
 	"sync"
 
-	cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache"
+	cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
diff --git a/flashring/cmd/flashringtest/plan_freecache.go b/flashring/cmd/flashringtest/plan_freecache.go
index 0fe6a297..be46daf9 100644
--- a/flashring/cmd/flashringtest/plan_freecache.go
+++ b/flashring/cmd/flashringtest/plan_freecache.go
@@ -11,7 +11,7 @@ import (
 	"strings"
 	"sync"
 
-	cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache"
+	cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
diff --git a/flashring/cmd/flashringtest/plan_lockless.go b/flashring/cmd/flashringtest/plan_lockless.go
index e946c9af..ea7f8ede 100644
--- a/flashring/cmd/flashringtest/plan_lockless.go
+++ b/flashring/cmd/flashringtest/plan_lockless.go
@@ -13,7 +13,7 @@ import (
 	"sync"
 	"time"
 
-	cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache"
+	cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
@@ -35,13 +35,13 @@ func planLockless() {
 		cpuProfile         string
 	)
 
-	flag.StringVar(&mountPoint, "mount", "/media/a0d00kc/trishul/", "data directory for shard files")
-	flag.IntVar(&numShards, "shards", 500, "number of shards")
-	flag.IntVar(&keysPerShard, "keys-per-shard", 10_00_00, "keys per shard")
-	flag.IntVar(&memtableMB, "memtable-mb", 16, "memtable size in MiB")
+	flag.StringVar(&mountPoint, "mount", "/mnt/disks/nvme", "data directory for shard files")
+	flag.IntVar(&numShards, "shards", 100, "number of shards")
+	flag.IntVar(&keysPerShard, "keys-per-shard", 3_00_000, "keys per shard")
+	flag.IntVar(&memtableMB, "memtable-mb", 2, "memtable size in MiB")
 	flag.IntVar(&fileSizeMultiplier, "file-size-multiplier", 2, "file size in GiB per shard")
-	flag.IntVar(&readWorkers, "readers", 8, "number of read workers")
-	flag.IntVar(&writeWorkers, "writers", 8, "number of write workers")
+	flag.IntVar(&readWorkers, "readers", 16, "number of read workers")
+	flag.IntVar(&writeWorkers, "writers", 16, "number of write workers")
 	flag.IntVar(&sampleSecs, "sample-secs", 30, "predictor sampling window in seconds")
 	flag.Int64Var(&iterations, "iterations", 100_000_000, "number of iterations")
 	flag.Float64Var(&aVal, "a", 0.4, "a value for the predictor")
@@ -84,7 +84,7 @@ func planLockless() {
 	}
 
 	memtableSizeInBytes := int32(memtableMB) * 1024 * 1024
-	fileSizeInBytes := int64(fileSizeMultiplier) * int64(memtableSizeInBytes)
+	fileSizeInBytes := int64(fileSizeMultiplier) * 1024 * 1024 * 1024 // fileSizeMultiplier in GiB
 
 	cfg := cachepkg.WrapCacheConfig{
 		NumShards:             numShards,
@@ -95,21 +95,11 @@ func planLockless() {
 		GridSearchEpsilon:     0.0001,
 		SampleDuration:        time.Duration(sampleSecs) * time.Second,
 
-		// Pass the metrics collector to record cache metrics
-		MetricsRecorder: InitMetricsCollector(),
+		//lockless mode for PutLL/GetLL
+		EnableLockless: true,
 	}
 
-	// Set additional input parameters that the cache doesn't know about
-	metricsCollector.SetShards(numShards)
-	metricsCollector.SetKeysPerShard(keysPerShard)
-	metricsCollector.SetReadWorkers(readWorkers)
-	metricsCollector.SetWriteWorkers(writeWorkers)
-	metricsCollector.SetPlan("lockless")
-
-	// Start background goroutine to wait for shutdown signal and export CSV
-	go RunmetricsWaitForShutdown()
-
-	pc, err := cachepkg.NewWrapCache(cfg, mountPoint, logStats)
+	pc, err := cachepkg.NewWrapCache(cfg, mountPoint)
 	if err != nil {
 		panic(err)
 	}
@@ -121,7 +111,7 @@ func planLockless() {
 		missedKeyChanList[i] = make(chan int)
 	}
 
-	totalKeys := keysPerShard * numShards
+	totalKeys := 30_000_000
 	str1kb := strings.Repeat("a", 1024)
 	str1kb = "%d" + str1kb
 
diff --git a/flashring/cmd/flashringtest/plan_random_gausian.go b/flashring/cmd/flashringtest/plan_random_gausian.go
index 3fbaf849..f906e320 100644
--- a/flashring/cmd/flashringtest/plan_random_gausian.go
+++ b/flashring/cmd/flashringtest/plan_random_gausian.go
@@ -12,7 +12,7 @@ import (
 	"sync"
 	"time"
 
-	cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache"
+	cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
@@ -83,7 +83,7 @@ func planRandomGaussian() {
 	}
 
 	memtableSizeInBytes := int32(memtableMB) * 1024 * 1024
-	fileSizeInBytes := int64(fileSizeMultiplier) * int64(memtableSizeInBytes)
+	fileSizeInBytes := int64(fileSizeMultiplier) * 1024 * 1024 * 1024 // fileSizeMultiplier in GiB
 
 	cfg := cachepkg.WrapCacheConfig{
 		NumShards:             numShards,
@@ -95,7 +95,7 @@ func planRandomGaussian() {
 		SampleDuration:        time.Duration(sampleSecs) * time.Second,
 	}
 
-	pc, err := cachepkg.NewWrapCache(cfg, mountPoint, logStats)
+	pc, err := cachepkg.NewWrapCache(cfg, mountPoint)
 	if err != nil {
 		panic(err)
 	}
diff --git a/flashring/cmd/flashringtest/plan_readthrough_gausian.go b/flashring/cmd/flashringtest/plan_readthrough_gausian.go
index 56c6da3d..a311d8f6 100644
--- a/flashring/cmd/flashringtest/plan_readthrough_gausian.go
+++ b/flashring/cmd/flashringtest/plan_readthrough_gausian.go
@@ -13,7 +13,7 @@ import (
 	"sync"
 	"time"
 
-	cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache"
+	cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
@@ -24,7 +24,7 @@ func planReadthroughGaussian() {
 		numShards          int
 		keysPerShard       int
 		memtableMB         int
-		fileSizeMultiplier int
+		fileSizeMultiplier float64
 		readWorkers        int
 		writeWorkers       int
 		sampleSecs         int
@@ -35,13 +35,13 @@ func planReadthroughGaussian() {
 		cpuProfile         string
 	)
 
-	flag.StringVar(&mountPoint, "mount", "/media/a0d00kc/trishul/", "data directory for shard files")
-	flag.IntVar(&numShards, "shards", 500, "number of shards")
-	flag.IntVar(&keysPerShard, "keys-per-shard", 4_00_00, "keys per shard")
-	flag.IntVar(&memtableMB, "memtable-mb", 16, "memtable size in MiB")
-	flag.IntVar(&fileSizeMultiplier, "file-size-multiplier", 2, "file size in GiB per shard")
-	flag.IntVar(&readWorkers, "readers", 8, "number of read workers")
-	flag.IntVar(&writeWorkers, "writers", 8, "number of write workers")
+	flag.StringVar(&mountPoint, "mount", "/mnt/disks/nvme/", "data directory for shard files")
+	flag.IntVar(&numShards, "shards", 50, "number of shards")
+	flag.IntVar(&keysPerShard, "keys-per-shard", 6_00_000, "keys per shard")
+	flag.IntVar(&memtableMB, "memtable-mb", 2, "memtable size in MiB")
+	flag.Float64Var(&fileSizeMultiplier, "file-size-multiplier", 0.25, "file size in GiB per shard")
+	flag.IntVar(&readWorkers, "readers", 16, "number of read workers")
+	flag.IntVar(&writeWorkers, "writers", 16, "number of write workers")
 	flag.IntVar(&sampleSecs, "sample-secs", 30, "predictor sampling window in seconds")
 	flag.Int64Var(&iterations, "iterations", 100_000_000, "number of iterations")
 	flag.Float64Var(&aVal, "a", 0.4, "a value for the predictor")
@@ -84,7 +84,7 @@ func planReadthroughGaussian() {
 	}
 
 	memtableSizeInBytes := int32(memtableMB) * 1024 * 1024
-	fileSizeInBytes := int64(fileSizeMultiplier) * int64(memtableSizeInBytes)
+	fileSizeInBytes := int64(float64(fileSizeMultiplier) * 1024 * 1024 * 1024) // fileSizeMultiplier in GiB
 
 	cfg := cachepkg.WrapCacheConfig{
 		NumShards:             numShards,
@@ -94,22 +94,9 @@ func planReadthroughGaussian() {
 		ReWriteScoreThreshold: 0.8,
 		GridSearchEpsilon:     0.0001,
 		SampleDuration:        time.Duration(sampleSecs) * time.Second,
-
-		// Pass the metrics collector to record cache metrics
-		MetricsRecorder: InitMetricsCollector(),
 	}
 
-	// Set additional input parameters that the cache doesn't know about
-	metricsCollector.SetShards(numShards)
-	metricsCollector.SetKeysPerShard(keysPerShard)
-	metricsCollector.SetReadWorkers(readWorkers)
-	metricsCollector.SetWriteWorkers(writeWorkers)
-	metricsCollector.SetPlan("readthrough")
-
-	// Start background goroutine to wait for shutdown signal and export CSV
-	go RunmetricsWaitForShutdown()
-
-	pc, err := cachepkg.NewWrapCache(cfg, mountPoint, logStats)
+	pc, err := cachepkg.NewWrapCache(cfg, mountPoint)
 	if err != nil {
 		panic(err)
 	}
@@ -121,7 +108,7 @@ func planReadthroughGaussian() {
 		missedKeyChanList[i] = make(chan int)
 	}
 
-	totalKeys := keysPerShard * numShards
+	totalKeys := 30_000_000
 	str1kb := strings.Repeat("a", 1024)
 	str1kb = "%d" + str1kb
 
@@ -139,7 +126,7 @@ func planReadthroughGaussian() {
 		key := fmt.Sprintf("key%d", k)
 		val := []byte(fmt.Sprintf(str1kb, k))
 		if err := pc.Put(key, val, 60); err != nil {
-			panic(err)
+			log.Error().Err(err).Msgf("error putting key %s", key)
 		}
 		if k%5000000 == 0 {
 			fmt.Printf("----------------------------------------------prepopulated %d keys\n", k)
@@ -158,7 +145,7 @@ func planReadthroughGaussian() {
 					key := fmt.Sprintf("key%d", mk)
 					val := []byte(fmt.Sprintf(str1kb, mk))
 					if err := pc.Put(key, val, 60); err != nil {
-						panic(err)
+						log.Error().Err(err).Msgf("error putting key %s", key)
 					}
 				}
 			}(w)
@@ -183,13 +170,14 @@ func planReadthroughGaussian() {
 					}
 
 					if expired {
-						panic("key expired")
+						log.Error().Msgf("key %s expired", key)
+						// panic("key expired")
 
 					}
 					if found && string(val) != fmt.Sprintf(str1kb, randomval) {
 						panic("value mismatch")
 					}
-					if k%5000000 == 0 {
+					if k%50000 == 0 {
 						fmt.Printf("----------------------------------------------read %d keys %d readerid\n", k, workerID)
 					}
 				}
diff --git a/flashring/cmd/flashringtest/plan_readthrough_gausian_batched.go b/flashring/cmd/flashringtest/plan_readthrough_gausian_batched.go
index fd33e06a..756e0d9b 100644
--- a/flashring/cmd/flashringtest/plan_readthrough_gausian_batched.go
+++ b/flashring/cmd/flashringtest/plan_readthrough_gausian_batched.go
@@ -13,7 +13,7 @@ import (
 	"sync"
 	"time"
 
-	cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache"
+	cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
@@ -93,7 +93,7 @@ func planReadthroughGaussianBatched() {
 	}
 
 	memtableSizeInBytes := int32(memtableMB) * 1024 * 1024
-	fileSizeInBytes := int64(fileSizeMultiplier) * int64(memtableSizeInBytes)
+	fileSizeInBytes := int64(fileSizeMultiplier) * 1024 * 1024 * 1024 // fileSizeMultiplier in GiB
 
 	cfg := cachepkg.WrapCacheConfig{
 		NumShards:             numShards,
@@ -108,22 +108,9 @@ func planReadthroughGaussianBatched() {
 		EnableBatching:    enableBatching,
 		BatchWindowMicros: batchWindowMicros,
 		MaxBatchSize:      maxBatchSize,
-
-		// Pass the metrics collector to record cache metrics
-		MetricsRecorder: InitMetricsCollector(),
 	}
 
-	// Set additional input parameters that the cache doesn't know about
-	metricsCollector.SetShards(numShards)
-	metricsCollector.SetKeysPerShard(keysPerShard)
-	metricsCollector.SetReadWorkers(readWorkers)
-	metricsCollector.SetWriteWorkers(writeWorkers)
-	metricsCollector.SetPlan("readthrough-batched")
-
-	// Start background goroutine to wait for shutdown signal and export CSV
-	go RunmetricsWaitForShutdown()
-
-	pc, err := cachepkg.NewWrapCache(cfg, mountPoint, logStats)
+	pc, err := cachepkg.NewWrapCache(cfg, mountPoint)
 	if err != nil {
 		panic(err)
 	}
diff --git a/flashring/cmd/flashringtest/runmetrics.go b/flashring/cmd/flashringtest/runmetrics.go
deleted file mode 100644
index 5e1aabec..00000000
--- a/flashring/cmd/flashringtest/runmetrics.go
+++ /dev/null
@@ -1,515 +0,0 @@
-package main
-
-import (
-	"bufio"
-	"encoding/csv"
-	"fmt"
-	"log"
-	"os"
-	"os/signal"
-	"runtime"
-	"strconv"
-	"strings"
-	"sync"
-	"syscall"
-	"time"
-)
-
-// Define your parameter structure
-type RunMetrics struct {
-	// Input Parameters
-	Shards       int
-	KeysPerShard int
-	ReadWorkers  int
-	WriteWorkers int
-	Plan         string
-
-	// Observation Parameters
-	RP99        time.Duration
-	RP50        time.Duration
-	RP25        time.Duration
-	WP99        time.Duration
-	WP50        time.Duration
-	WP25        time.Duration
-	RThroughput float64
-	WThroughput float64
-	HitRate     float64
-	CPUUsage    float64
-	MemoryUsage float64
-}
-
-// MetricChannels holds separate channels for each metric type
-type MetricChannels struct {
-	RP99        chan time.Duration
-	RP50        chan time.Duration
-	RP25        chan time.Duration
-	WP99        chan time.Duration
-	WP50        chan time.Duration
-	WP25        chan time.Duration
-	RThroughput chan float64
-	WThroughput chan float64
-	HitRate     chan float64
-	CPUUsage    chan float64
-	MemoryUsage chan float64
-}
-
-// MetricAverager maintains running averages for a metric
-type MetricAverager struct {
-	mu        sync.RWMutex
-	sum       float64
-	count     int64
-	lastValue float64
-}
-
-func (ma *MetricAverager) Add(value float64) {
-	if value == 0 {
-		return // Ignore zero values
-	}
-	ma.mu.Lock()
-	defer ma.mu.Unlock()
-	ma.sum += value
-	ma.count++
-	ma.lastValue = value
-}
-
-func (ma *MetricAverager) AddDuration(value time.Duration) {
-	if value == 0 {
-		return // Ignore zero values
-	}
-	ma.mu.Lock()
-	defer ma.mu.Unlock()
-	ma.sum += float64(value)
-	ma.count++
-}
-
-func (ma *MetricAverager) Average() float64 {
-	ma.mu.RLock()
-	defer ma.mu.RUnlock()
-	if ma.count == 0 {
-		return 0
-	}
-	return ma.sum / float64(ma.count)
-}
-
-func (ma *MetricAverager) Latest() float64 {
-	ma.mu.RLock()
-	defer ma.mu.RUnlock()
-	return ma.lastValue
-}
-
-func (ma *MetricAverager) Reset() {
-	ma.mu.Lock()
-	defer ma.mu.Unlock()
-	ma.sum = 0
-	ma.count = 0
-}
-
-// MetricsCollector collects and averages all metrics
-type MetricsCollector struct {
-	channels  MetricChannels
-	averagers map[string]*MetricAverager
-	stopCh    chan struct{}
-	wg        sync.WaitGroup
-
-	// Input parameters (set once)
-	Shards       int
-	KeysPerShard int
-	ReadWorkers  int
-	WriteWorkers int
-	Plan         string
-}
-
-// NewMetricsCollector creates a new metrics collector with channels
-func NewMetricsCollector(bufferSize int) *MetricsCollector {
-	mc := &MetricsCollector{
-		channels: MetricChannels{
-			RP99:        make(chan time.Duration, bufferSize),
-			RP50:        make(chan time.Duration, bufferSize),
-			RP25:        make(chan time.Duration, bufferSize),
-			WP99:        make(chan time.Duration, bufferSize),
-			WP50:        make(chan time.Duration, bufferSize),
-			WP25:        make(chan time.Duration, bufferSize),
-			RThroughput: make(chan float64, bufferSize),
-			WThroughput: make(chan float64, bufferSize),
-			HitRate:     make(chan float64, bufferSize),
-			CPUUsage:    make(chan float64, bufferSize),
-			MemoryUsage: make(chan float64, bufferSize),
-		},
-		averagers: make(map[string]*MetricAverager),
-		stopCh:    make(chan struct{}),
-	}
-
-	// Initialize averagers for each metric
-	metricNames := []string{"RThroughput", "RP99", "RP50", "RP25", "WThroughput", "WP99", "WP50", "WP25", "HitRate", "CPUUsage", "MemoryUsage"}
-	for _, name := range metricNames {
-		mc.averagers[name] = &MetricAverager{}
-	}
-
-	return mc
-}
-
-// Start begins collecting metrics from all channels
-func (mc *MetricsCollector) Start() {
-	// Start a goroutine for each metric channel
-	mc.wg.Add(11)
-
-	go mc.collectMetricDuration(mc.channels.RP99, "RP99")
-	go mc.collectMetricDuration(mc.channels.RP50, "RP50")
-	go mc.collectMetricDuration(mc.channels.RP25, "RP25")
-	go mc.collectMetricDuration(mc.channels.WP99, "WP99")
-	go mc.collectMetricDuration(mc.channels.WP50, "WP50")
-	go mc.collectMetricDuration(mc.channels.WP25, "WP25")
-	go mc.collectMetric(mc.channels.RThroughput, "RThroughput")
-	go mc.collectMetric(mc.channels.WThroughput, "WThroughput")
-	go mc.collectMetric(mc.channels.HitRate, "HitRate")
-	go mc.collectMetric(mc.channels.CPUUsage, "CPUUsage")
-	go mc.collectMetric(mc.channels.MemoryUsage, "MemoryUsage")
-}
-
-func (mc *MetricsCollector) collectMetric(ch chan float64, name string) {
-	defer mc.wg.Done()
-	for {
-		select {
-		case <-mc.stopCh:
-			return
-		case value, ok := <-ch:
-			if !ok {
-				return
-			}
-			mc.averagers[name].Add(value)
-		}
-	}
-}
-
-func (mc *MetricsCollector) collectMetricDuration(ch chan time.Duration, name string) {
-	defer mc.wg.Done()
-	for {
-		select {
-		case <-mc.stopCh:
-			return
-		case value, ok := <-ch:
-			if !ok {
-				return
-			}
-			mc.averagers[name].AddDuration(value)
-		}
-	}
-}
-
-// RecordRP99 sends a value to the RP99 channel
-func (mc *MetricsCollector) RecordRP99(value time.Duration) {
-	select {
-	case mc.channels.RP99 <- value:
-	default: // Don't block if channel is full
-	}
-}
-
-// RecordRP50 sends a value to the RP50 channel
-func (mc *MetricsCollector) RecordRP50(value time.Duration) {
-	select {
-	case mc.channels.RP50 <- value:
-	default:
-	}
-}
-
-// RecordRP25 sends a value to the RP25 channel
-func (mc *MetricsCollector) RecordRP25(value time.Duration) {
-	select {
-	case mc.channels.RP25 <- value:
-	default:
-	}
-}
-
-// RecordWP99 sends a value to the WP99 channel
-func (mc *MetricsCollector) RecordWP99(value time.Duration) {
-	select {
-	case mc.channels.WP99 <- value:
-	default:
-	}
-}
-
-// RecordWP50 sends a value to the WP50 channel
-func (mc *MetricsCollector) RecordWP50(value time.Duration) {
-	select {
-	case mc.channels.WP50 <- value:
-	default:
-	}
-}
-
-// RecordWP25 sends a value to the WP25 channel
-func (mc *MetricsCollector) RecordWP25(value time.Duration) {
-	select {
-	case mc.channels.WP25 <- value:
-	default:
-	}
-}
-
-// RecordRThroughput sends a value to the RThroughput channel
-func (mc *MetricsCollector) RecordRThroughput(value float64) {
-	select {
-	case mc.channels.RThroughput <- value:
-	default:
-	}
-}
-
-// RecordWThroughput sends a value to the WThroughput channel
-func (mc *MetricsCollector) RecordWThroughput(value float64) {
-	select {
-	case mc.channels.WThroughput <- value:
-	default:
-	}
-}
-
-// RecordHitRate sends a value to the HitRate channel
-func (mc *MetricsCollector) RecordHitRate(value float64) {
-	select {
-	case mc.channels.HitRate <- value:
-	default:
-	}
-}
-
-// GetAveragedMetrics returns the current averaged metrics
-func (mc *MetricsCollector) GetAveragedMetrics() RunMetrics {
-	return RunMetrics{
-		Shards:       mc.Shards,
-		KeysPerShard: mc.KeysPerShard,
-		ReadWorkers:  mc.ReadWorkers,
-		WriteWorkers: mc.WriteWorkers,
-		Plan:         mc.Plan,
-		RP99:         time.Duration(mc.averagers["RP99"].Average()),
-		RP50:         time.Duration(mc.averagers["RP50"].Average()),
-		RP25:         time.Duration(mc.averagers["RP25"].Average()),
-		WP99:         time.Duration(mc.averagers["WP99"].Average()),
-		WP50:         time.Duration(mc.averagers["WP50"].Average()),
-		WP25:         time.Duration(mc.averagers["WP25"].Average()),
-		RThroughput:  mc.averagers["RThroughput"].Latest(),
-		WThroughput:  mc.averagers["WThroughput"].Latest(),
-		HitRate:      mc.averagers["HitRate"].Average(),
-		CPUUsage:     mc.averagers["CPUUsage"].Average(),
-		MemoryUsage:  mc.averagers["MemoryUsage"].Average(),
-	}
-}
-
-// ResetAverages resets all averagers to start fresh
-func (mc *MetricsCollector) ResetAverages() {
-	for _, avg := range mc.averagers {
-		avg.Reset()
-	}
-}
-
-// Stop stops all collector goroutines
-func (mc *MetricsCollector) Stop() {
-	close(mc.stopCh)
-	mc.wg.Wait()
-}
-
-// SetShards sets the number of shards (input parameter)
-func (mc *MetricsCollector) SetShards(value int) {
-	mc.Shards = value
-}
-
-// SetKeysPerShard sets the keys per shard (input parameter)
-func (mc *MetricsCollector) SetKeysPerShard(value int) {
-	mc.KeysPerShard = value
-}
-
-// SetReadWorkers sets the number of read workers (input parameter)
-func (mc *MetricsCollector) SetReadWorkers(value int) {
-	mc.ReadWorkers = value
-}
-
-// SetWriteWorkers sets the number of write workers (input parameter)
-func (mc *MetricsCollector) SetWriteWorkers(value int) {
-	mc.WriteWorkers = value
-}
-
-// SetPlan sets the plan name (input parameter)
-func (mc *MetricsCollector) SetPlan(value string) {
-	mc.Plan = value
-}
-
-// Global variable to hold runtime data
-var currentMetrics RunMetrics
-var metricsCollector *MetricsCollector
-
-// --- CSV Configuration ---
-const CSVFileName = "performance_results.csv"
-
-// InitMetricsCollector creates and starts the metrics collector, returning it
-// so it can be passed to other components (e.g., cache config)
-func InitMetricsCollector() *MetricsCollector {
-	metricsCollector = NewMetricsCollector(100)
-	metricsCollector.Start()
-	return metricsCollector
-}
-
-// RunmetricsWaitForShutdown waits for shutdown signal and logs final metrics to CSV
-func RunmetricsWaitForShutdown() {
-	// --- Set up Signal Handling ---
-	stopChan := make(chan os.Signal, 1)
-	signal.Notify(stopChan, syscall.SIGINT, syscall.SIGTERM)
-
-	fmt.Println("Program running. Press Ctrl+C to stop and log results to CSV...")
-
-	// --- Wait for Stop Signal ---
-	<-stopChan
-	fmt.Println("\nTermination signal received. Stopping work and logging results...")
-
-	// Stop the metrics collector
-	if metricsCollector != nil {
-		metricsCollector.Stop()
-
-		// Get final averaged metrics
-		currentMetrics = metricsCollector.GetAveragedMetrics()
-	}
-
-	// Get memory usage and CPU usage at this instant
-	currentMetrics.MemoryUsage = getMemoryUsageMB()
-	currentMetrics.CPUUsage = getCPUUsagePercent()
-
-	// --- Log Data to CSV ---
-	if err := logResultsToCSV(); err != nil {
-		log.Fatalf("FATAL: Failed to log results to CSV: %v", err)
-	}
-
-	fmt.Printf("Successfully logged results to %s.\n", CSVFileName)
-
-	// Exit the program since we're running in a goroutine
-	os.Exit(0)
-}
-
-// RunmetricsInit initializes metrics and waits for shutdown (convenience function)
-func RunmetricsInit() {
-	InitMetricsCollector()
-	RunmetricsWaitForShutdown()
-}
-
-func logResultsToCSV() error {
-	// 1. Check if the file exists to determine if we need a header row.
-	file, err := os.OpenFile(CSVFileName, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
-	if err != nil {
-		return fmt.Errorf("failed to open CSV file: %w", err)
-	}
-	defer file.Close()
-
-	writer := csv.NewWriter(file)
-	defer writer.Flush() // Crucial to ensure data is written to the file before exiting.
-
-	// The list of all your column headers
-	header := []string{
-		"SHARDS", "KEYS_PER_SHARD", "READ_WORKERS", "WRITE_WORKERS", "PLAN",
-		"R_THROUGHPUT", "R_P99", "R_P50", "R_P25", "W_THROUGHPUT", "W_P99", "W_P50", "W_P25",
-		"HIT_RATE", "CPU", "MEMORY", "TIME",
-	}
-
-	// Determine if the file is new (or empty) and needs the header
-	fileInfo, _ := file.Stat()
-	if fileInfo.Size() == 0 {
-		if err := writer.Write(header); err != nil {
-			return fmt.Errorf("error writing CSV header: %w", err)
-		}
-	}
-
-	// Convert your struct fields into a slice of strings for the CSV writer
-	dataRow := []string{
-		// Input Parameters
-		strconv.Itoa(currentMetrics.Shards),
-		strconv.Itoa(currentMetrics.KeysPerShard),
-		strconv.Itoa(currentMetrics.ReadWorkers), // Convert int to string
-		strconv.Itoa(currentMetrics.WriteWorkers),
-		currentMetrics.Plan,
-
-		// Observation Parameters (convert floats to strings)
-		fmt.Sprintf("%v", currentMetrics.RThroughput),
-		fmt.Sprintf("%v", currentMetrics.RP99),
-		fmt.Sprintf("%v", currentMetrics.RP50),
-		fmt.Sprintf("%v", currentMetrics.RP25),
-
-		fmt.Sprintf("%v", currentMetrics.WThroughput),
-		fmt.Sprintf("%v", currentMetrics.WP99),
-		fmt.Sprintf("%v", currentMetrics.WP50),
-		fmt.Sprintf("%v", currentMetrics.WP25),
-
-		fmt.Sprintf("%v", currentMetrics.HitRate),
-		fmt.Sprintf("%v", currentMetrics.CPUUsage),
-		fmt.Sprintf("%v", currentMetrics.MemoryUsage),
-		fmt.Sprintf("%v", time.Now().In(time.FixedZone("IST", 5*60*60+30*60)).Format("2006-01-02 15:04:05")),
-	}
-
-	if err := writer.Write(dataRow); err != nil {
-		return fmt.Errorf("error writing CSV data row: %w", err)
-	}
-
-	return nil
-}
-
-// getMemoryUsageMB returns the current memory usage of this process in MB
-func getMemoryUsageMB() float64 {
-	var m runtime.MemStats
-	runtime.ReadMemStats(&m)
-	// Alloc is bytes of allocated heap objects
-	return float64(m.Alloc) / 1024 / 1024
-}
-
-// getSystemMemoryUsageMB returns the total system memory used by this process in MB
-func getSystemMemoryUsageMB() float64 {
-	var m runtime.MemStats
-	runtime.ReadMemStats(&m)
-	// Sys is the total bytes of memory obtained from the OS
-	return float64(m.Sys) / 1024 / 1024
-}
-
-// getCPUUsagePercent returns the CPU usage percentage for this process
-// It measures CPU usage over a short interval
-func getCPUUsagePercent() float64 {
-	// Read initial CPU stats
-	idle1, total1 := getCPUStats()
-	time.Sleep(100 * time.Millisecond)
-	// Read CPU stats again
-	idle2, total2 := getCPUStats()
-
-	idleDelta := float64(idle2 - idle1)
-	totalDelta := float64(total2 - total1)
-
-	if totalDelta == 0 {
-		return 0
-	}
-
-	cpuUsage := (1.0 - idleDelta/totalDelta) * 100.0
-	return cpuUsage
-}
-
-// getCPUStats reads /proc/stat and returns idle and total CPU time
-func getCPUStats() (idle, total uint64) {
-	file, err := os.Open("/proc/stat")
-	if err != nil {
-		return 0, 0
-	}
-	defer file.Close()
-
-	scanner := bufio.NewScanner(file)
-	for scanner.Scan() {
-		line := scanner.Text()
-		if strings.HasPrefix(line, "cpu ") {
-			fields := strings.Fields(line)
-			if len(fields) < 5 {
-				return 0, 0
-			}
-			// fields: cpu user nice system idle iowait irq softirq steal guest guest_nice
-			var values []uint64
-			for _, field := range fields[1:] {
-				val, err := strconv.ParseUint(field, 10, 64)
-				if err != nil {
-					continue
-				}
-				values = append(values, val)
-				total += val
-			}
-			if len(values) >= 4 {
-				idle = values[3] // idle is the 4th value
-			}
-			break
-		}
-	}
-	return idle, total
-}
diff --git a/flashring/go.mod b/flashring/go.mod
index f02d9663..206adab3 100644
--- a/flashring/go.mod
+++ b/flashring/go.mod
@@ -13,7 +13,23 @@ require (
 )
 
 require (
-	github.com/dgraph-io/badger/v4 v4.9.0 // indirect
+	github.com/Microsoft/go-winio v0.5.0 // indirect
+	github.com/fsnotify/fsnotify v1.9.0 // indirect
+	github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
+	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
+	github.com/sagikazarmark/locafero v0.11.0 // indirect
+	github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect
+	github.com/spf13/afero v1.15.0 // indirect
+	github.com/spf13/cast v1.10.0 // indirect
+	github.com/spf13/pflag v1.0.10 // indirect
+	github.com/subosito/gotenv v1.6.0 // indirect
+	go.yaml.in/yaml/v3 v3.0.4 // indirect
+	golang.org/x/text v0.28.0 // indirect
+)
+
+require (
+	github.com/DataDog/datadog-go/v5 v5.8.2
+	github.com/dgraph-io/badger/v4 v4.9.0
 	github.com/dgraph-io/ristretto/v2 v2.2.0 // indirect
 	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/go-logr/logr v1.4.3 // indirect
@@ -23,6 +39,7 @@ require (
 	github.com/klauspost/cpuid/v2 v2.3.0 // indirect
 	github.com/mattn/go-colorable v0.1.14 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
+	github.com/spf13/viper v1.21.0
 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
 	go.opentelemetry.io/otel v1.37.0 // indirect
 	go.opentelemetry.io/otel/metric v1.37.0 // indirect
diff --git a/flashring/go.sum b/flashring/go.sum
index 6c22ab66..5d69f8d2 100644
--- a/flashring/go.sum
+++ b/flashring/go.sum
@@ -1,42 +1,92 @@
+github.com/DataDog/datadog-go/v5 v5.8.2 h1:9IEfH1Mw9AjWwhAMqCAkhbxjuJeMxm2ARX2VdgL+ols=
+github.com/DataDog/datadog-go/v5 v5.8.2/go.mod h1:K9kcYBlxkcPP8tvvjZZKs/m1edNAUFzBbdpTUKfCsuw=
+github.com/Microsoft/go-winio v0.5.0 h1:Elr9Wn+sGKPlkaBvwu4mTrxtmOp3F3yV9qhaHbXGjwU=
+github.com/Microsoft/go-winio v0.5.0/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84=
 github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/coocood/freecache v1.2.4 h1:UdR6Yz/X1HW4fZOuH0Z94KwG851GWOSknua5VUbb/5M=
 github.com/coocood/freecache v1.2.4/go.mod h1:RBUWa/Cy+OHdfTGFEhEuE1pMCMX51Ncizj7rthiQ3vk=
 github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dgraph-io/badger/v4 v4.9.0 h1:tpqWb0NewSrCYqTvywbcXOhQdWcqephkVkbBmaaqHzc=
 github.com/dgraph-io/badger/v4 v4.9.0/go.mod h1:5/MEx97uzdPUHR4KtkNt8asfI2T4JiEiQlV7kWUo8c0=
 github.com/dgraph-io/ristretto/v2 v2.2.0 h1:bkY3XzJcXoMuELV8F+vS8kzNgicwQFAaGINAEJdWGOM=
 github.com/dgraph-io/ristretto/v2 v2.2.0/go.mod h1:RZrm63UmcBAaYWC1DotLYBmTvgkrs0+XhBd7Npn7/zI=
+github.com/dgryski/go-farm v0.0.0-20240924180020-3414d57e47da h1:aIftn67I1fkbMa512G+w+Pxci9hJPB8oMnkcP3iZF38=
+github.com/dgryski/go-farm v0.0.0-20240924180020-3414d57e47da/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
+github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
+github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
+github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
 github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
 github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs=
+github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
+github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs=
 github.com/google/flatbuffers v25.2.10+incompatible h1:F3vclr7C3HpB1k9mxCGRMXq6FdUalZ6H/pNX4FP1v0Q=
 github.com/google/flatbuffers v25.2.10+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
 github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
-github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
-github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
 github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
-github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
 github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
 github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8=
 github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
-github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
 github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
+github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
+github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
 github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0=
 github.com/rs/zerolog v1.34.0 h1:k43nTLIwcTVQAncfCw4KZ2VY6ukYoZaBPNOE8txlOeY=
 github.com/rs/zerolog v1.34.0/go.mod h1:bJsvje4Z08ROH4Nhs5iH600c3IkWhwp44iRc54W6wYQ=
+github.com/sagikazarmark/locafero v0.11.0 h1:1iurJgmM9G3PA/I+wWYIOw/5SyBtxapeHDcg+AAIFXc=
+github.com/sagikazarmark/locafero v0.11.0/go.mod h1:nVIGvgyzw595SUSUE6tvCp3YYTeHs15MvlmU87WwIik=
+github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
+github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 h1:+jumHNA0Wrelhe64i8F6HNlS8pkoyMv5sreGx2Ry5Rw=
+github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8/go.mod h1:3n1Cwaq1E1/1lhQhtRK2ts/ZwZEhjcQeJQ1RuC6Q/8U=
+github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I=
+github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg=
+github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY=
+github.com/spf13/cast v1.10.0/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo=
+github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
+github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/viper v1.21.0 h1:x5S+0EU27Lbphp4UKm1C+1oQO+rKx36vfCoaVebLFSU=
+github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjbTCAY=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
+github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
+github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
 github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
 github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
 github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
@@ -49,14 +99,46 @@ go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/Wgbsd
 go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E=
 go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4=
 go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0=
+go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
+go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
 golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
-golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
 golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng=
+golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A=
 google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/flashring/internal/cache/badger.go b/flashring/internal/cache/badger.go
deleted file mode 100644
index 7ff8c691..00000000
--- a/flashring/internal/cache/badger.go
+++ /dev/null
@@ -1,135 +0,0 @@
-package internal
-
-import (
-	"sync/atomic"
-	"time"
-
-	filecache "github.com/Meesho/BharatMLStack/flashring/internal/shard"
-	badger "github.com/dgraph-io/badger/v4"
-	"github.com/rs/zerolog/log"
-)
-
-type Badger struct {
-	cache *badger.DB
-	stats *CacheStats
-}
-
-func NewBadger(config WrapCacheConfig, logStats bool) (*Badger, error) {
-	options := badger.DefaultOptions(config.MountPoint)
-	options.MetricsEnabled = false
-
-	// 1. PRIMARY CACHE (1GB)
-	// This caches the data blocks themselves.
-	options.BlockCacheSize = 1024 << 20
-
-	// 2. INDEX CACHE (512MB)
-	// This keeps the keys and the structure of the LSM tree in RAM.
-	// This is the most critical setting for read latency.
-	options.IndexCacheSize = 512 << 20
-
-	// 3. WRITE BUFFERS (Memtables)
-	// We use 3 tables of 64MB each. This allows Badger to handle
-	// write spikes without blocking. (~192MB total)
-	options.NumMemtables = 40
-	options.MemTableSize = 1024 << 20
-
-	options.ValueThreshold = 1024
-	options.SyncWrites = false
-
-	cache, err := badger.Open(options)
-	if err != nil {
-		return nil, err
-	}
-	bc := &Badger{
-		cache: cache,
-		stats: &CacheStats{
-			Hits:                   atomic.Uint64{},
-			TotalGets:              atomic.Uint64{},
-			TotalPuts:              atomic.Uint64{},
-			ReWrites:               atomic.Uint64{},
-			Expired:                atomic.Uint64{},
-			ShardWiseActiveEntries: atomic.Uint64{},
-			LatencyTracker:         filecache.NewLatencyTracker(),
-		},
-	}
-
-	if logStats {
-		go func() {
-			sleepDuration := 10 * time.Second
-			var prevTotalGets, prevTotalPuts uint64
-			for {
-				time.Sleep(sleepDuration)
-
-				totalGets := bc.stats.TotalGets.Load()
-				totalPuts := bc.stats.TotalPuts.Load()
-				getsPerSec := float64(totalGets-prevTotalGets) / sleepDuration.Seconds()
-				putsPerSec := float64(totalPuts-prevTotalPuts) / sleepDuration.Seconds()
-
-				log.Info().Msgf("Shard %d HitRate: %v", 0, cache.BlockCacheMetrics().Hits())
-				log.Info().Msgf("Shard %d Expired: %v", 0, cache.BlockCacheMetrics().Misses())
-				log.Info().Msgf("Shard %d Total: %v", 0, cache.BlockCacheMetrics().KeysEvicted())
-				log.Info().Msgf("Gets/sec: %v", getsPerSec)
-				log.Info().Msgf("Puts/sec: %v", putsPerSec)
-
-				getP25, getP50, getP99 := bc.stats.LatencyTracker.GetLatencyPercentiles()
-				putP25, putP50, putP99 := bc.stats.LatencyTracker.PutLatencyPercentiles()
-
-				log.Info().Msgf("Get Count: %v", totalGets)
-				log.Info().Msgf("Put Count: %v", totalPuts)
-				log.Info().Msgf("Get Latencies - P25: %v, P50: %v, P99: %v", getP25, getP50, getP99)
-				log.Info().Msgf("Put Latencies - P25: %v, P50: %v, P99: %v", putP25, putP50, putP99)
-
-				prevTotalGets = totalGets
-				prevTotalPuts = totalPuts
-			}
-		}()
-	}
-
-	return bc, nil
-}
-
-func (b *Badger) Put(key string, value []byte, exptimeInMinutes uint16) error {
-
-	start := time.Now()
-	defer func() {
-		b.stats.LatencyTracker.RecordPut(time.Since(start))
-	}()
-
-	b.stats.TotalPuts.Add(1)
-	err := b.cache.Update(func(txn *badger.Txn) error {
-		entry := badger.NewEntry([]byte(key), value).WithTTL(time.Duration(exptimeInMinutes) * time.Minute)
-		err := txn.SetEntry(entry)
-		return err
-	})
-	return err
-}
-
-func (b *Badger) Get(key string) ([]byte, bool, bool) {
-
-	start := time.Now()
-	defer func() {
-		b.stats.LatencyTracker.RecordGet(time.Since(start))
-	}()
-
-	b.stats.TotalGets.Add(1)
-
-	val := make([]byte, 0)
-	err := b.cache.View(func(txn *badger.Txn) error {
-		item, err := txn.Get([]byte(key))
-		if err != nil {
-			return err
-		}
-		val, err = item.ValueCopy(val)
-
-		if err != nil {
-			b.stats.Hits.Add(1)
-		}
-
-		return err
-	})
-	return val, err != badger.ErrKeyNotFound, false
-}
-
-func (b *Badger) Close() error {
-	return b.cache.Close()
-}
diff --git a/flashring/internal/cache/cache.go b/flashring/internal/cache/cache.go
deleted file mode 100644
index 74755251..00000000
--- a/flashring/internal/cache/cache.go
+++ /dev/null
@@ -1,457 +0,0 @@
-package internal
-
-import (
-	"fmt"
-	"strconv"
-	"sync"
-	"sync/atomic"
-	"time"
-
-	"github.com/Meesho/BharatMLStack/flashring/internal/maths"
-	filecache "github.com/Meesho/BharatMLStack/flashring/internal/shard"
-	"github.com/cespare/xxhash/v2"
-	"github.com/rs/zerolog/log"
-)
-
-/*
- Each shard can keep 67M keys
- With Round = 1, expected collision (67M)^2/(2*2^62) = 4.87×10^-4
-*/
-
-const (
-	ROUNDS         = 1
-	KEYS_PER_SHARD = (1 << 26)
-	BLOCK_SIZE     = 4096
-)
-
-var (
-	ErrNumShardLessThan1            = fmt.Errorf("num shards must be greater than 0")
-	ErrKeysPerShardLessThan1        = fmt.Errorf("keys per shard must be greater than 0")
-	ErrKeysPerShardGreaterThan67M   = fmt.Errorf("keys per shard must be less than 67M")
-	ErrMemtableSizeLessThan1        = fmt.Errorf("memtable size must be greater than 0")
-	ErrMemtableSizeGreaterThan1GB   = fmt.Errorf("memtable size must be less than 1GB")
-	ErrMemtableSizeNotMultipleOf4KB = fmt.Errorf("memtable size must be a multiple of 4KB")
-	ErrFileSizeLessThan1            = fmt.Errorf("file size must be greater than 0")
-	ErrFileSizeNotMultipleOf4KB     = fmt.Errorf("file size must be a multiple of 4KB")
-	Seed                            = xxhash.Sum64String(strconv.Itoa(int(time.Now().UnixNano())))
-)
-
-type WrapCache struct {
-	shards          []*filecache.ShardCache
-	shardLocks      []sync.RWMutex
-	predictor       *maths.Predictor
-	stats           []*CacheStats
-	metricsRecorder MetricsRecorder
-}
-
-type CacheStats struct {
-	Hits                   atomic.Uint64
-	TotalGets              atomic.Uint64
-	TotalPuts              atomic.Uint64
-	ReWrites               atomic.Uint64
-	Expired                atomic.Uint64
-	ShardWiseActiveEntries atomic.Uint64
-	LatencyTracker         *filecache.LatencyTracker
-	BatchTracker           *filecache.BatchTracker
-}
-
-// MetricsRecorder is an interface for recording metrics from the cache
-// Implement this interface to receive metrics from the cache layer
-type MetricsRecorder interface {
-	// Input parameters
-	SetShards(value int)
-	SetKeysPerShard(value int)
-	SetReadWorkers(value int)
-	SetWriteWorkers(value int)
-	SetPlan(value string)
-
-	// Observation metrics
-	RecordRP99(value time.Duration)
-	RecordRP50(value time.Duration)
-	RecordRP25(value time.Duration)
-	RecordWP99(value time.Duration)
-	RecordWP50(value time.Duration)
-	RecordWP25(value time.Duration)
-	RecordRThroughput(value float64)
-	RecordWThroughput(value float64)
-	RecordHitRate(value float64)
-}
-
-type WrapCacheConfig struct {
-	NumShards             int
-	KeysPerShard          int
-	FileSize              int64
-	MemtableSize          int32
-	ReWriteScoreThreshold float32
-	GridSearchEpsilon     float64
-	SampleDuration        time.Duration
-
-	// Batching reads
-	EnableBatching    bool
-	BatchWindowMicros int // in microseconds
-	MaxBatchSize      int
-
-	// Optional metrics recorder
-	MetricsRecorder MetricsRecorder
-
-	//Badger
-	MountPoint string
-}
-
-func NewWrapCache(config WrapCacheConfig, mountPoint string, logStats bool) (*WrapCache, error) {
-	if config.NumShards <= 0 {
-		return nil, ErrNumShardLessThan1
-	}
-	if config.KeysPerShard <= 0 {
-		return nil, ErrKeysPerShardLessThan1
-	}
-	if config.KeysPerShard > KEYS_PER_SHARD {
-		return nil, ErrKeysPerShardGreaterThan67M
-	}
-	if config.MemtableSize <= 0 {
-		return nil, ErrMemtableSizeLessThan1
-	}
-	if config.MemtableSize > 1024*1024*1024 {
-		return nil, ErrMemtableSizeGreaterThan1GB
-	}
-	if config.MemtableSize%BLOCK_SIZE != 0 {
-		return nil, ErrMemtableSizeNotMultipleOf4KB
-	}
-	if config.FileSize <= 0 {
-		return nil, ErrFileSizeLessThan1
-	}
-	if config.FileSize%BLOCK_SIZE != 0 {
-		return nil, ErrFileSizeNotMultipleOf4KB
-	}
-	weights := []maths.WeightTuple{
-		{
-			WFreq: 0.1,
-			WLA:   0.1,
-		},
-		{
-			WFreq: 0.45,
-			WLA:   0.1,
-		},
-		{
-			WFreq: 0.9,
-			WLA:   0.1,
-		},
-		{
-			WFreq: 0.1,
-			WLA:   0.45,
-		},
-		{
-			WFreq: 0.45,
-			WLA:   0.45,
-		},
-		{
-			WFreq: 0.9,
-			WLA:   0.45,
-		},
-		{
-			WFreq: 0.1,
-			WLA:   0.9,
-		},
-		{
-			WFreq: 0.45,
-			WLA:   0.9,
-		},
-		{
-			WFreq: 0.9,
-			WLA:   0.9,
-		},
-	}
-	MaxMemTableCount := config.FileSize / int64(config.MemtableSize)
-	predictor := maths.NewPredictor(maths.PredictorConfig{
-		ReWriteScoreThreshold: config.ReWriteScoreThreshold,
-		Weights:               weights,
-		SampleDuration:        config.SampleDuration,
-		MaxMemTableCount:      uint32(MaxMemTableCount),
-		GridSearchEpsilon:     config.GridSearchEpsilon,
-	})
-
-	batchWindow := time.Duration(0)
-	if config.EnableBatching && config.BatchWindowMicros > 0 {
-		batchWindow = time.Duration(config.BatchWindowMicros) * time.Microsecond
-	}
-	shardLocks := make([]sync.RWMutex, config.NumShards)
-	shards := make([]*filecache.ShardCache, config.NumShards)
-	for i := 0; i < config.NumShards; i++ {
-		shards[i] = filecache.NewShardCache(filecache.ShardCacheConfig{
-			MemtableSize:        config.MemtableSize,
-			Rounds:              ROUNDS,
-			RbInitial:           config.KeysPerShard,
-			RbMax:               config.KeysPerShard,
-			DeleteAmortizedStep: 10000,
-			MaxFileSize:         int64(config.FileSize),
-			BlockSize:           BLOCK_SIZE,
-			Directory:           mountPoint,
-			Predictor:           predictor,
-
-			//batching reads
-			EnableBatching: config.EnableBatching,
-			BatchWindow:    batchWindow,
-			MaxBatchSize:   config.MaxBatchSize,
-		}, &shardLocks[i])
-	}
-
-	stats := make([]*CacheStats, config.NumShards)
-	for i := 0; i < config.NumShards; i++ {
-		stats[i] = &CacheStats{LatencyTracker: filecache.NewLatencyTracker(), BatchTracker: filecache.NewBatchTracker()}
-	}
-	wc := &WrapCache{
-		shards:          shards,
-		shardLocks:      shardLocks,
-		predictor:       predictor,
-		stats:           stats,
-		metricsRecorder: config.MetricsRecorder,
-	}
-	if logStats {
-
-		go func() {
-			sleepDuration := 10 * time.Second
-			// perShardPrevTotalGets := make([]uint64, config.NumShards)
-			// perShardPrevTotalPuts := make([]uint64, config.NumShards)
-			combinedPrevTotalGets := uint64(0)
-			combinedPrevTotalPuts := uint64(0)
-			for {
-				time.Sleep(sleepDuration)
-
-				combinedTotalGets := uint64(0)
-				combinedTotalPuts := uint64(0)
-				combinedHits := uint64(0)
-				combinedReWrites := uint64(0)
-				combinedExpired := uint64(0)
-				combinedShardWiseActiveEntries := uint64(0)
-				for i := 0; i < config.NumShards; i++ {
-					combinedTotalGets += wc.stats[i].TotalGets.Load()
-					combinedTotalPuts += wc.stats[i].TotalPuts.Load()
-					combinedHits += wc.stats[i].Hits.Load()
-					combinedReWrites += wc.stats[i].ReWrites.Load()
-					combinedExpired += wc.stats[i].Expired.Load()
-					combinedShardWiseActiveEntries += wc.stats[i].ShardWiseActiveEntries.Load()
-				}
-
-				combinedHitRate := float64(0)
-				if combinedTotalGets > 0 {
-					combinedHitRate = float64(combinedHits) / float64(combinedTotalGets)
-				}
-
-				log.Info().Msgf("Combined HitRate: %v", combinedHitRate)
-				log.Info().Msgf("Combined ReWrites: %v", combinedReWrites)
-				log.Info().Msgf("Combined Expired: %v", combinedExpired)
-				log.Info().Msgf("Combined Total: %v", combinedTotalGets)
-				log.Info().Msgf("Combined Puts/sec: %v", float64(combinedTotalPuts-combinedPrevTotalPuts)/float64(sleepDuration.Seconds()))
-				log.Info().Msgf("Combined Gets/sec: %v", float64(combinedTotalGets-combinedPrevTotalGets)/float64(sleepDuration.Seconds()))
-				log.Info().Msgf("Combined ShardWiseActiveEntries: %v", combinedShardWiseActiveEntries)
-
-				combinedGetP25, combinedGetP50, combinedGetP99 := wc.stats[0].LatencyTracker.GetLatencyPercentiles()
-				combinedPutP25, combinedPutP50, combinedPutP99 := wc.stats[0].LatencyTracker.PutLatencyPercentiles()
-
-				log.Info().Msgf("Combined Get Count: %v", combinedTotalGets)
-				log.Info().Msgf("Combined Put Count: %v", combinedTotalPuts)
-				log.Info().Msgf("Combined Get Latencies - P25: %v, P50: %v, P99: %v", combinedGetP25, combinedGetP50, combinedGetP99)
-				log.Info().Msgf("Combined Put Latencies - P25: %v, P50: %v, P99: %v", combinedPutP25, combinedPutP50, combinedPutP99)
-
-				combinedGetBatchP25, combinedGetBatchP50, combinedGetBatchP99 := wc.shards[0].Stats.BatchTracker.GetBatchSizePercentiles()
-				log.Info().Msgf("Combined Get Batch Sizes - P25: %v, P50: %v, P99: %v", combinedGetBatchP25, combinedGetBatchP50, combinedGetBatchP99)
-
-				// Send metrics to the recorder if configured
-				if wc.metricsRecorder != nil {
-					rThroughput := float64(combinedTotalGets-combinedPrevTotalGets) / sleepDuration.Seconds()
-					wThroughput := float64(combinedTotalPuts-combinedPrevTotalPuts) / sleepDuration.Seconds()
-
-					wc.metricsRecorder.RecordRP25(combinedGetP25)
-					wc.metricsRecorder.RecordRP50(combinedGetP50)
-					wc.metricsRecorder.RecordRP99(combinedGetP99)
-					wc.metricsRecorder.RecordWP25(combinedPutP25)
-					wc.metricsRecorder.RecordWP50(combinedPutP50)
-					wc.metricsRecorder.RecordWP99(combinedPutP99)
-					wc.metricsRecorder.RecordRThroughput(rThroughput)
-					wc.metricsRecorder.RecordWThroughput(wThroughput)
-					wc.metricsRecorder.RecordHitRate(combinedHitRate)
-				}
-
-				combinedPrevTotalGets = combinedTotalGets
-				combinedPrevTotalPuts = combinedTotalPuts
-
-				/* disabling per shard stats for now
-				for i := 0; i < config.NumShards; i++ {
-					log.Info().Msgf("Shard %d has %d active entries", i, wc.stats[i].ShardWiseActiveEntries.Load())
-					total := wc.stats[i].TotalGets.Load()
-					hits := wc.stats[i].Hits.Load()
-					hitRate := float64(0)
-					if total > 0 {
-						hitRate = float64(hits) / float64(total)
-					}
-					log.Info().Msgf("Shard %d HitRate: %v", i, hitRate)
-					log.Info().Msgf("Shard %d ReWrites: %v", i, wc.stats[i].ReWrites.Load())
-					log.Info().Msgf("Shard %d Expired: %v", i, wc.stats[i].Expired.Load())
-					log.Info().Msgf("Shard %d Total: %v", i, total)
-					log.Info().Msgf("Gets/sec: %v", float64(total-perShardPrevTotalGets[i])/float64(sleepDuration.Seconds()))
-					log.Info().Msgf("Puts/sec: %v", float64(wc.stats[i].TotalPuts.Load()-perShardPrevTotalPuts[i])/float64(sleepDuration.Seconds()))
-					perShardPrevTotalGets[i] = total
-					perShardPrevTotalPuts[i] = wc.stats[i].TotalPuts.Load()
-
-					getP25, getP50, getP99 := wc.stats[i].LatencyTracker.GetLatencyPercentiles()
-					putP25, putP50, putP99 := wc.stats[i].LatencyTracker.PutLatencyPercentiles()
-
-					log.Info().Msgf("Get Count: %v", wc.stats[i].TotalGets.Load())
-					log.Info().Msgf("Put Count: %v", wc.stats[i].TotalPuts.Load())
-					log.Info().Msgf("Get Latencies - P25: %v, P50: %v, P99: %v", getP25, getP50, getP99)
-					log.Info().Msgf("Put Latencies - P25: %v, P50: %v, P99: %v", putP25, putP50, putP99)
-
-				}
-				*/
-				log.Info().Msgf("GridSearchActive: %v", wc.predictor.GridSearchEstimator.IsGridSearchActive())
-			}
-		}()
-	}
-	return wc, nil
-}
-
-func (wc *WrapCache) PutLL(key string, value []byte, exptimeInMinutes uint16) error {
-
-	h32 := wc.Hash(key)
-	shardIdx := h32 % uint32(len(wc.shards))
-	start := time.Now()
-
-	result := filecache.ErrorPool.Get().(chan error)
-
-	wc.shards[shardIdx].WriteCh <- &filecache.WriteRequestV2{
-		Key:              key,
-		Value:            value,
-		ExptimeInMinutes: exptimeInMinutes,
-		Result:           result,
-	}
-
-	if h32%100 < 10 {
-		wc.stats[shardIdx].ShardWiseActiveEntries.Store(uint64(wc.shards[shardIdx].GetRingBufferActiveEntries()))
-	}
-
-	op := <-result
-	filecache.ErrorPool.Put(result)
-	wc.stats[shardIdx].TotalPuts.Add(1)
-	wc.stats[shardIdx].LatencyTracker.RecordPut(time.Since(start))
-	return op
-}
-
-func (wc *WrapCache) GetLL(key string) ([]byte, bool, bool) {
-	h32 := wc.Hash(key)
-	shardIdx := h32 % uint32(len(wc.shards))
-
-	start := time.Now()
-
-	found, value, _, expired, needsSlowPath := wc.shards[shardIdx].GetFastPath(key)
-
-	if !needsSlowPath {
-		if found && !expired {
-			wc.stats[shardIdx].Hits.Add(1)
-		} else if expired {
-			wc.stats[shardIdx].Expired.Add(1)
-		}
-
-		wc.stats[shardIdx].TotalGets.Add(1)
-		wc.stats[shardIdx].LatencyTracker.RecordGet(time.Since(start))
-		return value, found, expired
-	}
-
-	result := filecache.ReadResultPool.Get().(chan filecache.ReadResultV2)
-
-	req := filecache.ReadRequestPool.Get().(*filecache.ReadRequestV2)
-	req.Key = key
-	req.Result = result
-
-	wc.shards[shardIdx].ReadCh <- req
-	op := <-result
-
-	filecache.ReadResultPool.Put(result)
-	filecache.ReadRequestPool.Put(req)
-
-	if op.Found && !op.Expired {
-		wc.stats[shardIdx].Hits.Add(1)
-	}
-	if op.Expired {
-		wc.stats[shardIdx].Expired.Add(1)
-	}
-	wc.stats[shardIdx].LatencyTracker.RecordGet(time.Since(start))
-	wc.stats[shardIdx].TotalGets.Add(1)
-
-	return op.Data, op.Found, op.Expired
-}
-
-func (wc *WrapCache) Put(key string, value []byte, exptimeInMinutes uint16) error {
-
-	h32 := wc.Hash(key)
-	shardIdx := h32 % uint32(len(wc.shards))
-
-	start := time.Now()
-	defer func() {
-		wc.stats[shardIdx].LatencyTracker.RecordPut(time.Since(start))
-	}()
-
-	wc.shardLocks[shardIdx].Lock()
-	defer wc.shardLocks[shardIdx].Unlock()
-	wc.putLocked(shardIdx, h32, key, value, exptimeInMinutes)
-	return nil
-}
-
-func (wc *WrapCache) putLocked(shardIdx uint32, h32 uint32, key string, value []byte, exptimeInMinutes uint16) {
-	wc.shards[shardIdx].Put(key, value, exptimeInMinutes)
-	wc.stats[shardIdx].TotalPuts.Add(1)
-	if h32%100 < 10 {
-		wc.stats[shardIdx].ShardWiseActiveEntries.Store(uint64(wc.shards[shardIdx].GetRingBufferActiveEntries()))
-	}
-}
-
-func (wc *WrapCache) Get(key string) ([]byte, bool, bool) {
-	h32 := wc.Hash(key)
-	shardIdx := h32 % uint32(len(wc.shards))
-
-	start := time.Now()
-	defer func() {
-		wc.stats[shardIdx].LatencyTracker.RecordGet(time.Since(start))
-	}()
-
-	var keyFound bool
-	var val []byte
-	var remainingTTL uint16
-	var expired bool
-	var shouldReWrite bool
-	if wc.shards[shardIdx].BatchReader != nil {
-		reqChan := make(chan filecache.ReadResultV2, 1)
-		wc.shards[shardIdx].BatchReader.Requests <- &filecache.ReadRequestV2{
-			Key:    key,
-			Result: reqChan,
-		}
-		result := <-reqChan
-
-		keyFound, val, remainingTTL, expired, shouldReWrite = result.Found, result.Data, result.TTL, result.Expired, result.ShouldRewrite
-	} else {
-		wc.shardLocks[shardIdx].RLock()
-		defer wc.shardLocks[shardIdx].RUnlock()
-		keyFound, val, remainingTTL, expired, shouldReWrite = wc.shards[shardIdx].Get(key)
-	}
-
-	if keyFound && !expired {
-		wc.stats[shardIdx].Hits.Add(1)
-	}
-	if expired {
-		wc.stats[shardIdx].Expired.Add(1)
-	}
-	wc.stats[shardIdx].TotalGets.Add(1)
-	if shouldReWrite {
-		wc.stats[shardIdx].ReWrites.Add(1)
-		wc.putLocked(shardIdx, h32, key, val, remainingTTL)
-	}
-	wc.predictor.Observe(float64(wc.stats[shardIdx].Hits.Load()) / float64(wc.stats[shardIdx].TotalGets.Load()))
-	return val, keyFound, expired
-}
-
-func (wc *WrapCache) Hash(key string) uint32 {
-	return uint32(xxhash.Sum64String(key) ^ Seed)
-}
-
-func (wc *WrapCache) GetShardCache(shardIdx int) *filecache.ShardCache {
-	return wc.shards[shardIdx]
-}
diff --git a/flashring/internal/cache/freecache.go b/flashring/internal/cache/freecache.go
deleted file mode 100644
index df0f0f75..00000000
--- a/flashring/internal/cache/freecache.go
+++ /dev/null
@@ -1,96 +0,0 @@
-package internal
-
-import (
-	"runtime/debug"
-	"sync/atomic"
-	"time"
-
-	filecache "github.com/Meesho/BharatMLStack/flashring/internal/shard"
-	"github.com/coocood/freecache"
-	"github.com/rs/zerolog/log"
-)
-
-type Freecache struct {
-	cache *freecache.Cache
-	stats *CacheStats
-}
-
-func NewFreecache(config WrapCacheConfig, logStats bool) (*Freecache, error) {
-
-	cache := freecache.NewCache(int(config.FileSize))
-	debug.SetGCPercent(20)
-
-	fc := &Freecache{
-		cache: cache,
-		stats: &CacheStats{
-			Hits:                   atomic.Uint64{},
-			TotalGets:              atomic.Uint64{},
-			TotalPuts:              atomic.Uint64{},
-			ReWrites:               atomic.Uint64{},
-			Expired:                atomic.Uint64{},
-			ShardWiseActiveEntries: atomic.Uint64{},
-			LatencyTracker:         filecache.NewLatencyTracker(),
-		},
-	}
-
-	if logStats {
-		go func() {
-			sleepDuration := 10 * time.Second
-			var prevTotalGets, prevTotalPuts uint64
-			for {
-				time.Sleep(sleepDuration)
-
-				totalGets := fc.stats.TotalGets.Load()
-				totalPuts := fc.stats.TotalPuts.Load()
-				getsPerSec := float64(totalGets-prevTotalGets) / sleepDuration.Seconds()
-				putsPerSec := float64(totalPuts-prevTotalPuts) / sleepDuration.Seconds()
-
-				log.Info().Msgf("Shard %d HitRate: %v", 0, cache.HitRate())
-				log.Info().Msgf("Shard %d Expired: %v", 0, cache.ExpiredCount())
-				log.Info().Msgf("Shard %d Total: %v", 0, cache.EntryCount())
-				log.Info().Msgf("Gets/sec: %v", getsPerSec)
-				log.Info().Msgf("Puts/sec: %v", putsPerSec)
-
-				getP25, getP50, getP99 := fc.stats.LatencyTracker.GetLatencyPercentiles()
-				putP25, putP50, putP99 := fc.stats.LatencyTracker.PutLatencyPercentiles()
-
-				log.Info().Msgf("Get Count: %v", totalGets)
-				log.Info().Msgf("Put Count: %v", totalPuts)
-				log.Info().Msgf("Get Latencies - P25: %v, P50: %v, P99: %v", getP25, getP50, getP99)
-				log.Info().Msgf("Put Latencies - P25: %v, P50: %v, P99: %v", putP25, putP50, putP99)
-
-				prevTotalGets = totalGets
-				prevTotalPuts = totalPuts
-			}
-		}()
-	}
-
-	return fc, nil
-
-}
-
-func (c *Freecache) Put(key string, value []byte, exptimeInMinutes uint16) error {
-	start := time.Now()
-	defer func() {
-		c.stats.LatencyTracker.RecordPut(time.Since(start))
-	}()
-
-	c.stats.TotalPuts.Add(1)
-	c.cache.Set([]byte(key), value, int(exptimeInMinutes)*60)
-	return nil
-}
-
-func (c *Freecache) Get(key string) ([]byte, bool, bool) {
-	start := time.Now()
-	defer func() {
-		c.stats.LatencyTracker.RecordGet(time.Since(start))
-	}()
-
-	c.stats.TotalGets.Add(1)
-	val, err := c.cache.Get([]byte(key))
-	if err != nil {
-		return nil, false, false
-	}
-	c.stats.Hits.Add(1)
-	return val, true, false
-}
diff --git a/flashring/internal/fs/aligned_page.go b/flashring/internal/fs/aligned_page.go
index c499ae36..099ccd9d 100644
--- a/flashring/internal/fs/aligned_page.go
+++ b/flashring/internal/fs/aligned_page.go
@@ -4,8 +4,6 @@
 package fs
 
 import (
-	"runtime/pprof"
-
 	"golang.org/x/sys/unix"
 )
 
@@ -16,7 +14,7 @@ const (
 	MAP_ANON    = unix.MAP_ANON
 )
 
-var mmapProf = pprof.NewProfile("mmap") // will show up in /debug/pprof/
+// var mmapProf = pprof.NewProfile("mmap") // will show up in /debug/pprof/
 
 type AlignedPage struct {
 	Buf  []byte
@@ -28,9 +26,9 @@ func NewAlignedPage(pageSize int) *AlignedPage {
 	if err != nil {
 		panic(err)
 	}
-	if pageSize > 0 {
-		mmapProf.Add(&b[0], pageSize) // attribute sz bytes to this callsite
-	}
+	// if pageSize > 0 {
+	// 	mmapProf.Add(&b[0], pageSize) // attribute sz bytes to this callsite
+	// }
 	return &AlignedPage{
 		Buf:  b,
 		mmap: b,
@@ -38,9 +36,9 @@ func NewAlignedPage(pageSize int) *AlignedPage {
 }
 
 func Unmap(p *AlignedPage) error {
-	if len(p.mmap) > 0 {
-		mmapProf.Remove(&p.mmap[0]) // release from custom profile
-	}
+	// if len(p.mmap) > 0 {
+	// 	mmapProf.Remove(&p.mmap[0]) // release from custom profile
+	// }
 	if p.mmap != nil {
 		err := unix.Munmap(p.mmap)
 		if err != nil {
diff --git a/flashring/internal/fs/batch_iouring.go b/flashring/internal/fs/batch_iouring.go
new file mode 100644
index 00000000..13c8267f
--- /dev/null
+++ b/flashring/internal/fs/batch_iouring.go
@@ -0,0 +1,322 @@
+//go:build linux
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"time"
+
+	"github.com/Meesho/BharatMLStack/flashring/pkg/metrics"
+)
+
+// batchReadResult holds the outcome of a single batched pread.
+type batchReadResult struct {
+	N   int
+	Err error
+}
+
+// batchReadRequest is a pread submitted to the batch reader.
+type batchReadRequest struct {
+	fd     int
+	buf    []byte
+	offset uint64
+	done   chan batchReadResult
+}
+
+var batchReqPool = sync.Pool{
+	New: func() interface{} {
+		return &batchReadRequest{
+			done: make(chan batchReadResult, 1),
+		}
+	},
+}
+
+// BatchIoUringReader collects pread requests from multiple goroutines into a
+// single channel and submits them as one io_uring batch. This amortizes the
+// syscall overhead (1 io_uring_enter instead of N) and lets NVMe process
+// multiple commands in parallel (queue depth > 1).
+//
+// Collection uses non-blocking channel drain: after receiving the first
+// request, it drains whatever else is already queued (no timer). Under load
+// this provides natural batching; under low load single requests go out
+// with zero added latency.
+//
+// CQEs are dispatched individually as they complete (no head-of-line blocking).
+type BatchIoUringReader struct {
+	ring     *IoUring
+	reqCh    chan *batchReadRequest
+	maxBatch int
+	window   time.Duration // wait up to this for more requests before submit (0 = drain only)
+	closeCh  chan struct{}
+	wg       sync.WaitGroup
+}
+
+// BatchIoUringConfig configures the batch reader.
+type BatchIoUringConfig struct {
+	RingDepth uint32        // io_uring SQ/CQ size (default 256)
+	MaxBatch  int           // max requests per batch (capped to RingDepth)
+	Window    time.Duration // wait up to this for requests to accumulate before submit (e.g. 500*time.Microsecond); 0 = drain only, no wait
+	QueueSize int           // channel buffer size (default 1024)
+}
+
+// NewBatchIoUringReader creates a batch reader with its own io_uring ring
+// and starts the background collection goroutine.
+func NewBatchIoUringReader(cfg BatchIoUringConfig) (*BatchIoUringReader, error) {
+	if cfg.RingDepth == 0 {
+		cfg.RingDepth = 256
+	}
+	if cfg.MaxBatch == 0 || cfg.MaxBatch > int(cfg.RingDepth) {
+		cfg.MaxBatch = int(cfg.RingDepth)
+	}
+	if cfg.QueueSize == 0 {
+		cfg.QueueSize = 1024
+	}
+
+	ring, err := NewIoUring(cfg.RingDepth, 0)
+	if err != nil {
+		return nil, fmt.Errorf("batch io_uring init: %w", err)
+	}
+
+	b := &BatchIoUringReader{
+		ring:     ring,
+		reqCh:    make(chan *batchReadRequest, cfg.QueueSize),
+		maxBatch: cfg.MaxBatch,
+		window:   cfg.Window,
+		closeCh:  make(chan struct{}),
+	}
+	b.wg.Add(1)
+	go b.loop()
+	return b, nil
+}
+
+// Submit sends a pread request into the batch channel and blocks until the
+// io_uring completion is received. Thread-safe; called from many goroutines.
+func (b *BatchIoUringReader) Submit(fd int, buf []byte, offset uint64) (int, error) {
+	if len(buf) == 0 {
+		return 0, nil
+	}
+
+	var startTime time.Time
+	if metrics.Enabled() {
+		startTime = time.Now()
+	}
+
+	req := batchReqPool.Get().(*batchReadRequest)
+	req.fd = fd
+	req.buf = buf
+	req.offset = offset
+
+	b.reqCh <- req
+
+	result := <-req.done
+	n, err := result.N, result.Err
+	if metrics.Enabled() {
+		metrics.Timing(metrics.KEY_PREAD_LATENCY, time.Since(startTime), []string{})
+	}
+
+	// Reset and return to pool
+	req.fd = 0
+	req.buf = nil
+	req.offset = 0
+	batchReqPool.Put(req)
+
+	return n, err
+}
+
+// Close shuts down the collection goroutine and releases the io_uring ring.
+func (b *BatchIoUringReader) Close() {
+	close(b.closeCh)
+	b.wg.Wait()
+	b.ring.Close()
+}
+
+// loop is the single background goroutine that collects and submits batches.
+//
+// Phase 1: block on first request (no timer ticking when idle).
+// Phase 2: non-blocking drain of whatever else is already queued.
+// Phase 3: submit the batch and dispatch CQEs as they complete.
+func (b *BatchIoUringReader) loop() {
+	defer b.wg.Done()
+
+	batch := make([]*batchReadRequest, 0, b.maxBatch)
+
+	for {
+		// Phase 1: block until the first request arrives
+		select {
+		case req := <-b.reqCh:
+			batch = append(batch, req)
+		case <-b.closeCh:
+			return
+		}
+
+		// Phase 2: drain with optional wait — if window > 0, wait up to window
+		// for more requests; otherwise non-blocking drain only.
+		var timer *time.Timer
+		if b.window > 0 {
+			timer = time.NewTimer(b.window)
+		}
+	drain:
+		for len(batch) < b.maxBatch {
+			if b.window > 0 {
+				select {
+				case req := <-b.reqCh:
+					batch = append(batch, req)
+				case <-timer.C:
+					break drain
+				case <-b.closeCh:
+					if timer != nil {
+						timer.Stop()
+					}
+					return
+				}
+			} else {
+				select {
+				case req := <-b.reqCh:
+					batch = append(batch, req)
+				default:
+					break drain
+				}
+			}
+		}
+		if timer != nil {
+			timer.Stop()
+		}
+
+		// Phase 3: submit and dispatch
+		b.submitBatch(batch)
+		batch = batch[:0]
+	}
+}
+
+// submitBatch prepares N SQEs, submits them (fire-and-forget), then dispatches
+// each CQE individually as it completes. Fast reads are dispatched immediately
+// without waiting for slow reads in the same batch (no head-of-line blocking).
+func (b *BatchIoUringReader) submitBatch(batch []*batchReadRequest) {
+	if metrics.Enabled() {
+		metrics.Timing(metrics.KEY_IOURING_SIZE, time.Duration(len(batch))*time.Millisecond, []string{})
+	}
+	n := len(batch)
+	if n == 0 {
+		return
+	}
+
+	b.ring.mu.Lock()
+
+	// Prepare SQEs
+	prepared := 0
+	for i, req := range batch {
+		sqe := b.ring.getSqe()
+		if sqe == nil {
+			// SQ full -- error the rest
+			for j := i; j < n; j++ {
+				batch[j].done <- batchReadResult{
+					Err: fmt.Errorf("io_uring: SQ full, batch=%d depth=%d", n, b.ring.sqEntries),
+				}
+			}
+			break
+		}
+		prepRead(sqe, req.fd, req.buf, req.offset)
+		sqe.UserData = uint64(i) // index for CQE matching
+		prepared++
+	}
+
+	if prepared == 0 {
+		b.ring.mu.Unlock()
+		return
+	}
+
+	// Submit SQEs but do NOT wait for completions (waitNr=0).
+	// The kernel starts processing I/O immediately; we dispatch each CQE
+	// as it arrives below, so fast reads aren't blocked by slow ones.
+	_, err := b.ring.submit(0)
+	if err != nil {
+		b.ring.mu.Unlock()
+		for i := 0; i < prepared; i++ {
+			batch[i].done <- batchReadResult{Err: fmt.Errorf("io_uring_enter: %w", err)}
+		}
+		return
+	}
+
+	// Dispatch CQEs one-by-one as they complete.
+	completed := 0
+	for completed < prepared {
+		cqe, err := b.ring.waitCqe()
+		if err != nil {
+			// Catastrophic ring error -- unblock all unsatisfied callers.
+			b.ring.mu.Unlock()
+			for i := 0; i < n; i++ {
+				select {
+				case batch[i].done <- batchReadResult{Err: fmt.Errorf("io_uring waitCqe: %w", err)}:
+				default: // already sent
+				}
+			}
+			return
+		}
+
+		idx := int(cqe.UserData)
+		res := cqe.Res
+		b.ring.seenCqe()
+		completed++
+
+		if idx < 0 || idx >= prepared {
+			continue // unexpected UserData; skip
+		}
+
+		if res < 0 {
+			batch[idx].done <- batchReadResult{
+				Err: fmt.Errorf("io_uring pread errno %d (%s), fd=%d off=%d len=%d",
+					-res, syscall.Errno(-res), batch[idx].fd, batch[idx].offset, len(batch[idx].buf)),
+			}
+		} else {
+			batch[idx].done <- batchReadResult{N: int(res)}
+		}
+	}
+
+	b.ring.mu.Unlock()
+}
+
+// ParallelBatchIoUringReader distributes pread requests across N independent
+// BatchIoUringReader instances (each with its own io_uring ring and goroutine)
+// using round-robin. This removes the single-ring serialization bottleneck and
+// lets NVMe service requests across multiple hardware queues in parallel.
+type ParallelBatchIoUringReader struct {
+	readers []*BatchIoUringReader
+	next    atomic.Uint64
+}
+
+// NewParallelBatchIoUringReader creates numRings independent batch readers.
+// Each ring gets its own io_uring instance and background goroutine.
+func NewParallelBatchIoUringReader(cfg BatchIoUringConfig, numRings int) (*ParallelBatchIoUringReader, error) {
+	if numRings <= 0 {
+		numRings = 1
+	}
+	readers := make([]*BatchIoUringReader, numRings)
+	for i := 0; i < numRings; i++ {
+		r, err := NewBatchIoUringReader(cfg)
+		if err != nil {
+			for j := 0; j < i; j++ {
+				readers[j].Close()
+			}
+			return nil, fmt.Errorf("parallel batch reader ring %d: %w", i, err)
+		}
+		readers[i] = r
+	}
+	return &ParallelBatchIoUringReader{readers: readers}, nil
+}
+
+// Submit routes the pread to the next ring via round-robin. Thread-safe.
+func (p *ParallelBatchIoUringReader) Submit(fd int, buf []byte, offset uint64) (int, error) {
+	idx := p.next.Add(1) % uint64(len(p.readers))
+	return p.readers[idx].Submit(fd, buf, offset)
+}
+
+// Close shuts down all underlying batch readers.
+func (p *ParallelBatchIoUringReader) Close() {
+	for _, r := range p.readers {
+		r.Close()
+	}
+}
diff --git a/flashring/internal/fs/fs.go b/flashring/internal/fs/fs.go
index 186e524e..b69be0a4 100644
--- a/flashring/internal/fs/fs.go
+++ b/flashring/internal/fs/fs.go
@@ -32,6 +32,7 @@ var (
 	ErrFileSizeExceeded     = errors.New("file size exceeded. Please punch hole")
 	ErrFileOffsetOutOfRange = errors.New("file offset is out of range")
 	ErrOffsetNotAligned     = errors.New("offset is not aligned to block size")
+	ErrReadTimeout          = errors.New("read timeout")
 )
 
 type Stat struct {
diff --git a/flashring/internal/fs/iouring.go b/flashring/internal/fs/iouring.go
new file mode 100644
index 00000000..4b5b18b3
--- /dev/null
+++ b/flashring/internal/fs/iouring.go
@@ -0,0 +1,585 @@
+//go:build linux
+// +build linux
+
+// Package fs provides a minimal io_uring implementation using raw syscalls.
+// No external dependencies beyond golang.org/x/sys/unix are needed.
+// Compatible with Go 1.24+ (no go:linkname usage).
+package fs
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"time"
+	"unsafe"
+
+	"github.com/Meesho/BharatMLStack/flashring/pkg/metrics"
+	"golang.org/x/sys/unix"
+)
+
+// -----------------------------------------------------------------------
+// io_uring syscall numbers (amd64)
+// -----------------------------------------------------------------------
+
+const (
+	sysIOUringSetup    = 425
+	sysIOUringEnter    = 426
+	sysIOUringRegister = 427
+)
+
+// -----------------------------------------------------------------------
+// io_uring constants
+// -----------------------------------------------------------------------
+
+const (
+	// Setup flags
+	iouringSetupSQPoll = 1 << 1
+
+	// Enter flags
+	iouringEnterGetEvents = 1 << 0
+	iouringEnterSQWakeup  = 1 << 1
+
+	// SQ flags (read from kernel-shared memory)
+	iouringSQNeedWakeup = 1 << 0
+
+	// Opcodes
+	iouringOpNop   = 0
+	iouringOpRead  = 22
+	iouringOpWrite = 23
+
+	// offsets for mmap
+	iouringOffSQRing = 0
+	iouringOffCQRing = 0x8000000
+	iouringOffSQEs   = 0x10000000
+)
+
+// -----------------------------------------------------------------------
+// io_uring kernel structures (must match kernel ABI exactly)
+// -----------------------------------------------------------------------
+
+// ioUringSqe is the 64-byte submission queue entry.
+type ioUringSqe struct {
+	Opcode   uint8
+	Flags    uint8
+	IoPrio   uint16
+	Fd       int32
+	Off      uint64 // union: off / addr2
+	Addr     uint64 // union: addr / splice_off_in
+	Len      uint32
+	OpFlags  uint32 // union: rw_flags, etc.
+	UserData uint64
+	BufIndex uint16 // union: buf_index / buf_group
+	_        uint16 // personality
+	_        int32  // splice_fd_in / file_index
+	_        uint64 // addr3
+	_        uint64 // __pad2[0]
+}
+
+// ioUringCqe is the 16-byte completion queue entry.
+type ioUringCqe struct {
+	UserData uint64
+	Res      int32
+	Flags    uint32
+}
+
+// ioUringParams is passed to io_uring_setup.
+type ioUringParams struct {
+	SqEntries    uint32
+	CqEntries    uint32
+	Flags        uint32
+	SqThreadCPU  uint32
+	SqThreadIdle uint32
+	Features     uint32
+	WqFd         uint32
+	Resv         [3]uint32
+	SqOff        ioUringSqringOffsets
+	CqOff        ioUringCqringOffsets
+}
+
+type ioUringSqringOffsets struct {
+	Head        uint32
+	Tail        uint32
+	RingMask    uint32
+	RingEntries uint32
+	Flags       uint32
+	Dropped     uint32
+	Array       uint32
+	Resv1       uint32
+	Resv2       uint64
+}
+
+type ioUringCqringOffsets struct {
+	Head        uint32
+	Tail        uint32
+	RingMask    uint32
+	RingEntries uint32
+	Overflow    uint32
+	Cqes        uint32
+	Flags       uint32
+	Resv1       uint32
+	Resv2       uint64
+}
+
+// -----------------------------------------------------------------------
+// IoUring is the main ring handle
+// -----------------------------------------------------------------------
+
+// IoUring wraps a single io_uring instance with SQ/CQ ring mappings.
+type IoUring struct {
+	fd int
+
+	// SQ ring mapped memory
+	sqRingPtr  []byte
+	sqMask     uint32
+	sqEntries  uint32
+	sqHead     *uint32 // kernel-updated
+	sqTail     *uint32 // user-updated
+	sqFlags    *uint32 // kernel-updated (NEED_WAKEUP etc.)
+	sqArray    unsafe.Pointer
+	sqeTail    uint32 // local tracking of next SQE slot
+	sqeHead    uint32 // local tracking of submitted SQEs
+	sqesMmap   []byte
+	sqesBase   unsafe.Pointer // base pointer to SQE array
+	sqRingSz   int
+	cqRingSz   int
+	sqesSz     int
+	singleMmap bool
+
+	// CQ ring mapped memory
+	cqRingPtr []byte
+	cqMask    uint32
+	cqEntries uint32
+	cqHead    *uint32 // user-updated
+	cqTail    *uint32 // kernel-updated
+	cqesBase  unsafe.Pointer
+
+	// Setup flags
+	flags uint32
+
+	// Mutex for concurrent SQE submission from multiple goroutines
+	mu sync.Mutex
+
+	// Diagnostic counter -- limits debug output to first N failures
+	debugCount int
+}
+
+// NewIoUring creates a new io_uring instance with the given queue depth.
+// flags can be 0 for normal mode.
+func NewIoUring(entries uint32, flags uint32) (*IoUring, error) {
+	var params ioUringParams
+	params.Flags = flags
+
+	fd, _, errno := syscall.Syscall(sysIOUringSetup, uintptr(entries), uintptr(unsafe.Pointer(&params)), 0)
+	if errno != 0 {
+		return nil, fmt.Errorf("io_uring_setup failed: %w", errno)
+	}
+
+	ring := &IoUring{
+		fd:    int(fd),
+		flags: params.Flags,
+	}
+
+	if err := ring.mapRings(&params); err != nil {
+		syscall.Close(ring.fd)
+		return nil, err
+	}
+
+	return ring, nil
+}
+
+func (r *IoUring) mapRings(p *ioUringParams) error {
+	sqOff := &p.SqOff
+	cqOff := &p.CqOff
+
+	// Calculate SQ ring size
+	r.sqRingSz = int(sqOff.Array + p.SqEntries*4) // Array + entries*sizeof(uint32)
+
+	// Calculate CQ ring size
+	r.cqRingSz = int(cqOff.Cqes + p.CqEntries*uint32(unsafe.Sizeof(ioUringCqe{})))
+
+	// Check if kernel supports single mmap for both rings
+	r.singleMmap = (p.Features & 1) != 0 // IORING_FEAT_SINGLE_MMAP = 1
+	if r.singleMmap {
+		if r.cqRingSz > r.sqRingSz {
+			r.sqRingSz = r.cqRingSz
+		}
+	}
+
+	// Map SQ ring
+	var err error
+	r.sqRingPtr, err = unix.Mmap(r.fd, iouringOffSQRing, r.sqRingSz,
+		unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE)
+	if err != nil {
+		return fmt.Errorf("mmap SQ ring: %w", err)
+	}
+
+	// Map CQ ring (same or separate mapping)
+	if r.singleMmap {
+		r.cqRingPtr = r.sqRingPtr
+	} else {
+		r.cqRingPtr, err = unix.Mmap(r.fd, iouringOffCQRing, r.cqRingSz,
+			unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE)
+		if err != nil {
+			unix.Munmap(r.sqRingPtr)
+			return fmt.Errorf("mmap CQ ring: %w", err)
+		}
+	}
+
+	// Map SQE array
+	r.sqesSz = int(p.SqEntries) * int(unsafe.Sizeof(ioUringSqe{}))
+	r.sqesMmap, err = unix.Mmap(r.fd, iouringOffSQEs, r.sqesSz,
+		unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE)
+	if err != nil {
+		unix.Munmap(r.sqRingPtr)
+		if !r.singleMmap {
+			unix.Munmap(r.cqRingPtr)
+		}
+		return fmt.Errorf("mmap SQEs: %w", err)
+	}
+	r.sqesBase = unsafe.Pointer(&r.sqesMmap[0])
+
+	// Set up SQ ring pointers
+	sqBase := unsafe.Pointer(&r.sqRingPtr[0])
+	r.sqHead = (*uint32)(unsafe.Add(sqBase, sqOff.Head))
+	r.sqTail = (*uint32)(unsafe.Add(sqBase, sqOff.Tail))
+	r.sqFlags = (*uint32)(unsafe.Add(sqBase, sqOff.Flags))
+	r.sqMask = *(*uint32)(unsafe.Add(sqBase, sqOff.RingMask))
+	r.sqEntries = *(*uint32)(unsafe.Add(sqBase, sqOff.RingEntries))
+	r.sqArray = unsafe.Add(sqBase, sqOff.Array)
+
+	// Set up CQ ring pointers
+	cqBase := unsafe.Pointer(&r.cqRingPtr[0])
+	r.cqHead = (*uint32)(unsafe.Add(cqBase, cqOff.Head))
+	r.cqTail = (*uint32)(unsafe.Add(cqBase, cqOff.Tail))
+	r.cqMask = *(*uint32)(unsafe.Add(cqBase, cqOff.RingMask))
+	r.cqEntries = *(*uint32)(unsafe.Add(cqBase, cqOff.RingEntries))
+	r.cqesBase = unsafe.Add(cqBase, cqOff.Cqes)
+
+	return nil
+}
+
+// Close releases all resources associated with the ring.
+func (r *IoUring) Close() {
+	unix.Munmap(r.sqesMmap)
+	unix.Munmap(r.sqRingPtr)
+	if !r.singleMmap {
+		unix.Munmap(r.cqRingPtr)
+	}
+	syscall.Close(r.fd)
+}
+
+// -----------------------------------------------------------------------
+// SQE helpers
+// -----------------------------------------------------------------------
+
+func (r *IoUring) getSqeAt(idx uint32) *ioUringSqe {
+	return (*ioUringSqe)(unsafe.Add(r.sqesBase, uintptr(idx)*unsafe.Sizeof(ioUringSqe{})))
+}
+
+func (r *IoUring) getCqeAt(idx uint32) *ioUringCqe {
+	return (*ioUringCqe)(unsafe.Add(r.cqesBase, uintptr(idx)*unsafe.Sizeof(ioUringCqe{})))
+}
+
+func (r *IoUring) sqArrayAt(idx uint32) *uint32 {
+	return (*uint32)(unsafe.Add(r.sqArray, uintptr(idx)*4))
+}
+
+// getSqe returns the next available SQE, or nil if the SQ is full.
+func (r *IoUring) getSqe() *ioUringSqe {
+	head := atomic.LoadUint32(r.sqHead)
+	next := r.sqeTail + 1
+	if next-head > r.sqEntries {
+		return nil // SQ full
+	}
+	sqe := r.getSqeAt(r.sqeTail & r.sqMask)
+	r.sqeTail++
+	// Zero out the SQE
+	*sqe = ioUringSqe{}
+	return sqe
+}
+
+// flushSq flushes locally queued SQEs into the kernel-visible SQ ring.
+func (r *IoUring) flushSq() uint32 {
+	tail := *r.sqTail
+	toSubmit := r.sqeTail - r.sqeHead
+	if toSubmit == 0 {
+		return tail - atomic.LoadUint32(r.sqHead)
+	}
+	for ; toSubmit > 0; toSubmit-- {
+		*r.sqArrayAt(tail & r.sqMask) = r.sqeHead & r.sqMask
+		tail++
+		r.sqeHead++
+	}
+	atomic.StoreUint32(r.sqTail, tail)
+	return tail - atomic.LoadUint32(r.sqHead)
+}
+
+// -----------------------------------------------------------------------
+// Submission and completion
+// -----------------------------------------------------------------------
+
+func ioUringEnter(fd int, toSubmit, minComplete, flags uint32) (int, error) {
+	ret, _, errno := syscall.Syscall6(sysIOUringEnter,
+		uintptr(fd), uintptr(toSubmit), uintptr(minComplete), uintptr(flags), 0, 0)
+	if errno != 0 {
+		return int(ret), errno
+	}
+	return int(ret), nil
+}
+
+// submit flushes SQEs and calls io_uring_enter if needed.
+// Retries automatically on EINTR (signal interruption).
+func (r *IoUring) submit(waitNr uint32) (int, error) {
+	submitted := r.flushSq()
+	var flags uint32 = 0
+
+	// If not using SQPOLL, we always need to enter
+	if r.flags&iouringSetupSQPoll == 0 {
+		if waitNr > 0 {
+			flags |= iouringEnterGetEvents
+		}
+		for {
+			ret, err := ioUringEnter(r.fd, submitted, waitNr, flags)
+			if err == syscall.EINTR {
+				continue
+			}
+			return ret, err
+		}
+	}
+
+	// SQPOLL: only enter if kernel thread needs wakeup
+	if atomic.LoadUint32(r.sqFlags)&iouringSQNeedWakeup != 0 {
+		flags |= iouringEnterSQWakeup
+	}
+	if waitNr > 0 {
+		flags |= iouringEnterGetEvents
+	}
+	if flags != 0 {
+		for {
+			ret, err := ioUringEnter(r.fd, submitted, waitNr, flags)
+			if err == syscall.EINTR {
+				continue
+			}
+			return ret, err
+		}
+	}
+	return int(submitted), nil
+}
+
+// waitCqe waits for at least one CQE to be available and returns it.
+// The caller MUST call SeenCqe after processing.
+func (r *IoUring) waitCqe() (*ioUringCqe, error) {
+	for {
+		head := atomic.LoadUint32(r.cqHead)
+		tail := atomic.LoadUint32(r.cqTail)
+		if head != tail {
+			cqe := r.getCqeAt(head & r.cqMask)
+			return cqe, nil
+		}
+		// No CQE available, ask the kernel
+		_, err := ioUringEnter(r.fd, 0, 1, iouringEnterGetEvents)
+		if err != nil {
+			if err == syscall.EINTR {
+				continue // signal interrupted the syscall; retry
+			}
+			return nil, err
+		}
+	}
+}
+
+// seenCqe advances the CQ head by 1, releasing the CQE slot.
+func (r *IoUring) seenCqe() {
+	atomic.StoreUint32(r.cqHead, atomic.LoadUint32(r.cqHead)+1)
+}
+
+// -----------------------------------------------------------------------
+// PrepRead / PrepWrite helpers
+// -----------------------------------------------------------------------
+
+func prepRead(sqe *ioUringSqe, fd int, buf []byte, offset uint64) {
+	if len(buf) == 0 {
+		sqe.Opcode = iouringOpNop
+		return
+	}
+	sqe.Opcode = iouringOpRead
+	sqe.Fd = int32(fd)
+	sqe.Addr = uint64(uintptr(unsafe.Pointer(&buf[0])))
+	sqe.Len = uint32(len(buf))
+	sqe.Off = offset
+}
+
+func prepWrite(sqe *ioUringSqe, fd int, buf []byte, offset uint64) {
+	if len(buf) == 0 {
+		sqe.Opcode = iouringOpNop
+		return
+	}
+	sqe.Opcode = iouringOpWrite
+	sqe.Fd = int32(fd)
+	sqe.Addr = uint64(uintptr(unsafe.Pointer(&buf[0])))
+	sqe.Len = uint32(len(buf))
+	sqe.Off = offset
+}
+
+// -----------------------------------------------------------------------
+// High-level thread-safe API
+// -----------------------------------------------------------------------
+
+// SubmitRead submits a pread and waits for completion. Thread-safe.
+// Returns bytes read or an error.
+func (r *IoUring) SubmitRead(fd int, buf []byte, offset uint64) (int, error) {
+	if len(buf) == 0 {
+		return 0, nil
+	}
+
+	r.mu.Lock()
+
+	sqe := r.getSqe()
+	if sqe == nil {
+		r.mu.Unlock()
+		return 0, fmt.Errorf("io_uring: SQ full, no SQE available")
+	}
+	prepRead(sqe, fd, buf, offset)
+	// Tag the SQE so we can verify the CQE belongs to this request
+	sqe.UserData = offset
+
+	submitted, err := r.submit(1)
+	if err != nil {
+		r.mu.Unlock()
+		return 0, fmt.Errorf("io_uring_enter failed: %w", err)
+	}
+
+	cqe, err := r.waitCqe()
+	if err != nil {
+		r.mu.Unlock()
+		return 0, fmt.Errorf("io_uring wait cqe: %w", err)
+	}
+
+	res := cqe.Res
+	userData := cqe.UserData
+	cqeFlags := cqe.Flags
+	r.seenCqe()
+	r.mu.Unlock()
+
+	if res < 0 {
+		return 0, fmt.Errorf("io_uring pread errno %d (%s), fd=%d off=%d len=%d submitted=%d ud=%d",
+			-res, syscall.Errno(-res), fd, offset, len(buf), submitted, userData)
+	}
+
+	// Diagnostic: if io_uring returned 0 (EOF) or short read, compare with syscall.Pread
+	if r.debugCount < 20 && int(res) != len(buf) {
+		r.debugCount++
+		pn, perr := syscall.Pread(fd, buf, int64(offset))
+		// Also stat the fd to check file size
+		var stat syscall.Stat_t
+		fstatErr := syscall.Fstat(fd, &stat)
+		var fsize int64
+		if fstatErr == nil {
+			fsize = stat.Size
+		}
+		fmt.Printf("[io_uring diag] fd=%d off=%d len=%d uring_res=%d uring_ud=%d uring_flags=%d "+
+			"submitted=%d pread_n=%d pread_err=%v filesize=%d fstat_err=%v sqeHead=%d sqeTail=%d\n",
+			fd, offset, len(buf), res, userData, cqeFlags,
+			submitted, pn, perr, fsize, fstatErr, r.sqeHead, r.sqeTail)
+	}
+
+	return int(res), nil
+}
+
+// SubmitWriteBatch submits N pwrite operations in a single io_uring_enter call
+// and waits for all completions. Thread-safe.
+// Returns per-chunk bytes written. On error, partial results may be returned.
+func (r *IoUring) SubmitWriteBatch(fd int, bufs [][]byte, offsets []uint64) ([]int, error) {
+	n := len(bufs)
+	if n == 0 {
+		return nil, nil
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	// Prepare all SQEs
+	for i := 0; i < n; i++ {
+		sqe := r.getSqe()
+		if sqe == nil {
+			return nil, fmt.Errorf("io_uring: SQ full, need %d slots but ring has %d", n, r.sqEntries)
+		}
+		prepWrite(sqe, fd, bufs[i], offsets[i])
+		sqe.UserData = uint64(i)
+	}
+
+	// Submit all at once; kernel waits for all completions
+	_, err := r.submit(uint32(n))
+	if err != nil {
+		return nil, fmt.Errorf("io_uring_enter: %w", err)
+	}
+
+	var startTime time.Time
+	if metrics.Enabled() {
+		startTime = time.Now()
+	}
+
+	// Drain all CQEs (order may differ from submission)
+	results := make([]int, n)
+	for i := 0; i < n; i++ {
+		cqe, err := r.waitCqe()
+		if err != nil {
+			return results, fmt.Errorf("io_uring waitCqe: %w", err)
+		}
+		idx := int(cqe.UserData)
+		res := cqe.Res
+		r.seenCqe()
+
+		if res < 0 {
+			return results, fmt.Errorf("io_uring pwrite errno %d (%s), fd=%d off=%d len=%d",
+				-res, syscall.Errno(-res), fd, offsets[idx], len(bufs[idx]))
+		}
+		if idx >= 0 && idx < n {
+			results[idx] = int(res)
+		}
+
+		if metrics.Enabled() {
+			metrics.Timing(metrics.KEY_PWRITE_LATENCY, time.Since(startTime), []string{})
+		}
+	}
+
+	return results, nil
+}
+
+// SubmitWrite submits a pwrite and waits for completion. Thread-safe.
+// Returns bytes written or an error.
+func (r *IoUring) SubmitWrite(fd int, buf []byte, offset uint64) (int, error) {
+	if len(buf) == 0 {
+		return 0, nil
+	}
+
+	r.mu.Lock()
+
+	sqe := r.getSqe()
+	if sqe == nil {
+		r.mu.Unlock()
+		return 0, fmt.Errorf("io_uring: SQ full, no SQE available")
+	}
+	prepWrite(sqe, fd, buf, offset)
+
+	_, err := r.submit(1)
+	if err != nil {
+		r.mu.Unlock()
+		return 0, fmt.Errorf("io_uring_enter failed: %w", err)
+	}
+
+	cqe, err := r.waitCqe()
+	if err != nil {
+		r.mu.Unlock()
+		return 0, fmt.Errorf("io_uring wait cqe: %w", err)
+	}
+
+	res := cqe.Res
+	r.seenCqe()
+	r.mu.Unlock()
+
+	if res < 0 {
+		return 0, fmt.Errorf("io_uring pwrite failed: errno %d (%s)", -res, syscall.Errno(-res))
+	}
+	return int(res), nil
+}
diff --git a/flashring/internal/fs/iouring_test.go b/flashring/internal/fs/iouring_test.go
new file mode 100644
index 00000000..37f1cfa7
--- /dev/null
+++ b/flashring/internal/fs/iouring_test.go
@@ -0,0 +1,103 @@
+//go:build linux
+// +build linux
+
+package fs
+
+import (
+	"os"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+func TestIoUringBasicRead(t *testing.T) {
+	// 1. Create a temp file with known data
+	f, err := os.CreateTemp("", "iouring_test_*")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.Remove(f.Name())
+
+	data := make([]byte, 4096)
+	for i := range data {
+		data[i] = byte(i % 251) // non-zero pattern
+	}
+	if _, err := f.Write(data); err != nil {
+		t.Fatal(err)
+	}
+	if err := f.Sync(); err != nil {
+		t.Fatal(err)
+	}
+	f.Close()
+
+	// 2. Open with O_DIRECT | O_RDONLY
+	fd, err := syscall.Open(f.Name(), syscall.O_RDONLY|syscall.O_DIRECT, 0)
+	if err != nil {
+		t.Fatalf("open O_DIRECT: %v", err)
+	}
+	defer syscall.Close(fd)
+
+	// 3. Create io_uring ring
+	ring, err := NewIoUring(32, 0)
+	if err != nil {
+		t.Fatalf("NewIoUring: %v", err)
+	}
+	defer ring.Close()
+
+	// 4. Allocate aligned buffer
+	buf := AlignedBlock(4096, 4096)
+
+	// 5. Submit read via io_uring
+	n, err := ring.SubmitRead(fd, buf, 0)
+	if err != nil {
+		t.Fatalf("SubmitRead: %v", err)
+	}
+	if n != 4096 {
+		t.Fatalf("SubmitRead returned %d bytes, expected 4096", n)
+	}
+
+	// 6. Verify data
+	for i := 0; i < 4096; i++ {
+		if buf[i] != data[i] {
+			t.Fatalf("data mismatch at byte %d: got %d, want %d", i, buf[i], data[i])
+		}
+	}
+	t.Logf("io_uring read of 4096 bytes succeeded and data matches")
+
+	// 7. Test a second read (to verify ring reuse works)
+	buf2 := AlignedBlock(4096, 4096)
+	n2, err := ring.SubmitRead(fd, buf2, 0)
+	if err != nil {
+		t.Fatalf("SubmitRead #2: %v", err)
+	}
+	if n2 != 4096 {
+		t.Fatalf("SubmitRead #2 returned %d bytes, expected 4096", n2)
+	}
+	for i := 0; i < 4096; i++ {
+		if buf2[i] != data[i] {
+			t.Fatalf("data mismatch #2 at byte %d: got %d, want %d", i, buf2[i], data[i])
+		}
+	}
+	t.Logf("io_uring second read also succeeded")
+
+	// 8. Test multiple sequential reads to exercise ring cycling
+	for iter := 0; iter < 100; iter++ {
+		buf3 := AlignedBlock(4096, 4096)
+		n3, err := ring.SubmitRead(fd, buf3, 0)
+		if err != nil {
+			t.Fatalf("SubmitRead iter %d: %v", iter, err)
+		}
+		if n3 != 4096 {
+			t.Fatalf("SubmitRead iter %d returned %d bytes, expected 4096", iter, n3)
+		}
+	}
+	t.Logf("100 sequential io_uring reads succeeded")
+}
+
+// AlignedBlock returns a 4096-byte-aligned buffer.
+func AlignedBlock(size, alignment int) []byte {
+	raw := make([]byte, size+alignment)
+	addr := uintptr(unsafe.Pointer(&raw[0]))
+	off := (alignment - int(addr%uintptr(alignment))) % alignment
+	return raw[off : off+size]
+}
diff --git a/flashring/internal/fs/iouring_wrapper.go b/flashring/internal/fs/iouring_wrapper.go
new file mode 100644
index 00000000..b059e4ed
--- /dev/null
+++ b/flashring/internal/fs/iouring_wrapper.go
@@ -0,0 +1,40 @@
+//go:build linux
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+)
+
+// IOUringFile wraps an existing WrapAppendFile with an io_uring ring for async I/O.
+// It does NOT own the WrapAppendFile -- the caller manages its lifecycle.
+type IOUringFile struct {
+	*WrapAppendFile          // embed existing file (shared, not owned)
+	ring            *IoUring // our raw io_uring instance
+	depth           uint32   // submission queue depth
+}
+
+// NewIOUringFile attaches an io_uring ring to an existing WrapAppendFile.
+// The WrapAppendFile is shared (not duplicated) -- writes and reads use
+// the same file descriptors, so offset tracking stays in sync.
+// ringDepth controls the SQ/CQ size (64-256 is a good starting point).
+// flags can be 0 for normal mode.
+func NewIOUringFile(waf *WrapAppendFile, ringDepth uint32, flags uint32) (*IOUringFile, error) {
+	ring, err := NewIoUring(ringDepth, flags)
+	if err != nil {
+		return nil, fmt.Errorf("io_uring init failed: %w", err)
+	}
+
+	return &IOUringFile{
+		WrapAppendFile: waf,
+		ring:           ring,
+		depth:          ringDepth,
+	}, nil
+}
+
+// Close releases only the io_uring ring. The underlying WrapAppendFile
+// is NOT closed here since it is shared with the shard.
+func (f *IOUringFile) Close() {
+	f.ring.Close()
+}
diff --git a/flashring/internal/fs/wrap_file.go b/flashring/internal/fs/wrap_file.go
index fc91e006..3ef52fa8 100644
--- a/flashring/internal/fs/wrap_file.go
+++ b/flashring/internal/fs/wrap_file.go
@@ -6,7 +6,9 @@ package fs
 import (
 	"os"
 	"syscall"
+	"time"
 
+	"github.com/Meesho/BharatMLStack/flashring/pkg/metrics"
 	"golang.org/x/sys/unix"
 )
 
@@ -25,6 +27,7 @@ type WrapAppendFile struct {
 	WriteFile            *os.File // write file
 	ReadFile             *os.File // read file
 	Stat                 *Stat    // file statistics
+	WriteRing            *IoUring // optional io_uring ring for batched writes
 }
 
 func NewWrapAppendFile(config FileConfig) (*WrapAppendFile, error) {
@@ -72,20 +75,96 @@ func (r *WrapAppendFile) Pwrite(buf []byte) (currentPhysicalOffset int64, err er
 			return 0, ErrBufNoAlign
 		}
 	}
+	var startTime time.Time
+	if metrics.Enabled() {
+		startTime = time.Now()
+	}
 	n, err := syscall.Pwrite(r.WriteFd, buf, r.PhysicalWriteOffset)
+	if metrics.Enabled() {
+		metrics.Timing(metrics.KEY_PWRITE_LATENCY, time.Since(startTime), []string{})
+	}
 	if err != nil {
 		return 0, err
 	}
+
 	r.PhysicalWriteOffset += int64(n)
 	if r.PhysicalWriteOffset >= r.MaxFileSize {
 		r.wrapped = true
 		r.PhysicalWriteOffset = r.PhysicalStartOffset
 	}
 	r.LogicalCurrentOffset += int64(n)
-	r.Stat.WriteCount++
+
 	return r.PhysicalWriteOffset, nil
 }
 
+// PwriteBatch writes a large buffer in chunkSize pieces via io_uring.
+// Chunks are submitted in sub-batches that fit within the ring's SQ depth,
+// so arbitrarily large buffers work regardless of ring size.
+// Returns total bytes written and the final PhysicalWriteOffset.
+// Requires WriteRing to be set; falls back to sequential Pwrite if nil.
+func (r *WrapAppendFile) PwriteBatch(buf []byte, chunkSize int) (totalWritten int, fileOffset int64, err error) {
+	if r.WriteRing == nil {
+		// Fallback: sequential pwrite
+		for written := 0; written < len(buf); written += chunkSize {
+			end := written + chunkSize
+			if end > len(buf) {
+				end = len(buf)
+			}
+			fileOffset, err = r.Pwrite(buf[written:end])
+			if err != nil {
+				return written, fileOffset, err
+			}
+			totalWritten += end - written
+		}
+		return totalWritten, fileOffset, nil
+	}
+
+	if r.WriteDirectIO {
+		if !isAlignedBuffer(buf, r.blockSize) {
+			return 0, 0, ErrBufNoAlign
+		}
+	}
+
+	// Maximum SQEs per submission -- capped to ring depth.
+	maxPerBatch := int(r.WriteRing.sqEntries)
+
+	for written := 0; written < len(buf); {
+		// Build a sub-batch that fits within the ring
+		var bufs [][]byte
+		var offsets []uint64
+
+		for i := 0; i < maxPerBatch && written < len(buf); i++ {
+			end := written + chunkSize
+			if end > len(buf) {
+				end = len(buf)
+			}
+			bufs = append(bufs, buf[written:end])
+			offsets = append(offsets, uint64(r.PhysicalWriteOffset))
+
+			// Advance write offset, handle ring-buffer wrap
+			r.PhysicalWriteOffset += int64(end - written)
+			if r.PhysicalWriteOffset >= r.MaxFileSize {
+				r.wrapped = true
+				r.PhysicalWriteOffset = r.PhysicalStartOffset
+			}
+			written = end
+		}
+
+		results, serr := r.WriteRing.SubmitWriteBatch(r.WriteFd, bufs, offsets)
+		if serr != nil {
+			return totalWritten, r.PhysicalWriteOffset, serr
+		}
+
+		for _, n := range results {
+			totalWritten += n
+			r.LogicalCurrentOffset += int64(n)
+			r.Stat.WriteCount++
+		}
+	}
+
+	return totalWritten, r.PhysicalWriteOffset, nil
+}
+
 func (r *WrapAppendFile) TrimHeadIfNeeded() bool {
 	if r.wrapped && r.PhysicalWriteOffset == r.PhysicalStartOffset {
 		return true
@@ -126,7 +205,14 @@ func (r *WrapAppendFile) Pread(fileOffset int64, buf []byte) (int32, error) {
 		return 0, ErrFileOffsetOutOfRange
 	}
 
+	var startTime time.Time
+	if metrics.Enabled() {
+		startTime = time.Now()
+	}
 	n, err := syscall.Pread(r.ReadFd, buf, fileOffset)
+	if metrics.Enabled() {
+		metrics.Timing(metrics.KEY_PREAD_LATENCY, time.Since(startTime), []string{})
+	}
 	// flags := unix.RWF_HIPRI // optionally: | unix.RWF_NOWAIT
 	// n, err := preadv2(r.ReadFd, buf, fileOffset, flags)
 	if err != nil {
@@ -136,7 +222,97 @@ func (r *WrapAppendFile) Pread(fileOffset int64, buf []byte) (int32, error) {
 	return int32(n), nil
 }
 
+// ValidateReadOffset checks the read window and wraps the offset for ring-buffer
+// files. Returns the physical file offset to use, or an error.
+// Mirrors the validation logic in PreadAsync / Pread so callers that bypass
+// PreadAsync (e.g. the batched io_uring path) get identical safety checks.
+func (r *WrapAppendFile) ValidateReadOffset(fileOffset int64, bufLen int) (int64, error) {
+	if r.ReadDirectIO {
+		if !isAlignedOffset(fileOffset, r.blockSize) {
+			return 0, ErrOffsetNotAligned
+		}
+	}
+
+	readEnd := fileOffset + int64(bufLen)
+	valid := false
+
+	if !r.wrapped {
+		valid = fileOffset >= r.PhysicalStartOffset && readEnd <= r.PhysicalWriteOffset
+	} else {
+		fileOffset = fileOffset % r.MaxFileSize
+		readEnd = readEnd % r.MaxFileSize
+		if fileOffset >= r.PhysicalStartOffset {
+			valid = readEnd <= r.MaxFileSize
+		} else {
+			valid = readEnd <= r.PhysicalWriteOffset
+		}
+	}
+	if !valid {
+		return 0, ErrFileOffsetOutOfRange
+	}
+
+	return fileOffset, nil
+}
+
+// PreadAsync submits a pread via io_uring and waits for completion.
+// Thread-safe: multiple goroutines can call this concurrently on the same IOUringFile.
+// Applies the same read-window validation and offset wrapping as Pread so that
+// stale index entries (pointing past MaxFileSize) are rejected cheaply without
+// hitting the kernel.
+func (f *IOUringFile) PreadAsync(fileOffset int64, buf []byte) (int, error) {
+	if f.ReadDirectIO {
+		if !isAlignedOffset(fileOffset, f.blockSize) {
+			return 0, ErrOffsetNotAligned
+		}
+		if !isAlignedBuffer(buf, f.blockSize) {
+			return 0, ErrBufNoAlign
+		}
+	}
+
+	// Validate read window and wrap offset (mirrors Pread logic exactly)
+	readEnd := fileOffset + int64(len(buf))
+	valid := false
+
+	if !f.wrapped {
+		// Single valid region: [PhysicalStartOffset, PhysicalWriteOffset)
+		valid = fileOffset >= f.PhysicalStartOffset && readEnd <= f.PhysicalWriteOffset
+	} else {
+		// Ring buffer has wrapped -- map the logical offset back into [0, MaxFileSize)
+		fileOffset = fileOffset % f.MaxFileSize
+		readEnd = readEnd % f.MaxFileSize
+		if fileOffset >= f.PhysicalStartOffset {
+			valid = readEnd <= f.MaxFileSize
+		} else {
+			valid = readEnd <= f.PhysicalWriteOffset
+		}
+	}
+	if !valid {
+		return 0, ErrFileOffsetOutOfRange
+	}
+
+	var startTime time.Time
+	if metrics.Enabled() {
+		startTime = time.Now()
+	}
+	n, err := f.ring.SubmitRead(f.ReadFd, buf, uint64(fileOffset))
+	if metrics.Enabled() {
+		metrics.Incr(metrics.KEY_PREAD_COUNT, []string{})
+		metrics.Timing(metrics.KEY_PREAD_LATENCY, time.Since(startTime), []string{})
+	}
+	if err != nil {
+		return 0, err
+	}
+
+	f.Stat.ReadCount++
+	return n, nil
+}
+
 func (r *WrapAppendFile) TrimHead() (err error) {
+
+	var startTime time.Time
+	if metrics.Enabled() {
+		startTime = time.Now()
+	}
 	if r.WriteDirectIO {
 		if !isAlignedOffset(r.PhysicalStartOffset, r.blockSize) {
 			return ErrOffsetNotAligned
@@ -150,7 +326,10 @@ func (r *WrapAppendFile) TrimHead() (err error) {
 	if r.PhysicalStartOffset >= r.MaxFileSize {
 		r.PhysicalStartOffset = 0
 	}
-	r.Stat.PunchHoleCount++
+	if metrics.Enabled() {
+		metrics.Incr(metrics.KEY_PUNCH_HOLE_COUNT, []string{})
+		metrics.Timing(metrics.KEY_TRIM_HEAD_LATENCY, time.Since(startTime), []string{})
+	}
 	return nil
 }
 
diff --git a/flashring/internal/indicesV3/delete_manager.go b/flashring/internal/indicesV3/delete_manager.go
index 6b218915..c6e632db 100644
--- a/flashring/internal/indicesV3/delete_manager.go
+++ b/flashring/internal/indicesV3/delete_manager.go
@@ -1,6 +1,7 @@
 package indicesv2
 
 import (
+	"errors"
 	"fmt"
 
 	"github.com/Meesho/BharatMLStack/flashring/internal/fs"
@@ -62,6 +63,9 @@ func (dm *DeleteManager) ExecuteDeleteIfNeeded() error {
 	if trimNeeded || nextAddNeedsDelete {
 		dm.deleteInProgress = true
 		dm.deleteCount = int(dm.memtableData[dm.toBeDeletedMemId] / dm.deleteAmortizedStep)
+		if dm.deleteCount == 0 {
+			dm.deleteCount = int(dm.memtableData[dm.toBeDeletedMemId] % dm.deleteAmortizedStep)
+		}
 		memIdAtHead, err := dm.keyIndex.PeekMemIdAtHead()
 		if err != nil {
 			return err
@@ -69,8 +73,9 @@ func (dm *DeleteManager) ExecuteDeleteIfNeeded() error {
 		if memIdAtHead != dm.toBeDeletedMemId {
 			return fmt.Errorf("memIdAtHead: %d, toBeDeletedMemId: %d", memIdAtHead, dm.toBeDeletedMemId)
 		}
+
 		dm.wrapFile.TrimHead()
-		return nil
+		return errors.New("trim needed retry this write")
 	}
 	return nil
 }
diff --git a/flashring/internal/indicesV3/index.go b/flashring/internal/indicesV3/index.go
index 29261585..aa4b3556 100644
--- a/flashring/internal/indicesV3/index.go
+++ b/flashring/internal/indicesV3/index.go
@@ -7,7 +7,6 @@ import (
 
 	"github.com/Meesho/BharatMLStack/flashring/internal/maths"
 	"github.com/cespare/xxhash/v2"
-	"github.com/rs/zerolog/log"
 	"github.com/zeebo/xxh3"
 )
 
@@ -22,20 +21,22 @@ const (
 )
 
 type Index struct {
-	rm       sync.Map
+	mu       *sync.RWMutex
+	rm       map[uint64]int
 	rb       *RingBuffer
 	mc       *maths.MorrisLogCounter
 	startAt  int64
 	hashBits int
 }
 
-func NewIndex(hashBits int, rbInitial, rbMax, deleteAmortizedStep int) *Index {
+func NewIndex(hashBits int, rbInitial, rbMax, deleteAmortizedStep int, mu *sync.RWMutex) *Index {
 	if ByteOrder == nil {
 		loadByteOrder()
 	}
 	// rm := make(map[uint64]int)
 	return &Index{
-		rm:       sync.Map{},
+		mu:       mu,
+		rm:       make(map[uint64]int),
 		rb:       NewRingBuffer(rbInitial, rbMax),
 		mc:       maths.New(12),
 		startAt:  time.Now().Unix(),
@@ -52,15 +53,15 @@ func (i *Index) Put(key string, length, ttlInMinutes uint16, memId, offset uint3
 	delta := uint16(expiryAt - (i.startAt / 60))
 	encode(key, length, delta, lastAccess, freq, memId, offset, entry)
 
-	if headIdx, ok := i.rm.Load(hlo); !ok {
+	if headIdx, ok := i.rm[hlo]; !ok {
 		encodeHashNextPrev(hhi, hlo, -1, -1, hashNextPrev)
-		i.rm.Store(hlo, idx)
+		i.rm[hlo] = idx
 		return
 	} else {
-		_, headHashNextPrev, _ := i.rb.Get(int(headIdx.(int)))
+		_, headHashNextPrev, _ := i.rb.Get(int(headIdx))
 		encodeUpdatePrev(int32(idx), headHashNextPrev)
-		encodeHashNextPrev(hhi, hlo, -1, int32(headIdx.(int)), hashNextPrev)
-		i.rm.Store(hlo, idx)
+		encodeHashNextPrev(hhi, hlo, -1, int32(headIdx), hashNextPrev)
+		i.rm[hlo] = idx
 		return
 	}
 
@@ -68,9 +69,14 @@ func (i *Index) Put(key string, length, ttlInMinutes uint16, memId, offset uint3
 
 func (i *Index) Get(key string) (length, lastAccess, remainingTTL uint16, freq uint64, memId, offset uint32, status Status) {
 	hhi, hlo := hash128(key)
-	if idx, ok := i.rm.Load(hlo); ok {
-		entry, hashNextPrev, _ := i.rb.Get(int(idx.(int)))
+
+	i.mu.RLock()
+	idx, ok := i.rm[hlo]
+	i.mu.RUnlock()
+
+	if ok {
 		for {
+			entry, hashNextPrev, _ := i.rb.Get(int(idx))
 			if isHashMatch(hhi, hlo, hashNextPrev) {
 				length, deltaExptime, lastAccess, freq, memId, offset := decode(entry)
 				exptime := int(deltaExptime) + int(i.startAt/60)
@@ -96,6 +102,9 @@ func (i *Index) Get(key string) (length, lastAccess, remainingTTL uint16, freq u
 }
 
 func (ix *Index) Delete(count int) (uint32, int) {
+	if count == 0 {
+		return 0, 0
+	}
 	for i := 0; i < count; i++ {
 		deleted, deletedHashNextPrev, deletedIdx, next := ix.rb.Delete()
 		if deleted == nil {
@@ -103,15 +112,15 @@ func (ix *Index) Delete(count int) (uint32, int) {
 		}
 		delMemId, _ := decodeMemIdOffset(deleted)
 		deletedHlo := decodeHashLo(deletedHashNextPrev)
-		mapIdx, ok := ix.rm.Load(deletedHlo)
-		if ok && mapIdx.(int) == deletedIdx {
-			ix.rm.Delete(deletedHlo)
+		mapIdx, ok := ix.rm[deletedHlo]
+		if ok && mapIdx == deletedIdx {
+			delete(ix.rm, deletedHlo)
 		} else if ok && hasPrev(deletedHashNextPrev) {
 			prevIdx := decodePrev(deletedHashNextPrev)
 			_, hashNextPrev, _ := ix.rb.Get(int(prevIdx))
 			encodeUpdateNext(-1, hashNextPrev)
 		} else {
-			log.Warn().Msgf("broken link. Entry in RB but cannot be linked to map. deletedIdx: %d", deletedIdx)
+			//log.Warn().Msgf("broken link. Entry in RB but cannot be linked to map. deletedIdx: %d", deletedIdx)
 		}
 
 		nextMemId, _ := decodeMemIdOffset(next)
diff --git a/flashring/internal/indicesV3/index_test.go b/flashring/internal/indicesV3/index_test.go
index 3eecea9d..fe4ca081 100644
--- a/flashring/internal/indicesV3/index_test.go
+++ b/flashring/internal/indicesV3/index_test.go
@@ -2,17 +2,19 @@ package indicesv2
 
 import (
 	"fmt"
+	"sync"
 	"testing"
 )
 
 func TestIndexAddRbMax(t *testing.T) {
 	loadByteOrder()
 
+	mu := &sync.RWMutex{}
 	// Use equal initial and max capacity for the fixed-size ring buffer.
 	rbMax := 1000_000
 	rbInitial := rbMax
 	hashBits := 16
-	idx := NewIndex(hashBits, rbInitial, rbMax, 1)
+	idx := NewIndex(hashBits, rbInitial, rbMax, 1, mu)
 
 	// Insert exactly rbMax distinct keys
 	for i := 0; i < rbMax; i++ {
@@ -64,7 +66,7 @@ func TestIndexDeleteAndGet(t *testing.T) {
 	rbMax := 99
 	rbInitial := rbMax
 	hashBits := 16
-	idx := NewIndex(hashBits, rbInitial, rbMax, 1)
+	idx := NewIndex(hashBits, rbInitial, rbMax, 1, nil)
 
 	// Insert exactly rbMax distinct keys in order
 	for i := 0; i < 33; i++ {
@@ -137,11 +139,13 @@ func TestIndexDeleteAndGet(t *testing.T) {
 func TestIndexDeleteAndGetOverlappingHash(t *testing.T) {
 	loadByteOrder()
 
+	mu := &sync.RWMutex{}
+
 	// Keep this small and fast
 	rbMax := 99
 	rbInitial := rbMax
 	hashBits := 16
-	idx := NewIndex(hashBits, rbInitial, rbMax, 1)
+	idx := NewIndex(hashBits, rbInitial, rbMax, 1, mu)
 
 	// Insert exactly rbMax distinct keys in order
 	for i := 0; i < 33; i++ {
diff --git a/flashring/internal/maths/estimator.go b/flashring/internal/maths/estimator.go
index f477d96e..154298e1 100644
--- a/flashring/internal/maths/estimator.go
+++ b/flashring/internal/maths/estimator.go
@@ -5,6 +5,8 @@ package maths
 import (
 	"math"
 	"time"
+
+	"github.com/rs/zerolog/log"
 )
 
 const (
@@ -75,6 +77,7 @@ func (g *GridSearchEstimator) RecordHitRate(hitRate float64) {
 		stat.HitRate = (stat.HitRate*float64(stat.Trials) + hitRate) / float64(stat.Trials+1)
 		stat.Trials++
 		if stat.HitRate < g.bestHitRate*0.9 {
+			log.Error().Msgf("GridSearchRestarted: hitRate %v bestHitRate %v", stat.HitRate, g.bestHitRate)
 			g.RestartGridSearch()
 		}
 		return
@@ -130,6 +133,10 @@ func (g *GridSearchEstimator) GenerateRefinedGrid(base WeightTuple, steps int, d
 	refined := make([]WeightTuple, 0, (2*steps+1)*(2*steps+1))
 	for i := -steps; i <= steps; i++ {
 		for j := -steps; j <= steps; j++ {
+
+			if i == 0 && j == 0 {
+				continue
+			}
 			wf := base.WFreq + float64(i)*delta
 			la := base.WLA + float64(j)*delta
 			if math.Abs(wf-base.WFreq) < g.epsilon && math.Abs(la-base.WLA) < g.epsilon {
diff --git a/flashring/internal/memtables/manager.go b/flashring/internal/memtables/manager.go
index a86fb108..3c313017 100644
--- a/flashring/internal/memtables/manager.go
+++ b/flashring/internal/memtables/manager.go
@@ -3,6 +3,7 @@ package memtables
 import (
 	"github.com/Meesho/BharatMLStack/flashring/internal/allocators"
 	"github.com/Meesho/BharatMLStack/flashring/internal/fs"
+	"github.com/Meesho/BharatMLStack/flashring/pkg/metrics"
 	"github.com/rs/zerolog/log"
 )
 
@@ -16,11 +17,6 @@ type MemtableManager struct {
 	nextFileOffset int64
 	nextId         uint32
 	semaphore      chan int
-	stats          Stats
-}
-
-type Stats struct {
-	Flushes int64
 }
 
 func NewMemtableManager(file *fs.WrapAppendFile, capacity int32) (*MemtableManager, error) {
@@ -62,7 +58,6 @@ func NewMemtableManager(file *fs.WrapAppendFile, capacity int32) (*MemtableManag
 		nextFileOffset: 2 * int64(capacity),
 		nextId:         2,
 		semaphore:      make(chan int, 1),
-		stats:          Stats{},
 	}
 	return memtableManager, nil
 }
@@ -92,7 +87,9 @@ func (mm *MemtableManager) flushConsumer(memtable *Memtable) {
 	memtable.Id = mm.nextId
 	mm.nextId++
 	mm.nextFileOffset += int64(n)
-	mm.stats.Flushes++
+	if metrics.Enabled() {
+		metrics.Incr(metrics.KEY_MEMTABLE_FLUSH_COUNT, append(metrics.GetShardTag(memtable.ShardIdx), metrics.GetMemtableTag(memtable.Id)...))
+	}
 }
 func (mm *MemtableManager) Flush() error {
 
diff --git a/flashring/internal/memtables/manager_bench_test.go b/flashring/internal/memtables/manager_bench_test.go
index 28738185..c29c0e52 100644
--- a/flashring/internal/memtables/manager_bench_test.go
+++ b/flashring/internal/memtables/manager_bench_test.go
@@ -48,7 +48,7 @@ func Benchmark_Puts(b *testing.B) {
 		}
 	}
 
-	b.ReportMetric(float64(manager.stats.Flushes), "flushes")
+	// b.ReportMetric(float64(manager.stats.Flushes), "flushes")
 	b.ReportMetric(float64(b.N*16*1024)/1024/1024, "MB/s")
 	b.ReportAllocs()
 
diff --git a/flashring/internal/memtables/memtable.go b/flashring/internal/memtables/memtable.go
index bc92f0ff..3be40e4b 100644
--- a/flashring/internal/memtables/memtable.go
+++ b/flashring/internal/memtables/memtable.go
@@ -4,7 +4,6 @@ import (
 	"errors"
 
 	"github.com/Meesho/BharatMLStack/flashring/internal/fs"
-	"github.com/rs/zerolog/log"
 )
 
 var (
@@ -25,6 +24,7 @@ type Memtable struct {
 	readyForFlush bool
 	next          *Memtable
 	prev          *Memtable
+	ShardIdx      uint32
 }
 
 type MemtableConfig struct {
@@ -32,6 +32,7 @@ type MemtableConfig struct {
 	id       uint32
 	page     *fs.AlignedPage
 	file     *fs.WrapAppendFile
+	shardIdx uint32
 }
 
 func NewMemtable(config MemtableConfig) (*Memtable, error) {
@@ -49,6 +50,7 @@ func NewMemtable(config MemtableConfig) (*Memtable, error) {
 	}
 	return &Memtable{
 		Id:            config.id,
+		ShardIdx:      config.shardIdx,
 		capacity:      config.capacity,
 		currentOffset: 0,
 		file:          config.file,
@@ -98,15 +100,23 @@ func (m *Memtable) Flush() (n int, fileOffset int64, err error) {
 	if !m.readyForFlush {
 		return 0, 0, ErrMemtableNotReadyForFlush
 	}
-	fileOffset, err = m.file.Pwrite(m.page.Buf)
+
+	chunkSize := fs.BLOCK_SIZE
+	numChunks := len(m.page.Buf) / chunkSize
+	if len(m.page.Buf)%chunkSize != 0 {
+		numChunks++
+	}
+
+	// PwriteBatch submits all chunks in one io_uring_enter when WriteRing is
+	// set, otherwise falls back to sequential pwrite internally.
+	totalWritten, fileOffset, err := m.file.PwriteBatch(m.page.Buf, chunkSize)
 	if err != nil {
 		return 0, 0, err
-	} else {
-		log.Debug().Msgf("Flushed memtable %d to file %d", m.Id, fileOffset)
 	}
+
 	m.currentOffset = 0
 	m.readyForFlush = false
-	return len(m.page.Buf), fileOffset, nil
+	return totalWritten, fileOffset, nil
 }
 
 func (m *Memtable) Discard() {
diff --git a/flashring/internal/pools/leaky_pool.go b/flashring/internal/pools/leaky_pool.go
index b2a59487..afcd1b2e 100644
--- a/flashring/internal/pools/leaky_pool.go
+++ b/flashring/internal/pools/leaky_pool.go
@@ -11,7 +11,6 @@ type LeakyPool struct {
 	usage            int
 	idx              int
 	lock             sync.RWMutex
-	stats            *Stats
 }
 
 type Stats struct {
@@ -34,7 +33,6 @@ func NewLeakyPool(config LeakyPoolConfig) *LeakyPool {
 		usage:            0,
 		idx:              -1,
 		preDrefHook:      nil,
-		stats:            &Stats{Usage: 0, Capacity: config.Capacity},
 	}
 }
 
diff --git a/flashring/internal/shard/batch_reader.go b/flashring/internal/shard/batch_reader.go
index 3896834b..c6d462be 100644
--- a/flashring/internal/shard/batch_reader.go
+++ b/flashring/internal/shard/batch_reader.go
@@ -68,7 +68,6 @@ func (br *BatchReader) processBatches() {
 			return
 		case firstReq := <-br.requests:
 			batch := br.collectBatch(firstReq)
-			br.shardCache.Stats.BatchTracker.RecordBatchSize(len(batch))
 			br.executeBatch(batch)
 		}
 	}
diff --git a/flashring/internal/shard/batch_reader_v2.go b/flashring/internal/shard/batch_reader_v2.go
index 2aa99b09..fb614321 100644
--- a/flashring/internal/shard/batch_reader_v2.go
+++ b/flashring/internal/shard/batch_reader_v2.go
@@ -94,7 +94,6 @@ func (br *BatchReaderV2) processBatchesV2() {
 			return
 		case firstReq := <-br.Requests:
 			batch := br.collectBatchV2(firstReq)
-			br.shardCache.Stats.BatchTracker.RecordBatchSize(len(batch))
 			br.executeBatchV2(batch)
 		}
 	}
diff --git a/flashring/internal/shard/batch_tracker.go b/flashring/internal/shard/batch_tracker.go
deleted file mode 100644
index 5658d0e2..00000000
--- a/flashring/internal/shard/batch_tracker.go
+++ /dev/null
@@ -1,55 +0,0 @@
-package filecache
-
-import (
-	"sort"
-	"sync"
-)
-
-type BatchTracker struct {
-	mu         sync.RWMutex
-	getBatch   []int
-	maxSamples int
-	getIndex   int
-}
-
-// const defaultMaxSamples = 100000
-
-func NewBatchTracker() *BatchTracker {
-	return &BatchTracker{
-		getBatch:   make([]int, defaultMaxSamples),
-		maxSamples: defaultMaxSamples,
-	}
-}
-
-func (bt *BatchTracker) RecordBatchSize(batchSize int) {
-	bt.mu.Lock()
-	defer bt.mu.Unlock()
-	bt.getBatch[bt.getIndex] = batchSize
-	bt.getIndex = (bt.getIndex + 1) % bt.maxSamples
-}
-
-func (bt *BatchTracker) GetBatchSizePercentiles() (p25, p50, p99 int) {
-	bt.mu.RLock()
-	defer bt.mu.RUnlock()
-
-	samples := bt.getIndex
-	if samples > int(bt.maxSamples) {
-		samples = int(bt.maxSamples)
-	}
-
-	if samples == 0 {
-		return 0, 0, 0
-	}
-
-	batchSizesCopy := make([]int, samples)
-	copy(batchSizesCopy, bt.getBatch[:samples])
-	sort.Slice(batchSizesCopy, func(i, j int) bool {
-		return batchSizesCopy[i] < batchSizesCopy[j]
-	})
-
-	p25 = batchSizesCopy[int(float64(samples)*0.25)]
-	p50 = batchSizesCopy[int(float64(samples)*0.50)]
-	p99 = batchSizesCopy[int(float64(samples)*0.99)]
-
-	return p25, p50, p99
-}
diff --git a/flashring/internal/shard/latency_tracker.go b/flashring/internal/shard/latency_tracker.go
deleted file mode 100644
index eeb109c8..00000000
--- a/flashring/internal/shard/latency_tracker.go
+++ /dev/null
@@ -1,96 +0,0 @@
-package filecache
-
-import (
-	"sort"
-	"sync"
-	"time"
-)
-
-type LatencyTracker struct {
-	mu           sync.RWMutex
-	getLatencies []time.Duration
-	putLatencies []time.Duration
-	maxSamples   int
-	getIndex     int
-	putIndex     int
-	getCount     int64
-	putCount     int64
-}
-
-const defaultMaxSamples = 100000
-
-func NewLatencyTracker() *LatencyTracker {
-	return &LatencyTracker{
-		getLatencies: make([]time.Duration, defaultMaxSamples),
-		putLatencies: make([]time.Duration, defaultMaxSamples),
-		maxSamples:   defaultMaxSamples,
-	}
-}
-
-func (lt *LatencyTracker) RecordGet(duration time.Duration) {
-	lt.mu.Lock()
-	defer lt.mu.Unlock()
-	lt.getLatencies[lt.getIndex] = duration
-	lt.getIndex = (lt.getIndex + 1) % lt.maxSamples
-	lt.getCount++
-}
-
-func (lt *LatencyTracker) RecordPut(duration time.Duration) {
-	lt.mu.Lock()
-	defer lt.mu.Unlock()
-	lt.putLatencies[lt.putIndex] = duration
-	lt.putIndex = (lt.putIndex + 1) % lt.maxSamples
-	lt.putCount++
-}
-
-func (lt *LatencyTracker) GetLatencyPercentiles() (p25, p50, p99 time.Duration) {
-	lt.mu.RLock()
-	defer lt.mu.RUnlock()
-
-	samples := lt.getCount
-	if samples > int64(lt.maxSamples) {
-		samples = int64(lt.maxSamples)
-	}
-
-	if samples == 0 {
-		return 0, 0, 0
-	}
-
-	latenciesCopy := make([]time.Duration, samples)
-	copy(latenciesCopy, lt.getLatencies[:samples])
-	sort.Slice(latenciesCopy, func(i, j int) bool {
-		return latenciesCopy[i] < latenciesCopy[j]
-	})
-
-	p25 = latenciesCopy[int(float64(samples)*0.25)]
-	p50 = latenciesCopy[int(float64(samples)*0.50)]
-	p99 = latenciesCopy[int(float64(samples)*0.99)]
-
-	return p25, p50, p99
-}
-
-func (lt *LatencyTracker) PutLatencyPercentiles() (p25, p50, p99 time.Duration) {
-	lt.mu.RLock()
-	defer lt.mu.RUnlock()
-
-	samples := lt.putCount
-	if samples > int64(lt.maxSamples) {
-		samples = int64(lt.maxSamples)
-	}
-
-	if samples == 0 {
-		return 0, 0, 0
-	}
-
-	latenciesCopy := make([]time.Duration, samples)
-	copy(latenciesCopy, lt.putLatencies[:samples])
-	sort.Slice(latenciesCopy, func(i, j int) bool {
-		return latenciesCopy[i] < latenciesCopy[j]
-	})
-
-	p25 = latenciesCopy[int(float64(samples)*0.25)]
-	p50 = latenciesCopy[int(float64(samples)*0.50)]
-	p99 = latenciesCopy[int(float64(samples)*0.99)]
-
-	return p25, p50, p99
-}
diff --git a/flashring/internal/shard/shard_cache.go b/flashring/internal/shard/shard_cache.go
index 78e19deb..4796be8b 100644
--- a/flashring/internal/shard/shard_cache.go
+++ b/flashring/internal/shard/shard_cache.go
@@ -11,18 +11,20 @@ import (
 	indices "github.com/Meesho/BharatMLStack/flashring/internal/indicesV3"
 	"github.com/Meesho/BharatMLStack/flashring/internal/maths"
 	"github.com/Meesho/BharatMLStack/flashring/internal/memtables"
+	"github.com/Meesho/BharatMLStack/flashring/pkg/metrics"
 	"github.com/rs/zerolog/log"
 )
 
 type ShardCache struct {
 	keyIndex          *indices.Index
 	file              *fs.WrapAppendFile
+	ioFile            *fs.IOUringFile
+	batchReader       *fs.ParallelBatchIoUringReader // global batched io_uring reader (shared across shards)
 	mm                *memtables.MemtableManager
 	readPageAllocator *allocators.SlabAlignedPageAllocator
 	dm                *indices.DeleteManager
 	predictor         *maths.Predictor
 	startAt           int64
-	Stats             *Stats
 
 	//batching reads
 	BatchReader *BatchReaderV2
@@ -30,21 +32,8 @@ type ShardCache struct {
 	//Lockless read and write
 	ReadCh  chan *ReadRequestV2
 	WriteCh chan *WriteRequestV2
-}
 
-type Stats struct {
-	KeyNotFoundCount int
-	KeyExpiredCount  int
-	BadDataCount     int
-	BadLengthCount   int
-	BadCR32Count     int
-	BadKeyCount      int
-	MemIdCount       map[uint32]int
-	LastDeletedMemId uint32
-	DeletedKeyCount  int
-	BadCRCMemIds     map[uint32]int
-	BadKeyMemIds     map[uint32]int
-	BatchTracker     *BatchTracker
+	ShardIdx uint32
 }
 
 type ShardCacheConfig struct {
@@ -64,6 +53,16 @@ type ShardCacheConfig struct {
 	EnableBatching bool
 	BatchWindow    time.Duration
 	MaxBatchSize   int
+
+	//lockless
+	EnableLockless bool
+
+	// Global batched io_uring reader (shared across all shards).
+	// When set, disk reads go through this instead of the per-shard IOUringFile.
+	BatchIoUringReader *fs.ParallelBatchIoUringReader
+
+	// Dedicated io_uring ring for batched writes (shared across all shards).
+	WriteRing *fs.IoUring
 }
 
 func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache {
@@ -83,12 +82,12 @@ func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache {
 	if err != nil {
 		log.Panic().Err(err).Msg("Failed to create memtable manager")
 	}
-	ki := indices.NewIndex(0, config.RbInitial, config.RbMax, config.DeleteAmortizedStep)
+	ki := indices.NewIndex(0, config.RbInitial, config.RbMax, config.DeleteAmortizedStep, sl)
 	sizeClasses := make([]allocators.SizeClass, 0)
 	i := fs.BLOCK_SIZE
 	iMax := (1 << 16)
 	for i < iMax {
-		sizeClasses = append(sizeClasses, allocators.SizeClass{Size: i, MinCount: 1000})
+		sizeClasses = append(sizeClasses, allocators.SizeClass{Size: i, MinCount: 20})
 		i *= 2
 	}
 	readPageAllocator, err := allocators.NewSlabAlignedPageAllocator(allocators.SlabAlignedPageAllocatorConfig{SizeClasses: sizeClasses})
@@ -96,6 +95,12 @@ func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache {
 		log.Panic().Err(err).Msg("Failed to create read page allocator")
 	}
 	dm := indices.NewDeleteManager(ki, file, config.DeleteAmortizedStep)
+
+	// Attach the dedicated write ring so memtable flushes use batched io_uring.
+	if config.WriteRing != nil {
+		file.WriteRing = config.WriteRing
+	}
+
 	sc := &ShardCache{
 		keyIndex:          ki,
 		mm:                memtableManager,
@@ -104,12 +109,18 @@ func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache {
 		dm:                dm,
 		predictor:         config.Predictor,
 		startAt:           time.Now().Unix(),
-		Stats: &Stats{
-			MemIdCount:   make(map[uint32]int),
-			BadCRCMemIds: make(map[uint32]int),
-			BadKeyMemIds: make(map[uint32]int),
-			BatchTracker: NewBatchTracker(),
-		},
+	}
+
+	if config.BatchIoUringReader != nil {
+		// Use the global batched io_uring reader (shared across all shards).
+		sc.batchReader = config.BatchIoUringReader
+	} else {
+		// Fallback: per-shard io_uring ring for backward compatibility.
+		ioFile, err := fs.NewIOUringFile(file, 256, 0)
+		if err != nil {
+			log.Panic().Err(err).Msg("Failed to create io_uring file")
+		}
+		sc.ioFile = ioFile
 	}
 
 	// Initialize batch reader if enabled
@@ -120,10 +131,13 @@ func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache {
 		}, sc, sl)
 	}
 
-	sc.ReadCh = make(chan *ReadRequestV2, 500)
-	sc.WriteCh = make(chan *WriteRequestV2, 500)
+	if config.EnableLockless {
 
-	go sc.startReadWriteRoutines()
+		sc.ReadCh = make(chan *ReadRequestV2, 500)
+		sc.WriteCh = make(chan *WriteRequestV2, 500)
+
+		go sc.startReadWriteRoutines()
+	}
 
 	return sc
 }
@@ -163,19 +177,26 @@ func (fc *ShardCache) Put(key string, value []byte, ttlMinutes uint16) error {
 	indices.ByteOrder.PutUint32(buf[0:4], crc)
 	fc.keyIndex.Put(key, length, ttlMinutes, mtId, uint32(offset))
 	fc.dm.IncMemtableKeyCount(mtId)
-	fc.Stats.MemIdCount[mtId]++
 	return nil
 }
 
 func (fc *ShardCache) Get(key string) (bool, []byte, uint16, bool, bool) {
 	length, lastAccess, remainingTTL, freq, memId, offset, status := fc.keyIndex.Get(key)
 	if status == indices.StatusNotFound {
-		fc.Stats.KeyNotFoundCount++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_KEY_NOT_FOUND_COUNT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		return false, nil, 0, false, false
 	}
 
+	if metrics.Enabled() {
+		metrics.Timing(metrics.KEY_DATA_LENGTH, time.Duration(length), metrics.GetShardTag(fc.ShardIdx))
+	}
+
 	if status == indices.StatusExpired {
-		fc.Stats.KeyExpiredCount++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_KEY_EXPIRED_COUNT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		return false, nil, 0, true, false
 	}
 
@@ -190,32 +211,40 @@ func (fc *ShardCache) Get(key string) (bool, []byte, uint16, bool, bool) {
 		memtableExists = false
 	}
 	if !memtableExists {
-		bufPtr := BufPool.Get().(*[]byte)
-		buf = *bufPtr
-		defer BufPool.Put(bufPtr)
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_MEMTABLE_MISS, metrics.GetShardTag(fc.ShardIdx))
+		}
+		buf = make([]byte, length)
 		fileOffset := uint64(memId)*uint64(fc.mm.Capacity) + uint64(offset)
-		n := fc.readFromDisk(int64(fileOffset), length, buf)
+		n := fc.readFromDiskAsync(int64(fileOffset), length, buf)
 		if n != int(length) {
-			fc.Stats.BadLengthCount++
+			if metrics.Enabled() {
+				metrics.Incr(metrics.KEY_BAD_LENGTH_COUNT, metrics.GetShardTag(fc.ShardIdx))
+			}
 			return false, nil, 0, false, shouldReWrite
 		}
 	} else {
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_MEMTABLE_HIT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		buf, exists = mt.GetBufForRead(int(offset), length)
 		if !exists {
 			panic("memtable exists but buf not found")
 		}
 	}
 	gotCR32 := indices.ByteOrder.Uint32(buf[0:4])
-	computedCR32 := crc32.ChecksumIEEE(buf[4:])
+	computedCR32 := crc32.ChecksumIEEE(buf[4:length])
 	gotKey := string(buf[4 : 4+len(key)])
 	if gotCR32 != computedCR32 {
-		fc.Stats.BadCR32Count++
-		fc.Stats.BadCRCMemIds[memId]++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_BAD_CR32_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...))
+		}
 		return false, nil, 0, false, shouldReWrite
 	}
 	if gotKey != key {
-		fc.Stats.BadKeyCount++
-		fc.Stats.BadKeyMemIds[memId]++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_BAD_KEY_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...))
+		}
 		return false, nil, 0, false, shouldReWrite
 	}
 	valLen := int(length) - 4 - len(key)
@@ -228,12 +257,16 @@ func (fc *ShardCache) Get(key string) (bool, []byte, uint16, bool, bool) {
 func (fc *ShardCache) GetFastPath(key string) (bool, []byte, uint16, bool, bool) {
 	length, lastAccess, remainingTTL, freq, memId, offset, status := fc.keyIndex.Get(key)
 	if status == indices.StatusNotFound {
-		fc.Stats.KeyNotFoundCount++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_KEY_NOT_FOUND_COUNT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		return false, nil, 0, false, false // needsSlowPath = false (not found)
 	}
 
 	if status == indices.StatusExpired {
-		fc.Stats.KeyExpiredCount++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_KEY_EXPIRED_COUNT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		return false, nil, 0, true, false // needsSlowPath = false (expired)
 	}
 
@@ -254,18 +287,20 @@ func (fc *ShardCache) GetFastPath(key string) (bool, []byte, uint16, bool, bool)
 	gotCR32 := indices.ByteOrder.Uint32(buf[0:4])
 	computedCR32 := crc32.ChecksumIEEE(buf[4:])
 	if gotCR32 != computedCR32 {
-		fc.Stats.BadCR32Count++
-		fc.Stats.BadCRCMemIds[memId]++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_BAD_CR32_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...))
+		}
 		_, currMemId, _ := fc.mm.GetMemtable()
 		shouldReWrite := fc.predictor.Predict(uint64(freq), uint64(lastAccess), memId, currMemId)
-		_ = shouldReWrite // Not returning shouldReWrite in fast path for simplicity
+		_ = shouldReWrite
 		return false, nil, 0, false, false
 	}
 
 	gotKey := string(buf[4 : 4+len(key)])
 	if gotKey != key {
-		fc.Stats.BadKeyCount++
-		fc.Stats.BadKeyMemIds[memId]++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_BAD_KEY_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...))
+		}
 		return false, nil, 0, false, false
 	}
 
@@ -278,12 +313,16 @@ func (fc *ShardCache) GetFastPath(key string) (bool, []byte, uint16, bool, bool)
 func (fc *ShardCache) GetSlowPath(key string) (bool, []byte, uint16, bool, bool) {
 	length, lastAccess, remainingTTL, freq, memId, offset, status := fc.keyIndex.Get(key)
 	if status == indices.StatusNotFound {
-		fc.Stats.KeyNotFoundCount++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_KEY_NOT_FOUND_COUNT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		return false, nil, 0, false, false
 	}
 
 	if status == indices.StatusExpired {
-		fc.Stats.KeyExpiredCount++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_KEY_EXPIRED_COUNT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		return false, nil, 0, true, false
 	}
 
@@ -293,7 +332,6 @@ func (fc *ShardCache) GetSlowPath(key string) (bool, []byte, uint16, bool, bool)
 	// Check memtable again (might have changed since fast path check)
 	mt := fc.mm.GetMemtableById(memId)
 	if mt != nil {
-		// Data is now in memtable, use fast path logic
 		buf, exists := mt.GetBufForRead(int(offset), length)
 		if !exists {
 			panic("memtable exists but buf not found")
@@ -301,14 +339,13 @@ func (fc *ShardCache) GetSlowPath(key string) (bool, []byte, uint16, bool, bool)
 		return fc.validateAndReturnBuffer(key, buf, length, memId, remainingTTL, shouldReWrite)
 	}
 
-	// Read from disk
-	bufPtr := BufPool.Get().(*[]byte)
-	buf := *bufPtr
-	defer BufPool.Put(bufPtr)
+	buf := make([]byte, length)
 	fileOffset := uint64(memId)*uint64(fc.mm.Capacity) + uint64(offset)
 	n := fc.readFromDisk(int64(fileOffset), length, buf)
 	if n != int(length) {
-		fc.Stats.BadLengthCount++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_BAD_LENGTH_COUNT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		return false, nil, 0, false, shouldReWrite
 	}
 
@@ -318,17 +355,19 @@ func (fc *ShardCache) GetSlowPath(key string) (bool, []byte, uint16, bool, bool)
 // validateAndReturnBuffer validates CRC and key, then returns the value
 func (fc *ShardCache) validateAndReturnBuffer(key string, buf []byte, length uint16, memId uint32, remainingTTL uint16, shouldReWrite bool) (bool, []byte, uint16, bool, bool) {
 	gotCR32 := indices.ByteOrder.Uint32(buf[0:4])
-	computedCR32 := crc32.ChecksumIEEE(buf[4:])
+	computedCR32 := crc32.ChecksumIEEE(buf[4:length])
 	if gotCR32 != computedCR32 {
-		fc.Stats.BadCR32Count++
-		fc.Stats.BadCRCMemIds[memId]++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_BAD_CR32_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...))
+		}
 		return false, nil, 0, false, shouldReWrite
 	}
 
 	gotKey := string(buf[4 : 4+len(key)])
 	if gotKey != key {
-		fc.Stats.BadKeyCount++
-		fc.Stats.BadKeyMemIds[memId]++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_BAD_KEY_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...))
+		}
 		return false, nil, 0, false, shouldReWrite
 	}
 
@@ -337,18 +376,68 @@ func (fc *ShardCache) validateAndReturnBuffer(key string, buf []byte, length uin
 }
 
 func (fc *ShardCache) readFromDisk(fileOffset int64, length uint16, buf []byte) int {
+
 	alignedStartOffset := (fileOffset / fs.BLOCK_SIZE) * fs.BLOCK_SIZE
 	endndOffset := fileOffset + int64(length)
 	endAlignedOffset := ((endndOffset + fs.BLOCK_SIZE - 1) / fs.BLOCK_SIZE) * fs.BLOCK_SIZE
 	alignedReadSize := endAlignedOffset - alignedStartOffset
+
 	page := fc.readPageAllocator.Get(int(alignedReadSize))
+
 	fc.file.Pread(alignedStartOffset, page.Buf)
+
 	start := int(fileOffset - alignedStartOffset)
 	n := copy(buf, page.Buf[start:start+int(length)])
 	fc.readPageAllocator.Put(page)
 	return n
 }
 
+func (fc *ShardCache) readFromDiskAsync(fileOffset int64, length uint16, buf []byte) int {
+	alignedStartOffset := (fileOffset / fs.BLOCK_SIZE) * fs.BLOCK_SIZE
+	endndOffset := fileOffset + int64(length)
+	endAlignedOffset := ((endndOffset + fs.BLOCK_SIZE - 1) / fs.BLOCK_SIZE) * fs.BLOCK_SIZE
+	alignedReadSize := int(endAlignedOffset - alignedStartOffset)
+	page := fc.readPageAllocator.Get(alignedReadSize)
+
+	// Use exactly alignedReadSize bytes, not the full page.Buf which may be
+	// larger due to slab allocator rounding to the next size class.
+	readBuf := page.Buf[:alignedReadSize]
+
+	var n int
+	var err error
+
+	if fc.batchReader != nil {
+		// Batched path: validate offset locally, then submit to the global
+		// io_uring batch reader which accumulates requests across all shards.
+		var validOffset int64
+		validOffset, err = fc.file.ValidateReadOffset(alignedStartOffset, alignedReadSize)
+		if err == nil {
+			n, err = fc.batchReader.Submit(fc.file.ReadFd, readBuf, uint64(validOffset))
+		}
+	} else {
+		// Per-shard io_uring fallback
+		n, err = fc.ioFile.PreadAsync(alignedStartOffset, readBuf)
+	}
+
+	if err != nil || n != alignedReadSize {
+		// ErrFileOffsetOutOfRange is expected for stale index entries -- don't log.
+		if err != nil && err != fs.ErrFileOffsetOutOfRange {
+			log.Warn().Err(err).
+				Int64("offset", alignedStartOffset).
+				Int("alignedReadSize", alignedReadSize).
+				Int("n", n).
+				Msg("io_uring pread failed")
+		}
+		fc.readPageAllocator.Put(page)
+		return 0
+	}
+
+	start := int(fileOffset - alignedStartOffset)
+	copied := copy(buf, page.Buf[start:start+int(length)])
+	fc.readPageAllocator.Put(page)
+	return copied
+}
+
 func (fc *ShardCache) GetRingBufferActiveEntries() int {
 	return fc.keyIndex.GetRB().ActiveEntries()
 }
@@ -360,11 +449,15 @@ func (fc *ShardCache) processBuffer(key string, buf []byte, length uint16) ReadR
 	gotKey := string(buf[4 : 4+len(key)])
 
 	if gotCR32 != computedCR32 {
-		fc.Stats.BadCR32Count++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_BAD_CR32_COUNT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		return ReadResult{Found: false, Error: fmt.Errorf("crc mismatch")}
 	}
 	if gotKey != key {
-		fc.Stats.BadKeyCount++
+		if metrics.Enabled() {
+			metrics.Incr(metrics.KEY_BAD_KEY_COUNT, metrics.GetShardTag(fc.ShardIdx))
+		}
 		return ReadResult{Found: false, Error: fmt.Errorf("key mismatch")}
 	}
 
diff --git a/flashring/pkg/cache/badger.go b/flashring/pkg/cache/badger.go
new file mode 100644
index 00000000..859c4bac
--- /dev/null
+++ b/flashring/pkg/cache/badger.go
@@ -0,0 +1,73 @@
+package internal
+
+import (
+	"time"
+
+	badger "github.com/dgraph-io/badger/v4"
+)
+
+type Badger struct {
+	cache *badger.DB
+}
+
+func NewBadger(config WrapCacheConfig, logStats bool) (*Badger, error) {
+	options := badger.DefaultOptions(config.MountPoint)
+	options.MetricsEnabled = false
+
+	// 1. PRIMARY CACHE (1GB)
+	// This caches the data blocks themselves.
+	options.BlockCacheSize = 1024 << 20
+
+	// 2. INDEX CACHE (512MB)
+	// This keeps the keys and the structure of the LSM tree in RAM.
+	// This is the most critical setting for read latency.
+	options.IndexCacheSize = 512 << 20
+
+	// 3. WRITE BUFFERS (Memtables)
+	// We use 3 tables of 64MB each. This allows Badger to handle
+	// write spikes without blocking. (~192MB total)
+	options.NumMemtables = 40
+	options.MemTableSize = 1024 << 20
+
+	options.ValueThreshold = 1024
+	options.SyncWrites = false
+
+	cache, err := badger.Open(options)
+	if err != nil {
+		return nil, err
+	}
+	bc := &Badger{
+		cache: cache,
+	}
+
+	return bc, nil
+}
+
+func (b *Badger) Put(key string, value []byte, exptimeInMinutes uint16) error {
+
+	err := b.cache.Update(func(txn *badger.Txn) error {
+		entry := badger.NewEntry([]byte(key), value).WithTTL(time.Duration(exptimeInMinutes) * time.Minute)
+		err := txn.SetEntry(entry)
+		return err
+	})
+	return err
+}
+
+func (b *Badger) Get(key string) ([]byte, bool, bool) {
+
+	val := make([]byte, 0)
+	err := b.cache.View(func(txn *badger.Txn) error {
+		item, err := txn.Get([]byte(key))
+		if err != nil {
+			return err
+		}
+		val, err = item.ValueCopy(val)
+
+		return err
+	})
+	return val, err != badger.ErrKeyNotFound, false
+}
+
+func (b *Badger) Close() error {
+	return b.cache.Close()
+}
diff --git a/flashring/pkg/cache/cache.go b/flashring/pkg/cache/cache.go
new file mode 100644
index 00000000..96324381
--- /dev/null
+++ b/flashring/pkg/cache/cache.go
@@ -0,0 +1,403 @@
+package internal
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"sync"
+	"time"
+
+	"github.com/Meesho/BharatMLStack/flashring/internal/fs"
+	"github.com/Meesho/BharatMLStack/flashring/internal/maths"
+	filecache "github.com/Meesho/BharatMLStack/flashring/internal/shard"
+	"github.com/cespare/xxhash/v2"
+	"github.com/rs/zerolog/log"
+
+	metrics "github.com/Meesho/BharatMLStack/flashring/pkg/metrics"
+)
+
+/*
+ Each shard can keep 67M keys
+ With Round = 1, expected collision (67M)^2/(2*2^62) = 4.87×10^-4
+*/
+
+const (
+	ROUNDS         = 1
+	KEYS_PER_SHARD = (1 << 26)
+	BLOCK_SIZE     = 4096
+)
+
+var (
+	ErrNumShardLessThan1            = fmt.Errorf("num shards must be greater than 0")
+	ErrKeysPerShardLessThan1        = fmt.Errorf("keys per shard must be greater than 0")
+	ErrKeysPerShardGreaterThan67M   = fmt.Errorf("keys per shard must be less than 67M")
+	ErrMemtableSizeLessThan1        = fmt.Errorf("memtable size must be greater than 0")
+	ErrMemtableSizeGreaterThan1GB   = fmt.Errorf("memtable size must be less than 1GB")
+	ErrMemtableSizeNotMultipleOf4KB = fmt.Errorf("memtable size must be a multiple of 4KB")
+	ErrFileSizeLessThan1            = fmt.Errorf("file size must be greater than 0")
+	ErrFileSizeNotMultipleOf4KB     = fmt.Errorf("file size must be a multiple of 4KB")
+	Seed                            = xxhash.Sum64String(strconv.Itoa(int(time.Now().UnixNano())))
+)
+
+type WrapCache struct {
+	shards      []*filecache.ShardCache
+	shardLocks  []sync.RWMutex
+	predictor   *maths.Predictor
+	batchReader *fs.ParallelBatchIoUringReader // global batched io_uring reader
+}
+
+type WrapCacheConfig struct {
+	NumShards             int
+	KeysPerShard          int
+	FileSize              int64
+	MemtableSize          int32
+	ReWriteScoreThreshold float32
+	GridSearchEpsilon     float64
+	SampleDuration        time.Duration
+
+	// Batching reads
+	EnableBatching    bool
+	BatchWindowMicros int // in microseconds
+	MaxBatchSize      int
+
+	//lockless mode for PutLL/GetLL
+	EnableLockless bool
+
+	//Badger
+	MountPoint string
+}
+
+func NewWrapCache(config WrapCacheConfig, mountPoint string) (*WrapCache, error) {
+	if config.NumShards <= 0 {
+		return nil, ErrNumShardLessThan1
+	}
+	if config.KeysPerShard <= 0 {
+		return nil, ErrKeysPerShardLessThan1
+	}
+	if config.KeysPerShard > KEYS_PER_SHARD {
+		return nil, ErrKeysPerShardGreaterThan67M
+	}
+	if config.MemtableSize <= 0 {
+		return nil, ErrMemtableSizeLessThan1
+	}
+	if config.MemtableSize > 1024*1024*1024 {
+		return nil, ErrMemtableSizeGreaterThan1GB
+	}
+	if config.MemtableSize%BLOCK_SIZE != 0 {
+		return nil, ErrMemtableSizeNotMultipleOf4KB
+	}
+	if config.FileSize <= 0 {
+		return nil, ErrFileSizeLessThan1
+	}
+	if config.FileSize%BLOCK_SIZE != 0 {
+		return nil, ErrFileSizeNotMultipleOf4KB
+	}
+
+	//clear existing data
+	files, err := os.ReadDir(mountPoint)
+	if err != nil {
+		log.Error().Err(err).Msg("Failed to read directory")
+		panic(err)
+	}
+	for _, file := range files {
+		os.Remove(filepath.Join(mountPoint, file.Name()))
+	}
+
+	weights := []maths.WeightTuple{
+		{
+			WFreq: 0.1,
+			WLA:   0.1,
+		},
+		{
+			WFreq: 0.45,
+			WLA:   0.1,
+		},
+		{
+			WFreq: 0.9,
+			WLA:   0.1,
+		},
+		{
+			WFreq: 0.1,
+			WLA:   0.45,
+		},
+		{
+			WFreq: 0.45,
+			WLA:   0.45,
+		},
+		{
+			WFreq: 0.9,
+			WLA:   0.45,
+		},
+		{
+			WFreq: 0.1,
+			WLA:   0.9,
+		},
+		{
+			WFreq: 0.45,
+			WLA:   0.9,
+		},
+		{
+			WFreq: 0.9,
+			WLA:   0.9,
+		},
+	}
+	MaxMemTableCount := config.FileSize / int64(config.MemtableSize)
+	predictor := maths.NewPredictor(maths.PredictorConfig{
+		ReWriteScoreThreshold: config.ReWriteScoreThreshold,
+		Weights:               weights,
+		SampleDuration:        config.SampleDuration,
+		MaxMemTableCount:      uint32(MaxMemTableCount),
+		GridSearchEpsilon:     config.GridSearchEpsilon,
+	})
+
+	// Create a single global batched io_uring reader shared across all shards.
+	// All disk reads funnel into one channel; the background goroutine collects
+	// them for up to 1ms and submits them in a single io_uring_enter call.
+	batchReader, err := fs.NewParallelBatchIoUringReader(fs.BatchIoUringConfig{
+		RingDepth: 256,
+		MaxBatch:  256,
+		Window:    time.Millisecond,
+		QueueSize: 1024,
+	}, 2)
+	if err != nil {
+		log.Error().Err(err).Msg("Failed to create batched io_uring reader, falling back to per-shard rings")
+		batchReader = nil
+	}
+
+	// Separate io_uring ring dedicated to batched writes (memtable flushes).
+	// Kept separate from the read ring to avoid mutex contention between the
+	// read batch loop and concurrent flushes.
+	writeRing, err := fs.NewIoUring(256, 0)
+	if err != nil {
+		log.Error().Err(err).Msg("Failed to create io_uring write ring, falling back to sequential pwrite")
+		writeRing = nil
+	}
+
+	batchWindow := time.Duration(0)
+	if config.EnableBatching && config.BatchWindowMicros > 0 {
+		batchWindow = time.Duration(config.BatchWindowMicros) * time.Microsecond
+	}
+	shardLocks := make([]sync.RWMutex, config.NumShards)
+	shards := make([]*filecache.ShardCache, config.NumShards)
+	for i := 0; i < config.NumShards; i++ {
+		shards[i] = filecache.NewShardCache(filecache.ShardCacheConfig{
+			MemtableSize:        config.MemtableSize,
+			Rounds:              ROUNDS,
+			RbInitial:           config.KeysPerShard,
+			RbMax:               config.KeysPerShard,
+			DeleteAmortizedStep: 10000,
+			MaxFileSize:         int64(config.FileSize),
+			BlockSize:           BLOCK_SIZE,
+			Directory:           mountPoint,
+			Predictor:           predictor,
+
+			//batching reads
+			EnableBatching: config.EnableBatching,
+			BatchWindow:    batchWindow,
+			MaxBatchSize:   config.MaxBatchSize,
+
+			//lockless mode for PutLL/GetLL
+			EnableLockless: config.EnableLockless,
+
+			BatchIoUringReader: batchReader,
+			WriteRing:          writeRing,
+		}, &shardLocks[i])
+	}
+
+	wc := &WrapCache{
+		shards:      shards,
+		shardLocks:  shardLocks,
+		predictor:   predictor,
+		batchReader: batchReader,
+	}
+
+	return wc, nil
+}
+
+func (wc *WrapCache) PutLL(key string, value []byte, exptimeInMinutes uint16) error {
+
+	h32 := wc.Hash(key)
+	shardIdx := h32 % uint32(len(wc.shards))
+	start := time.Now()
+
+	result := filecache.ErrorPool.Get().(chan error)
+
+	wc.shards[shardIdx].WriteCh <- &filecache.WriteRequestV2{
+		Key:              key,
+		Value:            value,
+		ExptimeInMinutes: exptimeInMinutes,
+		Result:           result,
+	}
+
+	if metrics.Enabled() && h32%100 < 10 {
+		metrics.Incr(metrics.KEY_RINGBUFFER_ACTIVE_ENTRIES, metrics.GetShardTag(shardIdx))
+	}
+
+	op := <-result
+	filecache.ErrorPool.Put(result)
+	if metrics.Enabled() {
+		metrics.Incr(metrics.KEY_PUTS, metrics.GetShardTag(shardIdx))
+		metrics.Timing(metrics.KEY_PUT_LATENCY, time.Since(start), metrics.GetShardTag(shardIdx))
+	}
+	return op
+}
+
+func (wc *WrapCache) GetLL(key string) ([]byte, bool, bool) {
+	h32 := wc.Hash(key)
+	shardIdx := h32 % uint32(len(wc.shards))
+
+	start := time.Now()
+
+	// found, value, _, expired, needsSlowPath := wc.shards[shardIdx].GetFastPath(key)
+
+	// if !needsSlowPath {
+	// 	if found && !expired {
+	// 		wc.stats[shardIdx].Hits.Add(1)
+	// 	} else if expired {
+	// 		wc.stats[shardIdx].Expired.Add(1)
+	// 	}
+
+	// 	wc.stats[shardIdx].TotalGets.Add(1)
+	// 	wc.stats[shardIdx].LatencyTracker.RecordGet(time.Since(start))
+	// 	return value, found, expired
+	// }
+
+	result := filecache.ReadResultPool.Get().(chan filecache.ReadResultV2)
+
+	req := filecache.ReadRequestPool.Get().(*filecache.ReadRequestV2)
+	req.Key = key
+	req.Result = result
+
+	wc.shards[shardIdx].ReadCh <- req
+	op := <-result
+
+	filecache.ReadResultPool.Put(result)
+	filecache.ReadRequestPool.Put(req)
+
+	if metrics.Enabled() {
+		if op.Found && !op.Expired {
+			metrics.Incr(metrics.KEY_HITS, metrics.GetShardTag(shardIdx))
+		}
+		if op.Expired {
+			metrics.Incr(metrics.KEY_EXPIRED_ENTRIES, metrics.GetShardTag(shardIdx))
+		}
+		metrics.Timing(metrics.KEY_GET_LATENCY, time.Since(start), metrics.GetShardTag(shardIdx))
+		metrics.Incr(metrics.KEY_GETS, metrics.GetShardTag(shardIdx))
+	}
+
+	return op.Data, op.Found, op.Expired
+}
+
+func (wc *WrapCache) Put(key string, value []byte, exptimeInMinutes uint16) error {
+
+	h32 := wc.Hash(key)
+	shardIdx := h32 % uint32(len(wc.shards))
+
+	var start time.Time
+	if metrics.Enabled() {
+		start = time.Now()
+		defer func() {
+			metrics.Timing(metrics.KEY_PUT_LATENCY, time.Since(start), metrics.GetShardTag(shardIdx))
+		}()
+	}
+
+	if metrics.Enabled() {
+		start = time.Now()
+	}
+	wc.shardLocks[shardIdx].Lock()
+	if metrics.Enabled() {
+		metrics.Timing(metrics.LATENCY_WLOCK, time.Since(start), []string{})
+	}
+	defer wc.shardLocks[shardIdx].Unlock()
+
+	err := wc.shards[shardIdx].Put(key, value, exptimeInMinutes)
+	if err != nil {
+		log.Error().Err(err).Msgf("Put failed for key: %s", key)
+		return fmt.Errorf("put failed for key: %s", key)
+	}
+	if metrics.Enabled() {
+		metrics.Incr(metrics.KEY_PUTS, metrics.GetShardTag(shardIdx))
+		if h32%100 < 10 {
+			metrics.Incr(metrics.KEY_RINGBUFFER_ACTIVE_ENTRIES, metrics.GetShardTag(shardIdx))
+		}
+	}
+
+	return nil
+}
+
+func (wc *WrapCache) Get(key string) ([]byte, bool, bool) {
+	h32 := wc.Hash(key)
+	shardIdx := h32 % uint32(len(wc.shards))
+
+	var start time.Time
+	if metrics.Enabled() {
+		start = time.Now()
+		defer func() {
+			metrics.Timing(metrics.KEY_GET_LATENCY, time.Since(start), metrics.GetShardTag(shardIdx))
+		}()
+	}
+
+	var keyFound bool
+	var val []byte
+	var valCopy []byte
+	var remainingTTL uint16
+	var expired bool
+	var shouldReWrite bool
+	if wc.shards[shardIdx].BatchReader != nil {
+		reqChan := make(chan filecache.ReadResultV2, 1)
+		wc.shards[shardIdx].BatchReader.Requests <- &filecache.ReadRequestV2{
+			Key:    key,
+			Result: reqChan,
+		}
+		result := <-reqChan
+		keyFound, val, remainingTTL, expired, shouldReWrite = result.Found, result.Data, result.TTL, result.Expired, result.ShouldRewrite
+		if shouldReWrite {
+			valCopy = make([]byte, len(val))
+			copy(valCopy, val)
+		}
+	} else {
+
+		func(key string, shardIdx uint32) {
+
+			keyFound, val, remainingTTL, expired, shouldReWrite = wc.shards[shardIdx].Get(key)
+
+			if shouldReWrite {
+				//copy val into a safe variable because we are unlocking the shard
+				// at the end of anon function execution
+				valCopy = make([]byte, len(val))
+				copy(valCopy, val)
+				val = valCopy
+			}
+		}(key, shardIdx)
+
+	}
+
+	if metrics.Enabled() {
+		if keyFound && !expired {
+			metrics.Incr(metrics.KEY_HITS, metrics.GetShardTag(shardIdx))
+		}
+		if expired {
+			metrics.Incr(metrics.KEY_EXPIRED_ENTRIES, metrics.GetShardTag(shardIdx))
+		}
+		metrics.Incr(metrics.KEY_GETS, metrics.GetShardTag(shardIdx))
+		if shouldReWrite {
+			metrics.Incr(metrics.KEY_REWRITES, metrics.GetShardTag(shardIdx))
+		}
+	}
+	if shouldReWrite {
+		wc.Put(key, valCopy, remainingTTL)
+	}
+
+	//todo: track hit rate here using
+	// wc.predictor.Observe(hitRate)
+	return val, keyFound, expired
+}
+
+func (wc *WrapCache) Hash(key string) uint32 {
+	return uint32(xxhash.Sum64String(key) ^ Seed)
+}
+
+func (wc *WrapCache) GetShardCache(shardIdx int) *filecache.ShardCache {
+	return wc.shards[shardIdx]
+}
diff --git a/flashring/pkg/cache/freecache.go b/flashring/pkg/cache/freecache.go
new file mode 100644
index 00000000..f16191c6
--- /dev/null
+++ b/flashring/pkg/cache/freecache.go
@@ -0,0 +1,40 @@
+package internal
+
+import (
+	"runtime/debug"
+
+	"github.com/coocood/freecache"
+)
+
+type Freecache struct {
+	cache *freecache.Cache
+}
+
+func NewFreecache(config WrapCacheConfig, logStats bool) (*Freecache, error) {
+
+	cache := freecache.NewCache(int(config.FileSize))
+	debug.SetGCPercent(20)
+
+	fc := &Freecache{
+		cache: cache,
+	}
+
+	return fc, nil
+
+}
+
+func (c *Freecache) Put(key string, value []byte, exptimeInMinutes uint16) error {
+
+	c.cache.Set([]byte(key), value, int(exptimeInMinutes)*60)
+	return nil
+}
+
+func (c *Freecache) Get(key string) ([]byte, bool, bool) {
+
+	val, err := c.cache.Get([]byte(key))
+	if err != nil {
+		return nil, false, false
+	}
+
+	return val, true, false
+}
diff --git a/flashring/pkg/metrics/metric.go b/flashring/pkg/metrics/metric.go
new file mode 100644
index 00000000..e977bb77
--- /dev/null
+++ b/flashring/pkg/metrics/metric.go
@@ -0,0 +1,204 @@
+package metrics
+
+import (
+	"os"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/DataDog/datadog-go/v5/statsd"
+	"github.com/rs/zerolog/log"
+	"github.com/spf13/viper"
+)
+
+// Flashring metric keys
+const (
+	KEY_GET_LATENCY     = "flashring_get_latency"
+	KEY_PUT_LATENCY     = "flashring_put_latency"
+	KEY_RTHROUGHPUT     = "flashring_rthroughput"
+	KEY_WTHROUGHPUT     = "flashring_wthroughput"
+	KEY_HITRATE         = "flashring_hitrate"
+	KEY_ACTIVE_ENTRIES  = "flashring_active_entries"
+	KEY_EXPIRED_ENTRIES = "flashring_expired_entries"
+	KEY_REWRITES        = "flashring_rewrites"
+	KEY_GETS            = "flashring_gets"
+	KEY_PUTS            = "flashring_puts"
+	KEY_HITS            = "flashring_hits"
+
+	KEY_KEY_NOT_FOUND_COUNT = "flashring_key_not_found_count"
+	KEY_KEY_EXPIRED_COUNT   = "flashring_key_expired_count"
+	KEY_BAD_DATA_COUNT      = "flashring_bad_data_count"
+	KEY_BAD_LENGTH_COUNT    = "flashring_bad_length_count"
+	KEY_BAD_CR32_COUNT      = "flashring_bad_cr32_count"
+	KEY_BAD_KEY_COUNT       = "flashring_bad_key_count"
+	KEY_DELETED_KEY_COUNT   = "flashring_deleted_key_count"
+
+	KEY_WRITE_COUNT      = "flashring_write_count"
+	KEY_PUNCH_HOLE_COUNT = "flashring_punch_hole_count"
+	KEY_PREAD_COUNT      = "flashring_pread_count"
+
+	KEY_TRIM_HEAD_LATENCY = "flashring_wrap_file_trim_head_latency"
+	KEY_PREAD_LATENCY     = "flashring_pread_latency"
+	KEY_PWRITE_LATENCY    = "flashring_pwrite_latency"
+
+	KEY_MEMTABLE_FLUSH_COUNT = "flashring_memtable_flush_count"
+
+	LATENCY_RLOCK = "flashring_rlock_latency"
+	LATENCY_WLOCK = "flashring_wlock_latency"
+
+	KEY_RINGBUFFER_ACTIVE_ENTRIES = "flashring_ringbuffer_active_entries"
+	KEY_MEMTABLE_ENTRY_COUNT      = "flashring_memtable_entry_count"
+	KEY_MEMTABLE_HIT              = "flashring_memtable_hit"
+	KEY_MEMTABLE_MISS             = "flashring_memtable_miss"
+	KEY_DATA_LENGTH               = "flashring_data_length"
+	KEY_IOURING_SIZE              = "flashring_iouring_size"
+)
+
+// Flashring tag keys
+const (
+	TAG_LATENCY_PERCENTILE = "latency_percentile"
+	TAG_VALUE_P25          = "p25"
+	TAG_VALUE_P50          = "p50"
+	TAG_VALUE_P99          = "p99"
+	TAG_SHARD_IDX          = "shard_idx"
+	TAG_MEMTABLE_ID        = "memtable_id"
+)
+
+// Application-level metric keys
+const (
+	ApiRequestCount           = "api_request_count"
+	ApiRequestLatency         = "api_request_latency"
+	ExternalApiRequestCount   = "external_api_request_count"
+	ExternalApiRequestLatency = "external_api_request_latency"
+	DBCallLatency             = "db_call_latency"
+	DBCallCount               = "db_call_count"
+	MethodLatency             = "method_latency"
+	MethodCount               = "method_count"
+)
+
+var (
+	statsDClient    = getDefaultClient()
+	samplingRate    = 0.1
+	telegrafAddress = "localhost:8125"
+	appName         = ""
+	initialized     = false
+	once            sync.Once
+
+	// When false, all Timing/Count/Incr/Gauge calls are no-ops (zero allocations).
+	// Controlled by FLASHRING_METRICS_ENABLED env var ("true"/"1" to enable).
+	// Defaults to true for backward compatibility.
+	metricsEnabled = loadMetricsEnabled()
+)
+
+func loadMetricsEnabled() bool {
+	v := os.Getenv("FLASHRING_METRICS_ENABLED")
+	if v == "" {
+		return false
+	}
+	return strings.EqualFold(v, "true") || v == "1"
+}
+
+// Init initializes the metrics client
+func Init() {
+	if initialized {
+		log.Debug().Msgf("Metrics already initialized!")
+		return
+	}
+	once.Do(func() {
+		var err error
+		samplingRate = viper.GetFloat64("APP_METRIC_SAMPLING_RATE")
+		appName = viper.GetString("APP_NAME")
+		globalTags := getGlobalTags()
+
+		statsDClient, err = statsd.New(
+			telegrafAddress,
+			statsd.WithTags(globalTags),
+		)
+
+		if err != nil {
+			log.Panic().AnErr("StatsD client initialization failed", err)
+		}
+		log.Info().Msgf("Metrics client initialized with telegraf address - %s, global tags - %v, and "+
+			"sampling rate - %f, flashring metrics enabled - %v", telegrafAddress, globalTags, samplingRate, metricsEnabled)
+		initialized = true
+	})
+}
+
+func getDefaultClient() *statsd.Client {
+	client, _ := statsd.New("localhost:8125")
+	return client
+}
+
+func getGlobalTags() []string {
+	env := viper.GetString("APP_ENV")
+	if len(env) == 0 {
+		log.Warn().Msg("APP_ENV is not set")
+	}
+	service := viper.GetString("APP_NAME")
+	if len(service) == 0 {
+		log.Warn().Msg("APP_NAME is not set")
+	}
+	return []string{
+		TagAsString(TagEnv, env),
+		TagAsString(TagService, service),
+	}
+}
+
+// Timing sends timing information. No-op when metrics are disabled.
+func Timing(name string, value time.Duration, tags []string) {
+	if !metricsEnabled {
+		return
+	}
+	tags = append(tags, TagAsString(TagService, appName))
+	err := statsDClient.Timing(name, value, tags, samplingRate)
+	if err != nil {
+		log.Warn().AnErr("Error occurred while doing statsd timing", err)
+	}
+}
+
+// Count increases metric counter by value. No-op when metrics are disabled.
+func Count(name string, value int64, tags []string) {
+	if !metricsEnabled {
+		return
+	}
+	tags = append(tags, TagAsString(TagService, appName))
+	err := statsDClient.Count(name, value, tags, samplingRate)
+	if err != nil {
+		log.Warn().AnErr("Error occurred while doing statsd count", err)
+	}
+}
+
+// Incr increases metric counter by 1. No-op when metrics are disabled.
+func Incr(name string, tags []string) {
+	if !metricsEnabled {
+		return
+	}
+	Count(name, 1, tags)
+}
+
+// Gauge sets a gauge value. No-op when metrics are disabled.
+func Gauge(name string, value float64, tags []string) {
+	if !metricsEnabled {
+		return
+	}
+	tags = append(tags, TagAsString(TagService, appName))
+	err := statsDClient.Gauge(name, value, tags, samplingRate)
+	if err != nil {
+		log.Warn().AnErr("Error occurred while doing statsd gauge", err)
+	}
+}
+
+// Enabled returns whether flashring metrics are enabled.
+// Call sites should check this before allocating tags to avoid heap allocations.
+func Enabled() bool {
+	return metricsEnabled
+}
+
+func GetShardTag(shardIdx uint32) []string {
+	return BuildTag(NewTag(TAG_SHARD_IDX, strconv.Itoa(int(shardIdx))))
+}
+
+func GetMemtableTag(memtableId uint32) []string {
+	return BuildTag(NewTag(TAG_MEMTABLE_ID, strconv.Itoa(int(memtableId))))
+}
diff --git a/flashring/pkg/metrics/tag.go b/flashring/pkg/metrics/tag.go
new file mode 100644
index 00000000..d77ac38e
--- /dev/null
+++ b/flashring/pkg/metrics/tag.go
@@ -0,0 +1,55 @@
+package metrics
+
+// Tag constants
+const (
+	TagEnv                          = "env"
+	TagService                      = "service"
+	TagPath                         = "path"
+	TagMethod                       = "method"
+	TagHttpStatusCode               = "http_status_code"
+	TagGrpcStatusCode               = "grpc_status_code"
+	TagExternalService              = "external_service"
+	TagExternalServicePath          = "external_service_path"
+	TagExternalServiceMethod        = "external_service_method"
+	TagExternalServiceStatusCode    = "external_service_status_code"
+	TagZkRealtimeTotalUpdateEvent   = "zk_realtime_total_update_event"
+	TagZkRealtimeFailureEvent       = "zk_realtime_failure_event"
+	TagZkRealtimeSuccessEvent       = "zk_realtime_success_event"
+	TagZkRealtimeEventUpdateLatency = "zk_realtime_event_update_latency"
+	TagCommunicationProtocol        = "communication_protocol"
+	TagUserContext                  = "user_context"
+
+	TagValueCommunicationProtocolHttp = "http"
+	TagValueCommunicationProtocolGrpc = "grpc"
+)
+
+type Tag struct {
+	Name  string
+	Value string
+}
+
+func NewTag(name, value string) Tag {
+	return Tag{
+		Name:  name,
+		Value: value,
+	}
+}
+
+// BuildTag builds a tag from the given name and value
+func BuildTag(tags ...Tag) []string {
+	allTags := make([]string, 0)
+	for _, tag := range tags {
+		allTags = append(allTags, TagAsString(tag.Name, tag.Value))
+	}
+	return allTags
+}
+
+func TagAsString(name string, value string) string {
+	return name + ":" + value
+}
+
+func UpdateTags(tags *[]string, newTags ...Tag) {
+	for _, tag := range newTags {
+		*tags = append(*tags, TagAsString(tag.Name, tag.Value))
+	}
+}