diff --git a/.vscode/launch.json b/.vscode/launch.json index 2decad3c..e9505d8c 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -26,7 +26,7 @@ "mode": "debug", "program": "${workspaceFolder}/flashring/cmd/flashringtest", "env": { - "PLAN": "readthrough-batched" + "PLAN": "readthrough" } } diff --git a/flashring/cmd/flashringtest/plan_badger.go b/flashring/cmd/flashringtest/plan_badger.go index 4ba266d4..1e06f8fa 100644 --- a/flashring/cmd/flashringtest/plan_badger.go +++ b/flashring/cmd/flashringtest/plan_badger.go @@ -10,7 +10,7 @@ import ( "strings" "sync" - cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache" + cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache" "github.com/rs/zerolog" "github.com/rs/zerolog/log" ) diff --git a/flashring/cmd/flashringtest/plan_freecache.go b/flashring/cmd/flashringtest/plan_freecache.go index 0fe6a297..be46daf9 100644 --- a/flashring/cmd/flashringtest/plan_freecache.go +++ b/flashring/cmd/flashringtest/plan_freecache.go @@ -11,7 +11,7 @@ import ( "strings" "sync" - cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache" + cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache" "github.com/rs/zerolog" "github.com/rs/zerolog/log" ) diff --git a/flashring/cmd/flashringtest/plan_lockless.go b/flashring/cmd/flashringtest/plan_lockless.go index e946c9af..ea7f8ede 100644 --- a/flashring/cmd/flashringtest/plan_lockless.go +++ b/flashring/cmd/flashringtest/plan_lockless.go @@ -13,7 +13,7 @@ import ( "sync" "time" - cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache" + cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache" "github.com/rs/zerolog" "github.com/rs/zerolog/log" ) @@ -35,13 +35,13 @@ func planLockless() { cpuProfile string ) - flag.StringVar(&mountPoint, "mount", "/media/a0d00kc/trishul/", "data directory for shard files") - flag.IntVar(&numShards, "shards", 500, "number of shards") - flag.IntVar(&keysPerShard, "keys-per-shard", 10_00_00, "keys per shard") - flag.IntVar(&memtableMB, "memtable-mb", 16, "memtable size in MiB") + flag.StringVar(&mountPoint, "mount", "/mnt/disks/nvme", "data directory for shard files") + flag.IntVar(&numShards, "shards", 100, "number of shards") + flag.IntVar(&keysPerShard, "keys-per-shard", 3_00_000, "keys per shard") + flag.IntVar(&memtableMB, "memtable-mb", 2, "memtable size in MiB") flag.IntVar(&fileSizeMultiplier, "file-size-multiplier", 2, "file size in GiB per shard") - flag.IntVar(&readWorkers, "readers", 8, "number of read workers") - flag.IntVar(&writeWorkers, "writers", 8, "number of write workers") + flag.IntVar(&readWorkers, "readers", 16, "number of read workers") + flag.IntVar(&writeWorkers, "writers", 16, "number of write workers") flag.IntVar(&sampleSecs, "sample-secs", 30, "predictor sampling window in seconds") flag.Int64Var(&iterations, "iterations", 100_000_000, "number of iterations") flag.Float64Var(&aVal, "a", 0.4, "a value for the predictor") @@ -84,7 +84,7 @@ func planLockless() { } memtableSizeInBytes := int32(memtableMB) * 1024 * 1024 - fileSizeInBytes := int64(fileSizeMultiplier) * int64(memtableSizeInBytes) + fileSizeInBytes := int64(fileSizeMultiplier) * 1024 * 1024 * 1024 // fileSizeMultiplier in GiB cfg := cachepkg.WrapCacheConfig{ NumShards: numShards, @@ -95,21 +95,11 @@ func planLockless() { GridSearchEpsilon: 0.0001, SampleDuration: time.Duration(sampleSecs) * time.Second, - // Pass the metrics collector to record cache metrics - MetricsRecorder: InitMetricsCollector(), + //lockless mode for PutLL/GetLL + EnableLockless: true, } - // Set additional input parameters that the cache doesn't know about - metricsCollector.SetShards(numShards) - metricsCollector.SetKeysPerShard(keysPerShard) - metricsCollector.SetReadWorkers(readWorkers) - metricsCollector.SetWriteWorkers(writeWorkers) - metricsCollector.SetPlan("lockless") - - // Start background goroutine to wait for shutdown signal and export CSV - go RunmetricsWaitForShutdown() - - pc, err := cachepkg.NewWrapCache(cfg, mountPoint, logStats) + pc, err := cachepkg.NewWrapCache(cfg, mountPoint) if err != nil { panic(err) } @@ -121,7 +111,7 @@ func planLockless() { missedKeyChanList[i] = make(chan int) } - totalKeys := keysPerShard * numShards + totalKeys := 30_000_000 str1kb := strings.Repeat("a", 1024) str1kb = "%d" + str1kb diff --git a/flashring/cmd/flashringtest/plan_random_gausian.go b/flashring/cmd/flashringtest/plan_random_gausian.go index 3fbaf849..f906e320 100644 --- a/flashring/cmd/flashringtest/plan_random_gausian.go +++ b/flashring/cmd/flashringtest/plan_random_gausian.go @@ -12,7 +12,7 @@ import ( "sync" "time" - cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache" + cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache" "github.com/rs/zerolog" "github.com/rs/zerolog/log" ) @@ -83,7 +83,7 @@ func planRandomGaussian() { } memtableSizeInBytes := int32(memtableMB) * 1024 * 1024 - fileSizeInBytes := int64(fileSizeMultiplier) * int64(memtableSizeInBytes) + fileSizeInBytes := int64(fileSizeMultiplier) * 1024 * 1024 * 1024 // fileSizeMultiplier in GiB cfg := cachepkg.WrapCacheConfig{ NumShards: numShards, @@ -95,7 +95,7 @@ func planRandomGaussian() { SampleDuration: time.Duration(sampleSecs) * time.Second, } - pc, err := cachepkg.NewWrapCache(cfg, mountPoint, logStats) + pc, err := cachepkg.NewWrapCache(cfg, mountPoint) if err != nil { panic(err) } diff --git a/flashring/cmd/flashringtest/plan_readthrough_gausian.go b/flashring/cmd/flashringtest/plan_readthrough_gausian.go index 56c6da3d..a311d8f6 100644 --- a/flashring/cmd/flashringtest/plan_readthrough_gausian.go +++ b/flashring/cmd/flashringtest/plan_readthrough_gausian.go @@ -13,7 +13,7 @@ import ( "sync" "time" - cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache" + cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache" "github.com/rs/zerolog" "github.com/rs/zerolog/log" ) @@ -24,7 +24,7 @@ func planReadthroughGaussian() { numShards int keysPerShard int memtableMB int - fileSizeMultiplier int + fileSizeMultiplier float64 readWorkers int writeWorkers int sampleSecs int @@ -35,13 +35,13 @@ func planReadthroughGaussian() { cpuProfile string ) - flag.StringVar(&mountPoint, "mount", "/media/a0d00kc/trishul/", "data directory for shard files") - flag.IntVar(&numShards, "shards", 500, "number of shards") - flag.IntVar(&keysPerShard, "keys-per-shard", 4_00_00, "keys per shard") - flag.IntVar(&memtableMB, "memtable-mb", 16, "memtable size in MiB") - flag.IntVar(&fileSizeMultiplier, "file-size-multiplier", 2, "file size in GiB per shard") - flag.IntVar(&readWorkers, "readers", 8, "number of read workers") - flag.IntVar(&writeWorkers, "writers", 8, "number of write workers") + flag.StringVar(&mountPoint, "mount", "/mnt/disks/nvme/", "data directory for shard files") + flag.IntVar(&numShards, "shards", 50, "number of shards") + flag.IntVar(&keysPerShard, "keys-per-shard", 6_00_000, "keys per shard") + flag.IntVar(&memtableMB, "memtable-mb", 2, "memtable size in MiB") + flag.Float64Var(&fileSizeMultiplier, "file-size-multiplier", 0.25, "file size in GiB per shard") + flag.IntVar(&readWorkers, "readers", 16, "number of read workers") + flag.IntVar(&writeWorkers, "writers", 16, "number of write workers") flag.IntVar(&sampleSecs, "sample-secs", 30, "predictor sampling window in seconds") flag.Int64Var(&iterations, "iterations", 100_000_000, "number of iterations") flag.Float64Var(&aVal, "a", 0.4, "a value for the predictor") @@ -84,7 +84,7 @@ func planReadthroughGaussian() { } memtableSizeInBytes := int32(memtableMB) * 1024 * 1024 - fileSizeInBytes := int64(fileSizeMultiplier) * int64(memtableSizeInBytes) + fileSizeInBytes := int64(float64(fileSizeMultiplier) * 1024 * 1024 * 1024) // fileSizeMultiplier in GiB cfg := cachepkg.WrapCacheConfig{ NumShards: numShards, @@ -94,22 +94,9 @@ func planReadthroughGaussian() { ReWriteScoreThreshold: 0.8, GridSearchEpsilon: 0.0001, SampleDuration: time.Duration(sampleSecs) * time.Second, - - // Pass the metrics collector to record cache metrics - MetricsRecorder: InitMetricsCollector(), } - // Set additional input parameters that the cache doesn't know about - metricsCollector.SetShards(numShards) - metricsCollector.SetKeysPerShard(keysPerShard) - metricsCollector.SetReadWorkers(readWorkers) - metricsCollector.SetWriteWorkers(writeWorkers) - metricsCollector.SetPlan("readthrough") - - // Start background goroutine to wait for shutdown signal and export CSV - go RunmetricsWaitForShutdown() - - pc, err := cachepkg.NewWrapCache(cfg, mountPoint, logStats) + pc, err := cachepkg.NewWrapCache(cfg, mountPoint) if err != nil { panic(err) } @@ -121,7 +108,7 @@ func planReadthroughGaussian() { missedKeyChanList[i] = make(chan int) } - totalKeys := keysPerShard * numShards + totalKeys := 30_000_000 str1kb := strings.Repeat("a", 1024) str1kb = "%d" + str1kb @@ -139,7 +126,7 @@ func planReadthroughGaussian() { key := fmt.Sprintf("key%d", k) val := []byte(fmt.Sprintf(str1kb, k)) if err := pc.Put(key, val, 60); err != nil { - panic(err) + log.Error().Err(err).Msgf("error putting key %s", key) } if k%5000000 == 0 { fmt.Printf("----------------------------------------------prepopulated %d keys\n", k) @@ -158,7 +145,7 @@ func planReadthroughGaussian() { key := fmt.Sprintf("key%d", mk) val := []byte(fmt.Sprintf(str1kb, mk)) if err := pc.Put(key, val, 60); err != nil { - panic(err) + log.Error().Err(err).Msgf("error putting key %s", key) } } }(w) @@ -183,13 +170,14 @@ func planReadthroughGaussian() { } if expired { - panic("key expired") + log.Error().Msgf("key %s expired", key) + // panic("key expired") } if found && string(val) != fmt.Sprintf(str1kb, randomval) { panic("value mismatch") } - if k%5000000 == 0 { + if k%50000 == 0 { fmt.Printf("----------------------------------------------read %d keys %d readerid\n", k, workerID) } } diff --git a/flashring/cmd/flashringtest/plan_readthrough_gausian_batched.go b/flashring/cmd/flashringtest/plan_readthrough_gausian_batched.go index fd33e06a..756e0d9b 100644 --- a/flashring/cmd/flashringtest/plan_readthrough_gausian_batched.go +++ b/flashring/cmd/flashringtest/plan_readthrough_gausian_batched.go @@ -13,7 +13,7 @@ import ( "sync" "time" - cachepkg "github.com/Meesho/BharatMLStack/flashring/internal/cache" + cachepkg "github.com/Meesho/BharatMLStack/flashring/pkg/cache" "github.com/rs/zerolog" "github.com/rs/zerolog/log" ) @@ -93,7 +93,7 @@ func planReadthroughGaussianBatched() { } memtableSizeInBytes := int32(memtableMB) * 1024 * 1024 - fileSizeInBytes := int64(fileSizeMultiplier) * int64(memtableSizeInBytes) + fileSizeInBytes := int64(fileSizeMultiplier) * 1024 * 1024 * 1024 // fileSizeMultiplier in GiB cfg := cachepkg.WrapCacheConfig{ NumShards: numShards, @@ -108,22 +108,9 @@ func planReadthroughGaussianBatched() { EnableBatching: enableBatching, BatchWindowMicros: batchWindowMicros, MaxBatchSize: maxBatchSize, - - // Pass the metrics collector to record cache metrics - MetricsRecorder: InitMetricsCollector(), } - // Set additional input parameters that the cache doesn't know about - metricsCollector.SetShards(numShards) - metricsCollector.SetKeysPerShard(keysPerShard) - metricsCollector.SetReadWorkers(readWorkers) - metricsCollector.SetWriteWorkers(writeWorkers) - metricsCollector.SetPlan("readthrough-batched") - - // Start background goroutine to wait for shutdown signal and export CSV - go RunmetricsWaitForShutdown() - - pc, err := cachepkg.NewWrapCache(cfg, mountPoint, logStats) + pc, err := cachepkg.NewWrapCache(cfg, mountPoint) if err != nil { panic(err) } diff --git a/flashring/cmd/flashringtest/runmetrics.go b/flashring/cmd/flashringtest/runmetrics.go deleted file mode 100644 index 5e1aabec..00000000 --- a/flashring/cmd/flashringtest/runmetrics.go +++ /dev/null @@ -1,515 +0,0 @@ -package main - -import ( - "bufio" - "encoding/csv" - "fmt" - "log" - "os" - "os/signal" - "runtime" - "strconv" - "strings" - "sync" - "syscall" - "time" -) - -// Define your parameter structure -type RunMetrics struct { - // Input Parameters - Shards int - KeysPerShard int - ReadWorkers int - WriteWorkers int - Plan string - - // Observation Parameters - RP99 time.Duration - RP50 time.Duration - RP25 time.Duration - WP99 time.Duration - WP50 time.Duration - WP25 time.Duration - RThroughput float64 - WThroughput float64 - HitRate float64 - CPUUsage float64 - MemoryUsage float64 -} - -// MetricChannels holds separate channels for each metric type -type MetricChannels struct { - RP99 chan time.Duration - RP50 chan time.Duration - RP25 chan time.Duration - WP99 chan time.Duration - WP50 chan time.Duration - WP25 chan time.Duration - RThroughput chan float64 - WThroughput chan float64 - HitRate chan float64 - CPUUsage chan float64 - MemoryUsage chan float64 -} - -// MetricAverager maintains running averages for a metric -type MetricAverager struct { - mu sync.RWMutex - sum float64 - count int64 - lastValue float64 -} - -func (ma *MetricAverager) Add(value float64) { - if value == 0 { - return // Ignore zero values - } - ma.mu.Lock() - defer ma.mu.Unlock() - ma.sum += value - ma.count++ - ma.lastValue = value -} - -func (ma *MetricAverager) AddDuration(value time.Duration) { - if value == 0 { - return // Ignore zero values - } - ma.mu.Lock() - defer ma.mu.Unlock() - ma.sum += float64(value) - ma.count++ -} - -func (ma *MetricAverager) Average() float64 { - ma.mu.RLock() - defer ma.mu.RUnlock() - if ma.count == 0 { - return 0 - } - return ma.sum / float64(ma.count) -} - -func (ma *MetricAverager) Latest() float64 { - ma.mu.RLock() - defer ma.mu.RUnlock() - return ma.lastValue -} - -func (ma *MetricAverager) Reset() { - ma.mu.Lock() - defer ma.mu.Unlock() - ma.sum = 0 - ma.count = 0 -} - -// MetricsCollector collects and averages all metrics -type MetricsCollector struct { - channels MetricChannels - averagers map[string]*MetricAverager - stopCh chan struct{} - wg sync.WaitGroup - - // Input parameters (set once) - Shards int - KeysPerShard int - ReadWorkers int - WriteWorkers int - Plan string -} - -// NewMetricsCollector creates a new metrics collector with channels -func NewMetricsCollector(bufferSize int) *MetricsCollector { - mc := &MetricsCollector{ - channels: MetricChannels{ - RP99: make(chan time.Duration, bufferSize), - RP50: make(chan time.Duration, bufferSize), - RP25: make(chan time.Duration, bufferSize), - WP99: make(chan time.Duration, bufferSize), - WP50: make(chan time.Duration, bufferSize), - WP25: make(chan time.Duration, bufferSize), - RThroughput: make(chan float64, bufferSize), - WThroughput: make(chan float64, bufferSize), - HitRate: make(chan float64, bufferSize), - CPUUsage: make(chan float64, bufferSize), - MemoryUsage: make(chan float64, bufferSize), - }, - averagers: make(map[string]*MetricAverager), - stopCh: make(chan struct{}), - } - - // Initialize averagers for each metric - metricNames := []string{"RThroughput", "RP99", "RP50", "RP25", "WThroughput", "WP99", "WP50", "WP25", "HitRate", "CPUUsage", "MemoryUsage"} - for _, name := range metricNames { - mc.averagers[name] = &MetricAverager{} - } - - return mc -} - -// Start begins collecting metrics from all channels -func (mc *MetricsCollector) Start() { - // Start a goroutine for each metric channel - mc.wg.Add(11) - - go mc.collectMetricDuration(mc.channels.RP99, "RP99") - go mc.collectMetricDuration(mc.channels.RP50, "RP50") - go mc.collectMetricDuration(mc.channels.RP25, "RP25") - go mc.collectMetricDuration(mc.channels.WP99, "WP99") - go mc.collectMetricDuration(mc.channels.WP50, "WP50") - go mc.collectMetricDuration(mc.channels.WP25, "WP25") - go mc.collectMetric(mc.channels.RThroughput, "RThroughput") - go mc.collectMetric(mc.channels.WThroughput, "WThroughput") - go mc.collectMetric(mc.channels.HitRate, "HitRate") - go mc.collectMetric(mc.channels.CPUUsage, "CPUUsage") - go mc.collectMetric(mc.channels.MemoryUsage, "MemoryUsage") -} - -func (mc *MetricsCollector) collectMetric(ch chan float64, name string) { - defer mc.wg.Done() - for { - select { - case <-mc.stopCh: - return - case value, ok := <-ch: - if !ok { - return - } - mc.averagers[name].Add(value) - } - } -} - -func (mc *MetricsCollector) collectMetricDuration(ch chan time.Duration, name string) { - defer mc.wg.Done() - for { - select { - case <-mc.stopCh: - return - case value, ok := <-ch: - if !ok { - return - } - mc.averagers[name].AddDuration(value) - } - } -} - -// RecordRP99 sends a value to the RP99 channel -func (mc *MetricsCollector) RecordRP99(value time.Duration) { - select { - case mc.channels.RP99 <- value: - default: // Don't block if channel is full - } -} - -// RecordRP50 sends a value to the RP50 channel -func (mc *MetricsCollector) RecordRP50(value time.Duration) { - select { - case mc.channels.RP50 <- value: - default: - } -} - -// RecordRP25 sends a value to the RP25 channel -func (mc *MetricsCollector) RecordRP25(value time.Duration) { - select { - case mc.channels.RP25 <- value: - default: - } -} - -// RecordWP99 sends a value to the WP99 channel -func (mc *MetricsCollector) RecordWP99(value time.Duration) { - select { - case mc.channels.WP99 <- value: - default: - } -} - -// RecordWP50 sends a value to the WP50 channel -func (mc *MetricsCollector) RecordWP50(value time.Duration) { - select { - case mc.channels.WP50 <- value: - default: - } -} - -// RecordWP25 sends a value to the WP25 channel -func (mc *MetricsCollector) RecordWP25(value time.Duration) { - select { - case mc.channels.WP25 <- value: - default: - } -} - -// RecordRThroughput sends a value to the RThroughput channel -func (mc *MetricsCollector) RecordRThroughput(value float64) { - select { - case mc.channels.RThroughput <- value: - default: - } -} - -// RecordWThroughput sends a value to the WThroughput channel -func (mc *MetricsCollector) RecordWThroughput(value float64) { - select { - case mc.channels.WThroughput <- value: - default: - } -} - -// RecordHitRate sends a value to the HitRate channel -func (mc *MetricsCollector) RecordHitRate(value float64) { - select { - case mc.channels.HitRate <- value: - default: - } -} - -// GetAveragedMetrics returns the current averaged metrics -func (mc *MetricsCollector) GetAveragedMetrics() RunMetrics { - return RunMetrics{ - Shards: mc.Shards, - KeysPerShard: mc.KeysPerShard, - ReadWorkers: mc.ReadWorkers, - WriteWorkers: mc.WriteWorkers, - Plan: mc.Plan, - RP99: time.Duration(mc.averagers["RP99"].Average()), - RP50: time.Duration(mc.averagers["RP50"].Average()), - RP25: time.Duration(mc.averagers["RP25"].Average()), - WP99: time.Duration(mc.averagers["WP99"].Average()), - WP50: time.Duration(mc.averagers["WP50"].Average()), - WP25: time.Duration(mc.averagers["WP25"].Average()), - RThroughput: mc.averagers["RThroughput"].Latest(), - WThroughput: mc.averagers["WThroughput"].Latest(), - HitRate: mc.averagers["HitRate"].Average(), - CPUUsage: mc.averagers["CPUUsage"].Average(), - MemoryUsage: mc.averagers["MemoryUsage"].Average(), - } -} - -// ResetAverages resets all averagers to start fresh -func (mc *MetricsCollector) ResetAverages() { - for _, avg := range mc.averagers { - avg.Reset() - } -} - -// Stop stops all collector goroutines -func (mc *MetricsCollector) Stop() { - close(mc.stopCh) - mc.wg.Wait() -} - -// SetShards sets the number of shards (input parameter) -func (mc *MetricsCollector) SetShards(value int) { - mc.Shards = value -} - -// SetKeysPerShard sets the keys per shard (input parameter) -func (mc *MetricsCollector) SetKeysPerShard(value int) { - mc.KeysPerShard = value -} - -// SetReadWorkers sets the number of read workers (input parameter) -func (mc *MetricsCollector) SetReadWorkers(value int) { - mc.ReadWorkers = value -} - -// SetWriteWorkers sets the number of write workers (input parameter) -func (mc *MetricsCollector) SetWriteWorkers(value int) { - mc.WriteWorkers = value -} - -// SetPlan sets the plan name (input parameter) -func (mc *MetricsCollector) SetPlan(value string) { - mc.Plan = value -} - -// Global variable to hold runtime data -var currentMetrics RunMetrics -var metricsCollector *MetricsCollector - -// --- CSV Configuration --- -const CSVFileName = "performance_results.csv" - -// InitMetricsCollector creates and starts the metrics collector, returning it -// so it can be passed to other components (e.g., cache config) -func InitMetricsCollector() *MetricsCollector { - metricsCollector = NewMetricsCollector(100) - metricsCollector.Start() - return metricsCollector -} - -// RunmetricsWaitForShutdown waits for shutdown signal and logs final metrics to CSV -func RunmetricsWaitForShutdown() { - // --- Set up Signal Handling --- - stopChan := make(chan os.Signal, 1) - signal.Notify(stopChan, syscall.SIGINT, syscall.SIGTERM) - - fmt.Println("Program running. Press Ctrl+C to stop and log results to CSV...") - - // --- Wait for Stop Signal --- - <-stopChan - fmt.Println("\nTermination signal received. Stopping work and logging results...") - - // Stop the metrics collector - if metricsCollector != nil { - metricsCollector.Stop() - - // Get final averaged metrics - currentMetrics = metricsCollector.GetAveragedMetrics() - } - - // Get memory usage and CPU usage at this instant - currentMetrics.MemoryUsage = getMemoryUsageMB() - currentMetrics.CPUUsage = getCPUUsagePercent() - - // --- Log Data to CSV --- - if err := logResultsToCSV(); err != nil { - log.Fatalf("FATAL: Failed to log results to CSV: %v", err) - } - - fmt.Printf("Successfully logged results to %s.\n", CSVFileName) - - // Exit the program since we're running in a goroutine - os.Exit(0) -} - -// RunmetricsInit initializes metrics and waits for shutdown (convenience function) -func RunmetricsInit() { - InitMetricsCollector() - RunmetricsWaitForShutdown() -} - -func logResultsToCSV() error { - // 1. Check if the file exists to determine if we need a header row. - file, err := os.OpenFile(CSVFileName, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - return fmt.Errorf("failed to open CSV file: %w", err) - } - defer file.Close() - - writer := csv.NewWriter(file) - defer writer.Flush() // Crucial to ensure data is written to the file before exiting. - - // The list of all your column headers - header := []string{ - "SHARDS", "KEYS_PER_SHARD", "READ_WORKERS", "WRITE_WORKERS", "PLAN", - "R_THROUGHPUT", "R_P99", "R_P50", "R_P25", "W_THROUGHPUT", "W_P99", "W_P50", "W_P25", - "HIT_RATE", "CPU", "MEMORY", "TIME", - } - - // Determine if the file is new (or empty) and needs the header - fileInfo, _ := file.Stat() - if fileInfo.Size() == 0 { - if err := writer.Write(header); err != nil { - return fmt.Errorf("error writing CSV header: %w", err) - } - } - - // Convert your struct fields into a slice of strings for the CSV writer - dataRow := []string{ - // Input Parameters - strconv.Itoa(currentMetrics.Shards), - strconv.Itoa(currentMetrics.KeysPerShard), - strconv.Itoa(currentMetrics.ReadWorkers), // Convert int to string - strconv.Itoa(currentMetrics.WriteWorkers), - currentMetrics.Plan, - - // Observation Parameters (convert floats to strings) - fmt.Sprintf("%v", currentMetrics.RThroughput), - fmt.Sprintf("%v", currentMetrics.RP99), - fmt.Sprintf("%v", currentMetrics.RP50), - fmt.Sprintf("%v", currentMetrics.RP25), - - fmt.Sprintf("%v", currentMetrics.WThroughput), - fmt.Sprintf("%v", currentMetrics.WP99), - fmt.Sprintf("%v", currentMetrics.WP50), - fmt.Sprintf("%v", currentMetrics.WP25), - - fmt.Sprintf("%v", currentMetrics.HitRate), - fmt.Sprintf("%v", currentMetrics.CPUUsage), - fmt.Sprintf("%v", currentMetrics.MemoryUsage), - fmt.Sprintf("%v", time.Now().In(time.FixedZone("IST", 5*60*60+30*60)).Format("2006-01-02 15:04:05")), - } - - if err := writer.Write(dataRow); err != nil { - return fmt.Errorf("error writing CSV data row: %w", err) - } - - return nil -} - -// getMemoryUsageMB returns the current memory usage of this process in MB -func getMemoryUsageMB() float64 { - var m runtime.MemStats - runtime.ReadMemStats(&m) - // Alloc is bytes of allocated heap objects - return float64(m.Alloc) / 1024 / 1024 -} - -// getSystemMemoryUsageMB returns the total system memory used by this process in MB -func getSystemMemoryUsageMB() float64 { - var m runtime.MemStats - runtime.ReadMemStats(&m) - // Sys is the total bytes of memory obtained from the OS - return float64(m.Sys) / 1024 / 1024 -} - -// getCPUUsagePercent returns the CPU usage percentage for this process -// It measures CPU usage over a short interval -func getCPUUsagePercent() float64 { - // Read initial CPU stats - idle1, total1 := getCPUStats() - time.Sleep(100 * time.Millisecond) - // Read CPU stats again - idle2, total2 := getCPUStats() - - idleDelta := float64(idle2 - idle1) - totalDelta := float64(total2 - total1) - - if totalDelta == 0 { - return 0 - } - - cpuUsage := (1.0 - idleDelta/totalDelta) * 100.0 - return cpuUsage -} - -// getCPUStats reads /proc/stat and returns idle and total CPU time -func getCPUStats() (idle, total uint64) { - file, err := os.Open("/proc/stat") - if err != nil { - return 0, 0 - } - defer file.Close() - - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - if strings.HasPrefix(line, "cpu ") { - fields := strings.Fields(line) - if len(fields) < 5 { - return 0, 0 - } - // fields: cpu user nice system idle iowait irq softirq steal guest guest_nice - var values []uint64 - for _, field := range fields[1:] { - val, err := strconv.ParseUint(field, 10, 64) - if err != nil { - continue - } - values = append(values, val) - total += val - } - if len(values) >= 4 { - idle = values[3] // idle is the 4th value - } - break - } - } - return idle, total -} diff --git a/flashring/go.mod b/flashring/go.mod index f02d9663..206adab3 100644 --- a/flashring/go.mod +++ b/flashring/go.mod @@ -13,7 +13,23 @@ require ( ) require ( - github.com/dgraph-io/badger/v4 v4.9.0 // indirect + github.com/Microsoft/go-winio v0.5.0 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect + github.com/go-viper/mapstructure/v2 v2.4.0 // indirect + github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/sagikazarmark/locafero v0.11.0 // indirect + github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect + github.com/spf13/afero v1.15.0 // indirect + github.com/spf13/cast v1.10.0 // indirect + github.com/spf13/pflag v1.0.10 // indirect + github.com/subosito/gotenv v1.6.0 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/text v0.28.0 // indirect +) + +require ( + github.com/DataDog/datadog-go/v5 v5.8.2 + github.com/dgraph-io/badger/v4 v4.9.0 github.com/dgraph-io/ristretto/v2 v2.2.0 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/go-logr/logr v1.4.3 // indirect @@ -23,6 +39,7 @@ require ( github.com/klauspost/cpuid/v2 v2.3.0 // indirect github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-isatty v0.0.20 // indirect + github.com/spf13/viper v1.21.0 go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/otel v1.37.0 // indirect go.opentelemetry.io/otel/metric v1.37.0 // indirect diff --git a/flashring/go.sum b/flashring/go.sum index 6c22ab66..5d69f8d2 100644 --- a/flashring/go.sum +++ b/flashring/go.sum @@ -1,42 +1,92 @@ +github.com/DataDog/datadog-go/v5 v5.8.2 h1:9IEfH1Mw9AjWwhAMqCAkhbxjuJeMxm2ARX2VdgL+ols= +github.com/DataDog/datadog-go/v5 v5.8.2/go.mod h1:K9kcYBlxkcPP8tvvjZZKs/m1edNAUFzBbdpTUKfCsuw= +github.com/Microsoft/go-winio v0.5.0 h1:Elr9Wn+sGKPlkaBvwu4mTrxtmOp3F3yV9qhaHbXGjwU= +github.com/Microsoft/go-winio v0.5.0/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/coocood/freecache v1.2.4 h1:UdR6Yz/X1HW4fZOuH0Z94KwG851GWOSknua5VUbb/5M= github.com/coocood/freecache v1.2.4/go.mod h1:RBUWa/Cy+OHdfTGFEhEuE1pMCMX51Ncizj7rthiQ3vk= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgraph-io/badger/v4 v4.9.0 h1:tpqWb0NewSrCYqTvywbcXOhQdWcqephkVkbBmaaqHzc= github.com/dgraph-io/badger/v4 v4.9.0/go.mod h1:5/MEx97uzdPUHR4KtkNt8asfI2T4JiEiQlV7kWUo8c0= github.com/dgraph-io/ristretto/v2 v2.2.0 h1:bkY3XzJcXoMuELV8F+vS8kzNgicwQFAaGINAEJdWGOM= github.com/dgraph-io/ristretto/v2 v2.2.0/go.mod h1:RZrm63UmcBAaYWC1DotLYBmTvgkrs0+XhBd7Npn7/zI= +github.com/dgryski/go-farm v0.0.0-20240924180020-3414d57e47da h1:aIftn67I1fkbMa512G+w+Pxci9hJPB8oMnkcP3iZF38= +github.com/dgryski/go-farm v0.0.0-20240924180020-3414d57e47da/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= +github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs= +github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= github.com/google/flatbuffers v25.2.10+incompatible h1:F3vclr7C3HpB1k9mxCGRMXq6FdUalZ6H/pNX4FP1v0Q= github.com/google/flatbuffers v25.2.10+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= -github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= -github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y= github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= -github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= +github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= github.com/rs/zerolog v1.34.0 h1:k43nTLIwcTVQAncfCw4KZ2VY6ukYoZaBPNOE8txlOeY= github.com/rs/zerolog v1.34.0/go.mod h1:bJsvje4Z08ROH4Nhs5iH600c3IkWhwp44iRc54W6wYQ= +github.com/sagikazarmark/locafero v0.11.0 h1:1iurJgmM9G3PA/I+wWYIOw/5SyBtxapeHDcg+AAIFXc= +github.com/sagikazarmark/locafero v0.11.0/go.mod h1:nVIGvgyzw595SUSUE6tvCp3YYTeHs15MvlmU87WwIik= +github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= +github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 h1:+jumHNA0Wrelhe64i8F6HNlS8pkoyMv5sreGx2Ry5Rw= +github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8/go.mod h1:3n1Cwaq1E1/1lhQhtRK2ts/ZwZEhjcQeJQ1RuC6Q/8U= +github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I= +github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg= +github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY= +github.com/spf13/cast v1.10.0/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/viper v1.21.0 h1:x5S+0EU27Lbphp4UKm1C+1oQO+rKx36vfCoaVebLFSU= +github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjbTCAY= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= +github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= @@ -49,14 +99,46 @@ go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/Wgbsd go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= -golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/flashring/internal/cache/badger.go b/flashring/internal/cache/badger.go deleted file mode 100644 index 7ff8c691..00000000 --- a/flashring/internal/cache/badger.go +++ /dev/null @@ -1,135 +0,0 @@ -package internal - -import ( - "sync/atomic" - "time" - - filecache "github.com/Meesho/BharatMLStack/flashring/internal/shard" - badger "github.com/dgraph-io/badger/v4" - "github.com/rs/zerolog/log" -) - -type Badger struct { - cache *badger.DB - stats *CacheStats -} - -func NewBadger(config WrapCacheConfig, logStats bool) (*Badger, error) { - options := badger.DefaultOptions(config.MountPoint) - options.MetricsEnabled = false - - // 1. PRIMARY CACHE (1GB) - // This caches the data blocks themselves. - options.BlockCacheSize = 1024 << 20 - - // 2. INDEX CACHE (512MB) - // This keeps the keys and the structure of the LSM tree in RAM. - // This is the most critical setting for read latency. - options.IndexCacheSize = 512 << 20 - - // 3. WRITE BUFFERS (Memtables) - // We use 3 tables of 64MB each. This allows Badger to handle - // write spikes without blocking. (~192MB total) - options.NumMemtables = 40 - options.MemTableSize = 1024 << 20 - - options.ValueThreshold = 1024 - options.SyncWrites = false - - cache, err := badger.Open(options) - if err != nil { - return nil, err - } - bc := &Badger{ - cache: cache, - stats: &CacheStats{ - Hits: atomic.Uint64{}, - TotalGets: atomic.Uint64{}, - TotalPuts: atomic.Uint64{}, - ReWrites: atomic.Uint64{}, - Expired: atomic.Uint64{}, - ShardWiseActiveEntries: atomic.Uint64{}, - LatencyTracker: filecache.NewLatencyTracker(), - }, - } - - if logStats { - go func() { - sleepDuration := 10 * time.Second - var prevTotalGets, prevTotalPuts uint64 - for { - time.Sleep(sleepDuration) - - totalGets := bc.stats.TotalGets.Load() - totalPuts := bc.stats.TotalPuts.Load() - getsPerSec := float64(totalGets-prevTotalGets) / sleepDuration.Seconds() - putsPerSec := float64(totalPuts-prevTotalPuts) / sleepDuration.Seconds() - - log.Info().Msgf("Shard %d HitRate: %v", 0, cache.BlockCacheMetrics().Hits()) - log.Info().Msgf("Shard %d Expired: %v", 0, cache.BlockCacheMetrics().Misses()) - log.Info().Msgf("Shard %d Total: %v", 0, cache.BlockCacheMetrics().KeysEvicted()) - log.Info().Msgf("Gets/sec: %v", getsPerSec) - log.Info().Msgf("Puts/sec: %v", putsPerSec) - - getP25, getP50, getP99 := bc.stats.LatencyTracker.GetLatencyPercentiles() - putP25, putP50, putP99 := bc.stats.LatencyTracker.PutLatencyPercentiles() - - log.Info().Msgf("Get Count: %v", totalGets) - log.Info().Msgf("Put Count: %v", totalPuts) - log.Info().Msgf("Get Latencies - P25: %v, P50: %v, P99: %v", getP25, getP50, getP99) - log.Info().Msgf("Put Latencies - P25: %v, P50: %v, P99: %v", putP25, putP50, putP99) - - prevTotalGets = totalGets - prevTotalPuts = totalPuts - } - }() - } - - return bc, nil -} - -func (b *Badger) Put(key string, value []byte, exptimeInMinutes uint16) error { - - start := time.Now() - defer func() { - b.stats.LatencyTracker.RecordPut(time.Since(start)) - }() - - b.stats.TotalPuts.Add(1) - err := b.cache.Update(func(txn *badger.Txn) error { - entry := badger.NewEntry([]byte(key), value).WithTTL(time.Duration(exptimeInMinutes) * time.Minute) - err := txn.SetEntry(entry) - return err - }) - return err -} - -func (b *Badger) Get(key string) ([]byte, bool, bool) { - - start := time.Now() - defer func() { - b.stats.LatencyTracker.RecordGet(time.Since(start)) - }() - - b.stats.TotalGets.Add(1) - - val := make([]byte, 0) - err := b.cache.View(func(txn *badger.Txn) error { - item, err := txn.Get([]byte(key)) - if err != nil { - return err - } - val, err = item.ValueCopy(val) - - if err != nil { - b.stats.Hits.Add(1) - } - - return err - }) - return val, err != badger.ErrKeyNotFound, false -} - -func (b *Badger) Close() error { - return b.cache.Close() -} diff --git a/flashring/internal/cache/cache.go b/flashring/internal/cache/cache.go deleted file mode 100644 index 74755251..00000000 --- a/flashring/internal/cache/cache.go +++ /dev/null @@ -1,457 +0,0 @@ -package internal - -import ( - "fmt" - "strconv" - "sync" - "sync/atomic" - "time" - - "github.com/Meesho/BharatMLStack/flashring/internal/maths" - filecache "github.com/Meesho/BharatMLStack/flashring/internal/shard" - "github.com/cespare/xxhash/v2" - "github.com/rs/zerolog/log" -) - -/* - Each shard can keep 67M keys - With Round = 1, expected collision (67M)^2/(2*2^62) = 4.87×10^-4 -*/ - -const ( - ROUNDS = 1 - KEYS_PER_SHARD = (1 << 26) - BLOCK_SIZE = 4096 -) - -var ( - ErrNumShardLessThan1 = fmt.Errorf("num shards must be greater than 0") - ErrKeysPerShardLessThan1 = fmt.Errorf("keys per shard must be greater than 0") - ErrKeysPerShardGreaterThan67M = fmt.Errorf("keys per shard must be less than 67M") - ErrMemtableSizeLessThan1 = fmt.Errorf("memtable size must be greater than 0") - ErrMemtableSizeGreaterThan1GB = fmt.Errorf("memtable size must be less than 1GB") - ErrMemtableSizeNotMultipleOf4KB = fmt.Errorf("memtable size must be a multiple of 4KB") - ErrFileSizeLessThan1 = fmt.Errorf("file size must be greater than 0") - ErrFileSizeNotMultipleOf4KB = fmt.Errorf("file size must be a multiple of 4KB") - Seed = xxhash.Sum64String(strconv.Itoa(int(time.Now().UnixNano()))) -) - -type WrapCache struct { - shards []*filecache.ShardCache - shardLocks []sync.RWMutex - predictor *maths.Predictor - stats []*CacheStats - metricsRecorder MetricsRecorder -} - -type CacheStats struct { - Hits atomic.Uint64 - TotalGets atomic.Uint64 - TotalPuts atomic.Uint64 - ReWrites atomic.Uint64 - Expired atomic.Uint64 - ShardWiseActiveEntries atomic.Uint64 - LatencyTracker *filecache.LatencyTracker - BatchTracker *filecache.BatchTracker -} - -// MetricsRecorder is an interface for recording metrics from the cache -// Implement this interface to receive metrics from the cache layer -type MetricsRecorder interface { - // Input parameters - SetShards(value int) - SetKeysPerShard(value int) - SetReadWorkers(value int) - SetWriteWorkers(value int) - SetPlan(value string) - - // Observation metrics - RecordRP99(value time.Duration) - RecordRP50(value time.Duration) - RecordRP25(value time.Duration) - RecordWP99(value time.Duration) - RecordWP50(value time.Duration) - RecordWP25(value time.Duration) - RecordRThroughput(value float64) - RecordWThroughput(value float64) - RecordHitRate(value float64) -} - -type WrapCacheConfig struct { - NumShards int - KeysPerShard int - FileSize int64 - MemtableSize int32 - ReWriteScoreThreshold float32 - GridSearchEpsilon float64 - SampleDuration time.Duration - - // Batching reads - EnableBatching bool - BatchWindowMicros int // in microseconds - MaxBatchSize int - - // Optional metrics recorder - MetricsRecorder MetricsRecorder - - //Badger - MountPoint string -} - -func NewWrapCache(config WrapCacheConfig, mountPoint string, logStats bool) (*WrapCache, error) { - if config.NumShards <= 0 { - return nil, ErrNumShardLessThan1 - } - if config.KeysPerShard <= 0 { - return nil, ErrKeysPerShardLessThan1 - } - if config.KeysPerShard > KEYS_PER_SHARD { - return nil, ErrKeysPerShardGreaterThan67M - } - if config.MemtableSize <= 0 { - return nil, ErrMemtableSizeLessThan1 - } - if config.MemtableSize > 1024*1024*1024 { - return nil, ErrMemtableSizeGreaterThan1GB - } - if config.MemtableSize%BLOCK_SIZE != 0 { - return nil, ErrMemtableSizeNotMultipleOf4KB - } - if config.FileSize <= 0 { - return nil, ErrFileSizeLessThan1 - } - if config.FileSize%BLOCK_SIZE != 0 { - return nil, ErrFileSizeNotMultipleOf4KB - } - weights := []maths.WeightTuple{ - { - WFreq: 0.1, - WLA: 0.1, - }, - { - WFreq: 0.45, - WLA: 0.1, - }, - { - WFreq: 0.9, - WLA: 0.1, - }, - { - WFreq: 0.1, - WLA: 0.45, - }, - { - WFreq: 0.45, - WLA: 0.45, - }, - { - WFreq: 0.9, - WLA: 0.45, - }, - { - WFreq: 0.1, - WLA: 0.9, - }, - { - WFreq: 0.45, - WLA: 0.9, - }, - { - WFreq: 0.9, - WLA: 0.9, - }, - } - MaxMemTableCount := config.FileSize / int64(config.MemtableSize) - predictor := maths.NewPredictor(maths.PredictorConfig{ - ReWriteScoreThreshold: config.ReWriteScoreThreshold, - Weights: weights, - SampleDuration: config.SampleDuration, - MaxMemTableCount: uint32(MaxMemTableCount), - GridSearchEpsilon: config.GridSearchEpsilon, - }) - - batchWindow := time.Duration(0) - if config.EnableBatching && config.BatchWindowMicros > 0 { - batchWindow = time.Duration(config.BatchWindowMicros) * time.Microsecond - } - shardLocks := make([]sync.RWMutex, config.NumShards) - shards := make([]*filecache.ShardCache, config.NumShards) - for i := 0; i < config.NumShards; i++ { - shards[i] = filecache.NewShardCache(filecache.ShardCacheConfig{ - MemtableSize: config.MemtableSize, - Rounds: ROUNDS, - RbInitial: config.KeysPerShard, - RbMax: config.KeysPerShard, - DeleteAmortizedStep: 10000, - MaxFileSize: int64(config.FileSize), - BlockSize: BLOCK_SIZE, - Directory: mountPoint, - Predictor: predictor, - - //batching reads - EnableBatching: config.EnableBatching, - BatchWindow: batchWindow, - MaxBatchSize: config.MaxBatchSize, - }, &shardLocks[i]) - } - - stats := make([]*CacheStats, config.NumShards) - for i := 0; i < config.NumShards; i++ { - stats[i] = &CacheStats{LatencyTracker: filecache.NewLatencyTracker(), BatchTracker: filecache.NewBatchTracker()} - } - wc := &WrapCache{ - shards: shards, - shardLocks: shardLocks, - predictor: predictor, - stats: stats, - metricsRecorder: config.MetricsRecorder, - } - if logStats { - - go func() { - sleepDuration := 10 * time.Second - // perShardPrevTotalGets := make([]uint64, config.NumShards) - // perShardPrevTotalPuts := make([]uint64, config.NumShards) - combinedPrevTotalGets := uint64(0) - combinedPrevTotalPuts := uint64(0) - for { - time.Sleep(sleepDuration) - - combinedTotalGets := uint64(0) - combinedTotalPuts := uint64(0) - combinedHits := uint64(0) - combinedReWrites := uint64(0) - combinedExpired := uint64(0) - combinedShardWiseActiveEntries := uint64(0) - for i := 0; i < config.NumShards; i++ { - combinedTotalGets += wc.stats[i].TotalGets.Load() - combinedTotalPuts += wc.stats[i].TotalPuts.Load() - combinedHits += wc.stats[i].Hits.Load() - combinedReWrites += wc.stats[i].ReWrites.Load() - combinedExpired += wc.stats[i].Expired.Load() - combinedShardWiseActiveEntries += wc.stats[i].ShardWiseActiveEntries.Load() - } - - combinedHitRate := float64(0) - if combinedTotalGets > 0 { - combinedHitRate = float64(combinedHits) / float64(combinedTotalGets) - } - - log.Info().Msgf("Combined HitRate: %v", combinedHitRate) - log.Info().Msgf("Combined ReWrites: %v", combinedReWrites) - log.Info().Msgf("Combined Expired: %v", combinedExpired) - log.Info().Msgf("Combined Total: %v", combinedTotalGets) - log.Info().Msgf("Combined Puts/sec: %v", float64(combinedTotalPuts-combinedPrevTotalPuts)/float64(sleepDuration.Seconds())) - log.Info().Msgf("Combined Gets/sec: %v", float64(combinedTotalGets-combinedPrevTotalGets)/float64(sleepDuration.Seconds())) - log.Info().Msgf("Combined ShardWiseActiveEntries: %v", combinedShardWiseActiveEntries) - - combinedGetP25, combinedGetP50, combinedGetP99 := wc.stats[0].LatencyTracker.GetLatencyPercentiles() - combinedPutP25, combinedPutP50, combinedPutP99 := wc.stats[0].LatencyTracker.PutLatencyPercentiles() - - log.Info().Msgf("Combined Get Count: %v", combinedTotalGets) - log.Info().Msgf("Combined Put Count: %v", combinedTotalPuts) - log.Info().Msgf("Combined Get Latencies - P25: %v, P50: %v, P99: %v", combinedGetP25, combinedGetP50, combinedGetP99) - log.Info().Msgf("Combined Put Latencies - P25: %v, P50: %v, P99: %v", combinedPutP25, combinedPutP50, combinedPutP99) - - combinedGetBatchP25, combinedGetBatchP50, combinedGetBatchP99 := wc.shards[0].Stats.BatchTracker.GetBatchSizePercentiles() - log.Info().Msgf("Combined Get Batch Sizes - P25: %v, P50: %v, P99: %v", combinedGetBatchP25, combinedGetBatchP50, combinedGetBatchP99) - - // Send metrics to the recorder if configured - if wc.metricsRecorder != nil { - rThroughput := float64(combinedTotalGets-combinedPrevTotalGets) / sleepDuration.Seconds() - wThroughput := float64(combinedTotalPuts-combinedPrevTotalPuts) / sleepDuration.Seconds() - - wc.metricsRecorder.RecordRP25(combinedGetP25) - wc.metricsRecorder.RecordRP50(combinedGetP50) - wc.metricsRecorder.RecordRP99(combinedGetP99) - wc.metricsRecorder.RecordWP25(combinedPutP25) - wc.metricsRecorder.RecordWP50(combinedPutP50) - wc.metricsRecorder.RecordWP99(combinedPutP99) - wc.metricsRecorder.RecordRThroughput(rThroughput) - wc.metricsRecorder.RecordWThroughput(wThroughput) - wc.metricsRecorder.RecordHitRate(combinedHitRate) - } - - combinedPrevTotalGets = combinedTotalGets - combinedPrevTotalPuts = combinedTotalPuts - - /* disabling per shard stats for now - for i := 0; i < config.NumShards; i++ { - log.Info().Msgf("Shard %d has %d active entries", i, wc.stats[i].ShardWiseActiveEntries.Load()) - total := wc.stats[i].TotalGets.Load() - hits := wc.stats[i].Hits.Load() - hitRate := float64(0) - if total > 0 { - hitRate = float64(hits) / float64(total) - } - log.Info().Msgf("Shard %d HitRate: %v", i, hitRate) - log.Info().Msgf("Shard %d ReWrites: %v", i, wc.stats[i].ReWrites.Load()) - log.Info().Msgf("Shard %d Expired: %v", i, wc.stats[i].Expired.Load()) - log.Info().Msgf("Shard %d Total: %v", i, total) - log.Info().Msgf("Gets/sec: %v", float64(total-perShardPrevTotalGets[i])/float64(sleepDuration.Seconds())) - log.Info().Msgf("Puts/sec: %v", float64(wc.stats[i].TotalPuts.Load()-perShardPrevTotalPuts[i])/float64(sleepDuration.Seconds())) - perShardPrevTotalGets[i] = total - perShardPrevTotalPuts[i] = wc.stats[i].TotalPuts.Load() - - getP25, getP50, getP99 := wc.stats[i].LatencyTracker.GetLatencyPercentiles() - putP25, putP50, putP99 := wc.stats[i].LatencyTracker.PutLatencyPercentiles() - - log.Info().Msgf("Get Count: %v", wc.stats[i].TotalGets.Load()) - log.Info().Msgf("Put Count: %v", wc.stats[i].TotalPuts.Load()) - log.Info().Msgf("Get Latencies - P25: %v, P50: %v, P99: %v", getP25, getP50, getP99) - log.Info().Msgf("Put Latencies - P25: %v, P50: %v, P99: %v", putP25, putP50, putP99) - - } - */ - log.Info().Msgf("GridSearchActive: %v", wc.predictor.GridSearchEstimator.IsGridSearchActive()) - } - }() - } - return wc, nil -} - -func (wc *WrapCache) PutLL(key string, value []byte, exptimeInMinutes uint16) error { - - h32 := wc.Hash(key) - shardIdx := h32 % uint32(len(wc.shards)) - start := time.Now() - - result := filecache.ErrorPool.Get().(chan error) - - wc.shards[shardIdx].WriteCh <- &filecache.WriteRequestV2{ - Key: key, - Value: value, - ExptimeInMinutes: exptimeInMinutes, - Result: result, - } - - if h32%100 < 10 { - wc.stats[shardIdx].ShardWiseActiveEntries.Store(uint64(wc.shards[shardIdx].GetRingBufferActiveEntries())) - } - - op := <-result - filecache.ErrorPool.Put(result) - wc.stats[shardIdx].TotalPuts.Add(1) - wc.stats[shardIdx].LatencyTracker.RecordPut(time.Since(start)) - return op -} - -func (wc *WrapCache) GetLL(key string) ([]byte, bool, bool) { - h32 := wc.Hash(key) - shardIdx := h32 % uint32(len(wc.shards)) - - start := time.Now() - - found, value, _, expired, needsSlowPath := wc.shards[shardIdx].GetFastPath(key) - - if !needsSlowPath { - if found && !expired { - wc.stats[shardIdx].Hits.Add(1) - } else if expired { - wc.stats[shardIdx].Expired.Add(1) - } - - wc.stats[shardIdx].TotalGets.Add(1) - wc.stats[shardIdx].LatencyTracker.RecordGet(time.Since(start)) - return value, found, expired - } - - result := filecache.ReadResultPool.Get().(chan filecache.ReadResultV2) - - req := filecache.ReadRequestPool.Get().(*filecache.ReadRequestV2) - req.Key = key - req.Result = result - - wc.shards[shardIdx].ReadCh <- req - op := <-result - - filecache.ReadResultPool.Put(result) - filecache.ReadRequestPool.Put(req) - - if op.Found && !op.Expired { - wc.stats[shardIdx].Hits.Add(1) - } - if op.Expired { - wc.stats[shardIdx].Expired.Add(1) - } - wc.stats[shardIdx].LatencyTracker.RecordGet(time.Since(start)) - wc.stats[shardIdx].TotalGets.Add(1) - - return op.Data, op.Found, op.Expired -} - -func (wc *WrapCache) Put(key string, value []byte, exptimeInMinutes uint16) error { - - h32 := wc.Hash(key) - shardIdx := h32 % uint32(len(wc.shards)) - - start := time.Now() - defer func() { - wc.stats[shardIdx].LatencyTracker.RecordPut(time.Since(start)) - }() - - wc.shardLocks[shardIdx].Lock() - defer wc.shardLocks[shardIdx].Unlock() - wc.putLocked(shardIdx, h32, key, value, exptimeInMinutes) - return nil -} - -func (wc *WrapCache) putLocked(shardIdx uint32, h32 uint32, key string, value []byte, exptimeInMinutes uint16) { - wc.shards[shardIdx].Put(key, value, exptimeInMinutes) - wc.stats[shardIdx].TotalPuts.Add(1) - if h32%100 < 10 { - wc.stats[shardIdx].ShardWiseActiveEntries.Store(uint64(wc.shards[shardIdx].GetRingBufferActiveEntries())) - } -} - -func (wc *WrapCache) Get(key string) ([]byte, bool, bool) { - h32 := wc.Hash(key) - shardIdx := h32 % uint32(len(wc.shards)) - - start := time.Now() - defer func() { - wc.stats[shardIdx].LatencyTracker.RecordGet(time.Since(start)) - }() - - var keyFound bool - var val []byte - var remainingTTL uint16 - var expired bool - var shouldReWrite bool - if wc.shards[shardIdx].BatchReader != nil { - reqChan := make(chan filecache.ReadResultV2, 1) - wc.shards[shardIdx].BatchReader.Requests <- &filecache.ReadRequestV2{ - Key: key, - Result: reqChan, - } - result := <-reqChan - - keyFound, val, remainingTTL, expired, shouldReWrite = result.Found, result.Data, result.TTL, result.Expired, result.ShouldRewrite - } else { - wc.shardLocks[shardIdx].RLock() - defer wc.shardLocks[shardIdx].RUnlock() - keyFound, val, remainingTTL, expired, shouldReWrite = wc.shards[shardIdx].Get(key) - } - - if keyFound && !expired { - wc.stats[shardIdx].Hits.Add(1) - } - if expired { - wc.stats[shardIdx].Expired.Add(1) - } - wc.stats[shardIdx].TotalGets.Add(1) - if shouldReWrite { - wc.stats[shardIdx].ReWrites.Add(1) - wc.putLocked(shardIdx, h32, key, val, remainingTTL) - } - wc.predictor.Observe(float64(wc.stats[shardIdx].Hits.Load()) / float64(wc.stats[shardIdx].TotalGets.Load())) - return val, keyFound, expired -} - -func (wc *WrapCache) Hash(key string) uint32 { - return uint32(xxhash.Sum64String(key) ^ Seed) -} - -func (wc *WrapCache) GetShardCache(shardIdx int) *filecache.ShardCache { - return wc.shards[shardIdx] -} diff --git a/flashring/internal/cache/freecache.go b/flashring/internal/cache/freecache.go deleted file mode 100644 index df0f0f75..00000000 --- a/flashring/internal/cache/freecache.go +++ /dev/null @@ -1,96 +0,0 @@ -package internal - -import ( - "runtime/debug" - "sync/atomic" - "time" - - filecache "github.com/Meesho/BharatMLStack/flashring/internal/shard" - "github.com/coocood/freecache" - "github.com/rs/zerolog/log" -) - -type Freecache struct { - cache *freecache.Cache - stats *CacheStats -} - -func NewFreecache(config WrapCacheConfig, logStats bool) (*Freecache, error) { - - cache := freecache.NewCache(int(config.FileSize)) - debug.SetGCPercent(20) - - fc := &Freecache{ - cache: cache, - stats: &CacheStats{ - Hits: atomic.Uint64{}, - TotalGets: atomic.Uint64{}, - TotalPuts: atomic.Uint64{}, - ReWrites: atomic.Uint64{}, - Expired: atomic.Uint64{}, - ShardWiseActiveEntries: atomic.Uint64{}, - LatencyTracker: filecache.NewLatencyTracker(), - }, - } - - if logStats { - go func() { - sleepDuration := 10 * time.Second - var prevTotalGets, prevTotalPuts uint64 - for { - time.Sleep(sleepDuration) - - totalGets := fc.stats.TotalGets.Load() - totalPuts := fc.stats.TotalPuts.Load() - getsPerSec := float64(totalGets-prevTotalGets) / sleepDuration.Seconds() - putsPerSec := float64(totalPuts-prevTotalPuts) / sleepDuration.Seconds() - - log.Info().Msgf("Shard %d HitRate: %v", 0, cache.HitRate()) - log.Info().Msgf("Shard %d Expired: %v", 0, cache.ExpiredCount()) - log.Info().Msgf("Shard %d Total: %v", 0, cache.EntryCount()) - log.Info().Msgf("Gets/sec: %v", getsPerSec) - log.Info().Msgf("Puts/sec: %v", putsPerSec) - - getP25, getP50, getP99 := fc.stats.LatencyTracker.GetLatencyPercentiles() - putP25, putP50, putP99 := fc.stats.LatencyTracker.PutLatencyPercentiles() - - log.Info().Msgf("Get Count: %v", totalGets) - log.Info().Msgf("Put Count: %v", totalPuts) - log.Info().Msgf("Get Latencies - P25: %v, P50: %v, P99: %v", getP25, getP50, getP99) - log.Info().Msgf("Put Latencies - P25: %v, P50: %v, P99: %v", putP25, putP50, putP99) - - prevTotalGets = totalGets - prevTotalPuts = totalPuts - } - }() - } - - return fc, nil - -} - -func (c *Freecache) Put(key string, value []byte, exptimeInMinutes uint16) error { - start := time.Now() - defer func() { - c.stats.LatencyTracker.RecordPut(time.Since(start)) - }() - - c.stats.TotalPuts.Add(1) - c.cache.Set([]byte(key), value, int(exptimeInMinutes)*60) - return nil -} - -func (c *Freecache) Get(key string) ([]byte, bool, bool) { - start := time.Now() - defer func() { - c.stats.LatencyTracker.RecordGet(time.Since(start)) - }() - - c.stats.TotalGets.Add(1) - val, err := c.cache.Get([]byte(key)) - if err != nil { - return nil, false, false - } - c.stats.Hits.Add(1) - return val, true, false -} diff --git a/flashring/internal/fs/aligned_page.go b/flashring/internal/fs/aligned_page.go index c499ae36..099ccd9d 100644 --- a/flashring/internal/fs/aligned_page.go +++ b/flashring/internal/fs/aligned_page.go @@ -4,8 +4,6 @@ package fs import ( - "runtime/pprof" - "golang.org/x/sys/unix" ) @@ -16,7 +14,7 @@ const ( MAP_ANON = unix.MAP_ANON ) -var mmapProf = pprof.NewProfile("mmap") // will show up in /debug/pprof/ +// var mmapProf = pprof.NewProfile("mmap") // will show up in /debug/pprof/ type AlignedPage struct { Buf []byte @@ -28,9 +26,9 @@ func NewAlignedPage(pageSize int) *AlignedPage { if err != nil { panic(err) } - if pageSize > 0 { - mmapProf.Add(&b[0], pageSize) // attribute sz bytes to this callsite - } + // if pageSize > 0 { + // mmapProf.Add(&b[0], pageSize) // attribute sz bytes to this callsite + // } return &AlignedPage{ Buf: b, mmap: b, @@ -38,9 +36,9 @@ func NewAlignedPage(pageSize int) *AlignedPage { } func Unmap(p *AlignedPage) error { - if len(p.mmap) > 0 { - mmapProf.Remove(&p.mmap[0]) // release from custom profile - } + // if len(p.mmap) > 0 { + // mmapProf.Remove(&p.mmap[0]) // release from custom profile + // } if p.mmap != nil { err := unix.Munmap(p.mmap) if err != nil { diff --git a/flashring/internal/fs/batch_iouring.go b/flashring/internal/fs/batch_iouring.go new file mode 100644 index 00000000..13c8267f --- /dev/null +++ b/flashring/internal/fs/batch_iouring.go @@ -0,0 +1,322 @@ +//go:build linux +// +build linux + +package fs + +import ( + "fmt" + "sync" + "sync/atomic" + "syscall" + "time" + + "github.com/Meesho/BharatMLStack/flashring/pkg/metrics" +) + +// batchReadResult holds the outcome of a single batched pread. +type batchReadResult struct { + N int + Err error +} + +// batchReadRequest is a pread submitted to the batch reader. +type batchReadRequest struct { + fd int + buf []byte + offset uint64 + done chan batchReadResult +} + +var batchReqPool = sync.Pool{ + New: func() interface{} { + return &batchReadRequest{ + done: make(chan batchReadResult, 1), + } + }, +} + +// BatchIoUringReader collects pread requests from multiple goroutines into a +// single channel and submits them as one io_uring batch. This amortizes the +// syscall overhead (1 io_uring_enter instead of N) and lets NVMe process +// multiple commands in parallel (queue depth > 1). +// +// Collection uses non-blocking channel drain: after receiving the first +// request, it drains whatever else is already queued (no timer). Under load +// this provides natural batching; under low load single requests go out +// with zero added latency. +// +// CQEs are dispatched individually as they complete (no head-of-line blocking). +type BatchIoUringReader struct { + ring *IoUring + reqCh chan *batchReadRequest + maxBatch int + window time.Duration // wait up to this for more requests before submit (0 = drain only) + closeCh chan struct{} + wg sync.WaitGroup +} + +// BatchIoUringConfig configures the batch reader. +type BatchIoUringConfig struct { + RingDepth uint32 // io_uring SQ/CQ size (default 256) + MaxBatch int // max requests per batch (capped to RingDepth) + Window time.Duration // wait up to this for requests to accumulate before submit (e.g. 500*time.Microsecond); 0 = drain only, no wait + QueueSize int // channel buffer size (default 1024) +} + +// NewBatchIoUringReader creates a batch reader with its own io_uring ring +// and starts the background collection goroutine. +func NewBatchIoUringReader(cfg BatchIoUringConfig) (*BatchIoUringReader, error) { + if cfg.RingDepth == 0 { + cfg.RingDepth = 256 + } + if cfg.MaxBatch == 0 || cfg.MaxBatch > int(cfg.RingDepth) { + cfg.MaxBatch = int(cfg.RingDepth) + } + if cfg.QueueSize == 0 { + cfg.QueueSize = 1024 + } + + ring, err := NewIoUring(cfg.RingDepth, 0) + if err != nil { + return nil, fmt.Errorf("batch io_uring init: %w", err) + } + + b := &BatchIoUringReader{ + ring: ring, + reqCh: make(chan *batchReadRequest, cfg.QueueSize), + maxBatch: cfg.MaxBatch, + window: cfg.Window, + closeCh: make(chan struct{}), + } + b.wg.Add(1) + go b.loop() + return b, nil +} + +// Submit sends a pread request into the batch channel and blocks until the +// io_uring completion is received. Thread-safe; called from many goroutines. +func (b *BatchIoUringReader) Submit(fd int, buf []byte, offset uint64) (int, error) { + if len(buf) == 0 { + return 0, nil + } + + var startTime time.Time + if metrics.Enabled() { + startTime = time.Now() + } + + req := batchReqPool.Get().(*batchReadRequest) + req.fd = fd + req.buf = buf + req.offset = offset + + b.reqCh <- req + + result := <-req.done + n, err := result.N, result.Err + if metrics.Enabled() { + metrics.Timing(metrics.KEY_PREAD_LATENCY, time.Since(startTime), []string{}) + } + + // Reset and return to pool + req.fd = 0 + req.buf = nil + req.offset = 0 + batchReqPool.Put(req) + + return n, err +} + +// Close shuts down the collection goroutine and releases the io_uring ring. +func (b *BatchIoUringReader) Close() { + close(b.closeCh) + b.wg.Wait() + b.ring.Close() +} + +// loop is the single background goroutine that collects and submits batches. +// +// Phase 1: block on first request (no timer ticking when idle). +// Phase 2: non-blocking drain of whatever else is already queued. +// Phase 3: submit the batch and dispatch CQEs as they complete. +func (b *BatchIoUringReader) loop() { + defer b.wg.Done() + + batch := make([]*batchReadRequest, 0, b.maxBatch) + + for { + // Phase 1: block until the first request arrives + select { + case req := <-b.reqCh: + batch = append(batch, req) + case <-b.closeCh: + return + } + + // Phase 2: drain with optional wait — if window > 0, wait up to window + // for more requests; otherwise non-blocking drain only. + var timer *time.Timer + if b.window > 0 { + timer = time.NewTimer(b.window) + } + drain: + for len(batch) < b.maxBatch { + if b.window > 0 { + select { + case req := <-b.reqCh: + batch = append(batch, req) + case <-timer.C: + break drain + case <-b.closeCh: + if timer != nil { + timer.Stop() + } + return + } + } else { + select { + case req := <-b.reqCh: + batch = append(batch, req) + default: + break drain + } + } + } + if timer != nil { + timer.Stop() + } + + // Phase 3: submit and dispatch + b.submitBatch(batch) + batch = batch[:0] + } +} + +// submitBatch prepares N SQEs, submits them (fire-and-forget), then dispatches +// each CQE individually as it completes. Fast reads are dispatched immediately +// without waiting for slow reads in the same batch (no head-of-line blocking). +func (b *BatchIoUringReader) submitBatch(batch []*batchReadRequest) { + if metrics.Enabled() { + metrics.Timing(metrics.KEY_IOURING_SIZE, time.Duration(len(batch))*time.Millisecond, []string{}) + } + n := len(batch) + if n == 0 { + return + } + + b.ring.mu.Lock() + + // Prepare SQEs + prepared := 0 + for i, req := range batch { + sqe := b.ring.getSqe() + if sqe == nil { + // SQ full -- error the rest + for j := i; j < n; j++ { + batch[j].done <- batchReadResult{ + Err: fmt.Errorf("io_uring: SQ full, batch=%d depth=%d", n, b.ring.sqEntries), + } + } + break + } + prepRead(sqe, req.fd, req.buf, req.offset) + sqe.UserData = uint64(i) // index for CQE matching + prepared++ + } + + if prepared == 0 { + b.ring.mu.Unlock() + return + } + + // Submit SQEs but do NOT wait for completions (waitNr=0). + // The kernel starts processing I/O immediately; we dispatch each CQE + // as it arrives below, so fast reads aren't blocked by slow ones. + _, err := b.ring.submit(0) + if err != nil { + b.ring.mu.Unlock() + for i := 0; i < prepared; i++ { + batch[i].done <- batchReadResult{Err: fmt.Errorf("io_uring_enter: %w", err)} + } + return + } + + // Dispatch CQEs one-by-one as they complete. + completed := 0 + for completed < prepared { + cqe, err := b.ring.waitCqe() + if err != nil { + // Catastrophic ring error -- unblock all unsatisfied callers. + b.ring.mu.Unlock() + for i := 0; i < n; i++ { + select { + case batch[i].done <- batchReadResult{Err: fmt.Errorf("io_uring waitCqe: %w", err)}: + default: // already sent + } + } + return + } + + idx := int(cqe.UserData) + res := cqe.Res + b.ring.seenCqe() + completed++ + + if idx < 0 || idx >= prepared { + continue // unexpected UserData; skip + } + + if res < 0 { + batch[idx].done <- batchReadResult{ + Err: fmt.Errorf("io_uring pread errno %d (%s), fd=%d off=%d len=%d", + -res, syscall.Errno(-res), batch[idx].fd, batch[idx].offset, len(batch[idx].buf)), + } + } else { + batch[idx].done <- batchReadResult{N: int(res)} + } + } + + b.ring.mu.Unlock() +} + +// ParallelBatchIoUringReader distributes pread requests across N independent +// BatchIoUringReader instances (each with its own io_uring ring and goroutine) +// using round-robin. This removes the single-ring serialization bottleneck and +// lets NVMe service requests across multiple hardware queues in parallel. +type ParallelBatchIoUringReader struct { + readers []*BatchIoUringReader + next atomic.Uint64 +} + +// NewParallelBatchIoUringReader creates numRings independent batch readers. +// Each ring gets its own io_uring instance and background goroutine. +func NewParallelBatchIoUringReader(cfg BatchIoUringConfig, numRings int) (*ParallelBatchIoUringReader, error) { + if numRings <= 0 { + numRings = 1 + } + readers := make([]*BatchIoUringReader, numRings) + for i := 0; i < numRings; i++ { + r, err := NewBatchIoUringReader(cfg) + if err != nil { + for j := 0; j < i; j++ { + readers[j].Close() + } + return nil, fmt.Errorf("parallel batch reader ring %d: %w", i, err) + } + readers[i] = r + } + return &ParallelBatchIoUringReader{readers: readers}, nil +} + +// Submit routes the pread to the next ring via round-robin. Thread-safe. +func (p *ParallelBatchIoUringReader) Submit(fd int, buf []byte, offset uint64) (int, error) { + idx := p.next.Add(1) % uint64(len(p.readers)) + return p.readers[idx].Submit(fd, buf, offset) +} + +// Close shuts down all underlying batch readers. +func (p *ParallelBatchIoUringReader) Close() { + for _, r := range p.readers { + r.Close() + } +} diff --git a/flashring/internal/fs/fs.go b/flashring/internal/fs/fs.go index 186e524e..b69be0a4 100644 --- a/flashring/internal/fs/fs.go +++ b/flashring/internal/fs/fs.go @@ -32,6 +32,7 @@ var ( ErrFileSizeExceeded = errors.New("file size exceeded. Please punch hole") ErrFileOffsetOutOfRange = errors.New("file offset is out of range") ErrOffsetNotAligned = errors.New("offset is not aligned to block size") + ErrReadTimeout = errors.New("read timeout") ) type Stat struct { diff --git a/flashring/internal/fs/iouring.go b/flashring/internal/fs/iouring.go new file mode 100644 index 00000000..4b5b18b3 --- /dev/null +++ b/flashring/internal/fs/iouring.go @@ -0,0 +1,585 @@ +//go:build linux +// +build linux + +// Package fs provides a minimal io_uring implementation using raw syscalls. +// No external dependencies beyond golang.org/x/sys/unix are needed. +// Compatible with Go 1.24+ (no go:linkname usage). +package fs + +import ( + "fmt" + "sync" + "sync/atomic" + "syscall" + "time" + "unsafe" + + "github.com/Meesho/BharatMLStack/flashring/pkg/metrics" + "golang.org/x/sys/unix" +) + +// ----------------------------------------------------------------------- +// io_uring syscall numbers (amd64) +// ----------------------------------------------------------------------- + +const ( + sysIOUringSetup = 425 + sysIOUringEnter = 426 + sysIOUringRegister = 427 +) + +// ----------------------------------------------------------------------- +// io_uring constants +// ----------------------------------------------------------------------- + +const ( + // Setup flags + iouringSetupSQPoll = 1 << 1 + + // Enter flags + iouringEnterGetEvents = 1 << 0 + iouringEnterSQWakeup = 1 << 1 + + // SQ flags (read from kernel-shared memory) + iouringSQNeedWakeup = 1 << 0 + + // Opcodes + iouringOpNop = 0 + iouringOpRead = 22 + iouringOpWrite = 23 + + // offsets for mmap + iouringOffSQRing = 0 + iouringOffCQRing = 0x8000000 + iouringOffSQEs = 0x10000000 +) + +// ----------------------------------------------------------------------- +// io_uring kernel structures (must match kernel ABI exactly) +// ----------------------------------------------------------------------- + +// ioUringSqe is the 64-byte submission queue entry. +type ioUringSqe struct { + Opcode uint8 + Flags uint8 + IoPrio uint16 + Fd int32 + Off uint64 // union: off / addr2 + Addr uint64 // union: addr / splice_off_in + Len uint32 + OpFlags uint32 // union: rw_flags, etc. + UserData uint64 + BufIndex uint16 // union: buf_index / buf_group + _ uint16 // personality + _ int32 // splice_fd_in / file_index + _ uint64 // addr3 + _ uint64 // __pad2[0] +} + +// ioUringCqe is the 16-byte completion queue entry. +type ioUringCqe struct { + UserData uint64 + Res int32 + Flags uint32 +} + +// ioUringParams is passed to io_uring_setup. +type ioUringParams struct { + SqEntries uint32 + CqEntries uint32 + Flags uint32 + SqThreadCPU uint32 + SqThreadIdle uint32 + Features uint32 + WqFd uint32 + Resv [3]uint32 + SqOff ioUringSqringOffsets + CqOff ioUringCqringOffsets +} + +type ioUringSqringOffsets struct { + Head uint32 + Tail uint32 + RingMask uint32 + RingEntries uint32 + Flags uint32 + Dropped uint32 + Array uint32 + Resv1 uint32 + Resv2 uint64 +} + +type ioUringCqringOffsets struct { + Head uint32 + Tail uint32 + RingMask uint32 + RingEntries uint32 + Overflow uint32 + Cqes uint32 + Flags uint32 + Resv1 uint32 + Resv2 uint64 +} + +// ----------------------------------------------------------------------- +// IoUring is the main ring handle +// ----------------------------------------------------------------------- + +// IoUring wraps a single io_uring instance with SQ/CQ ring mappings. +type IoUring struct { + fd int + + // SQ ring mapped memory + sqRingPtr []byte + sqMask uint32 + sqEntries uint32 + sqHead *uint32 // kernel-updated + sqTail *uint32 // user-updated + sqFlags *uint32 // kernel-updated (NEED_WAKEUP etc.) + sqArray unsafe.Pointer + sqeTail uint32 // local tracking of next SQE slot + sqeHead uint32 // local tracking of submitted SQEs + sqesMmap []byte + sqesBase unsafe.Pointer // base pointer to SQE array + sqRingSz int + cqRingSz int + sqesSz int + singleMmap bool + + // CQ ring mapped memory + cqRingPtr []byte + cqMask uint32 + cqEntries uint32 + cqHead *uint32 // user-updated + cqTail *uint32 // kernel-updated + cqesBase unsafe.Pointer + + // Setup flags + flags uint32 + + // Mutex for concurrent SQE submission from multiple goroutines + mu sync.Mutex + + // Diagnostic counter -- limits debug output to first N failures + debugCount int +} + +// NewIoUring creates a new io_uring instance with the given queue depth. +// flags can be 0 for normal mode. +func NewIoUring(entries uint32, flags uint32) (*IoUring, error) { + var params ioUringParams + params.Flags = flags + + fd, _, errno := syscall.Syscall(sysIOUringSetup, uintptr(entries), uintptr(unsafe.Pointer(¶ms)), 0) + if errno != 0 { + return nil, fmt.Errorf("io_uring_setup failed: %w", errno) + } + + ring := &IoUring{ + fd: int(fd), + flags: params.Flags, + } + + if err := ring.mapRings(¶ms); err != nil { + syscall.Close(ring.fd) + return nil, err + } + + return ring, nil +} + +func (r *IoUring) mapRings(p *ioUringParams) error { + sqOff := &p.SqOff + cqOff := &p.CqOff + + // Calculate SQ ring size + r.sqRingSz = int(sqOff.Array + p.SqEntries*4) // Array + entries*sizeof(uint32) + + // Calculate CQ ring size + r.cqRingSz = int(cqOff.Cqes + p.CqEntries*uint32(unsafe.Sizeof(ioUringCqe{}))) + + // Check if kernel supports single mmap for both rings + r.singleMmap = (p.Features & 1) != 0 // IORING_FEAT_SINGLE_MMAP = 1 + if r.singleMmap { + if r.cqRingSz > r.sqRingSz { + r.sqRingSz = r.cqRingSz + } + } + + // Map SQ ring + var err error + r.sqRingPtr, err = unix.Mmap(r.fd, iouringOffSQRing, r.sqRingSz, + unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE) + if err != nil { + return fmt.Errorf("mmap SQ ring: %w", err) + } + + // Map CQ ring (same or separate mapping) + if r.singleMmap { + r.cqRingPtr = r.sqRingPtr + } else { + r.cqRingPtr, err = unix.Mmap(r.fd, iouringOffCQRing, r.cqRingSz, + unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE) + if err != nil { + unix.Munmap(r.sqRingPtr) + return fmt.Errorf("mmap CQ ring: %w", err) + } + } + + // Map SQE array + r.sqesSz = int(p.SqEntries) * int(unsafe.Sizeof(ioUringSqe{})) + r.sqesMmap, err = unix.Mmap(r.fd, iouringOffSQEs, r.sqesSz, + unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE) + if err != nil { + unix.Munmap(r.sqRingPtr) + if !r.singleMmap { + unix.Munmap(r.cqRingPtr) + } + return fmt.Errorf("mmap SQEs: %w", err) + } + r.sqesBase = unsafe.Pointer(&r.sqesMmap[0]) + + // Set up SQ ring pointers + sqBase := unsafe.Pointer(&r.sqRingPtr[0]) + r.sqHead = (*uint32)(unsafe.Add(sqBase, sqOff.Head)) + r.sqTail = (*uint32)(unsafe.Add(sqBase, sqOff.Tail)) + r.sqFlags = (*uint32)(unsafe.Add(sqBase, sqOff.Flags)) + r.sqMask = *(*uint32)(unsafe.Add(sqBase, sqOff.RingMask)) + r.sqEntries = *(*uint32)(unsafe.Add(sqBase, sqOff.RingEntries)) + r.sqArray = unsafe.Add(sqBase, sqOff.Array) + + // Set up CQ ring pointers + cqBase := unsafe.Pointer(&r.cqRingPtr[0]) + r.cqHead = (*uint32)(unsafe.Add(cqBase, cqOff.Head)) + r.cqTail = (*uint32)(unsafe.Add(cqBase, cqOff.Tail)) + r.cqMask = *(*uint32)(unsafe.Add(cqBase, cqOff.RingMask)) + r.cqEntries = *(*uint32)(unsafe.Add(cqBase, cqOff.RingEntries)) + r.cqesBase = unsafe.Add(cqBase, cqOff.Cqes) + + return nil +} + +// Close releases all resources associated with the ring. +func (r *IoUring) Close() { + unix.Munmap(r.sqesMmap) + unix.Munmap(r.sqRingPtr) + if !r.singleMmap { + unix.Munmap(r.cqRingPtr) + } + syscall.Close(r.fd) +} + +// ----------------------------------------------------------------------- +// SQE helpers +// ----------------------------------------------------------------------- + +func (r *IoUring) getSqeAt(idx uint32) *ioUringSqe { + return (*ioUringSqe)(unsafe.Add(r.sqesBase, uintptr(idx)*unsafe.Sizeof(ioUringSqe{}))) +} + +func (r *IoUring) getCqeAt(idx uint32) *ioUringCqe { + return (*ioUringCqe)(unsafe.Add(r.cqesBase, uintptr(idx)*unsafe.Sizeof(ioUringCqe{}))) +} + +func (r *IoUring) sqArrayAt(idx uint32) *uint32 { + return (*uint32)(unsafe.Add(r.sqArray, uintptr(idx)*4)) +} + +// getSqe returns the next available SQE, or nil if the SQ is full. +func (r *IoUring) getSqe() *ioUringSqe { + head := atomic.LoadUint32(r.sqHead) + next := r.sqeTail + 1 + if next-head > r.sqEntries { + return nil // SQ full + } + sqe := r.getSqeAt(r.sqeTail & r.sqMask) + r.sqeTail++ + // Zero out the SQE + *sqe = ioUringSqe{} + return sqe +} + +// flushSq flushes locally queued SQEs into the kernel-visible SQ ring. +func (r *IoUring) flushSq() uint32 { + tail := *r.sqTail + toSubmit := r.sqeTail - r.sqeHead + if toSubmit == 0 { + return tail - atomic.LoadUint32(r.sqHead) + } + for ; toSubmit > 0; toSubmit-- { + *r.sqArrayAt(tail & r.sqMask) = r.sqeHead & r.sqMask + tail++ + r.sqeHead++ + } + atomic.StoreUint32(r.sqTail, tail) + return tail - atomic.LoadUint32(r.sqHead) +} + +// ----------------------------------------------------------------------- +// Submission and completion +// ----------------------------------------------------------------------- + +func ioUringEnter(fd int, toSubmit, minComplete, flags uint32) (int, error) { + ret, _, errno := syscall.Syscall6(sysIOUringEnter, + uintptr(fd), uintptr(toSubmit), uintptr(minComplete), uintptr(flags), 0, 0) + if errno != 0 { + return int(ret), errno + } + return int(ret), nil +} + +// submit flushes SQEs and calls io_uring_enter if needed. +// Retries automatically on EINTR (signal interruption). +func (r *IoUring) submit(waitNr uint32) (int, error) { + submitted := r.flushSq() + var flags uint32 = 0 + + // If not using SQPOLL, we always need to enter + if r.flags&iouringSetupSQPoll == 0 { + if waitNr > 0 { + flags |= iouringEnterGetEvents + } + for { + ret, err := ioUringEnter(r.fd, submitted, waitNr, flags) + if err == syscall.EINTR { + continue + } + return ret, err + } + } + + // SQPOLL: only enter if kernel thread needs wakeup + if atomic.LoadUint32(r.sqFlags)&iouringSQNeedWakeup != 0 { + flags |= iouringEnterSQWakeup + } + if waitNr > 0 { + flags |= iouringEnterGetEvents + } + if flags != 0 { + for { + ret, err := ioUringEnter(r.fd, submitted, waitNr, flags) + if err == syscall.EINTR { + continue + } + return ret, err + } + } + return int(submitted), nil +} + +// waitCqe waits for at least one CQE to be available and returns it. +// The caller MUST call SeenCqe after processing. +func (r *IoUring) waitCqe() (*ioUringCqe, error) { + for { + head := atomic.LoadUint32(r.cqHead) + tail := atomic.LoadUint32(r.cqTail) + if head != tail { + cqe := r.getCqeAt(head & r.cqMask) + return cqe, nil + } + // No CQE available, ask the kernel + _, err := ioUringEnter(r.fd, 0, 1, iouringEnterGetEvents) + if err != nil { + if err == syscall.EINTR { + continue // signal interrupted the syscall; retry + } + return nil, err + } + } +} + +// seenCqe advances the CQ head by 1, releasing the CQE slot. +func (r *IoUring) seenCqe() { + atomic.StoreUint32(r.cqHead, atomic.LoadUint32(r.cqHead)+1) +} + +// ----------------------------------------------------------------------- +// PrepRead / PrepWrite helpers +// ----------------------------------------------------------------------- + +func prepRead(sqe *ioUringSqe, fd int, buf []byte, offset uint64) { + if len(buf) == 0 { + sqe.Opcode = iouringOpNop + return + } + sqe.Opcode = iouringOpRead + sqe.Fd = int32(fd) + sqe.Addr = uint64(uintptr(unsafe.Pointer(&buf[0]))) + sqe.Len = uint32(len(buf)) + sqe.Off = offset +} + +func prepWrite(sqe *ioUringSqe, fd int, buf []byte, offset uint64) { + if len(buf) == 0 { + sqe.Opcode = iouringOpNop + return + } + sqe.Opcode = iouringOpWrite + sqe.Fd = int32(fd) + sqe.Addr = uint64(uintptr(unsafe.Pointer(&buf[0]))) + sqe.Len = uint32(len(buf)) + sqe.Off = offset +} + +// ----------------------------------------------------------------------- +// High-level thread-safe API +// ----------------------------------------------------------------------- + +// SubmitRead submits a pread and waits for completion. Thread-safe. +// Returns bytes read or an error. +func (r *IoUring) SubmitRead(fd int, buf []byte, offset uint64) (int, error) { + if len(buf) == 0 { + return 0, nil + } + + r.mu.Lock() + + sqe := r.getSqe() + if sqe == nil { + r.mu.Unlock() + return 0, fmt.Errorf("io_uring: SQ full, no SQE available") + } + prepRead(sqe, fd, buf, offset) + // Tag the SQE so we can verify the CQE belongs to this request + sqe.UserData = offset + + submitted, err := r.submit(1) + if err != nil { + r.mu.Unlock() + return 0, fmt.Errorf("io_uring_enter failed: %w", err) + } + + cqe, err := r.waitCqe() + if err != nil { + r.mu.Unlock() + return 0, fmt.Errorf("io_uring wait cqe: %w", err) + } + + res := cqe.Res + userData := cqe.UserData + cqeFlags := cqe.Flags + r.seenCqe() + r.mu.Unlock() + + if res < 0 { + return 0, fmt.Errorf("io_uring pread errno %d (%s), fd=%d off=%d len=%d submitted=%d ud=%d", + -res, syscall.Errno(-res), fd, offset, len(buf), submitted, userData) + } + + // Diagnostic: if io_uring returned 0 (EOF) or short read, compare with syscall.Pread + if r.debugCount < 20 && int(res) != len(buf) { + r.debugCount++ + pn, perr := syscall.Pread(fd, buf, int64(offset)) + // Also stat the fd to check file size + var stat syscall.Stat_t + fstatErr := syscall.Fstat(fd, &stat) + var fsize int64 + if fstatErr == nil { + fsize = stat.Size + } + fmt.Printf("[io_uring diag] fd=%d off=%d len=%d uring_res=%d uring_ud=%d uring_flags=%d "+ + "submitted=%d pread_n=%d pread_err=%v filesize=%d fstat_err=%v sqeHead=%d sqeTail=%d\n", + fd, offset, len(buf), res, userData, cqeFlags, + submitted, pn, perr, fsize, fstatErr, r.sqeHead, r.sqeTail) + } + + return int(res), nil +} + +// SubmitWriteBatch submits N pwrite operations in a single io_uring_enter call +// and waits for all completions. Thread-safe. +// Returns per-chunk bytes written. On error, partial results may be returned. +func (r *IoUring) SubmitWriteBatch(fd int, bufs [][]byte, offsets []uint64) ([]int, error) { + n := len(bufs) + if n == 0 { + return nil, nil + } + + r.mu.Lock() + defer r.mu.Unlock() + + // Prepare all SQEs + for i := 0; i < n; i++ { + sqe := r.getSqe() + if sqe == nil { + return nil, fmt.Errorf("io_uring: SQ full, need %d slots but ring has %d", n, r.sqEntries) + } + prepWrite(sqe, fd, bufs[i], offsets[i]) + sqe.UserData = uint64(i) + } + + // Submit all at once; kernel waits for all completions + _, err := r.submit(uint32(n)) + if err != nil { + return nil, fmt.Errorf("io_uring_enter: %w", err) + } + + var startTime time.Time + if metrics.Enabled() { + startTime = time.Now() + } + + // Drain all CQEs (order may differ from submission) + results := make([]int, n) + for i := 0; i < n; i++ { + cqe, err := r.waitCqe() + if err != nil { + return results, fmt.Errorf("io_uring waitCqe: %w", err) + } + idx := int(cqe.UserData) + res := cqe.Res + r.seenCqe() + + if res < 0 { + return results, fmt.Errorf("io_uring pwrite errno %d (%s), fd=%d off=%d len=%d", + -res, syscall.Errno(-res), fd, offsets[idx], len(bufs[idx])) + } + if idx >= 0 && idx < n { + results[idx] = int(res) + } + + if metrics.Enabled() { + metrics.Timing(metrics.KEY_PWRITE_LATENCY, time.Since(startTime), []string{}) + } + } + + return results, nil +} + +// SubmitWrite submits a pwrite and waits for completion. Thread-safe. +// Returns bytes written or an error. +func (r *IoUring) SubmitWrite(fd int, buf []byte, offset uint64) (int, error) { + if len(buf) == 0 { + return 0, nil + } + + r.mu.Lock() + + sqe := r.getSqe() + if sqe == nil { + r.mu.Unlock() + return 0, fmt.Errorf("io_uring: SQ full, no SQE available") + } + prepWrite(sqe, fd, buf, offset) + + _, err := r.submit(1) + if err != nil { + r.mu.Unlock() + return 0, fmt.Errorf("io_uring_enter failed: %w", err) + } + + cqe, err := r.waitCqe() + if err != nil { + r.mu.Unlock() + return 0, fmt.Errorf("io_uring wait cqe: %w", err) + } + + res := cqe.Res + r.seenCqe() + r.mu.Unlock() + + if res < 0 { + return 0, fmt.Errorf("io_uring pwrite failed: errno %d (%s)", -res, syscall.Errno(-res)) + } + return int(res), nil +} diff --git a/flashring/internal/fs/iouring_test.go b/flashring/internal/fs/iouring_test.go new file mode 100644 index 00000000..37f1cfa7 --- /dev/null +++ b/flashring/internal/fs/iouring_test.go @@ -0,0 +1,103 @@ +//go:build linux +// +build linux + +package fs + +import ( + "os" + "syscall" + "testing" + "unsafe" +) + +func TestIoUringBasicRead(t *testing.T) { + // 1. Create a temp file with known data + f, err := os.CreateTemp("", "iouring_test_*") + if err != nil { + t.Fatal(err) + } + defer os.Remove(f.Name()) + + data := make([]byte, 4096) + for i := range data { + data[i] = byte(i % 251) // non-zero pattern + } + if _, err := f.Write(data); err != nil { + t.Fatal(err) + } + if err := f.Sync(); err != nil { + t.Fatal(err) + } + f.Close() + + // 2. Open with O_DIRECT | O_RDONLY + fd, err := syscall.Open(f.Name(), syscall.O_RDONLY|syscall.O_DIRECT, 0) + if err != nil { + t.Fatalf("open O_DIRECT: %v", err) + } + defer syscall.Close(fd) + + // 3. Create io_uring ring + ring, err := NewIoUring(32, 0) + if err != nil { + t.Fatalf("NewIoUring: %v", err) + } + defer ring.Close() + + // 4. Allocate aligned buffer + buf := AlignedBlock(4096, 4096) + + // 5. Submit read via io_uring + n, err := ring.SubmitRead(fd, buf, 0) + if err != nil { + t.Fatalf("SubmitRead: %v", err) + } + if n != 4096 { + t.Fatalf("SubmitRead returned %d bytes, expected 4096", n) + } + + // 6. Verify data + for i := 0; i < 4096; i++ { + if buf[i] != data[i] { + t.Fatalf("data mismatch at byte %d: got %d, want %d", i, buf[i], data[i]) + } + } + t.Logf("io_uring read of 4096 bytes succeeded and data matches") + + // 7. Test a second read (to verify ring reuse works) + buf2 := AlignedBlock(4096, 4096) + n2, err := ring.SubmitRead(fd, buf2, 0) + if err != nil { + t.Fatalf("SubmitRead #2: %v", err) + } + if n2 != 4096 { + t.Fatalf("SubmitRead #2 returned %d bytes, expected 4096", n2) + } + for i := 0; i < 4096; i++ { + if buf2[i] != data[i] { + t.Fatalf("data mismatch #2 at byte %d: got %d, want %d", i, buf2[i], data[i]) + } + } + t.Logf("io_uring second read also succeeded") + + // 8. Test multiple sequential reads to exercise ring cycling + for iter := 0; iter < 100; iter++ { + buf3 := AlignedBlock(4096, 4096) + n3, err := ring.SubmitRead(fd, buf3, 0) + if err != nil { + t.Fatalf("SubmitRead iter %d: %v", iter, err) + } + if n3 != 4096 { + t.Fatalf("SubmitRead iter %d returned %d bytes, expected 4096", iter, n3) + } + } + t.Logf("100 sequential io_uring reads succeeded") +} + +// AlignedBlock returns a 4096-byte-aligned buffer. +func AlignedBlock(size, alignment int) []byte { + raw := make([]byte, size+alignment) + addr := uintptr(unsafe.Pointer(&raw[0])) + off := (alignment - int(addr%uintptr(alignment))) % alignment + return raw[off : off+size] +} diff --git a/flashring/internal/fs/iouring_wrapper.go b/flashring/internal/fs/iouring_wrapper.go new file mode 100644 index 00000000..b059e4ed --- /dev/null +++ b/flashring/internal/fs/iouring_wrapper.go @@ -0,0 +1,40 @@ +//go:build linux +// +build linux + +package fs + +import ( + "fmt" +) + +// IOUringFile wraps an existing WrapAppendFile with an io_uring ring for async I/O. +// It does NOT own the WrapAppendFile -- the caller manages its lifecycle. +type IOUringFile struct { + *WrapAppendFile // embed existing file (shared, not owned) + ring *IoUring // our raw io_uring instance + depth uint32 // submission queue depth +} + +// NewIOUringFile attaches an io_uring ring to an existing WrapAppendFile. +// The WrapAppendFile is shared (not duplicated) -- writes and reads use +// the same file descriptors, so offset tracking stays in sync. +// ringDepth controls the SQ/CQ size (64-256 is a good starting point). +// flags can be 0 for normal mode. +func NewIOUringFile(waf *WrapAppendFile, ringDepth uint32, flags uint32) (*IOUringFile, error) { + ring, err := NewIoUring(ringDepth, flags) + if err != nil { + return nil, fmt.Errorf("io_uring init failed: %w", err) + } + + return &IOUringFile{ + WrapAppendFile: waf, + ring: ring, + depth: ringDepth, + }, nil +} + +// Close releases only the io_uring ring. The underlying WrapAppendFile +// is NOT closed here since it is shared with the shard. +func (f *IOUringFile) Close() { + f.ring.Close() +} diff --git a/flashring/internal/fs/wrap_file.go b/flashring/internal/fs/wrap_file.go index fc91e006..3ef52fa8 100644 --- a/flashring/internal/fs/wrap_file.go +++ b/flashring/internal/fs/wrap_file.go @@ -6,7 +6,9 @@ package fs import ( "os" "syscall" + "time" + "github.com/Meesho/BharatMLStack/flashring/pkg/metrics" "golang.org/x/sys/unix" ) @@ -25,6 +27,7 @@ type WrapAppendFile struct { WriteFile *os.File // write file ReadFile *os.File // read file Stat *Stat // file statistics + WriteRing *IoUring // optional io_uring ring for batched writes } func NewWrapAppendFile(config FileConfig) (*WrapAppendFile, error) { @@ -72,20 +75,96 @@ func (r *WrapAppendFile) Pwrite(buf []byte) (currentPhysicalOffset int64, err er return 0, ErrBufNoAlign } } + var startTime time.Time + if metrics.Enabled() { + startTime = time.Now() + } n, err := syscall.Pwrite(r.WriteFd, buf, r.PhysicalWriteOffset) + if metrics.Enabled() { + metrics.Timing(metrics.KEY_PWRITE_LATENCY, time.Since(startTime), []string{}) + } if err != nil { return 0, err } + r.PhysicalWriteOffset += int64(n) if r.PhysicalWriteOffset >= r.MaxFileSize { r.wrapped = true r.PhysicalWriteOffset = r.PhysicalStartOffset } r.LogicalCurrentOffset += int64(n) - r.Stat.WriteCount++ + return r.PhysicalWriteOffset, nil } +// PwriteBatch writes a large buffer in chunkSize pieces via io_uring. +// Chunks are submitted in sub-batches that fit within the ring's SQ depth, +// so arbitrarily large buffers work regardless of ring size. +// Returns total bytes written and the final PhysicalWriteOffset. +// Requires WriteRing to be set; falls back to sequential Pwrite if nil. +func (r *WrapAppendFile) PwriteBatch(buf []byte, chunkSize int) (totalWritten int, fileOffset int64, err error) { + if r.WriteRing == nil { + // Fallback: sequential pwrite + for written := 0; written < len(buf); written += chunkSize { + end := written + chunkSize + if end > len(buf) { + end = len(buf) + } + fileOffset, err = r.Pwrite(buf[written:end]) + if err != nil { + return written, fileOffset, err + } + totalWritten += end - written + } + return totalWritten, fileOffset, nil + } + + if r.WriteDirectIO { + if !isAlignedBuffer(buf, r.blockSize) { + return 0, 0, ErrBufNoAlign + } + } + + // Maximum SQEs per submission -- capped to ring depth. + maxPerBatch := int(r.WriteRing.sqEntries) + + for written := 0; written < len(buf); { + // Build a sub-batch that fits within the ring + var bufs [][]byte + var offsets []uint64 + + for i := 0; i < maxPerBatch && written < len(buf); i++ { + end := written + chunkSize + if end > len(buf) { + end = len(buf) + } + bufs = append(bufs, buf[written:end]) + offsets = append(offsets, uint64(r.PhysicalWriteOffset)) + + // Advance write offset, handle ring-buffer wrap + r.PhysicalWriteOffset += int64(end - written) + if r.PhysicalWriteOffset >= r.MaxFileSize { + r.wrapped = true + r.PhysicalWriteOffset = r.PhysicalStartOffset + } + written = end + } + + results, serr := r.WriteRing.SubmitWriteBatch(r.WriteFd, bufs, offsets) + if serr != nil { + return totalWritten, r.PhysicalWriteOffset, serr + } + + for _, n := range results { + totalWritten += n + r.LogicalCurrentOffset += int64(n) + r.Stat.WriteCount++ + } + } + + return totalWritten, r.PhysicalWriteOffset, nil +} + func (r *WrapAppendFile) TrimHeadIfNeeded() bool { if r.wrapped && r.PhysicalWriteOffset == r.PhysicalStartOffset { return true @@ -126,7 +205,14 @@ func (r *WrapAppendFile) Pread(fileOffset int64, buf []byte) (int32, error) { return 0, ErrFileOffsetOutOfRange } + var startTime time.Time + if metrics.Enabled() { + startTime = time.Now() + } n, err := syscall.Pread(r.ReadFd, buf, fileOffset) + if metrics.Enabled() { + metrics.Timing(metrics.KEY_PREAD_LATENCY, time.Since(startTime), []string{}) + } // flags := unix.RWF_HIPRI // optionally: | unix.RWF_NOWAIT // n, err := preadv2(r.ReadFd, buf, fileOffset, flags) if err != nil { @@ -136,7 +222,97 @@ func (r *WrapAppendFile) Pread(fileOffset int64, buf []byte) (int32, error) { return int32(n), nil } +// ValidateReadOffset checks the read window and wraps the offset for ring-buffer +// files. Returns the physical file offset to use, or an error. +// Mirrors the validation logic in PreadAsync / Pread so callers that bypass +// PreadAsync (e.g. the batched io_uring path) get identical safety checks. +func (r *WrapAppendFile) ValidateReadOffset(fileOffset int64, bufLen int) (int64, error) { + if r.ReadDirectIO { + if !isAlignedOffset(fileOffset, r.blockSize) { + return 0, ErrOffsetNotAligned + } + } + + readEnd := fileOffset + int64(bufLen) + valid := false + + if !r.wrapped { + valid = fileOffset >= r.PhysicalStartOffset && readEnd <= r.PhysicalWriteOffset + } else { + fileOffset = fileOffset % r.MaxFileSize + readEnd = readEnd % r.MaxFileSize + if fileOffset >= r.PhysicalStartOffset { + valid = readEnd <= r.MaxFileSize + } else { + valid = readEnd <= r.PhysicalWriteOffset + } + } + if !valid { + return 0, ErrFileOffsetOutOfRange + } + + return fileOffset, nil +} + +// PreadAsync submits a pread via io_uring and waits for completion. +// Thread-safe: multiple goroutines can call this concurrently on the same IOUringFile. +// Applies the same read-window validation and offset wrapping as Pread so that +// stale index entries (pointing past MaxFileSize) are rejected cheaply without +// hitting the kernel. +func (f *IOUringFile) PreadAsync(fileOffset int64, buf []byte) (int, error) { + if f.ReadDirectIO { + if !isAlignedOffset(fileOffset, f.blockSize) { + return 0, ErrOffsetNotAligned + } + if !isAlignedBuffer(buf, f.blockSize) { + return 0, ErrBufNoAlign + } + } + + // Validate read window and wrap offset (mirrors Pread logic exactly) + readEnd := fileOffset + int64(len(buf)) + valid := false + + if !f.wrapped { + // Single valid region: [PhysicalStartOffset, PhysicalWriteOffset) + valid = fileOffset >= f.PhysicalStartOffset && readEnd <= f.PhysicalWriteOffset + } else { + // Ring buffer has wrapped -- map the logical offset back into [0, MaxFileSize) + fileOffset = fileOffset % f.MaxFileSize + readEnd = readEnd % f.MaxFileSize + if fileOffset >= f.PhysicalStartOffset { + valid = readEnd <= f.MaxFileSize + } else { + valid = readEnd <= f.PhysicalWriteOffset + } + } + if !valid { + return 0, ErrFileOffsetOutOfRange + } + + var startTime time.Time + if metrics.Enabled() { + startTime = time.Now() + } + n, err := f.ring.SubmitRead(f.ReadFd, buf, uint64(fileOffset)) + if metrics.Enabled() { + metrics.Incr(metrics.KEY_PREAD_COUNT, []string{}) + metrics.Timing(metrics.KEY_PREAD_LATENCY, time.Since(startTime), []string{}) + } + if err != nil { + return 0, err + } + + f.Stat.ReadCount++ + return n, nil +} + func (r *WrapAppendFile) TrimHead() (err error) { + + var startTime time.Time + if metrics.Enabled() { + startTime = time.Now() + } if r.WriteDirectIO { if !isAlignedOffset(r.PhysicalStartOffset, r.blockSize) { return ErrOffsetNotAligned @@ -150,7 +326,10 @@ func (r *WrapAppendFile) TrimHead() (err error) { if r.PhysicalStartOffset >= r.MaxFileSize { r.PhysicalStartOffset = 0 } - r.Stat.PunchHoleCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_PUNCH_HOLE_COUNT, []string{}) + metrics.Timing(metrics.KEY_TRIM_HEAD_LATENCY, time.Since(startTime), []string{}) + } return nil } diff --git a/flashring/internal/indicesV3/delete_manager.go b/flashring/internal/indicesV3/delete_manager.go index 6b218915..c6e632db 100644 --- a/flashring/internal/indicesV3/delete_manager.go +++ b/flashring/internal/indicesV3/delete_manager.go @@ -1,6 +1,7 @@ package indicesv2 import ( + "errors" "fmt" "github.com/Meesho/BharatMLStack/flashring/internal/fs" @@ -62,6 +63,9 @@ func (dm *DeleteManager) ExecuteDeleteIfNeeded() error { if trimNeeded || nextAddNeedsDelete { dm.deleteInProgress = true dm.deleteCount = int(dm.memtableData[dm.toBeDeletedMemId] / dm.deleteAmortizedStep) + if dm.deleteCount == 0 { + dm.deleteCount = int(dm.memtableData[dm.toBeDeletedMemId] % dm.deleteAmortizedStep) + } memIdAtHead, err := dm.keyIndex.PeekMemIdAtHead() if err != nil { return err @@ -69,8 +73,9 @@ func (dm *DeleteManager) ExecuteDeleteIfNeeded() error { if memIdAtHead != dm.toBeDeletedMemId { return fmt.Errorf("memIdAtHead: %d, toBeDeletedMemId: %d", memIdAtHead, dm.toBeDeletedMemId) } + dm.wrapFile.TrimHead() - return nil + return errors.New("trim needed retry this write") } return nil } diff --git a/flashring/internal/indicesV3/index.go b/flashring/internal/indicesV3/index.go index 29261585..aa4b3556 100644 --- a/flashring/internal/indicesV3/index.go +++ b/flashring/internal/indicesV3/index.go @@ -7,7 +7,6 @@ import ( "github.com/Meesho/BharatMLStack/flashring/internal/maths" "github.com/cespare/xxhash/v2" - "github.com/rs/zerolog/log" "github.com/zeebo/xxh3" ) @@ -22,20 +21,22 @@ const ( ) type Index struct { - rm sync.Map + mu *sync.RWMutex + rm map[uint64]int rb *RingBuffer mc *maths.MorrisLogCounter startAt int64 hashBits int } -func NewIndex(hashBits int, rbInitial, rbMax, deleteAmortizedStep int) *Index { +func NewIndex(hashBits int, rbInitial, rbMax, deleteAmortizedStep int, mu *sync.RWMutex) *Index { if ByteOrder == nil { loadByteOrder() } // rm := make(map[uint64]int) return &Index{ - rm: sync.Map{}, + mu: mu, + rm: make(map[uint64]int), rb: NewRingBuffer(rbInitial, rbMax), mc: maths.New(12), startAt: time.Now().Unix(), @@ -52,15 +53,15 @@ func (i *Index) Put(key string, length, ttlInMinutes uint16, memId, offset uint3 delta := uint16(expiryAt - (i.startAt / 60)) encode(key, length, delta, lastAccess, freq, memId, offset, entry) - if headIdx, ok := i.rm.Load(hlo); !ok { + if headIdx, ok := i.rm[hlo]; !ok { encodeHashNextPrev(hhi, hlo, -1, -1, hashNextPrev) - i.rm.Store(hlo, idx) + i.rm[hlo] = idx return } else { - _, headHashNextPrev, _ := i.rb.Get(int(headIdx.(int))) + _, headHashNextPrev, _ := i.rb.Get(int(headIdx)) encodeUpdatePrev(int32(idx), headHashNextPrev) - encodeHashNextPrev(hhi, hlo, -1, int32(headIdx.(int)), hashNextPrev) - i.rm.Store(hlo, idx) + encodeHashNextPrev(hhi, hlo, -1, int32(headIdx), hashNextPrev) + i.rm[hlo] = idx return } @@ -68,9 +69,14 @@ func (i *Index) Put(key string, length, ttlInMinutes uint16, memId, offset uint3 func (i *Index) Get(key string) (length, lastAccess, remainingTTL uint16, freq uint64, memId, offset uint32, status Status) { hhi, hlo := hash128(key) - if idx, ok := i.rm.Load(hlo); ok { - entry, hashNextPrev, _ := i.rb.Get(int(idx.(int))) + + i.mu.RLock() + idx, ok := i.rm[hlo] + i.mu.RUnlock() + + if ok { for { + entry, hashNextPrev, _ := i.rb.Get(int(idx)) if isHashMatch(hhi, hlo, hashNextPrev) { length, deltaExptime, lastAccess, freq, memId, offset := decode(entry) exptime := int(deltaExptime) + int(i.startAt/60) @@ -96,6 +102,9 @@ func (i *Index) Get(key string) (length, lastAccess, remainingTTL uint16, freq u } func (ix *Index) Delete(count int) (uint32, int) { + if count == 0 { + return 0, 0 + } for i := 0; i < count; i++ { deleted, deletedHashNextPrev, deletedIdx, next := ix.rb.Delete() if deleted == nil { @@ -103,15 +112,15 @@ func (ix *Index) Delete(count int) (uint32, int) { } delMemId, _ := decodeMemIdOffset(deleted) deletedHlo := decodeHashLo(deletedHashNextPrev) - mapIdx, ok := ix.rm.Load(deletedHlo) - if ok && mapIdx.(int) == deletedIdx { - ix.rm.Delete(deletedHlo) + mapIdx, ok := ix.rm[deletedHlo] + if ok && mapIdx == deletedIdx { + delete(ix.rm, deletedHlo) } else if ok && hasPrev(deletedHashNextPrev) { prevIdx := decodePrev(deletedHashNextPrev) _, hashNextPrev, _ := ix.rb.Get(int(prevIdx)) encodeUpdateNext(-1, hashNextPrev) } else { - log.Warn().Msgf("broken link. Entry in RB but cannot be linked to map. deletedIdx: %d", deletedIdx) + //log.Warn().Msgf("broken link. Entry in RB but cannot be linked to map. deletedIdx: %d", deletedIdx) } nextMemId, _ := decodeMemIdOffset(next) diff --git a/flashring/internal/indicesV3/index_test.go b/flashring/internal/indicesV3/index_test.go index 3eecea9d..fe4ca081 100644 --- a/flashring/internal/indicesV3/index_test.go +++ b/flashring/internal/indicesV3/index_test.go @@ -2,17 +2,19 @@ package indicesv2 import ( "fmt" + "sync" "testing" ) func TestIndexAddRbMax(t *testing.T) { loadByteOrder() + mu := &sync.RWMutex{} // Use equal initial and max capacity for the fixed-size ring buffer. rbMax := 1000_000 rbInitial := rbMax hashBits := 16 - idx := NewIndex(hashBits, rbInitial, rbMax, 1) + idx := NewIndex(hashBits, rbInitial, rbMax, 1, mu) // Insert exactly rbMax distinct keys for i := 0; i < rbMax; i++ { @@ -64,7 +66,7 @@ func TestIndexDeleteAndGet(t *testing.T) { rbMax := 99 rbInitial := rbMax hashBits := 16 - idx := NewIndex(hashBits, rbInitial, rbMax, 1) + idx := NewIndex(hashBits, rbInitial, rbMax, 1, nil) // Insert exactly rbMax distinct keys in order for i := 0; i < 33; i++ { @@ -137,11 +139,13 @@ func TestIndexDeleteAndGet(t *testing.T) { func TestIndexDeleteAndGetOverlappingHash(t *testing.T) { loadByteOrder() + mu := &sync.RWMutex{} + // Keep this small and fast rbMax := 99 rbInitial := rbMax hashBits := 16 - idx := NewIndex(hashBits, rbInitial, rbMax, 1) + idx := NewIndex(hashBits, rbInitial, rbMax, 1, mu) // Insert exactly rbMax distinct keys in order for i := 0; i < 33; i++ { diff --git a/flashring/internal/maths/estimator.go b/flashring/internal/maths/estimator.go index f477d96e..154298e1 100644 --- a/flashring/internal/maths/estimator.go +++ b/flashring/internal/maths/estimator.go @@ -5,6 +5,8 @@ package maths import ( "math" "time" + + "github.com/rs/zerolog/log" ) const ( @@ -75,6 +77,7 @@ func (g *GridSearchEstimator) RecordHitRate(hitRate float64) { stat.HitRate = (stat.HitRate*float64(stat.Trials) + hitRate) / float64(stat.Trials+1) stat.Trials++ if stat.HitRate < g.bestHitRate*0.9 { + log.Error().Msgf("GridSearchRestarted: hitRate %v bestHitRate %v", stat.HitRate, g.bestHitRate) g.RestartGridSearch() } return @@ -130,6 +133,10 @@ func (g *GridSearchEstimator) GenerateRefinedGrid(base WeightTuple, steps int, d refined := make([]WeightTuple, 0, (2*steps+1)*(2*steps+1)) for i := -steps; i <= steps; i++ { for j := -steps; j <= steps; j++ { + + if i == 0 && j == 0 { + continue + } wf := base.WFreq + float64(i)*delta la := base.WLA + float64(j)*delta if math.Abs(wf-base.WFreq) < g.epsilon && math.Abs(la-base.WLA) < g.epsilon { diff --git a/flashring/internal/memtables/manager.go b/flashring/internal/memtables/manager.go index a86fb108..3c313017 100644 --- a/flashring/internal/memtables/manager.go +++ b/flashring/internal/memtables/manager.go @@ -3,6 +3,7 @@ package memtables import ( "github.com/Meesho/BharatMLStack/flashring/internal/allocators" "github.com/Meesho/BharatMLStack/flashring/internal/fs" + "github.com/Meesho/BharatMLStack/flashring/pkg/metrics" "github.com/rs/zerolog/log" ) @@ -16,11 +17,6 @@ type MemtableManager struct { nextFileOffset int64 nextId uint32 semaphore chan int - stats Stats -} - -type Stats struct { - Flushes int64 } func NewMemtableManager(file *fs.WrapAppendFile, capacity int32) (*MemtableManager, error) { @@ -62,7 +58,6 @@ func NewMemtableManager(file *fs.WrapAppendFile, capacity int32) (*MemtableManag nextFileOffset: 2 * int64(capacity), nextId: 2, semaphore: make(chan int, 1), - stats: Stats{}, } return memtableManager, nil } @@ -92,7 +87,9 @@ func (mm *MemtableManager) flushConsumer(memtable *Memtable) { memtable.Id = mm.nextId mm.nextId++ mm.nextFileOffset += int64(n) - mm.stats.Flushes++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_MEMTABLE_FLUSH_COUNT, append(metrics.GetShardTag(memtable.ShardIdx), metrics.GetMemtableTag(memtable.Id)...)) + } } func (mm *MemtableManager) Flush() error { diff --git a/flashring/internal/memtables/manager_bench_test.go b/flashring/internal/memtables/manager_bench_test.go index 28738185..c29c0e52 100644 --- a/flashring/internal/memtables/manager_bench_test.go +++ b/flashring/internal/memtables/manager_bench_test.go @@ -48,7 +48,7 @@ func Benchmark_Puts(b *testing.B) { } } - b.ReportMetric(float64(manager.stats.Flushes), "flushes") + // b.ReportMetric(float64(manager.stats.Flushes), "flushes") b.ReportMetric(float64(b.N*16*1024)/1024/1024, "MB/s") b.ReportAllocs() diff --git a/flashring/internal/memtables/memtable.go b/flashring/internal/memtables/memtable.go index bc92f0ff..3be40e4b 100644 --- a/flashring/internal/memtables/memtable.go +++ b/flashring/internal/memtables/memtable.go @@ -4,7 +4,6 @@ import ( "errors" "github.com/Meesho/BharatMLStack/flashring/internal/fs" - "github.com/rs/zerolog/log" ) var ( @@ -25,6 +24,7 @@ type Memtable struct { readyForFlush bool next *Memtable prev *Memtable + ShardIdx uint32 } type MemtableConfig struct { @@ -32,6 +32,7 @@ type MemtableConfig struct { id uint32 page *fs.AlignedPage file *fs.WrapAppendFile + shardIdx uint32 } func NewMemtable(config MemtableConfig) (*Memtable, error) { @@ -49,6 +50,7 @@ func NewMemtable(config MemtableConfig) (*Memtable, error) { } return &Memtable{ Id: config.id, + ShardIdx: config.shardIdx, capacity: config.capacity, currentOffset: 0, file: config.file, @@ -98,15 +100,23 @@ func (m *Memtable) Flush() (n int, fileOffset int64, err error) { if !m.readyForFlush { return 0, 0, ErrMemtableNotReadyForFlush } - fileOffset, err = m.file.Pwrite(m.page.Buf) + + chunkSize := fs.BLOCK_SIZE + numChunks := len(m.page.Buf) / chunkSize + if len(m.page.Buf)%chunkSize != 0 { + numChunks++ + } + + // PwriteBatch submits all chunks in one io_uring_enter when WriteRing is + // set, otherwise falls back to sequential pwrite internally. + totalWritten, fileOffset, err := m.file.PwriteBatch(m.page.Buf, chunkSize) if err != nil { return 0, 0, err - } else { - log.Debug().Msgf("Flushed memtable %d to file %d", m.Id, fileOffset) } + m.currentOffset = 0 m.readyForFlush = false - return len(m.page.Buf), fileOffset, nil + return totalWritten, fileOffset, nil } func (m *Memtable) Discard() { diff --git a/flashring/internal/pools/leaky_pool.go b/flashring/internal/pools/leaky_pool.go index b2a59487..afcd1b2e 100644 --- a/flashring/internal/pools/leaky_pool.go +++ b/flashring/internal/pools/leaky_pool.go @@ -11,7 +11,6 @@ type LeakyPool struct { usage int idx int lock sync.RWMutex - stats *Stats } type Stats struct { @@ -34,7 +33,6 @@ func NewLeakyPool(config LeakyPoolConfig) *LeakyPool { usage: 0, idx: -1, preDrefHook: nil, - stats: &Stats{Usage: 0, Capacity: config.Capacity}, } } diff --git a/flashring/internal/shard/batch_reader.go b/flashring/internal/shard/batch_reader.go index 3896834b..c6d462be 100644 --- a/flashring/internal/shard/batch_reader.go +++ b/flashring/internal/shard/batch_reader.go @@ -68,7 +68,6 @@ func (br *BatchReader) processBatches() { return case firstReq := <-br.requests: batch := br.collectBatch(firstReq) - br.shardCache.Stats.BatchTracker.RecordBatchSize(len(batch)) br.executeBatch(batch) } } diff --git a/flashring/internal/shard/batch_reader_v2.go b/flashring/internal/shard/batch_reader_v2.go index 2aa99b09..fb614321 100644 --- a/flashring/internal/shard/batch_reader_v2.go +++ b/flashring/internal/shard/batch_reader_v2.go @@ -94,7 +94,6 @@ func (br *BatchReaderV2) processBatchesV2() { return case firstReq := <-br.Requests: batch := br.collectBatchV2(firstReq) - br.shardCache.Stats.BatchTracker.RecordBatchSize(len(batch)) br.executeBatchV2(batch) } } diff --git a/flashring/internal/shard/batch_tracker.go b/flashring/internal/shard/batch_tracker.go deleted file mode 100644 index 5658d0e2..00000000 --- a/flashring/internal/shard/batch_tracker.go +++ /dev/null @@ -1,55 +0,0 @@ -package filecache - -import ( - "sort" - "sync" -) - -type BatchTracker struct { - mu sync.RWMutex - getBatch []int - maxSamples int - getIndex int -} - -// const defaultMaxSamples = 100000 - -func NewBatchTracker() *BatchTracker { - return &BatchTracker{ - getBatch: make([]int, defaultMaxSamples), - maxSamples: defaultMaxSamples, - } -} - -func (bt *BatchTracker) RecordBatchSize(batchSize int) { - bt.mu.Lock() - defer bt.mu.Unlock() - bt.getBatch[bt.getIndex] = batchSize - bt.getIndex = (bt.getIndex + 1) % bt.maxSamples -} - -func (bt *BatchTracker) GetBatchSizePercentiles() (p25, p50, p99 int) { - bt.mu.RLock() - defer bt.mu.RUnlock() - - samples := bt.getIndex - if samples > int(bt.maxSamples) { - samples = int(bt.maxSamples) - } - - if samples == 0 { - return 0, 0, 0 - } - - batchSizesCopy := make([]int, samples) - copy(batchSizesCopy, bt.getBatch[:samples]) - sort.Slice(batchSizesCopy, func(i, j int) bool { - return batchSizesCopy[i] < batchSizesCopy[j] - }) - - p25 = batchSizesCopy[int(float64(samples)*0.25)] - p50 = batchSizesCopy[int(float64(samples)*0.50)] - p99 = batchSizesCopy[int(float64(samples)*0.99)] - - return p25, p50, p99 -} diff --git a/flashring/internal/shard/latency_tracker.go b/flashring/internal/shard/latency_tracker.go deleted file mode 100644 index eeb109c8..00000000 --- a/flashring/internal/shard/latency_tracker.go +++ /dev/null @@ -1,96 +0,0 @@ -package filecache - -import ( - "sort" - "sync" - "time" -) - -type LatencyTracker struct { - mu sync.RWMutex - getLatencies []time.Duration - putLatencies []time.Duration - maxSamples int - getIndex int - putIndex int - getCount int64 - putCount int64 -} - -const defaultMaxSamples = 100000 - -func NewLatencyTracker() *LatencyTracker { - return &LatencyTracker{ - getLatencies: make([]time.Duration, defaultMaxSamples), - putLatencies: make([]time.Duration, defaultMaxSamples), - maxSamples: defaultMaxSamples, - } -} - -func (lt *LatencyTracker) RecordGet(duration time.Duration) { - lt.mu.Lock() - defer lt.mu.Unlock() - lt.getLatencies[lt.getIndex] = duration - lt.getIndex = (lt.getIndex + 1) % lt.maxSamples - lt.getCount++ -} - -func (lt *LatencyTracker) RecordPut(duration time.Duration) { - lt.mu.Lock() - defer lt.mu.Unlock() - lt.putLatencies[lt.putIndex] = duration - lt.putIndex = (lt.putIndex + 1) % lt.maxSamples - lt.putCount++ -} - -func (lt *LatencyTracker) GetLatencyPercentiles() (p25, p50, p99 time.Duration) { - lt.mu.RLock() - defer lt.mu.RUnlock() - - samples := lt.getCount - if samples > int64(lt.maxSamples) { - samples = int64(lt.maxSamples) - } - - if samples == 0 { - return 0, 0, 0 - } - - latenciesCopy := make([]time.Duration, samples) - copy(latenciesCopy, lt.getLatencies[:samples]) - sort.Slice(latenciesCopy, func(i, j int) bool { - return latenciesCopy[i] < latenciesCopy[j] - }) - - p25 = latenciesCopy[int(float64(samples)*0.25)] - p50 = latenciesCopy[int(float64(samples)*0.50)] - p99 = latenciesCopy[int(float64(samples)*0.99)] - - return p25, p50, p99 -} - -func (lt *LatencyTracker) PutLatencyPercentiles() (p25, p50, p99 time.Duration) { - lt.mu.RLock() - defer lt.mu.RUnlock() - - samples := lt.putCount - if samples > int64(lt.maxSamples) { - samples = int64(lt.maxSamples) - } - - if samples == 0 { - return 0, 0, 0 - } - - latenciesCopy := make([]time.Duration, samples) - copy(latenciesCopy, lt.putLatencies[:samples]) - sort.Slice(latenciesCopy, func(i, j int) bool { - return latenciesCopy[i] < latenciesCopy[j] - }) - - p25 = latenciesCopy[int(float64(samples)*0.25)] - p50 = latenciesCopy[int(float64(samples)*0.50)] - p99 = latenciesCopy[int(float64(samples)*0.99)] - - return p25, p50, p99 -} diff --git a/flashring/internal/shard/shard_cache.go b/flashring/internal/shard/shard_cache.go index 78e19deb..4796be8b 100644 --- a/flashring/internal/shard/shard_cache.go +++ b/flashring/internal/shard/shard_cache.go @@ -11,18 +11,20 @@ import ( indices "github.com/Meesho/BharatMLStack/flashring/internal/indicesV3" "github.com/Meesho/BharatMLStack/flashring/internal/maths" "github.com/Meesho/BharatMLStack/flashring/internal/memtables" + "github.com/Meesho/BharatMLStack/flashring/pkg/metrics" "github.com/rs/zerolog/log" ) type ShardCache struct { keyIndex *indices.Index file *fs.WrapAppendFile + ioFile *fs.IOUringFile + batchReader *fs.ParallelBatchIoUringReader // global batched io_uring reader (shared across shards) mm *memtables.MemtableManager readPageAllocator *allocators.SlabAlignedPageAllocator dm *indices.DeleteManager predictor *maths.Predictor startAt int64 - Stats *Stats //batching reads BatchReader *BatchReaderV2 @@ -30,21 +32,8 @@ type ShardCache struct { //Lockless read and write ReadCh chan *ReadRequestV2 WriteCh chan *WriteRequestV2 -} -type Stats struct { - KeyNotFoundCount int - KeyExpiredCount int - BadDataCount int - BadLengthCount int - BadCR32Count int - BadKeyCount int - MemIdCount map[uint32]int - LastDeletedMemId uint32 - DeletedKeyCount int - BadCRCMemIds map[uint32]int - BadKeyMemIds map[uint32]int - BatchTracker *BatchTracker + ShardIdx uint32 } type ShardCacheConfig struct { @@ -64,6 +53,16 @@ type ShardCacheConfig struct { EnableBatching bool BatchWindow time.Duration MaxBatchSize int + + //lockless + EnableLockless bool + + // Global batched io_uring reader (shared across all shards). + // When set, disk reads go through this instead of the per-shard IOUringFile. + BatchIoUringReader *fs.ParallelBatchIoUringReader + + // Dedicated io_uring ring for batched writes (shared across all shards). + WriteRing *fs.IoUring } func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache { @@ -83,12 +82,12 @@ func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache { if err != nil { log.Panic().Err(err).Msg("Failed to create memtable manager") } - ki := indices.NewIndex(0, config.RbInitial, config.RbMax, config.DeleteAmortizedStep) + ki := indices.NewIndex(0, config.RbInitial, config.RbMax, config.DeleteAmortizedStep, sl) sizeClasses := make([]allocators.SizeClass, 0) i := fs.BLOCK_SIZE iMax := (1 << 16) for i < iMax { - sizeClasses = append(sizeClasses, allocators.SizeClass{Size: i, MinCount: 1000}) + sizeClasses = append(sizeClasses, allocators.SizeClass{Size: i, MinCount: 20}) i *= 2 } readPageAllocator, err := allocators.NewSlabAlignedPageAllocator(allocators.SlabAlignedPageAllocatorConfig{SizeClasses: sizeClasses}) @@ -96,6 +95,12 @@ func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache { log.Panic().Err(err).Msg("Failed to create read page allocator") } dm := indices.NewDeleteManager(ki, file, config.DeleteAmortizedStep) + + // Attach the dedicated write ring so memtable flushes use batched io_uring. + if config.WriteRing != nil { + file.WriteRing = config.WriteRing + } + sc := &ShardCache{ keyIndex: ki, mm: memtableManager, @@ -104,12 +109,18 @@ func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache { dm: dm, predictor: config.Predictor, startAt: time.Now().Unix(), - Stats: &Stats{ - MemIdCount: make(map[uint32]int), - BadCRCMemIds: make(map[uint32]int), - BadKeyMemIds: make(map[uint32]int), - BatchTracker: NewBatchTracker(), - }, + } + + if config.BatchIoUringReader != nil { + // Use the global batched io_uring reader (shared across all shards). + sc.batchReader = config.BatchIoUringReader + } else { + // Fallback: per-shard io_uring ring for backward compatibility. + ioFile, err := fs.NewIOUringFile(file, 256, 0) + if err != nil { + log.Panic().Err(err).Msg("Failed to create io_uring file") + } + sc.ioFile = ioFile } // Initialize batch reader if enabled @@ -120,10 +131,13 @@ func NewShardCache(config ShardCacheConfig, sl *sync.RWMutex) *ShardCache { }, sc, sl) } - sc.ReadCh = make(chan *ReadRequestV2, 500) - sc.WriteCh = make(chan *WriteRequestV2, 500) + if config.EnableLockless { - go sc.startReadWriteRoutines() + sc.ReadCh = make(chan *ReadRequestV2, 500) + sc.WriteCh = make(chan *WriteRequestV2, 500) + + go sc.startReadWriteRoutines() + } return sc } @@ -163,19 +177,26 @@ func (fc *ShardCache) Put(key string, value []byte, ttlMinutes uint16) error { indices.ByteOrder.PutUint32(buf[0:4], crc) fc.keyIndex.Put(key, length, ttlMinutes, mtId, uint32(offset)) fc.dm.IncMemtableKeyCount(mtId) - fc.Stats.MemIdCount[mtId]++ return nil } func (fc *ShardCache) Get(key string) (bool, []byte, uint16, bool, bool) { length, lastAccess, remainingTTL, freq, memId, offset, status := fc.keyIndex.Get(key) if status == indices.StatusNotFound { - fc.Stats.KeyNotFoundCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_KEY_NOT_FOUND_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return false, nil, 0, false, false } + if metrics.Enabled() { + metrics.Timing(metrics.KEY_DATA_LENGTH, time.Duration(length), metrics.GetShardTag(fc.ShardIdx)) + } + if status == indices.StatusExpired { - fc.Stats.KeyExpiredCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_KEY_EXPIRED_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return false, nil, 0, true, false } @@ -190,32 +211,40 @@ func (fc *ShardCache) Get(key string) (bool, []byte, uint16, bool, bool) { memtableExists = false } if !memtableExists { - bufPtr := BufPool.Get().(*[]byte) - buf = *bufPtr - defer BufPool.Put(bufPtr) + if metrics.Enabled() { + metrics.Incr(metrics.KEY_MEMTABLE_MISS, metrics.GetShardTag(fc.ShardIdx)) + } + buf = make([]byte, length) fileOffset := uint64(memId)*uint64(fc.mm.Capacity) + uint64(offset) - n := fc.readFromDisk(int64(fileOffset), length, buf) + n := fc.readFromDiskAsync(int64(fileOffset), length, buf) if n != int(length) { - fc.Stats.BadLengthCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_LENGTH_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return false, nil, 0, false, shouldReWrite } } else { + if metrics.Enabled() { + metrics.Incr(metrics.KEY_MEMTABLE_HIT, metrics.GetShardTag(fc.ShardIdx)) + } buf, exists = mt.GetBufForRead(int(offset), length) if !exists { panic("memtable exists but buf not found") } } gotCR32 := indices.ByteOrder.Uint32(buf[0:4]) - computedCR32 := crc32.ChecksumIEEE(buf[4:]) + computedCR32 := crc32.ChecksumIEEE(buf[4:length]) gotKey := string(buf[4 : 4+len(key)]) if gotCR32 != computedCR32 { - fc.Stats.BadCR32Count++ - fc.Stats.BadCRCMemIds[memId]++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_CR32_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...)) + } return false, nil, 0, false, shouldReWrite } if gotKey != key { - fc.Stats.BadKeyCount++ - fc.Stats.BadKeyMemIds[memId]++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_KEY_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...)) + } return false, nil, 0, false, shouldReWrite } valLen := int(length) - 4 - len(key) @@ -228,12 +257,16 @@ func (fc *ShardCache) Get(key string) (bool, []byte, uint16, bool, bool) { func (fc *ShardCache) GetFastPath(key string) (bool, []byte, uint16, bool, bool) { length, lastAccess, remainingTTL, freq, memId, offset, status := fc.keyIndex.Get(key) if status == indices.StatusNotFound { - fc.Stats.KeyNotFoundCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_KEY_NOT_FOUND_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return false, nil, 0, false, false // needsSlowPath = false (not found) } if status == indices.StatusExpired { - fc.Stats.KeyExpiredCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_KEY_EXPIRED_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return false, nil, 0, true, false // needsSlowPath = false (expired) } @@ -254,18 +287,20 @@ func (fc *ShardCache) GetFastPath(key string) (bool, []byte, uint16, bool, bool) gotCR32 := indices.ByteOrder.Uint32(buf[0:4]) computedCR32 := crc32.ChecksumIEEE(buf[4:]) if gotCR32 != computedCR32 { - fc.Stats.BadCR32Count++ - fc.Stats.BadCRCMemIds[memId]++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_CR32_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...)) + } _, currMemId, _ := fc.mm.GetMemtable() shouldReWrite := fc.predictor.Predict(uint64(freq), uint64(lastAccess), memId, currMemId) - _ = shouldReWrite // Not returning shouldReWrite in fast path for simplicity + _ = shouldReWrite return false, nil, 0, false, false } gotKey := string(buf[4 : 4+len(key)]) if gotKey != key { - fc.Stats.BadKeyCount++ - fc.Stats.BadKeyMemIds[memId]++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_KEY_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...)) + } return false, nil, 0, false, false } @@ -278,12 +313,16 @@ func (fc *ShardCache) GetFastPath(key string) (bool, []byte, uint16, bool, bool) func (fc *ShardCache) GetSlowPath(key string) (bool, []byte, uint16, bool, bool) { length, lastAccess, remainingTTL, freq, memId, offset, status := fc.keyIndex.Get(key) if status == indices.StatusNotFound { - fc.Stats.KeyNotFoundCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_KEY_NOT_FOUND_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return false, nil, 0, false, false } if status == indices.StatusExpired { - fc.Stats.KeyExpiredCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_KEY_EXPIRED_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return false, nil, 0, true, false } @@ -293,7 +332,6 @@ func (fc *ShardCache) GetSlowPath(key string) (bool, []byte, uint16, bool, bool) // Check memtable again (might have changed since fast path check) mt := fc.mm.GetMemtableById(memId) if mt != nil { - // Data is now in memtable, use fast path logic buf, exists := mt.GetBufForRead(int(offset), length) if !exists { panic("memtable exists but buf not found") @@ -301,14 +339,13 @@ func (fc *ShardCache) GetSlowPath(key string) (bool, []byte, uint16, bool, bool) return fc.validateAndReturnBuffer(key, buf, length, memId, remainingTTL, shouldReWrite) } - // Read from disk - bufPtr := BufPool.Get().(*[]byte) - buf := *bufPtr - defer BufPool.Put(bufPtr) + buf := make([]byte, length) fileOffset := uint64(memId)*uint64(fc.mm.Capacity) + uint64(offset) n := fc.readFromDisk(int64(fileOffset), length, buf) if n != int(length) { - fc.Stats.BadLengthCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_LENGTH_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return false, nil, 0, false, shouldReWrite } @@ -318,17 +355,19 @@ func (fc *ShardCache) GetSlowPath(key string) (bool, []byte, uint16, bool, bool) // validateAndReturnBuffer validates CRC and key, then returns the value func (fc *ShardCache) validateAndReturnBuffer(key string, buf []byte, length uint16, memId uint32, remainingTTL uint16, shouldReWrite bool) (bool, []byte, uint16, bool, bool) { gotCR32 := indices.ByteOrder.Uint32(buf[0:4]) - computedCR32 := crc32.ChecksumIEEE(buf[4:]) + computedCR32 := crc32.ChecksumIEEE(buf[4:length]) if gotCR32 != computedCR32 { - fc.Stats.BadCR32Count++ - fc.Stats.BadCRCMemIds[memId]++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_CR32_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...)) + } return false, nil, 0, false, shouldReWrite } gotKey := string(buf[4 : 4+len(key)]) if gotKey != key { - fc.Stats.BadKeyCount++ - fc.Stats.BadKeyMemIds[memId]++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_KEY_COUNT, append(metrics.GetShardTag(fc.ShardIdx), metrics.GetMemtableTag(memId)...)) + } return false, nil, 0, false, shouldReWrite } @@ -337,18 +376,68 @@ func (fc *ShardCache) validateAndReturnBuffer(key string, buf []byte, length uin } func (fc *ShardCache) readFromDisk(fileOffset int64, length uint16, buf []byte) int { + alignedStartOffset := (fileOffset / fs.BLOCK_SIZE) * fs.BLOCK_SIZE endndOffset := fileOffset + int64(length) endAlignedOffset := ((endndOffset + fs.BLOCK_SIZE - 1) / fs.BLOCK_SIZE) * fs.BLOCK_SIZE alignedReadSize := endAlignedOffset - alignedStartOffset + page := fc.readPageAllocator.Get(int(alignedReadSize)) + fc.file.Pread(alignedStartOffset, page.Buf) + start := int(fileOffset - alignedStartOffset) n := copy(buf, page.Buf[start:start+int(length)]) fc.readPageAllocator.Put(page) return n } +func (fc *ShardCache) readFromDiskAsync(fileOffset int64, length uint16, buf []byte) int { + alignedStartOffset := (fileOffset / fs.BLOCK_SIZE) * fs.BLOCK_SIZE + endndOffset := fileOffset + int64(length) + endAlignedOffset := ((endndOffset + fs.BLOCK_SIZE - 1) / fs.BLOCK_SIZE) * fs.BLOCK_SIZE + alignedReadSize := int(endAlignedOffset - alignedStartOffset) + page := fc.readPageAllocator.Get(alignedReadSize) + + // Use exactly alignedReadSize bytes, not the full page.Buf which may be + // larger due to slab allocator rounding to the next size class. + readBuf := page.Buf[:alignedReadSize] + + var n int + var err error + + if fc.batchReader != nil { + // Batched path: validate offset locally, then submit to the global + // io_uring batch reader which accumulates requests across all shards. + var validOffset int64 + validOffset, err = fc.file.ValidateReadOffset(alignedStartOffset, alignedReadSize) + if err == nil { + n, err = fc.batchReader.Submit(fc.file.ReadFd, readBuf, uint64(validOffset)) + } + } else { + // Per-shard io_uring fallback + n, err = fc.ioFile.PreadAsync(alignedStartOffset, readBuf) + } + + if err != nil || n != alignedReadSize { + // ErrFileOffsetOutOfRange is expected for stale index entries -- don't log. + if err != nil && err != fs.ErrFileOffsetOutOfRange { + log.Warn().Err(err). + Int64("offset", alignedStartOffset). + Int("alignedReadSize", alignedReadSize). + Int("n", n). + Msg("io_uring pread failed") + } + fc.readPageAllocator.Put(page) + return 0 + } + + start := int(fileOffset - alignedStartOffset) + copied := copy(buf, page.Buf[start:start+int(length)]) + fc.readPageAllocator.Put(page) + return copied +} + func (fc *ShardCache) GetRingBufferActiveEntries() int { return fc.keyIndex.GetRB().ActiveEntries() } @@ -360,11 +449,15 @@ func (fc *ShardCache) processBuffer(key string, buf []byte, length uint16) ReadR gotKey := string(buf[4 : 4+len(key)]) if gotCR32 != computedCR32 { - fc.Stats.BadCR32Count++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_CR32_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return ReadResult{Found: false, Error: fmt.Errorf("crc mismatch")} } if gotKey != key { - fc.Stats.BadKeyCount++ + if metrics.Enabled() { + metrics.Incr(metrics.KEY_BAD_KEY_COUNT, metrics.GetShardTag(fc.ShardIdx)) + } return ReadResult{Found: false, Error: fmt.Errorf("key mismatch")} } diff --git a/flashring/pkg/cache/badger.go b/flashring/pkg/cache/badger.go new file mode 100644 index 00000000..859c4bac --- /dev/null +++ b/flashring/pkg/cache/badger.go @@ -0,0 +1,73 @@ +package internal + +import ( + "time" + + badger "github.com/dgraph-io/badger/v4" +) + +type Badger struct { + cache *badger.DB +} + +func NewBadger(config WrapCacheConfig, logStats bool) (*Badger, error) { + options := badger.DefaultOptions(config.MountPoint) + options.MetricsEnabled = false + + // 1. PRIMARY CACHE (1GB) + // This caches the data blocks themselves. + options.BlockCacheSize = 1024 << 20 + + // 2. INDEX CACHE (512MB) + // This keeps the keys and the structure of the LSM tree in RAM. + // This is the most critical setting for read latency. + options.IndexCacheSize = 512 << 20 + + // 3. WRITE BUFFERS (Memtables) + // We use 3 tables of 64MB each. This allows Badger to handle + // write spikes without blocking. (~192MB total) + options.NumMemtables = 40 + options.MemTableSize = 1024 << 20 + + options.ValueThreshold = 1024 + options.SyncWrites = false + + cache, err := badger.Open(options) + if err != nil { + return nil, err + } + bc := &Badger{ + cache: cache, + } + + return bc, nil +} + +func (b *Badger) Put(key string, value []byte, exptimeInMinutes uint16) error { + + err := b.cache.Update(func(txn *badger.Txn) error { + entry := badger.NewEntry([]byte(key), value).WithTTL(time.Duration(exptimeInMinutes) * time.Minute) + err := txn.SetEntry(entry) + return err + }) + return err +} + +func (b *Badger) Get(key string) ([]byte, bool, bool) { + + val := make([]byte, 0) + err := b.cache.View(func(txn *badger.Txn) error { + item, err := txn.Get([]byte(key)) + if err != nil { + return err + } + val, err = item.ValueCopy(val) + + return err + }) + return val, err != badger.ErrKeyNotFound, false +} + +func (b *Badger) Close() error { + return b.cache.Close() +} diff --git a/flashring/pkg/cache/cache.go b/flashring/pkg/cache/cache.go new file mode 100644 index 00000000..96324381 --- /dev/null +++ b/flashring/pkg/cache/cache.go @@ -0,0 +1,403 @@ +package internal + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "sync" + "time" + + "github.com/Meesho/BharatMLStack/flashring/internal/fs" + "github.com/Meesho/BharatMLStack/flashring/internal/maths" + filecache "github.com/Meesho/BharatMLStack/flashring/internal/shard" + "github.com/cespare/xxhash/v2" + "github.com/rs/zerolog/log" + + metrics "github.com/Meesho/BharatMLStack/flashring/pkg/metrics" +) + +/* + Each shard can keep 67M keys + With Round = 1, expected collision (67M)^2/(2*2^62) = 4.87×10^-4 +*/ + +const ( + ROUNDS = 1 + KEYS_PER_SHARD = (1 << 26) + BLOCK_SIZE = 4096 +) + +var ( + ErrNumShardLessThan1 = fmt.Errorf("num shards must be greater than 0") + ErrKeysPerShardLessThan1 = fmt.Errorf("keys per shard must be greater than 0") + ErrKeysPerShardGreaterThan67M = fmt.Errorf("keys per shard must be less than 67M") + ErrMemtableSizeLessThan1 = fmt.Errorf("memtable size must be greater than 0") + ErrMemtableSizeGreaterThan1GB = fmt.Errorf("memtable size must be less than 1GB") + ErrMemtableSizeNotMultipleOf4KB = fmt.Errorf("memtable size must be a multiple of 4KB") + ErrFileSizeLessThan1 = fmt.Errorf("file size must be greater than 0") + ErrFileSizeNotMultipleOf4KB = fmt.Errorf("file size must be a multiple of 4KB") + Seed = xxhash.Sum64String(strconv.Itoa(int(time.Now().UnixNano()))) +) + +type WrapCache struct { + shards []*filecache.ShardCache + shardLocks []sync.RWMutex + predictor *maths.Predictor + batchReader *fs.ParallelBatchIoUringReader // global batched io_uring reader +} + +type WrapCacheConfig struct { + NumShards int + KeysPerShard int + FileSize int64 + MemtableSize int32 + ReWriteScoreThreshold float32 + GridSearchEpsilon float64 + SampleDuration time.Duration + + // Batching reads + EnableBatching bool + BatchWindowMicros int // in microseconds + MaxBatchSize int + + //lockless mode for PutLL/GetLL + EnableLockless bool + + //Badger + MountPoint string +} + +func NewWrapCache(config WrapCacheConfig, mountPoint string) (*WrapCache, error) { + if config.NumShards <= 0 { + return nil, ErrNumShardLessThan1 + } + if config.KeysPerShard <= 0 { + return nil, ErrKeysPerShardLessThan1 + } + if config.KeysPerShard > KEYS_PER_SHARD { + return nil, ErrKeysPerShardGreaterThan67M + } + if config.MemtableSize <= 0 { + return nil, ErrMemtableSizeLessThan1 + } + if config.MemtableSize > 1024*1024*1024 { + return nil, ErrMemtableSizeGreaterThan1GB + } + if config.MemtableSize%BLOCK_SIZE != 0 { + return nil, ErrMemtableSizeNotMultipleOf4KB + } + if config.FileSize <= 0 { + return nil, ErrFileSizeLessThan1 + } + if config.FileSize%BLOCK_SIZE != 0 { + return nil, ErrFileSizeNotMultipleOf4KB + } + + //clear existing data + files, err := os.ReadDir(mountPoint) + if err != nil { + log.Error().Err(err).Msg("Failed to read directory") + panic(err) + } + for _, file := range files { + os.Remove(filepath.Join(mountPoint, file.Name())) + } + + weights := []maths.WeightTuple{ + { + WFreq: 0.1, + WLA: 0.1, + }, + { + WFreq: 0.45, + WLA: 0.1, + }, + { + WFreq: 0.9, + WLA: 0.1, + }, + { + WFreq: 0.1, + WLA: 0.45, + }, + { + WFreq: 0.45, + WLA: 0.45, + }, + { + WFreq: 0.9, + WLA: 0.45, + }, + { + WFreq: 0.1, + WLA: 0.9, + }, + { + WFreq: 0.45, + WLA: 0.9, + }, + { + WFreq: 0.9, + WLA: 0.9, + }, + } + MaxMemTableCount := config.FileSize / int64(config.MemtableSize) + predictor := maths.NewPredictor(maths.PredictorConfig{ + ReWriteScoreThreshold: config.ReWriteScoreThreshold, + Weights: weights, + SampleDuration: config.SampleDuration, + MaxMemTableCount: uint32(MaxMemTableCount), + GridSearchEpsilon: config.GridSearchEpsilon, + }) + + // Create a single global batched io_uring reader shared across all shards. + // All disk reads funnel into one channel; the background goroutine collects + // them for up to 1ms and submits them in a single io_uring_enter call. + batchReader, err := fs.NewParallelBatchIoUringReader(fs.BatchIoUringConfig{ + RingDepth: 256, + MaxBatch: 256, + Window: time.Millisecond, + QueueSize: 1024, + }, 2) + if err != nil { + log.Error().Err(err).Msg("Failed to create batched io_uring reader, falling back to per-shard rings") + batchReader = nil + } + + // Separate io_uring ring dedicated to batched writes (memtable flushes). + // Kept separate from the read ring to avoid mutex contention between the + // read batch loop and concurrent flushes. + writeRing, err := fs.NewIoUring(256, 0) + if err != nil { + log.Error().Err(err).Msg("Failed to create io_uring write ring, falling back to sequential pwrite") + writeRing = nil + } + + batchWindow := time.Duration(0) + if config.EnableBatching && config.BatchWindowMicros > 0 { + batchWindow = time.Duration(config.BatchWindowMicros) * time.Microsecond + } + shardLocks := make([]sync.RWMutex, config.NumShards) + shards := make([]*filecache.ShardCache, config.NumShards) + for i := 0; i < config.NumShards; i++ { + shards[i] = filecache.NewShardCache(filecache.ShardCacheConfig{ + MemtableSize: config.MemtableSize, + Rounds: ROUNDS, + RbInitial: config.KeysPerShard, + RbMax: config.KeysPerShard, + DeleteAmortizedStep: 10000, + MaxFileSize: int64(config.FileSize), + BlockSize: BLOCK_SIZE, + Directory: mountPoint, + Predictor: predictor, + + //batching reads + EnableBatching: config.EnableBatching, + BatchWindow: batchWindow, + MaxBatchSize: config.MaxBatchSize, + + //lockless mode for PutLL/GetLL + EnableLockless: config.EnableLockless, + + BatchIoUringReader: batchReader, + WriteRing: writeRing, + }, &shardLocks[i]) + } + + wc := &WrapCache{ + shards: shards, + shardLocks: shardLocks, + predictor: predictor, + batchReader: batchReader, + } + + return wc, nil +} + +func (wc *WrapCache) PutLL(key string, value []byte, exptimeInMinutes uint16) error { + + h32 := wc.Hash(key) + shardIdx := h32 % uint32(len(wc.shards)) + start := time.Now() + + result := filecache.ErrorPool.Get().(chan error) + + wc.shards[shardIdx].WriteCh <- &filecache.WriteRequestV2{ + Key: key, + Value: value, + ExptimeInMinutes: exptimeInMinutes, + Result: result, + } + + if metrics.Enabled() && h32%100 < 10 { + metrics.Incr(metrics.KEY_RINGBUFFER_ACTIVE_ENTRIES, metrics.GetShardTag(shardIdx)) + } + + op := <-result + filecache.ErrorPool.Put(result) + if metrics.Enabled() { + metrics.Incr(metrics.KEY_PUTS, metrics.GetShardTag(shardIdx)) + metrics.Timing(metrics.KEY_PUT_LATENCY, time.Since(start), metrics.GetShardTag(shardIdx)) + } + return op +} + +func (wc *WrapCache) GetLL(key string) ([]byte, bool, bool) { + h32 := wc.Hash(key) + shardIdx := h32 % uint32(len(wc.shards)) + + start := time.Now() + + // found, value, _, expired, needsSlowPath := wc.shards[shardIdx].GetFastPath(key) + + // if !needsSlowPath { + // if found && !expired { + // wc.stats[shardIdx].Hits.Add(1) + // } else if expired { + // wc.stats[shardIdx].Expired.Add(1) + // } + + // wc.stats[shardIdx].TotalGets.Add(1) + // wc.stats[shardIdx].LatencyTracker.RecordGet(time.Since(start)) + // return value, found, expired + // } + + result := filecache.ReadResultPool.Get().(chan filecache.ReadResultV2) + + req := filecache.ReadRequestPool.Get().(*filecache.ReadRequestV2) + req.Key = key + req.Result = result + + wc.shards[shardIdx].ReadCh <- req + op := <-result + + filecache.ReadResultPool.Put(result) + filecache.ReadRequestPool.Put(req) + + if metrics.Enabled() { + if op.Found && !op.Expired { + metrics.Incr(metrics.KEY_HITS, metrics.GetShardTag(shardIdx)) + } + if op.Expired { + metrics.Incr(metrics.KEY_EXPIRED_ENTRIES, metrics.GetShardTag(shardIdx)) + } + metrics.Timing(metrics.KEY_GET_LATENCY, time.Since(start), metrics.GetShardTag(shardIdx)) + metrics.Incr(metrics.KEY_GETS, metrics.GetShardTag(shardIdx)) + } + + return op.Data, op.Found, op.Expired +} + +func (wc *WrapCache) Put(key string, value []byte, exptimeInMinutes uint16) error { + + h32 := wc.Hash(key) + shardIdx := h32 % uint32(len(wc.shards)) + + var start time.Time + if metrics.Enabled() { + start = time.Now() + defer func() { + metrics.Timing(metrics.KEY_PUT_LATENCY, time.Since(start), metrics.GetShardTag(shardIdx)) + }() + } + + if metrics.Enabled() { + start = time.Now() + } + wc.shardLocks[shardIdx].Lock() + if metrics.Enabled() { + metrics.Timing(metrics.LATENCY_WLOCK, time.Since(start), []string{}) + } + defer wc.shardLocks[shardIdx].Unlock() + + err := wc.shards[shardIdx].Put(key, value, exptimeInMinutes) + if err != nil { + log.Error().Err(err).Msgf("Put failed for key: %s", key) + return fmt.Errorf("put failed for key: %s", key) + } + if metrics.Enabled() { + metrics.Incr(metrics.KEY_PUTS, metrics.GetShardTag(shardIdx)) + if h32%100 < 10 { + metrics.Incr(metrics.KEY_RINGBUFFER_ACTIVE_ENTRIES, metrics.GetShardTag(shardIdx)) + } + } + + return nil +} + +func (wc *WrapCache) Get(key string) ([]byte, bool, bool) { + h32 := wc.Hash(key) + shardIdx := h32 % uint32(len(wc.shards)) + + var start time.Time + if metrics.Enabled() { + start = time.Now() + defer func() { + metrics.Timing(metrics.KEY_GET_LATENCY, time.Since(start), metrics.GetShardTag(shardIdx)) + }() + } + + var keyFound bool + var val []byte + var valCopy []byte + var remainingTTL uint16 + var expired bool + var shouldReWrite bool + if wc.shards[shardIdx].BatchReader != nil { + reqChan := make(chan filecache.ReadResultV2, 1) + wc.shards[shardIdx].BatchReader.Requests <- &filecache.ReadRequestV2{ + Key: key, + Result: reqChan, + } + result := <-reqChan + keyFound, val, remainingTTL, expired, shouldReWrite = result.Found, result.Data, result.TTL, result.Expired, result.ShouldRewrite + if shouldReWrite { + valCopy = make([]byte, len(val)) + copy(valCopy, val) + } + } else { + + func(key string, shardIdx uint32) { + + keyFound, val, remainingTTL, expired, shouldReWrite = wc.shards[shardIdx].Get(key) + + if shouldReWrite { + //copy val into a safe variable because we are unlocking the shard + // at the end of anon function execution + valCopy = make([]byte, len(val)) + copy(valCopy, val) + val = valCopy + } + }(key, shardIdx) + + } + + if metrics.Enabled() { + if keyFound && !expired { + metrics.Incr(metrics.KEY_HITS, metrics.GetShardTag(shardIdx)) + } + if expired { + metrics.Incr(metrics.KEY_EXPIRED_ENTRIES, metrics.GetShardTag(shardIdx)) + } + metrics.Incr(metrics.KEY_GETS, metrics.GetShardTag(shardIdx)) + if shouldReWrite { + metrics.Incr(metrics.KEY_REWRITES, metrics.GetShardTag(shardIdx)) + } + } + if shouldReWrite { + wc.Put(key, valCopy, remainingTTL) + } + + //todo: track hit rate here using + // wc.predictor.Observe(hitRate) + return val, keyFound, expired +} + +func (wc *WrapCache) Hash(key string) uint32 { + return uint32(xxhash.Sum64String(key) ^ Seed) +} + +func (wc *WrapCache) GetShardCache(shardIdx int) *filecache.ShardCache { + return wc.shards[shardIdx] +} diff --git a/flashring/pkg/cache/freecache.go b/flashring/pkg/cache/freecache.go new file mode 100644 index 00000000..f16191c6 --- /dev/null +++ b/flashring/pkg/cache/freecache.go @@ -0,0 +1,40 @@ +package internal + +import ( + "runtime/debug" + + "github.com/coocood/freecache" +) + +type Freecache struct { + cache *freecache.Cache +} + +func NewFreecache(config WrapCacheConfig, logStats bool) (*Freecache, error) { + + cache := freecache.NewCache(int(config.FileSize)) + debug.SetGCPercent(20) + + fc := &Freecache{ + cache: cache, + } + + return fc, nil + +} + +func (c *Freecache) Put(key string, value []byte, exptimeInMinutes uint16) error { + + c.cache.Set([]byte(key), value, int(exptimeInMinutes)*60) + return nil +} + +func (c *Freecache) Get(key string) ([]byte, bool, bool) { + + val, err := c.cache.Get([]byte(key)) + if err != nil { + return nil, false, false + } + + return val, true, false +} diff --git a/flashring/pkg/metrics/metric.go b/flashring/pkg/metrics/metric.go new file mode 100644 index 00000000..e977bb77 --- /dev/null +++ b/flashring/pkg/metrics/metric.go @@ -0,0 +1,204 @@ +package metrics + +import ( + "os" + "strconv" + "strings" + "sync" + "time" + + "github.com/DataDog/datadog-go/v5/statsd" + "github.com/rs/zerolog/log" + "github.com/spf13/viper" +) + +// Flashring metric keys +const ( + KEY_GET_LATENCY = "flashring_get_latency" + KEY_PUT_LATENCY = "flashring_put_latency" + KEY_RTHROUGHPUT = "flashring_rthroughput" + KEY_WTHROUGHPUT = "flashring_wthroughput" + KEY_HITRATE = "flashring_hitrate" + KEY_ACTIVE_ENTRIES = "flashring_active_entries" + KEY_EXPIRED_ENTRIES = "flashring_expired_entries" + KEY_REWRITES = "flashring_rewrites" + KEY_GETS = "flashring_gets" + KEY_PUTS = "flashring_puts" + KEY_HITS = "flashring_hits" + + KEY_KEY_NOT_FOUND_COUNT = "flashring_key_not_found_count" + KEY_KEY_EXPIRED_COUNT = "flashring_key_expired_count" + KEY_BAD_DATA_COUNT = "flashring_bad_data_count" + KEY_BAD_LENGTH_COUNT = "flashring_bad_length_count" + KEY_BAD_CR32_COUNT = "flashring_bad_cr32_count" + KEY_BAD_KEY_COUNT = "flashring_bad_key_count" + KEY_DELETED_KEY_COUNT = "flashring_deleted_key_count" + + KEY_WRITE_COUNT = "flashring_write_count" + KEY_PUNCH_HOLE_COUNT = "flashring_punch_hole_count" + KEY_PREAD_COUNT = "flashring_pread_count" + + KEY_TRIM_HEAD_LATENCY = "flashring_wrap_file_trim_head_latency" + KEY_PREAD_LATENCY = "flashring_pread_latency" + KEY_PWRITE_LATENCY = "flashring_pwrite_latency" + + KEY_MEMTABLE_FLUSH_COUNT = "flashring_memtable_flush_count" + + LATENCY_RLOCK = "flashring_rlock_latency" + LATENCY_WLOCK = "flashring_wlock_latency" + + KEY_RINGBUFFER_ACTIVE_ENTRIES = "flashring_ringbuffer_active_entries" + KEY_MEMTABLE_ENTRY_COUNT = "flashring_memtable_entry_count" + KEY_MEMTABLE_HIT = "flashring_memtable_hit" + KEY_MEMTABLE_MISS = "flashring_memtable_miss" + KEY_DATA_LENGTH = "flashring_data_length" + KEY_IOURING_SIZE = "flashring_iouring_size" +) + +// Flashring tag keys +const ( + TAG_LATENCY_PERCENTILE = "latency_percentile" + TAG_VALUE_P25 = "p25" + TAG_VALUE_P50 = "p50" + TAG_VALUE_P99 = "p99" + TAG_SHARD_IDX = "shard_idx" + TAG_MEMTABLE_ID = "memtable_id" +) + +// Application-level metric keys +const ( + ApiRequestCount = "api_request_count" + ApiRequestLatency = "api_request_latency" + ExternalApiRequestCount = "external_api_request_count" + ExternalApiRequestLatency = "external_api_request_latency" + DBCallLatency = "db_call_latency" + DBCallCount = "db_call_count" + MethodLatency = "method_latency" + MethodCount = "method_count" +) + +var ( + statsDClient = getDefaultClient() + samplingRate = 0.1 + telegrafAddress = "localhost:8125" + appName = "" + initialized = false + once sync.Once + + // When false, all Timing/Count/Incr/Gauge calls are no-ops (zero allocations). + // Controlled by FLASHRING_METRICS_ENABLED env var ("true"/"1" to enable). + // Defaults to true for backward compatibility. + metricsEnabled = loadMetricsEnabled() +) + +func loadMetricsEnabled() bool { + v := os.Getenv("FLASHRING_METRICS_ENABLED") + if v == "" { + return false + } + return strings.EqualFold(v, "true") || v == "1" +} + +// Init initializes the metrics client +func Init() { + if initialized { + log.Debug().Msgf("Metrics already initialized!") + return + } + once.Do(func() { + var err error + samplingRate = viper.GetFloat64("APP_METRIC_SAMPLING_RATE") + appName = viper.GetString("APP_NAME") + globalTags := getGlobalTags() + + statsDClient, err = statsd.New( + telegrafAddress, + statsd.WithTags(globalTags), + ) + + if err != nil { + log.Panic().AnErr("StatsD client initialization failed", err) + } + log.Info().Msgf("Metrics client initialized with telegraf address - %s, global tags - %v, and "+ + "sampling rate - %f, flashring metrics enabled - %v", telegrafAddress, globalTags, samplingRate, metricsEnabled) + initialized = true + }) +} + +func getDefaultClient() *statsd.Client { + client, _ := statsd.New("localhost:8125") + return client +} + +func getGlobalTags() []string { + env := viper.GetString("APP_ENV") + if len(env) == 0 { + log.Warn().Msg("APP_ENV is not set") + } + service := viper.GetString("APP_NAME") + if len(service) == 0 { + log.Warn().Msg("APP_NAME is not set") + } + return []string{ + TagAsString(TagEnv, env), + TagAsString(TagService, service), + } +} + +// Timing sends timing information. No-op when metrics are disabled. +func Timing(name string, value time.Duration, tags []string) { + if !metricsEnabled { + return + } + tags = append(tags, TagAsString(TagService, appName)) + err := statsDClient.Timing(name, value, tags, samplingRate) + if err != nil { + log.Warn().AnErr("Error occurred while doing statsd timing", err) + } +} + +// Count increases metric counter by value. No-op when metrics are disabled. +func Count(name string, value int64, tags []string) { + if !metricsEnabled { + return + } + tags = append(tags, TagAsString(TagService, appName)) + err := statsDClient.Count(name, value, tags, samplingRate) + if err != nil { + log.Warn().AnErr("Error occurred while doing statsd count", err) + } +} + +// Incr increases metric counter by 1. No-op when metrics are disabled. +func Incr(name string, tags []string) { + if !metricsEnabled { + return + } + Count(name, 1, tags) +} + +// Gauge sets a gauge value. No-op when metrics are disabled. +func Gauge(name string, value float64, tags []string) { + if !metricsEnabled { + return + } + tags = append(tags, TagAsString(TagService, appName)) + err := statsDClient.Gauge(name, value, tags, samplingRate) + if err != nil { + log.Warn().AnErr("Error occurred while doing statsd gauge", err) + } +} + +// Enabled returns whether flashring metrics are enabled. +// Call sites should check this before allocating tags to avoid heap allocations. +func Enabled() bool { + return metricsEnabled +} + +func GetShardTag(shardIdx uint32) []string { + return BuildTag(NewTag(TAG_SHARD_IDX, strconv.Itoa(int(shardIdx)))) +} + +func GetMemtableTag(memtableId uint32) []string { + return BuildTag(NewTag(TAG_MEMTABLE_ID, strconv.Itoa(int(memtableId)))) +} diff --git a/flashring/pkg/metrics/tag.go b/flashring/pkg/metrics/tag.go new file mode 100644 index 00000000..d77ac38e --- /dev/null +++ b/flashring/pkg/metrics/tag.go @@ -0,0 +1,55 @@ +package metrics + +// Tag constants +const ( + TagEnv = "env" + TagService = "service" + TagPath = "path" + TagMethod = "method" + TagHttpStatusCode = "http_status_code" + TagGrpcStatusCode = "grpc_status_code" + TagExternalService = "external_service" + TagExternalServicePath = "external_service_path" + TagExternalServiceMethod = "external_service_method" + TagExternalServiceStatusCode = "external_service_status_code" + TagZkRealtimeTotalUpdateEvent = "zk_realtime_total_update_event" + TagZkRealtimeFailureEvent = "zk_realtime_failure_event" + TagZkRealtimeSuccessEvent = "zk_realtime_success_event" + TagZkRealtimeEventUpdateLatency = "zk_realtime_event_update_latency" + TagCommunicationProtocol = "communication_protocol" + TagUserContext = "user_context" + + TagValueCommunicationProtocolHttp = "http" + TagValueCommunicationProtocolGrpc = "grpc" +) + +type Tag struct { + Name string + Value string +} + +func NewTag(name, value string) Tag { + return Tag{ + Name: name, + Value: value, + } +} + +// BuildTag builds a tag from the given name and value +func BuildTag(tags ...Tag) []string { + allTags := make([]string, 0) + for _, tag := range tags { + allTags = append(allTags, TagAsString(tag.Name, tag.Value)) + } + return allTags +} + +func TagAsString(name string, value string) string { + return name + ":" + value +} + +func UpdateTags(tags *[]string, newTags ...Tag) { + for _, tag := range newTags { + *tags = append(*tags, TagAsString(tag.Name, tag.Value)) + } +}