diff --git a/docs/404.html b/docs/404.html index ae98045f..29793efd 100644 --- a/docs/404.html +++ b/docs/404.html @@ -5,7 +5,7 @@ BharatMLStack - + diff --git a/docs/assets/js/4b01b88a.b97bdb87.js b/docs/assets/js/4b01b88a.b97bdb87.js deleted file mode 100644 index 7f412116..00000000 --- a/docs/assets/js/4b01b88a.b97bdb87.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[9095],{762:(e,t,n)=>{n.d(t,{A:()=>i});const i=n.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},1737:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>c,contentTitle:()=>o,default:()=>h,frontMatter:()=>a,metadata:()=>i,toc:()=>d});var i=n(3306),s=n(4848),r=n(8453);const a={title:"Beyond Vector RAG: Building Agent Memory That Learns From Experience.",description:"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.",slug:"episodic-memory-for-agents",authors:["adarsha"],date:new Date("2026-02-19T00:00:00.000Z"),tags:["ai-agents","memory","architecture","llm","episodic-memory"]},o=void 0,c={authorsImageUrls:[void 0]},d=[{value:"The Gap Nobody Talks About",id:"the-gap-nobody-talks-about",level:2},{value:"What's Wrong With Vector RAG as Memory",id:"whats-wrong-with-vector-rag-as-memory",level:2},{value:"The Architecture: Episodic Memory",id:"the-architecture-episodic-memory",level:2},{value:"Layer 1: Immutable Timeline",id:"layer-1-immutable-timeline",level:3},{value:"Layer 2: Episode Segmentation",id:"layer-2-episode-segmentation",level:3},{value:"Layer 3: Episodic Graph",id:"layer-3-episodic-graph",level:3},{value:"Layer 4: Generalized Facts",id:"layer-4-generalized-facts",level:3},{value:"The Reinforcement Loop",id:"the-reinforcement-loop",level:3},{value:"The Experiment",id:"the-experiment",level:2},{value:"Results",id:"results",level:2},{value:"Decision Accuracy",id:"decision-accuracy",level:3},{value:"Where the Gap Opened",id:"where-the-gap-opened",level:3},{value:"Retrieval Quality",id:"retrieval-quality",level:3},{value:"What Didn't Work",id:"what-didnt-work",level:2},{value:"What This Means",id:"what-this-means",level:2},{value:"How It Compares to Existing Solutions",id:"how-it-compares-to-existing-solutions",level:2},{value:"Try It Yourself",id:"try-it-yourself",level:2},{value:"Conclusion",id:"conclusion",level:2}];function l(e){const t={blockquote:"blockquote",code:"code",em:"em",h2:"h2",h3:"h3",hr:"hr",img:"img",li:"li",ol:"ol",p:"p",pre:"pre",strong:"strong",table:"table",tbody:"tbody",td:"td",th:"th",thead:"thead",tr:"tr",ul:"ul",...(0,r.R)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.img,{alt:"BharatMLStack",src:n(762).A+"",width:"1396",height:"460"}),'\nEvery agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.']}),"\n",(0,s.jsx)(t.p,{children:"They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach."}),"\n",(0,s.jsx)(t.p,{children:"We are trying to built something different. An episodic memory system where a frozen LLM \u2014 same weights, no retraining \u2014 produces increasingly better decisions over time because the memory feeding it context is continuously evolving."}),"\n",(0,s.jsx)(t.p,{children:"Then we tested it. The results surprised us."}),"\n",(0,s.jsx)(t.h2,{id:"the-gap-nobody-talks-about",children:"The Gap Nobody Talks About"}),"\n",(0,s.jsx)(t.p,{children:"Here's a scenario every engineering team has encountered: AI agent hits a Redis connection pool exhaustion issue. It misdiagnoses it as a database problem. You correct it. Next week, a different service has the exact same failure pattern. The agent makes the exact same mistake."}),"\n",(0,s.jsx)(t.p,{children:"Why? Because LLMs don't learn at inference time. Corrections adjust behavior within a conversation. Once the session ends, the lesson is gone. The model weights haven't changed. The next conversation starts from zero."}),"\n",(0,s.jsx)(t.p,{children:'Current "memory" systems don\'t fully address this. They store facts \u2014 user preferences, document chunks, conversation summaries. But facts aren\'t experience. Knowing that "Redis connection pools can exhaust under load" is different from remembering "last time I saw 500 errors under load, I assumed it was the database, I was wrong, it was actually the connection pool, and here\'s the correction I received."'}),"\n",(0,s.jsx)(t.p,{children:"The first is a fact. The second is an episode. The difference matters."}),"\n",(0,s.jsx)(t.h2,{id:"whats-wrong-with-vector-rag-as-memory",children:"What's Wrong With Vector RAG as Memory"}),"\n",(0,s.jsx)(t.p,{children:"We identified five structural gaps in how current agent frameworks handle memory:"}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"No concept of time."})," Two events are either semantically similar or they're not. The system can't represent \"this happened after that\" without distorting similarity scores. An agent can't reason about sequence or causality."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"No concept of situation."})," A production incident and a design review might use the same technical vocabulary. Flat vector search can't distinguish them. Your agent retrieves planning notes when it should be retrieving incident postmortems."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"No outcome tracking."})," The system stores ",(0,s.jsx)(t.em,{children:"what happened"})," but not ",(0,s.jsx)(t.em,{children:"whether it worked"}),". A failed approach and a successful one are equally retrievable. The agent has no way to prefer strategies that worked over strategies that didn't."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Summaries destroy evidence."})," Summarization-based memory compresses experience but discards the reasoning chain. The agent loses the ability to explain ",(0,s.jsx)(t.em,{children:"how"})," it arrived at a conclusion. The audit trail is gone."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"No causal links."})," Each memory chunk is independent. There's no way to express that incident A caused decision B, which led to outcome C, which was corrected by approach D. Without this structure, the agent can't traverse chains of reasoning."]}),"\n",(0,s.jsx)(t.p,{children:"These gaps compound. As an agent accumulates more experience, flat vector memory gets noisier, more contradictory, and less useful. The system degrades precisely when it should be improving."}),"\n",(0,s.jsx)(t.h2,{id:"the-architecture-episodic-memory",children:"The Architecture: Episodic Memory"}),"\n",(0,s.jsx)(t.p,{children:"We are building a memory system modeled on how human episodic memory works \u2014 not as a metaphor, but as an engineering specification."}),"\n",(0,s.jsx)(t.p,{children:"The system has four layers:"}),"\n",(0,s.jsx)(t.h3,{id:"layer-1-immutable-timeline",children:"Layer 1: Immutable Timeline"}),"\n",(0,s.jsx)(t.p,{children:"Every piece of agent experience is recorded as an append-only timeline entry. Each entry carries a semantic embedding (what it means), a timestamp (when it happened), and a state label (what situation the agent was in \u2014 debugging, planning, code review, incident response). Entries are never modified, never deleted, never summarized. This is the source of truth."}),"\n",(0,s.jsx)(t.h3,{id:"layer-2-episode-segmentation",children:"Layer 2: Episode Segmentation"}),"\n",(0,s.jsx)(t.p,{children:"The system watches the timeline and detects when one coherent unit of experience ends and another begins \u2014 via state transitions, semantic shifts, temporal gaps, or explicit signals. Each episode is a reference into the timeline (not a copy) with a generated summary, an outcome (SUCCESS, FAILURE, PARTIAL, UNKNOWN), decisions made, assumptions held, and corrections received."}),"\n",(0,s.jsx)(t.p,{children:"The outcome field is the most important thing that doesn't exist in any current memory system. Without it, you can't learn from mistakes."}),"\n",(0,s.jsx)(t.h3,{id:"layer-3-episodic-graph",children:"Layer 3: Episodic Graph"}),"\n",(0,s.jsx)(t.p,{children:'Episodes are connected through typed, weighted links: CAUSED_BY, LED_TO, RETRY_OF, LEARNED_FROM, CONTINUATION, CONTRADICTED. Over time, this forms a directed graph that enables traversal by meaning and causality. You can follow the chain: "this incident caused that investigation, which led to a failed fix, which was corrected by this approach."'}),"\n",(0,s.jsx)(t.h3,{id:"layer-4-generalized-facts",children:"Layer 4: Generalized Facts"}),"\n",(0,s.jsx)(t.p,{children:'When multiple episodes exhibit consistent patterns, the system extracts reasoning heuristics: "When services fail immediately after deployment with no traffic change, investigate configuration errors before connection pool problems." Facts are versioned, never overwritten, and maintain links back to supporting and contradicting episodes. When contradicting evidence accumulates, confidence decreases. When confidence drops below a threshold, the fact is revised \u2014 but the old version is preserved.'}),"\n",(0,s.jsx)(t.p,{children:"The LLM sits above all four layers. At query time, the system assembles structured context \u2014 relevant episodes with outcomes, applicable facts with confidence scores, causal narratives \u2014 and passes it to the LLM for reasoning. The model reasons over structured memory. It doesn't store or manage memory."}),"\n",(0,s.jsx)(t.h3,{id:"the-reinforcement-loop",children:"The Reinforcement Loop"}),"\n",(0,s.jsx)(t.p,{children:"This is where it comes together:"}),"\n",(0,s.jsxs)(t.ol,{children:["\n",(0,s.jsx)(t.li,{children:"Agent reasons using retrieved episodes and facts"}),"\n",(0,s.jsx)(t.li,{children:"Outcome is detected (CI pass/fail, user correction, test result)"}),"\n",(0,s.jsx)(t.li,{children:"New episode is created with outcome tracking"}),"\n",(0,s.jsx)(t.li,{children:"Links are created between the retrieved episodes and the new episode"}),"\n",(0,s.jsx)(t.li,{children:"Facts are reinforced (if outcome aligned) or contradicted (if outcome conflicted)"}),"\n",(0,s.jsx)(t.li,{children:"If the decision was wrong and corrected, a LEARNED_FROM link is created"}),"\n"]}),"\n",(0,s.jsx)(t.p,{children:"The model weights never change. The memory structure evolves continuously. A frozen LLM produces better decisions over time because it receives better context from richer memory."}),"\n",(0,s.jsx)(t.h2,{id:"the-experiment",children:"The Experiment"}),"\n",(0,s.jsx)(t.p,{children:"We built the full system in Python (~1,000 lines) and tested it head-to-head against a baseline flat-vector RAG agent across a 9-round synthetic debugging scenario. Both agents used the identical LLM (Claude Sonnet 4) for reasoning. The only variable was the memory system."}),"\n",(0,s.jsx)(t.p,{children:"The scenario was designed to test five capabilities:"}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Round Type"}),(0,s.jsx)(t.th,{children:"What It Tests"}),(0,s.jsx)(t.th,{children:"Rounds"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"LEARN"}),(0,s.jsx)(t.td,{children:"Can the agent build experience from failures?"}),(0,s.jsx)(t.td,{children:"1, 2, 4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"RED HERRING"}),(0,s.jsx)(t.td,{children:"Can the agent resist applying a pattern when it doesn't fit?"}),(0,s.jsx)(t.td,{children:"3"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TEST"}),(0,s.jsx)(t.td,{children:"Can the agent apply learned patterns to new services?"}),(0,s.jsx)(t.td,{children:"5, 6"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"SUBTLE"}),(0,s.jsx)(t.td,{children:"Can the agent generalize to different symptoms, same root cause?"}),(0,s.jsx)(t.td,{children:"7"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"CORRECTION"}),(0,s.jsx)(t.td,{children:"After being corrected, does the agent adapt?"}),(0,s.jsx)(t.td,{children:"8, 9"})]})]})]}),"\n",(0,s.jsxs)(t.p,{children:["Rounds 1-4 build experience: three connection pool failures across different services, plus one red herring (a deployment config error that ",(0,s.jsx)(t.em,{children:"looks"})," like a connection pool issue). Rounds 5-7 test whether the agent applies the learned pattern to unfamiliar services and subtle symptom variations. Rounds 8-9 are the critical test: the agent is corrected after misdiagnosing a deployment-correlated error, then tested on a near-identical scenario to see if it adapts."]}),"\n",(0,s.jsx)(t.h2,{id:"results",children:"Results"}),"\n",(0,s.jsx)(t.h3,{id:"decision-accuracy",children:"Decision Accuracy"}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Round"}),(0,s.jsx)(t.th,{children:"Type"}),(0,s.jsx)(t.th,{children:"Episodic Agent"}),(0,s.jsx)(t.th,{children:"Baseline Agent"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"1"}),(0,s.jsx)(t.td,{children:"LEARN"}),(0,s.jsx)(t.td,{children:"\u2717"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"2"}),(0,s.jsx)(t.td,{children:"LEARN"}),(0,s.jsx)(t.td,{children:"\u2713"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"3"}),(0,s.jsx)(t.td,{children:"RED HERRING"}),(0,s.jsx)(t.td,{children:"\u2717"}),(0,s.jsx)(t.td,{children:"\u2717"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"4"}),(0,s.jsx)(t.td,{children:"LEARN"}),(0,s.jsx)(t.td,{children:"\u2713"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"5"}),(0,s.jsx)(t.td,{children:"TEST"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"\u2713"})}),(0,s.jsx)(t.td,{children:"\u2717"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"6"}),(0,s.jsx)(t.td,{children:"TEST"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"\u2713"})}),(0,s.jsx)(t.td,{children:"\u2717"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"7"}),(0,s.jsx)(t.td,{children:"SUBTLE"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"\u2713"})}),(0,s.jsx)(t.td,{children:"\u2717"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"8"}),(0,s.jsx)(t.td,{children:"CORRECTION"}),(0,s.jsx)(t.td,{children:"\u2713"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"9"}),(0,s.jsx)(t.td,{children:"CORRECTION"}),(0,s.jsx)(t.td,{children:"\u2713"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"Total"})}),(0,s.jsx)(t.td,{}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"7/9 (78%)"})}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"5/9 (56%)"})})]})]})]}),"\n",(0,s.jsx)(t.p,{children:"The episodic agent won 7-5. A 40% relative improvement in decision accuracy using the exact same LLM."}),"\n",(0,s.jsx)(t.h3,{id:"where-the-gap-opened",children:"Where the Gap Opened"}),"\n",(0,s.jsx)(t.p,{children:"The episodic agent's advantage concentrated in exactly the rounds designed to test memory quality:"}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Rounds 5-6 (pattern application):"})," The episodic agent cited 4 past failure episodes with connection pool exhaustion as root cause, complete with correction annotations. It correctly identified pool exhaustion in new services. The baseline retrieved disconnected chunks and suggested checking timeout configurations \u2014 a pattern it picked up from the Round 3 red herring."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Round 7 (subtle symptoms \u2014 latency increase, no errors):"}),' Both agents had the same evidence available. The episodic agent\'s retrieval surfaced a diverse set of episodes (thanks to MMR diversity filtering) including the Redis pool exhaustion from Round 6, which primed it to recognize that latency without errors can still be pool contention. The baseline defaulted to "check recent config changes."']}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Round 9 (adaptation after correction):"})," This is the result we're most proud of. Look at the episodic agent's reasoning:"]}),"\n",(0,s.jsxs)(t.blockquote,{children:["\n",(0,s.jsx)(t.p,{children:(0,s.jsx)(t.em,{children:'"Episode 1 directly parallels this situation \u2014 errors spiking immediately after a deployment (v2.4.1 then, v3.1.0 now) with no traffic change. In that case, the root cause was a database migration that dropped an index. The generalized fact confirms that deployment-related issues with immediate onset after version changes are more likely caused by configuration errors or missing dependencies than by connection pool problems."'})}),"\n"]}),"\n",(0,s.jsxs)(t.p,{children:["It cited a specific past episode by analogy, quoted a generalized fact, and explained ",(0,s.jsx)(t.em,{children:"why"})," this situation matches the deployment pattern rather than the connection pool pattern. The baseline gave a vaguer assessment."]}),"\n",(0,s.jsx)(t.h3,{id:"retrieval-quality",children:"Retrieval Quality"}),"\n",(0,s.jsx)(t.p,{children:"This is where the structural difference is most visible:"}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Metric"}),(0,s.jsx)(t.th,{children:"Episodic Agent"}),(0,s.jsx)(t.th,{children:"Baseline Agent"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"Retrieved items with explicit outcome labels"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"100%"})}),(0,s.jsx)(t.td,{children:"25%"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"Correct pattern applications (Rounds 4-7)"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"4/4"})}),(0,s.jsx)(t.td,{children:"1/4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"False positives (Rounds 8-9)"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"0"})}),(0,s.jsx)(t.td,{children:"0"})]})]})]}),"\n",(0,s.jsx)(t.p,{children:"Every item the episodic agent retrieved carried a structured outcome label (SUCCESS or FAILURE) with correction details. Only 25% of the baseline's chunks contained any outcome information \u2014 and those were incidental text mentions, not structured labels."}),"\n",(0,s.jsx)(t.p,{children:"The episodic agent correctly applied the connection pool pattern in all four rounds where it was the root cause, and correctly avoided it in both rounds where it wasn't. The baseline applied it correctly once."}),"\n",(0,s.jsx)(t.h2,{id:"what-didnt-work",children:"What Didn't Work"}),"\n",(0,s.jsx)(t.p,{children:"Two things didn't work as anticipated:"}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Round 3 (red herring):"})," Both agents failed. The symptoms looked like connection pool issues, but the root cause was a deployment config change. At this point, the episodic agent had only seen connection pool episodes \u2014 it had no counter-evidence for deployment-correlated errors. You can't distinguish patterns you've only seen one side of. After Round 8 introduced a correction, the agent successfully avoided this mistake in Round 9."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Fact quality variance."}),' Some extracted facts were specific and actionable ("Deployment-related issues with immediate onset are more likely configuration errors"). Others were vague ("Initial symptom-based diagnosis often leads to misidentifying the root cause"). A production system needs a usefulness filter, not just a confidence score.']}),"\n",(0,s.jsx)(t.h2,{id:"what-this-means",children:"What This Means"}),"\n",(0,s.jsx)(t.p,{children:"The most important finding isn't the accuracy improvement. It's that the reinforcement loop closes without retraining."}),"\n",(0,s.jsx)(t.p,{children:"In the POC, we observed:"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsx)(t.li,{children:"Rounds 1-4: Agent encounters failures, episodes recorded with outcomes and corrections"}),"\n",(0,s.jsx)(t.li,{children:'After Round 4: Fact extracted \u2014 "Connection pool exhaustion is a common root cause under load"'}),"\n",(0,s.jsx)(t.li,{children:"Rounds 5-7: Agent applies the pattern with increasing confidence (fact support count grows)"}),"\n",(0,s.jsx)(t.li,{children:"Round 8: Agent encounters a deployment error, correctly identifies it as config, gets corrected"}),"\n",(0,s.jsx)(t.li,{children:'After Round 8: New fact \u2014 "Deployment-related issues with immediate onset are more likely configuration errors"'}),"\n",(0,s.jsx)(t.li,{children:"Round 9: Agent receives near-identical scenario, correctly avoids connection pool pattern, cites the Round 8 correction"}),"\n"]}),"\n",(0,s.jsx)(t.p,{children:"The model didn't change. The memory evolved. That's the whole point."}),"\n",(0,s.jsx)(t.h2,{id:"how-it-compares-to-existing-solutions",children:"How It Compares to Existing Solutions"}),"\n",(0,s.jsx)(t.p,{children:"Agent memory is a fast-moving space with several strong systems, each solving a different slice of the problem:"}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Mem0"})," excels at persistent personalization \u2014 extracting user preferences, managing session context, and reducing token costs through intelligent compression. It's the most production-ready memory layer available and integrates with nearly every agent framework. Its focus is on remembering about users and conversations rather than learning from task-level outcomes, which is a different problem than the one we're exploring here."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Zep/Graphiti"})," is doing some of the most interesting work in temporal knowledge graphs. Their bi-temporal model \u2014 tracking both when an event occurred and when it was ingested \u2014 addresses a real structural gap in how agent memory handles changing facts over time. Their episode and entity subgraphs share some philosophical DNA with our approach. Where our work diverges is in outcome tracking and reinforcement: we're specifically focused on whether a decision worked, and using that signal to update memory structure."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Letta (formerly MemGPT)"}),' pioneered self-editing memory \u2014 giving the LLM tools to manage its own memory blocks. This is a powerful paradigm, and their recent work on "Context Repositories" and sleep-time compute suggests they\'re actively pushing toward agents that learn over time. Their team has been transparent that experiential learning is an unsolved problem, which is part of what motivated our exploration.']}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"MemRL (Jan 2026 paper)"})," is the closest to our work academically. It shares the core insight of decoupling stable LLM reasoning from plastic, evolving memory. Their approach uses reinforcement learning to assign utility Q-values to memories, which is elegant but requires training a value function. Our approach is purely structural \u2014 no training step, no Q-values, just graph evolution and LLM-based reasoning over outcomes."]}),"\n",(0,s.jsx)(t.p,{children:"The common thread: most existing systems focus on knowledge persistence \u2014 remembering facts, preferences, and conversation history across sessions. The problem we're exploring is experiential learning \u2014 tracking whether past decisions worked, forming causal chains between episodes, and extracting reasoning heuristics that improve over time. These are complementary capabilities that would be needed by an ideal production system."}),"\n",(0,s.jsx)(t.h2,{id:"try-it-yourself",children:"Try It Yourself"}),"\n",(0,s.jsx)(t.p,{children:"The prototype is available in our experiments directory:"}),"\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{children:"experiments/episodic-memory-prototype/\n\u251c\u2500\u2500 memory/ # Timeline, encoder, episodes, graph, facts, retriever, reinforcer\n\u251c\u2500\u2500 agent/ # Episodic memory agent\n\u251c\u2500\u2500 baseline/ # Flat vector RAG agent (comparison)\n\u251c\u2500\u2500 simulator/ # 9-round debugging scenario\n\u251c\u2500\u2500 eval/ # Head-to-head comparison + scoring\n\u2514\u2500\u2500 tests/\n"})}),"\n",(0,s.jsx)(t.p,{children:"To run the comparison:"}),"\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{className:"language-bash",children:"cd experiments/episodic-memory-prototype\npython -m venv .venv && source .venv/bin/activate\npip install -r requirements.txt\nexport ANTHROPIC_API_KEY=sk-ant-...\npython -m eval.compare\n"})}),"\n",(0,s.jsx)(t.p,{children:"Without an API key, it runs in heuristic mode (keyword-based decisions). With a key, both agents use Claude Sonnet for reasoning \u2014 that's where the quality gap becomes visible."}),"\n",(0,s.jsx)(t.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,s.jsx)(t.p,{children:"This is a 9-round synthetic scenario we designed. It demonstrates the poc architecture works end-to-end and shows where episodic memory provides qualitatively different reasoning. It is not a peer-reviewed benchmark and should not be interpreted as a statistically rigorous claim. We're publishing the prototype so others can reproduce and extend the evaluation.\nIf this sparks interest do trigger github discussion."}),"\n",(0,s.jsx)(t.hr,{}),"\n",(0,s.jsx)(t.p,{children:(0,s.jsxs)(t.em,{children:["The episodic memory prototype is available in ",(0,s.jsx)(t.code,{children:"BharatMLStack"})," repo at ",(0,s.jsx)(t.code,{children:"/experiments/episodic-memory-prototype"})]})})]})}function h(e={}){const{wrapper:t}={...(0,r.R)(),...e.components};return t?(0,s.jsx)(t,{...e,children:(0,s.jsx)(l,{...e})}):l(e)}},3306:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/episodic-memory-for-agents","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","source":"@site/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","date":"2026-02-19T00:00:00.000Z","tags":[{"inline":true,"label":"ai-agents","permalink":"/BharatMLStack/blog/tags/ai-agents"},{"inline":true,"label":"memory","permalink":"/BharatMLStack/blog/tags/memory"},{"inline":true,"label":"architecture","permalink":"/BharatMLStack/blog/tags/architecture"},{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"episodic-memory","permalink":"/BharatMLStack/blog/tags/episodic-memory"}],"readingTime":11.61,"hasTruncateMarker":true,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","slug":"episodic-memory-for-agents","authors":["adarsha"],"date":"2026-02-19T00:00:00.000Z","tags":["ai-agents","memory","architecture","llm","episodic-memory"]},"unlisted":false,"nextItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency"}}')},8453:(e,t,n)=>{n.d(t,{R:()=>a,x:()=>o});var i=n(6540);const s={},r=i.createContext(s);function a(e){const t=i.useContext(r);return i.useMemo(function(){return"function"==typeof e?e(t):{...t,...e}},[t,e])}function o(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(s):e.components||s:a(e.components),i.createElement(r.Provider,{value:t},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/4b01b88a.e818c95d.js b/docs/assets/js/4b01b88a.e818c95d.js new file mode 100644 index 00000000..10572260 --- /dev/null +++ b/docs/assets/js/4b01b88a.e818c95d.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[9095],{762:(e,t,n)=>{n.d(t,{A:()=>i});const i=n.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},1737:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>c,contentTitle:()=>o,default:()=>h,frontMatter:()=>a,metadata:()=>i,toc:()=>d});var i=n(3306),s=n(4848),r=n(8453);const a={title:"Beyond Vector RAG: Building Agent Memory That Learns From Experience.",description:"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.",slug:"episodic-memory-for-agents",authors:["adarsha"],date:new Date("2026-02-19T00:00:00.000Z"),tags:["ai-agents","memory","architecture","llm","episodic-memory"]},o=void 0,c={authorsImageUrls:[void 0]},d=[{value:"The Gap Nobody Talks About",id:"the-gap-nobody-talks-about",level:2},{value:"What's Wrong With Vector RAG as Memory",id:"whats-wrong-with-vector-rag-as-memory",level:2},{value:"The Architecture: Episodic Memory",id:"the-architecture-episodic-memory",level:2},{value:"Layer 1: Immutable Timeline",id:"layer-1-immutable-timeline",level:3},{value:"Layer 2: Episode Segmentation",id:"layer-2-episode-segmentation",level:3},{value:"Layer 3: Episodic Graph",id:"layer-3-episodic-graph",level:3},{value:"Layer 4: Generalized Facts",id:"layer-4-generalized-facts",level:3},{value:"The Reinforcement Loop",id:"the-reinforcement-loop",level:3},{value:"The Experiment",id:"the-experiment",level:2},{value:"Results",id:"results",level:2},{value:"Decision Accuracy",id:"decision-accuracy",level:3},{value:"Where the Gap Opened",id:"where-the-gap-opened",level:3},{value:"Retrieval Quality",id:"retrieval-quality",level:3},{value:"What Didn't Work",id:"what-didnt-work",level:2},{value:"What This Means",id:"what-this-means",level:2},{value:"How It Compares to Existing Solutions",id:"how-it-compares-to-existing-solutions",level:2},{value:"Try It Yourself",id:"try-it-yourself",level:2},{value:"Conclusion",id:"conclusion",level:2}];function l(e){const t={blockquote:"blockquote",code:"code",em:"em",h2:"h2",h3:"h3",hr:"hr",img:"img",li:"li",ol:"ol",p:"p",pre:"pre",strong:"strong",table:"table",tbody:"tbody",td:"td",th:"th",thead:"thead",tr:"tr",ul:"ul",...(0,r.R)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.img,{alt:"BharatMLStack",src:n(762).A+"",width:"1396",height:"460"}),"\nAgent memory has come a long way. Persistent context, vector retrieval, knowledge graphs \u2014 the building blocks are real and getting better fast."]}),"\n",(0,s.jsx)(t.p,{children:'But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.'}),"\n",(0,s.jsx)(t.p,{children:"We are trying to experiment something different. An episodic memory system where a frozen LLM \u2014 same weights, no retraining \u2014 produces increasingly better decisions over time because the memory feeding it context is continuously evolving.\nThen we tested it. The results were interesting."}),"\n",(0,s.jsx)(t.h2,{id:"the-gap-nobody-talks-about",children:"The Gap Nobody Talks About"}),"\n",(0,s.jsx)(t.p,{children:"Here's a scenario every engineering team has encountered: AI agent hits a Redis connection pool exhaustion issue. It misdiagnoses it as a database problem. You correct it. Next week, a different service has the exact same failure pattern. The agent makes the exact same mistake."}),"\n",(0,s.jsx)(t.p,{children:"Why? Because LLMs don't learn at inference time. Corrections adjust behavior within a conversation. Once the session ends, the lesson is gone. The model weights haven't changed. The next conversation starts from zero."}),"\n",(0,s.jsx)(t.p,{children:'Current "memory" systems don\'t fully address this. They store facts \u2014 user preferences, document chunks, conversation summaries. But facts aren\'t experience. Knowing that "Redis connection pools can exhaust under load" is different from remembering "last time I saw 500 errors under load, I assumed it was the database, I was wrong, it was actually the connection pool, and here\'s the correction I received."'}),"\n",(0,s.jsx)(t.p,{children:"The first is a fact. The second is an episode. The difference matters."}),"\n",(0,s.jsx)(t.h2,{id:"whats-wrong-with-vector-rag-as-memory",children:"What's Wrong With Vector RAG as Memory"}),"\n",(0,s.jsx)(t.p,{children:"We identified five structural gaps in how current agent frameworks handle memory:"}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"No concept of time."})," Two events are either semantically similar or they're not. The system can't represent \"this happened after that\" without distorting similarity scores. An agent can't reason about sequence or causality."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"No concept of situation."})," A production incident and a design review might use the same technical vocabulary. Flat vector search can't distinguish them. Your agent retrieves planning notes when it should be retrieving incident postmortems."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"No outcome tracking."})," The system stores ",(0,s.jsx)(t.em,{children:"what happened"})," but not ",(0,s.jsx)(t.em,{children:"whether it worked"}),". A failed approach and a successful one are equally retrievable. The agent has no way to prefer strategies that worked over strategies that didn't."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Summaries destroy evidence."})," Summarization-based memory compresses experience but discards the reasoning chain. The agent loses the ability to explain ",(0,s.jsx)(t.em,{children:"how"})," it arrived at a conclusion. The audit trail is gone."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"No causal links."})," Each memory chunk is independent. There's no way to express that incident A caused decision B, which led to outcome C, which was corrected by approach D. Without this structure, the agent can't traverse chains of reasoning."]}),"\n",(0,s.jsx)(t.p,{children:"These gaps compound. As an agent accumulates more experience, flat vector memory gets noisier, more contradictory, and less useful. The system degrades precisely when it should be improving."}),"\n",(0,s.jsx)(t.h2,{id:"the-architecture-episodic-memory",children:"The Architecture: Episodic Memory"}),"\n",(0,s.jsx)(t.p,{children:"We are building a memory system modeled on how human episodic memory works \u2014 not as a metaphor, but as an engineering specification."}),"\n",(0,s.jsx)(t.p,{children:"The system has four layers:"}),"\n",(0,s.jsx)(t.h3,{id:"layer-1-immutable-timeline",children:"Layer 1: Immutable Timeline"}),"\n",(0,s.jsx)(t.p,{children:"Every piece of agent experience is recorded as an append-only timeline entry. Each entry carries a semantic embedding (what it means), a timestamp (when it happened), and a state label (what situation the agent was in \u2014 debugging, planning, code review, incident response). Entries are never modified, never deleted, never summarized. This is the source of truth."}),"\n",(0,s.jsx)(t.h3,{id:"layer-2-episode-segmentation",children:"Layer 2: Episode Segmentation"}),"\n",(0,s.jsx)(t.p,{children:"The system watches the timeline and detects when one coherent unit of experience ends and another begins \u2014 via state transitions, semantic shifts, temporal gaps, or explicit signals. Each episode is a reference into the timeline (not a copy) with a generated summary, an outcome (SUCCESS, FAILURE, PARTIAL, UNKNOWN), decisions made, assumptions held, and corrections received."}),"\n",(0,s.jsx)(t.p,{children:"The outcome field is the most important thing that doesn't exist in any current memory system. Without it, you can't learn from mistakes."}),"\n",(0,s.jsx)(t.h3,{id:"layer-3-episodic-graph",children:"Layer 3: Episodic Graph"}),"\n",(0,s.jsx)(t.p,{children:'Episodes are connected through typed, weighted links: CAUSED_BY, LED_TO, RETRY_OF, LEARNED_FROM, CONTINUATION, CONTRADICTED. Over time, this forms a directed graph that enables traversal by meaning and causality. You can follow the chain: "this incident caused that investigation, which led to a failed fix, which was corrected by this approach."'}),"\n",(0,s.jsx)(t.h3,{id:"layer-4-generalized-facts",children:"Layer 4: Generalized Facts"}),"\n",(0,s.jsx)(t.p,{children:'When multiple episodes exhibit consistent patterns, the system extracts reasoning heuristics: "When services fail immediately after deployment with no traffic change, investigate configuration errors before connection pool problems." Facts are versioned, never overwritten, and maintain links back to supporting and contradicting episodes. When contradicting evidence accumulates, confidence decreases. When confidence drops below a threshold, the fact is revised \u2014 but the old version is preserved.'}),"\n",(0,s.jsx)(t.p,{children:"The LLM sits above all four layers. At query time, the system assembles structured context \u2014 relevant episodes with outcomes, applicable facts with confidence scores, causal narratives \u2014 and passes it to the LLM for reasoning. The model reasons over structured memory. It doesn't store or manage memory."}),"\n",(0,s.jsx)(t.h3,{id:"the-reinforcement-loop",children:"The Reinforcement Loop"}),"\n",(0,s.jsx)(t.p,{children:"This is where it comes together:"}),"\n",(0,s.jsxs)(t.ol,{children:["\n",(0,s.jsx)(t.li,{children:"Agent reasons using retrieved episodes and facts"}),"\n",(0,s.jsx)(t.li,{children:"Outcome is detected (CI pass/fail, user correction, test result)"}),"\n",(0,s.jsx)(t.li,{children:"New episode is created with outcome tracking"}),"\n",(0,s.jsx)(t.li,{children:"Links are created between the retrieved episodes and the new episode"}),"\n",(0,s.jsx)(t.li,{children:"Facts are reinforced (if outcome aligned) or contradicted (if outcome conflicted)"}),"\n",(0,s.jsx)(t.li,{children:"If the decision was wrong and corrected, a LEARNED_FROM link is created"}),"\n"]}),"\n",(0,s.jsx)(t.p,{children:"The model weights never change. The memory structure evolves continuously. A frozen LLM produces better decisions over time because it receives better context from richer memory."}),"\n",(0,s.jsx)(t.h2,{id:"the-experiment",children:"The Experiment"}),"\n",(0,s.jsx)(t.p,{children:"We built the full system in Python (~1,000 lines) and tested it head-to-head against a baseline flat-vector RAG agent across a 9-round synthetic debugging scenario. Both agents used the identical LLM (Claude Sonnet 4) for reasoning. The only variable was the memory system."}),"\n",(0,s.jsx)(t.p,{children:"The scenario was designed to test five capabilities:"}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Round Type"}),(0,s.jsx)(t.th,{children:"What It Tests"}),(0,s.jsx)(t.th,{children:"Rounds"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"LEARN"}),(0,s.jsx)(t.td,{children:"Can the agent build experience from failures?"}),(0,s.jsx)(t.td,{children:"1, 2, 4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"RED HERRING"}),(0,s.jsx)(t.td,{children:"Can the agent resist applying a pattern when it doesn't fit?"}),(0,s.jsx)(t.td,{children:"3"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TEST"}),(0,s.jsx)(t.td,{children:"Can the agent apply learned patterns to new services?"}),(0,s.jsx)(t.td,{children:"5, 6"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"SUBTLE"}),(0,s.jsx)(t.td,{children:"Can the agent generalize to different symptoms, same root cause?"}),(0,s.jsx)(t.td,{children:"7"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"CORRECTION"}),(0,s.jsx)(t.td,{children:"After being corrected, does the agent adapt?"}),(0,s.jsx)(t.td,{children:"8, 9"})]})]})]}),"\n",(0,s.jsxs)(t.p,{children:["Rounds 1-4 build experience: three connection pool failures across different services, plus one red herring (a deployment config error that ",(0,s.jsx)(t.em,{children:"looks"})," like a connection pool issue). Rounds 5-7 test whether the agent applies the learned pattern to unfamiliar services and subtle symptom variations. Rounds 8-9 are the critical test: the agent is corrected after misdiagnosing a deployment-correlated error, then tested on a near-identical scenario to see if it adapts."]}),"\n",(0,s.jsx)(t.h2,{id:"results",children:"Results"}),"\n",(0,s.jsx)(t.h3,{id:"decision-accuracy",children:"Decision Accuracy"}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Round"}),(0,s.jsx)(t.th,{children:"Type"}),(0,s.jsx)(t.th,{children:"Episodic Agent"}),(0,s.jsx)(t.th,{children:"Baseline Agent"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"1"}),(0,s.jsx)(t.td,{children:"LEARN"}),(0,s.jsx)(t.td,{children:"\u2717"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"2"}),(0,s.jsx)(t.td,{children:"LEARN"}),(0,s.jsx)(t.td,{children:"\u2713"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"3"}),(0,s.jsx)(t.td,{children:"RED HERRING"}),(0,s.jsx)(t.td,{children:"\u2717"}),(0,s.jsx)(t.td,{children:"\u2717"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"4"}),(0,s.jsx)(t.td,{children:"LEARN"}),(0,s.jsx)(t.td,{children:"\u2713"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"5"}),(0,s.jsx)(t.td,{children:"TEST"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"\u2713"})}),(0,s.jsx)(t.td,{children:"\u2717"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"6"}),(0,s.jsx)(t.td,{children:"TEST"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"\u2713"})}),(0,s.jsx)(t.td,{children:"\u2717"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"7"}),(0,s.jsx)(t.td,{children:"SUBTLE"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"\u2713"})}),(0,s.jsx)(t.td,{children:"\u2717"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"8"}),(0,s.jsx)(t.td,{children:"CORRECTION"}),(0,s.jsx)(t.td,{children:"\u2713"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"9"}),(0,s.jsx)(t.td,{children:"CORRECTION"}),(0,s.jsx)(t.td,{children:"\u2713"}),(0,s.jsx)(t.td,{children:"\u2713"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"Total"})}),(0,s.jsx)(t.td,{}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"7/9 (78%)"})}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"5/9 (56%)"})})]})]})]}),"\n",(0,s.jsx)(t.p,{children:"The episodic agent won 7-5. A 40% relative improvement in decision accuracy using the exact same LLM."}),"\n",(0,s.jsx)(t.h3,{id:"where-the-gap-opened",children:"Where the Gap Opened"}),"\n",(0,s.jsx)(t.p,{children:"The episodic agent's advantage concentrated in exactly the rounds designed to test memory quality:"}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Rounds 5-6 (pattern application):"})," The episodic agent cited 4 past failure episodes with connection pool exhaustion as root cause, complete with correction annotations. It correctly identified pool exhaustion in new services. The baseline retrieved disconnected chunks and suggested checking timeout configurations \u2014 a pattern it picked up from the Round 3 red herring."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Round 7 (subtle symptoms \u2014 latency increase, no errors):"}),' Both agents had the same evidence available. The episodic agent\'s retrieval surfaced a diverse set of episodes (thanks to MMR diversity filtering) including the Redis pool exhaustion from Round 6, which primed it to recognize that latency without errors can still be pool contention. The baseline defaulted to "check recent config changes."']}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Round 9 (adaptation after correction):"})," This is the result we're most proud of. Look at the episodic agent's reasoning:"]}),"\n",(0,s.jsxs)(t.blockquote,{children:["\n",(0,s.jsx)(t.p,{children:(0,s.jsx)(t.em,{children:'"Episode 1 directly parallels this situation \u2014 errors spiking immediately after a deployment (v2.4.1 then, v3.1.0 now) with no traffic change. In that case, the root cause was a database migration that dropped an index. The generalized fact confirms that deployment-related issues with immediate onset after version changes are more likely caused by configuration errors or missing dependencies than by connection pool problems."'})}),"\n"]}),"\n",(0,s.jsxs)(t.p,{children:["It cited a specific past episode by analogy, quoted a generalized fact, and explained ",(0,s.jsx)(t.em,{children:"why"})," this situation matches the deployment pattern rather than the connection pool pattern. The baseline gave a vaguer assessment."]}),"\n",(0,s.jsx)(t.h3,{id:"retrieval-quality",children:"Retrieval Quality"}),"\n",(0,s.jsx)(t.p,{children:"This is where the structural difference is most visible:"}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Metric"}),(0,s.jsx)(t.th,{children:"Episodic Agent"}),(0,s.jsx)(t.th,{children:"Baseline Agent"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"Retrieved items with explicit outcome labels"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"100%"})}),(0,s.jsx)(t.td,{children:"25%"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"Correct pattern applications (Rounds 4-7)"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"4/4"})}),(0,s.jsx)(t.td,{children:"1/4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"False positives (Rounds 8-9)"}),(0,s.jsx)(t.td,{children:(0,s.jsx)(t.strong,{children:"0"})}),(0,s.jsx)(t.td,{children:"0"})]})]})]}),"\n",(0,s.jsx)(t.p,{children:"Every item the episodic agent retrieved carried a structured outcome label (SUCCESS or FAILURE) with correction details. Only 25% of the baseline's chunks contained any outcome information \u2014 and those were incidental text mentions, not structured labels."}),"\n",(0,s.jsx)(t.p,{children:"The episodic agent correctly applied the connection pool pattern in all four rounds where it was the root cause, and correctly avoided it in both rounds where it wasn't. The baseline applied it correctly once."}),"\n",(0,s.jsx)(t.h2,{id:"what-didnt-work",children:"What Didn't Work"}),"\n",(0,s.jsx)(t.p,{children:"Two things didn't work as anticipated:"}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Round 3 (red herring):"})," Both agents failed. The symptoms looked like connection pool issues, but the root cause was a deployment config change. At this point, the episodic agent had only seen connection pool episodes \u2014 it had no counter-evidence for deployment-correlated errors. You can't distinguish patterns you've only seen one side of. After Round 8 introduced a correction, the agent successfully avoided this mistake in Round 9."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Fact quality variance."}),' Some extracted facts were specific and actionable ("Deployment-related issues with immediate onset are more likely configuration errors"). Others were vague ("Initial symptom-based diagnosis often leads to misidentifying the root cause"). A production system needs a usefulness filter, not just a confidence score.']}),"\n",(0,s.jsx)(t.h2,{id:"what-this-means",children:"What This Means"}),"\n",(0,s.jsx)(t.p,{children:"The most important finding isn't the accuracy improvement. It's that the reinforcement loop closes without retraining."}),"\n",(0,s.jsx)(t.p,{children:"In the POC, we observed:"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsx)(t.li,{children:"Rounds 1-4: Agent encounters failures, episodes recorded with outcomes and corrections"}),"\n",(0,s.jsx)(t.li,{children:'After Round 4: Fact extracted \u2014 "Connection pool exhaustion is a common root cause under load"'}),"\n",(0,s.jsx)(t.li,{children:"Rounds 5-7: Agent applies the pattern with increasing confidence (fact support count grows)"}),"\n",(0,s.jsx)(t.li,{children:"Round 8: Agent encounters a deployment error, correctly identifies it as config, gets corrected"}),"\n",(0,s.jsx)(t.li,{children:'After Round 8: New fact \u2014 "Deployment-related issues with immediate onset are more likely configuration errors"'}),"\n",(0,s.jsx)(t.li,{children:"Round 9: Agent receives near-identical scenario, correctly avoids connection pool pattern, cites the Round 8 correction"}),"\n"]}),"\n",(0,s.jsx)(t.p,{children:"The model didn't change. The memory evolved. That's the whole point."}),"\n",(0,s.jsx)(t.h2,{id:"how-it-compares-to-existing-solutions",children:"How It Compares to Existing Solutions"}),"\n",(0,s.jsx)(t.p,{children:"Agent memory is a fast-moving space with several strong systems, each solving a different slice of the problem:"}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Mem0"})," excels at persistent personalization \u2014 extracting user preferences, managing session context, and reducing token costs through intelligent compression. It's the most production-ready memory layer available and integrates with nearly every agent framework. Its focus is on remembering about users and conversations rather than learning from task-level outcomes, which is a different problem than the one we're exploring here."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Zep/Graphiti"})," is doing some of the most interesting work in temporal knowledge graphs. Their bi-temporal model \u2014 tracking both when an event occurred and when it was ingested \u2014 addresses a real structural gap in how agent memory handles changing facts over time. Their episode and entity subgraphs share some philosophical DNA with our approach. Where our work diverges is in outcome tracking and reinforcement: we're specifically focused on whether a decision worked, and using that signal to update memory structure."]}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"Letta (formerly MemGPT)"}),' pioneered self-editing memory \u2014 giving the LLM tools to manage its own memory blocks. This is a powerful paradigm, and their recent work on "Context Repositories" and sleep-time compute suggests they\'re actively pushing toward agents that learn over time. Their team has been transparent that experiential learning is an unsolved problem, which is part of what motivated our exploration.']}),"\n",(0,s.jsxs)(t.p,{children:[(0,s.jsx)(t.strong,{children:"MemRL (Jan 2026 paper)"})," is the closest to our work academically. It shares the core insight of decoupling stable LLM reasoning from plastic, evolving memory. Their approach uses reinforcement learning to assign utility Q-values to memories, which is elegant but requires training a value function. Our approach is purely structural \u2014 no training step, no Q-values, just graph evolution and LLM-based reasoning over outcomes."]}),"\n",(0,s.jsx)(t.p,{children:"The common thread: most existing systems focus on knowledge persistence \u2014 remembering facts, preferences, and conversation history across sessions. The problem we're exploring is experiential learning \u2014 tracking whether past decisions worked, forming causal chains between episodes, and extracting reasoning heuristics that improve over time. These are complementary capabilities that would be needed by an ideal production system."}),"\n",(0,s.jsx)(t.h2,{id:"try-it-yourself",children:"Try It Yourself"}),"\n",(0,s.jsx)(t.p,{children:"The prototype is available in our experiments directory:"}),"\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{children:"experiments/episodic-memory-prototype/\n\u251c\u2500\u2500 memory/ # Timeline, encoder, episodes, graph, facts, retriever, reinforcer\n\u251c\u2500\u2500 agent/ # Episodic memory agent\n\u251c\u2500\u2500 baseline/ # Flat vector RAG agent (comparison)\n\u251c\u2500\u2500 simulator/ # 9-round debugging scenario\n\u251c\u2500\u2500 eval/ # Head-to-head comparison + scoring\n\u2514\u2500\u2500 tests/\n"})}),"\n",(0,s.jsx)(t.p,{children:"To run the comparison:"}),"\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{className:"language-bash",children:"cd experiments/episodic-memory-prototype\npython -m venv .venv && source .venv/bin/activate\npip install -r requirements.txt\nexport ANTHROPIC_API_KEY=sk-ant-...\npython -m eval.compare\n"})}),"\n",(0,s.jsx)(t.p,{children:"Without an API key, it runs in heuristic mode (keyword-based decisions). With a key, both agents use Claude Sonnet for reasoning \u2014 that's where the quality gap becomes visible."}),"\n",(0,s.jsx)(t.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,s.jsx)(t.p,{children:"This is a 9-round synthetic scenario we designed. It demonstrates the poc architecture works end-to-end and shows where episodic memory provides qualitatively different reasoning. It is not a peer-reviewed benchmark and should not be interpreted as a statistically rigorous claim. We're publishing the prototype so others can reproduce and extend the evaluation.\nIf this sparks interest do trigger github discussion."}),"\n",(0,s.jsx)(t.hr,{}),"\n",(0,s.jsx)(t.p,{children:(0,s.jsxs)(t.em,{children:["The episodic memory prototype is available in ",(0,s.jsx)(t.code,{children:"BharatMLStack"})," repo at ",(0,s.jsx)(t.code,{children:"/experiments/episodic-memory-prototype"})]})})]})}function h(e={}){const{wrapper:t}={...(0,r.R)(),...e.components};return t?(0,s.jsx)(t,{...e,children:(0,s.jsx)(l,{...e})}):l(e)}},3306:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/episodic-memory-for-agents","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","source":"@site/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","date":"2026-02-19T00:00:00.000Z","tags":[{"inline":true,"label":"ai-agents","permalink":"/BharatMLStack/blog/tags/ai-agents"},{"inline":true,"label":"memory","permalink":"/BharatMLStack/blog/tags/memory"},{"inline":true,"label":"architecture","permalink":"/BharatMLStack/blog/tags/architecture"},{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"episodic-memory","permalink":"/BharatMLStack/blog/tags/episodic-memory"}],"readingTime":11.67,"hasTruncateMarker":true,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","slug":"episodic-memory-for-agents","authors":["adarsha"],"date":"2026-02-19T00:00:00.000Z","tags":["ai-agents","memory","architecture","llm","episodic-memory"]},"unlisted":false,"nextItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency"}}')},8453:(e,t,n)=>{n.d(t,{R:()=>a,x:()=>o});var i=n(6540);const s={},r=i.createContext(s);function a(e){const t=i.useContext(r);return i.useMemo(function(){return"function"==typeof e?e(t):{...t,...e}},[t,e])}function o(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(s):e.components||s:a(e.components),i.createElement(r.Provider,{value:t},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/4dd73b28.f4ce8cb6.js b/docs/assets/js/4dd73b28.06495428.js similarity index 63% rename from docs/assets/js/4dd73b28.f4ce8cb6.js rename to docs/assets/js/4dd73b28.06495428.js index 384a9035..c53aabc3 100644 --- a/docs/assets/js/4dd73b28.f4ce8cb6.js +++ b/docs/assets/js/4dd73b28.06495428.js @@ -1 +1 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[4797],{762:(e,t,a)=>{a.d(t,{A:()=>r});const r=a.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},3306:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/episodic-memory-for-agents","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","source":"@site/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","date":"2026-02-19T00:00:00.000Z","tags":[{"inline":true,"label":"ai-agents","permalink":"/BharatMLStack/blog/tags/ai-agents"},{"inline":true,"label":"memory","permalink":"/BharatMLStack/blog/tags/memory"},{"inline":true,"label":"architecture","permalink":"/BharatMLStack/blog/tags/architecture"},{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"episodic-memory","permalink":"/BharatMLStack/blog/tags/episodic-memory"}],"readingTime":11.61,"hasTruncateMarker":true,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","slug":"episodic-memory-for-agents","authors":["adarsha"],"date":"2026-02-19T00:00:00.000Z","tags":["ai-agents","memory","architecture","llm","episodic-memory"]},"unlisted":false,"nextItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency"}}')},4043:(e,t,a)=>{a.r(t),a.d(t,{assets:()=>c,contentTitle:()=>o,default:()=>u,frontMatter:()=>s,metadata:()=>r,toc:()=>m});var r=a(3306),n=a(4848),i=a(8453);const s={title:"Beyond Vector RAG: Building Agent Memory That Learns From Experience.",description:"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.",slug:"episodic-memory-for-agents",authors:["adarsha"],date:new Date("2026-02-19T00:00:00.000Z"),tags:["ai-agents","memory","architecture","llm","episodic-memory"]},o=void 0,c={authorsImageUrls:[void 0]},m=[];function l(e){const t={img:"img",p:"p",...(0,i.R)(),...e.components};return(0,n.jsxs)(n.Fragment,{children:[(0,n.jsxs)(t.p,{children:[(0,n.jsx)(t.img,{alt:"BharatMLStack",src:a(762).A+"",width:"1396",height:"460"}),'\nEvery agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.']}),"\n",(0,n.jsx)(t.p,{children:"They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach."}),"\n",(0,n.jsx)(t.p,{children:"We are trying to built something different. An episodic memory system where a frozen LLM \u2014 same weights, no retraining \u2014 produces increasingly better decisions over time because the memory feeding it context is continuously evolving."}),"\n",(0,n.jsx)(t.p,{children:"Then we tested it. The results surprised us."})]})}function u(e={}){const{wrapper:t}={...(0,i.R)(),...e.components};return t?(0,n.jsx)(t,{...e,children:(0,n.jsx)(l,{...e})}):l(e)}},8453:(e,t,a)=>{a.d(t,{R:()=>s,x:()=>o});var r=a(6540);const n={},i=r.createContext(n);function s(e){const t=r.useContext(i);return r.useMemo(function(){return"function"==typeof e?e(t):{...t,...e}},[t,e])}function o(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(n):e.components||n:s(e.components),r.createElement(i.Provider,{value:t},e.children)}}}]); \ No newline at end of file +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[4797],{762:(e,t,a)=>{a.d(t,{A:()=>r});const r=a.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},3306:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/episodic-memory-for-agents","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","source":"@site/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","date":"2026-02-19T00:00:00.000Z","tags":[{"inline":true,"label":"ai-agents","permalink":"/BharatMLStack/blog/tags/ai-agents"},{"inline":true,"label":"memory","permalink":"/BharatMLStack/blog/tags/memory"},{"inline":true,"label":"architecture","permalink":"/BharatMLStack/blog/tags/architecture"},{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"episodic-memory","permalink":"/BharatMLStack/blog/tags/episodic-memory"}],"readingTime":11.67,"hasTruncateMarker":true,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","slug":"episodic-memory-for-agents","authors":["adarsha"],"date":"2026-02-19T00:00:00.000Z","tags":["ai-agents","memory","architecture","llm","episodic-memory"]},"unlisted":false,"nextItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency"}}')},4043:(e,t,a)=>{a.r(t),a.d(t,{assets:()=>c,contentTitle:()=>o,default:()=>u,frontMatter:()=>s,metadata:()=>r,toc:()=>l});var r=a(3306),n=a(4848),i=a(8453);const s={title:"Beyond Vector RAG: Building Agent Memory That Learns From Experience.",description:"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.",slug:"episodic-memory-for-agents",authors:["adarsha"],date:new Date("2026-02-19T00:00:00.000Z"),tags:["ai-agents","memory","architecture","llm","episodic-memory"]},o=void 0,c={authorsImageUrls:[void 0]},l=[];function m(e){const t={img:"img",p:"p",...(0,i.R)(),...e.components};return(0,n.jsxs)(n.Fragment,{children:[(0,n.jsxs)(t.p,{children:[(0,n.jsx)(t.img,{alt:"BharatMLStack",src:a(762).A+"",width:"1396",height:"460"}),"\nAgent memory has come a long way. Persistent context, vector retrieval, knowledge graphs \u2014 the building blocks are real and getting better fast."]}),"\n",(0,n.jsx)(t.p,{children:'But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.'}),"\n",(0,n.jsx)(t.p,{children:"We are trying to experiment something different. An episodic memory system where a frozen LLM \u2014 same weights, no retraining \u2014 produces increasingly better decisions over time because the memory feeding it context is continuously evolving.\nThen we tested it. The results were interesting."})]})}function u(e={}){const{wrapper:t}={...(0,i.R)(),...e.components};return t?(0,n.jsx)(t,{...e,children:(0,n.jsx)(m,{...e})}):m(e)}},8453:(e,t,a)=>{a.d(t,{R:()=>s,x:()=>o});var r=a(6540);const n={},i=r.createContext(n);function s(e){const t=r.useContext(i);return r.useMemo(function(){return"function"==typeof e?e(t):{...t,...e}},[t,e])}function o(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(n):e.components||n:s(e.components),r.createElement(i.Provider,{value:t},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/6479fb86.1b301bf1.js b/docs/assets/js/6479fb86.1b301bf1.js deleted file mode 100644 index f4acc381..00000000 --- a/docs/assets/js/6479fb86.1b301bf1.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[5579],{3751:e=>{e.exports=JSON.parse('{"archive":{"blogPosts":[{"id":"episodic-memory-for-agents","metadata":{"permalink":"/BharatMLStack/blog/episodic-memory-for-agents","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","source":"@site/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","date":"2026-02-19T00:00:00.000Z","tags":[{"inline":true,"label":"ai-agents","permalink":"/BharatMLStack/blog/tags/ai-agents"},{"inline":true,"label":"memory","permalink":"/BharatMLStack/blog/tags/memory"},{"inline":true,"label":"architecture","permalink":"/BharatMLStack/blog/tags/architecture"},{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"episodic-memory","permalink":"/BharatMLStack/blog/tags/episodic-memory"}],"readingTime":11.61,"hasTruncateMarker":true,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","slug":"episodic-memory-for-agents","authors":["adarsha"],"date":"2026-02-19T00:00:00.000Z","tags":["ai-agents","memory","architecture","llm","episodic-memory"]},"unlisted":false,"nextItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency"}},"content":"![BharatMLStack](./bms.png)\\nEvery agent framework on the market will tell you their agents \\"have memory.\\" What they mean is: they have a vector database.\\n\\nThey chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.\\n\\nWe are trying to built something different. An episodic memory system where a frozen LLM \u2014 same weights, no retraining \u2014 produces increasingly better decisions over time because the memory feeding it context is continuously evolving.\\n\\nThen we tested it. The results surprised us.\\n\\n\x3c!-- truncate --\x3e\\n\\n## The Gap Nobody Talks About\\n\\nHere\'s a scenario every engineering team has encountered: AI agent hits a Redis connection pool exhaustion issue. It misdiagnoses it as a database problem. You correct it. Next week, a different service has the exact same failure pattern. The agent makes the exact same mistake.\\n\\nWhy? Because LLMs don\'t learn at inference time. Corrections adjust behavior within a conversation. Once the session ends, the lesson is gone. The model weights haven\'t changed. The next conversation starts from zero.\\n\\nCurrent \\"memory\\" systems don\'t fully address this. They store facts \u2014 user preferences, document chunks, conversation summaries. But facts aren\'t experience. Knowing that \\"Redis connection pools can exhaust under load\\" is different from remembering \\"last time I saw 500 errors under load, I assumed it was the database, I was wrong, it was actually the connection pool, and here\'s the correction I received.\\"\\n\\nThe first is a fact. The second is an episode. The difference matters.\\n\\n## What\'s Wrong With Vector RAG as Memory\\n\\nWe identified five structural gaps in how current agent frameworks handle memory:\\n\\n**No concept of time.** Two events are either semantically similar or they\'re not. The system can\'t represent \\"this happened after that\\" without distorting similarity scores. An agent can\'t reason about sequence or causality.\\n\\n**No concept of situation.** A production incident and a design review might use the same technical vocabulary. Flat vector search can\'t distinguish them. Your agent retrieves planning notes when it should be retrieving incident postmortems.\\n\\n**No outcome tracking.** The system stores *what happened* but not *whether it worked*. A failed approach and a successful one are equally retrievable. The agent has no way to prefer strategies that worked over strategies that didn\'t.\\n\\n**Summaries destroy evidence.** Summarization-based memory compresses experience but discards the reasoning chain. The agent loses the ability to explain *how* it arrived at a conclusion. The audit trail is gone.\\n\\n**No causal links.** Each memory chunk is independent. There\'s no way to express that incident A caused decision B, which led to outcome C, which was corrected by approach D. Without this structure, the agent can\'t traverse chains of reasoning.\\n\\nThese gaps compound. As an agent accumulates more experience, flat vector memory gets noisier, more contradictory, and less useful. The system degrades precisely when it should be improving.\\n\\n## The Architecture: Episodic Memory\\n\\nWe are building a memory system modeled on how human episodic memory works \u2014 not as a metaphor, but as an engineering specification.\\n\\nThe system has four layers:\\n\\n### Layer 1: Immutable Timeline\\n\\nEvery piece of agent experience is recorded as an append-only timeline entry. Each entry carries a semantic embedding (what it means), a timestamp (when it happened), and a state label (what situation the agent was in \u2014 debugging, planning, code review, incident response). Entries are never modified, never deleted, never summarized. This is the source of truth.\\n\\n### Layer 2: Episode Segmentation\\n\\nThe system watches the timeline and detects when one coherent unit of experience ends and another begins \u2014 via state transitions, semantic shifts, temporal gaps, or explicit signals. Each episode is a reference into the timeline (not a copy) with a generated summary, an outcome (SUCCESS, FAILURE, PARTIAL, UNKNOWN), decisions made, assumptions held, and corrections received.\\n\\nThe outcome field is the most important thing that doesn\'t exist in any current memory system. Without it, you can\'t learn from mistakes.\\n\\n### Layer 3: Episodic Graph\\n\\nEpisodes are connected through typed, weighted links: CAUSED_BY, LED_TO, RETRY_OF, LEARNED_FROM, CONTINUATION, CONTRADICTED. Over time, this forms a directed graph that enables traversal by meaning and causality. You can follow the chain: \\"this incident caused that investigation, which led to a failed fix, which was corrected by this approach.\\"\\n\\n### Layer 4: Generalized Facts\\n\\nWhen multiple episodes exhibit consistent patterns, the system extracts reasoning heuristics: \\"When services fail immediately after deployment with no traffic change, investigate configuration errors before connection pool problems.\\" Facts are versioned, never overwritten, and maintain links back to supporting and contradicting episodes. When contradicting evidence accumulates, confidence decreases. When confidence drops below a threshold, the fact is revised \u2014 but the old version is preserved.\\n\\nThe LLM sits above all four layers. At query time, the system assembles structured context \u2014 relevant episodes with outcomes, applicable facts with confidence scores, causal narratives \u2014 and passes it to the LLM for reasoning. The model reasons over structured memory. It doesn\'t store or manage memory.\\n\\n### The Reinforcement Loop\\n\\nThis is where it comes together:\\n\\n1. Agent reasons using retrieved episodes and facts\\n2. Outcome is detected (CI pass/fail, user correction, test result)\\n3. New episode is created with outcome tracking\\n4. Links are created between the retrieved episodes and the new episode\\n5. Facts are reinforced (if outcome aligned) or contradicted (if outcome conflicted)\\n6. If the decision was wrong and corrected, a LEARNED_FROM link is created\\n\\nThe model weights never change. The memory structure evolves continuously. A frozen LLM produces better decisions over time because it receives better context from richer memory.\\n\\n## The Experiment\\n\\nWe built the full system in Python (~1,000 lines) and tested it head-to-head against a baseline flat-vector RAG agent across a 9-round synthetic debugging scenario. Both agents used the identical LLM (Claude Sonnet 4) for reasoning. The only variable was the memory system.\\n\\nThe scenario was designed to test five capabilities:\\n\\n| Round Type | What It Tests | Rounds |\\n|---|---|---|\\n| LEARN | Can the agent build experience from failures? | 1, 2, 4 |\\n| RED HERRING | Can the agent resist applying a pattern when it doesn\'t fit? | 3 |\\n| TEST | Can the agent apply learned patterns to new services? | 5, 6 |\\n| SUBTLE | Can the agent generalize to different symptoms, same root cause? | 7 |\\n| CORRECTION | After being corrected, does the agent adapt? | 8, 9 |\\n\\nRounds 1-4 build experience: three connection pool failures across different services, plus one red herring (a deployment config error that *looks* like a connection pool issue). Rounds 5-7 test whether the agent applies the learned pattern to unfamiliar services and subtle symptom variations. Rounds 8-9 are the critical test: the agent is corrected after misdiagnosing a deployment-correlated error, then tested on a near-identical scenario to see if it adapts.\\n\\n## Results\\n\\n### Decision Accuracy\\n\\n| Round | Type | Episodic Agent | Baseline Agent |\\n|---|---|---|---|\\n| 1 | LEARN | \u2717 | \u2713 |\\n| 2 | LEARN | \u2713 | \u2713 |\\n| 3 | RED HERRING | \u2717 | \u2717 |\\n| 4 | LEARN | \u2713 | \u2713 |\\n| 5 | TEST | **\u2713** | \u2717 |\\n| 6 | TEST | **\u2713** | \u2717 |\\n| 7 | SUBTLE | **\u2713** | \u2717 |\\n| 8 | CORRECTION | \u2713 | \u2713 |\\n| 9 | CORRECTION | \u2713 | \u2713 |\\n| **Total** | | **7/9 (78%)** | **5/9 (56%)** |\\n\\nThe episodic agent won 7-5. A 40% relative improvement in decision accuracy using the exact same LLM.\\n\\n### Where the Gap Opened\\n\\nThe episodic agent\'s advantage concentrated in exactly the rounds designed to test memory quality:\\n\\n**Rounds 5-6 (pattern application):** The episodic agent cited 4 past failure episodes with connection pool exhaustion as root cause, complete with correction annotations. It correctly identified pool exhaustion in new services. The baseline retrieved disconnected chunks and suggested checking timeout configurations \u2014 a pattern it picked up from the Round 3 red herring.\\n\\n**Round 7 (subtle symptoms \u2014 latency increase, no errors):** Both agents had the same evidence available. The episodic agent\'s retrieval surfaced a diverse set of episodes (thanks to MMR diversity filtering) including the Redis pool exhaustion from Round 6, which primed it to recognize that latency without errors can still be pool contention. The baseline defaulted to \\"check recent config changes.\\"\\n\\n**Round 9 (adaptation after correction):** This is the result we\'re most proud of. Look at the episodic agent\'s reasoning:\\n\\n> *\\"Episode 1 directly parallels this situation \u2014 errors spiking immediately after a deployment (v2.4.1 then, v3.1.0 now) with no traffic change. In that case, the root cause was a database migration that dropped an index. The generalized fact confirms that deployment-related issues with immediate onset after version changes are more likely caused by configuration errors or missing dependencies than by connection pool problems.\\"*\\n\\nIt cited a specific past episode by analogy, quoted a generalized fact, and explained *why* this situation matches the deployment pattern rather than the connection pool pattern. The baseline gave a vaguer assessment.\\n\\n### Retrieval Quality\\n\\nThis is where the structural difference is most visible:\\n\\n| Metric | Episodic Agent | Baseline Agent |\\n|---|---|---|\\n| Retrieved items with explicit outcome labels | **100%** | 25% |\\n| Correct pattern applications (Rounds 4-7) | **4/4** | 1/4 |\\n| False positives (Rounds 8-9) | **0** | 0 |\\n\\nEvery item the episodic agent retrieved carried a structured outcome label (SUCCESS or FAILURE) with correction details. Only 25% of the baseline\'s chunks contained any outcome information \u2014 and those were incidental text mentions, not structured labels.\\n\\nThe episodic agent correctly applied the connection pool pattern in all four rounds where it was the root cause, and correctly avoided it in both rounds where it wasn\'t. The baseline applied it correctly once.\\n\\n## What Didn\'t Work\\n\\nTwo things didn\'t work as anticipated:\\n\\n**Round 3 (red herring):** Both agents failed. The symptoms looked like connection pool issues, but the root cause was a deployment config change. At this point, the episodic agent had only seen connection pool episodes \u2014 it had no counter-evidence for deployment-correlated errors. You can\'t distinguish patterns you\'ve only seen one side of. After Round 8 introduced a correction, the agent successfully avoided this mistake in Round 9.\\n\\n**Fact quality variance.** Some extracted facts were specific and actionable (\\"Deployment-related issues with immediate onset are more likely configuration errors\\"). Others were vague (\\"Initial symptom-based diagnosis often leads to misidentifying the root cause\\"). A production system needs a usefulness filter, not just a confidence score.\\n\\n## What This Means\\n\\nThe most important finding isn\'t the accuracy improvement. It\'s that the reinforcement loop closes without retraining.\\n\\nIn the POC, we observed:\\n\\n- Rounds 1-4: Agent encounters failures, episodes recorded with outcomes and corrections\\n- After Round 4: Fact extracted \u2014 \\"Connection pool exhaustion is a common root cause under load\\"\\n- Rounds 5-7: Agent applies the pattern with increasing confidence (fact support count grows)\\n- Round 8: Agent encounters a deployment error, correctly identifies it as config, gets corrected\\n- After Round 8: New fact \u2014 \\"Deployment-related issues with immediate onset are more likely configuration errors\\"\\n- Round 9: Agent receives near-identical scenario, correctly avoids connection pool pattern, cites the Round 8 correction\\n\\nThe model didn\'t change. The memory evolved. That\'s the whole point.\\n\\n## How It Compares to Existing Solutions\\n\\nAgent memory is a fast-moving space with several strong systems, each solving a different slice of the problem:\\n\\n**Mem0** excels at persistent personalization \u2014 extracting user preferences, managing session context, and reducing token costs through intelligent compression. It\'s the most production-ready memory layer available and integrates with nearly every agent framework. Its focus is on remembering about users and conversations rather than learning from task-level outcomes, which is a different problem than the one we\'re exploring here.\\n\\n**Zep/Graphiti** is doing some of the most interesting work in temporal knowledge graphs. Their bi-temporal model \u2014 tracking both when an event occurred and when it was ingested \u2014 addresses a real structural gap in how agent memory handles changing facts over time. Their episode and entity subgraphs share some philosophical DNA with our approach. Where our work diverges is in outcome tracking and reinforcement: we\'re specifically focused on whether a decision worked, and using that signal to update memory structure.\\n\\n**Letta (formerly MemGPT)** pioneered self-editing memory \u2014 giving the LLM tools to manage its own memory blocks. This is a powerful paradigm, and their recent work on \\"Context Repositories\\" and sleep-time compute suggests they\'re actively pushing toward agents that learn over time. Their team has been transparent that experiential learning is an unsolved problem, which is part of what motivated our exploration.\\n\\n**MemRL (Jan 2026 paper)** is the closest to our work academically. It shares the core insight of decoupling stable LLM reasoning from plastic, evolving memory. Their approach uses reinforcement learning to assign utility Q-values to memories, which is elegant but requires training a value function. Our approach is purely structural \u2014 no training step, no Q-values, just graph evolution and LLM-based reasoning over outcomes.\\n\\n\\nThe common thread: most existing systems focus on knowledge persistence \u2014 remembering facts, preferences, and conversation history across sessions. The problem we\'re exploring is experiential learning \u2014 tracking whether past decisions worked, forming causal chains between episodes, and extracting reasoning heuristics that improve over time. These are complementary capabilities that would be needed by an ideal production system.\\n\\n## Try It Yourself\\n\\nThe prototype is available in our experiments directory:\\n\\n```\\nexperiments/episodic-memory-prototype/\\n\u251c\u2500\u2500 memory/ # Timeline, encoder, episodes, graph, facts, retriever, reinforcer\\n\u251c\u2500\u2500 agent/ # Episodic memory agent\\n\u251c\u2500\u2500 baseline/ # Flat vector RAG agent (comparison)\\n\u251c\u2500\u2500 simulator/ # 9-round debugging scenario\\n\u251c\u2500\u2500 eval/ # Head-to-head comparison + scoring\\n\u2514\u2500\u2500 tests/\\n```\\n\\nTo run the comparison:\\n\\n```bash\\ncd experiments/episodic-memory-prototype\\npython -m venv .venv && source .venv/bin/activate\\npip install -r requirements.txt\\nexport ANTHROPIC_API_KEY=sk-ant-...\\npython -m eval.compare\\n```\\n\\nWithout an API key, it runs in heuristic mode (keyword-based decisions). With a key, both agents use Claude Sonnet for reasoning \u2014 that\'s where the quality gap becomes visible.\\n\\n\\n## Conclusion\\nThis is a 9-round synthetic scenario we designed. It demonstrates the poc architecture works end-to-end and shows where episodic memory provides qualitatively different reasoning. It is not a peer-reviewed benchmark and should not be interpreted as a statistically rigorous claim. We\'re publishing the prototype so others can reproduce and extend the evaluation.\\nIf this sparks interest do trigger github discussion.\\n\\n---\\n\\n*The episodic memory prototype is available in `BharatMLStack` repo at `/experiments/episodic-memory-prototype`*"},{"id":"llm-inference-optimization-sub-sec-latency","metadata":{"permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/llm-inference-optimization/index.md","source":"@site/blog/bharatmlstack-history/llm-inference-optimization/index.md","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"A practical guide to the optimization techniques behind sub-second LLM inference\u2014covering paged KV caching, INT4 AWQ and FP8 quantization, kernel fusion, inflight batching, parallelism strategies, and speculative decoding, with production benchmarks on L4 and A100 GPUs.","date":"2025-06-02T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":4.88,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"A practical guide to the optimization techniques behind sub-second LLM inference\u2014covering paged KV caching, INT4 AWQ and FP8 quantization, kernel fusion, inflight batching, parallelism strategies, and speculative decoding, with production benchmarks on L4 and A100 GPUs.","authors":["jaya"],"slug":"llm-inference-optimization-sub-sec-latency","date":"2025-6-2","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","permalink":"/BharatMLStack/blog/episodic-memory-for-agents"},"nextItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/multi-engine-llm-inferencing-platform"}},"content":"![BharatMLStack](./bms.png)\\nRaw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack\u2014from memory management to kernel execution.\\n\\n## 1. Advanced Memory Management: Paged & Prefix KV Caching\\n\\nThe most significant bottleneck in LLM inference is not always compute, but memory bandwidth\u2014specifically managing the Key-Value (KV) cache.\\n\\n### Paged KV caching\\n\\nStandard caching suffers from fragmentation. We use **Paged KV caching**, which operates similarly to an operating system\'s virtual memory: the KV cache is divided into non-contiguous blocks. This lets us serve larger batch sizes without running out of memory.\\n\\n### KV cache quantization\\n\\nTo further maximize available memory, we implement **KV cache quantization** (e.g., FP8). By compressing stored attention keys and values from 16-bit to 8-bit, we nearly double the effective context window capacity of the GPU, allowing longer conversations or larger batches without materially degrading quality.\\n\\n### Prefix caching (the \\"voice bot\\" optimizer)\\n\\nFor use cases like GenAI voice bots where the system prompt (e.g., \\"You are a helpful assistant...\\") is static across thousands of requests, we enable **prefix caching**.\\n\\n- **Impact**: By reusing pre-computed KV states for common prefixes, we achieve a cache hit rate of ~90%. This reduces **Time To First Token (TTFT)** by skipping redundant computation of the system prompt.\\n\\n## 2. Aggressive Quantization (INT4 AWQ & FP8)\\n\\nRunning models in their native 16-bit precision (BF16) restricts maximum batch size and throughput. We use quantization to shrink model weights without sacrificing accuracy.\\n\\n### INT4 AWQ (Activation-aware Weight Quantization)\\n\\nFor the Llama 3 family, we use **AWQ** to compress weights to 4 bits. This reduces model size by ~75%, allowing larger models to fit into L4 GPU memory and significantly improving token generation speed.\\n\\n### FP8 precision\\n\\nFor NVIDIA Hopper (H100) architectures, we are exploring **FP8 quantization**, leveraging native FP8 tensor cores to accelerate matrix multiplications while maintaining a higher dynamic range than integer quantization.\\n\\n- **Verification**: We validate quantized models by comparing dot-product similarity of embeddings against the FP16 baseline, consistently achieving **>99% similarity**.\\n\\n## 3. Kernel Fusion & Custom Plugins\\n\\nTo minimize overhead from launching thousands of small GPU operations, we fuse them into monolithic kernels using NVIDIA TensorRT plugins.\\n\\n- **Flash attention & FMHA**: We enable **Fused Multi-Head Attention (FMHA)** combined with flash attention to reduce memory reads/writes.\\n- **GEMM plugins**: We use specialized **GEMM** plugins to accelerate transformer linear layers.\\n- **Removing input padding**: Instead of padding short sequences to match the longest, we remove input padding so the GPU processes only valid tokens.\\n\\n## 4. Inflight (Continuous) Batching\\n\\nTraditional static batching waits for all requests in a batch to finish before returning results\u2014so one long response delays everyone else.\\n\\nWe implement **inflight batching**: as soon as one request completes, its slot is freed and filled by a new request from the queue. This keeps GPUs saturated and decouples latency of short queries from long ones.\\n\\n## 5. Parallelism Strategies: Scaling Beyond One GPU\\n\\nFor large models (e.g., 70B+ parameters) that cannot fit into the VRAM of a single GPU, we use parallelism strategies.\\n\\n- **Tensor parallelism (TP)**: Split weight matrices across multiple GPUs (e.g., 4\xd7 L4 or 8\xd7 A100). Each GPU computes a shard and outputs are reduced at every layer.\\n- **Pipeline parallelism (PP)**: Split model layers across GPUs to pipeline compute (e.g., while one GPU computes later layers for Request A, another starts early layers for Request B).\\n\\n## 6. Speculative Decoding\\n\\nTo reduce inter-token latency (ITL), we explore **speculative decoding**.\\n\\n- **Mechanism**: A smaller, faster \\"draft\\" model speculatively generates a short token sequence (e.g., 5 tokens).\\n- **Verification**: The larger target model verifies those tokens in one parallel forward pass. If correct, we effectively generate multiple tokens per large-model step; if not, we discard and regenerate. This is effective for predictable text, improving perceived generation speed.\\n\\n## Few Benchmarks\\n\\nBelow are a couple of representative use cases and performance numbers.\\n\\n### Search query rewriting\\n\\n- **LLM**: Fine-tuned llama-3.2-1B\\n- **Input & output token length**: ~10\u201320\\n- **Response type**: Non-streaming\\n\\n| Inference runtime | Hardware | Max requests/sec | Max p99 latency |\\n| --- | --- | ---: | ---: |\\n| TensorRT-LLM | 4 \xd7 L4 GPUs (multi-GPU) | 1000 | 95 ms |\\n| TensorRT-LLM | 1 \xd7 A100 40 GB GPU | 1000 | 69 ms |\\n\\n### Voice bot query\\n\\n- **LLM**: Llama-3.1-8B\\n- **Input token length**: ~1900\u20132000\\n- **Output token length**: ~200\\n- **Response type**: Streaming\\n\\n| Inference runtime | Concurrency | p99 TTFT (ms) | p99 ITL (ms) | Token throughput (tokens/sec) | Request throughput (req/sec) | Hardware |\\n| --- | ---: | ---: | ---: | ---: | ---: | --- |\\n| TensorRT-LLM | 1 | 36.27 | 22.78 | 45.66 | 0.23 | L4 |\\n| TensorRT-LLM | 2 | 49.81 | 23.21 | 89.37 | 0.45 | L4 |\\n| TensorRT-LLM | 4 | 55.33 | 36.62 | 153.39 | 0.78 | L4 |\\n| TensorRT-LLM | 8 | 66.5 | 39.11 | 279.88 | 1.47 | L4 |\\n| TensorRT-LLM | 16 | 131.8 | 30.39 | 547.8 | 2.77 | L4 |\\n| TensorRT-LLM | 32 | 277.22 | 48.02 | 925.7 | 4.78 | L4 |\\n| TensorRT-LLM | 64 | 498.52 | 71.62 | 1,164.40 | 6.2 | L4 |\\n| TensorRT-LLM | 128 | 677.31 | 120.37 | 1,445.18 | 7.69 | L4 |\\n| TensorRT-LLM | 256 | 1,926.31 | 216.88 | 1,600.81 | 8.52 | L4 |\\n| TensorRT-LLM | 1 | 21.17 | 9.24 | 130.05 | 0.68 | A100 |\\n| TensorRT-LLM | 2 | 25.78 | 9.21 | 264.5 | 1.35 | A100 |\\n| TensorRT-LLM | 4 | 28.52 | 10.99 | 437.69 | 2.27 | A100 |\\n| TensorRT-LLM | 8 | 34.4 | 12.61 | 760.49 | 3.96 | A100 |\\n| TensorRT-LLM | 16 | 68.03 | 14.32 | 1,343.80 | 7.01 | A100 |\\n| TensorRT-LLM | 32 | 185.96 | 16.82 | 2,287.30 | 11.92 | A100 |\\n| TensorRT-LLM | 64 | 136.87 | 21.17 | 3,625.22 | 18.89 | A100 |\\n| TensorRT-LLM | 128 | 463.78 | 34.15 | 4,456.51 | 23.24 | A100 |\\n| TensorRT-LLM | 256 | 890.12 | 59.18 | 5,188.24 | 27.05 | A100 |\\n\\n## Conclusion\\n\\nHigh-performance LLM inference is fundamentally a systems engineering problem: memory efficiency, kernel execution, batching strategy, and parallelism determine real-world latency and throughput. Techniques such as paged KV caching, aggressive quantization, kernel fusion, and inflight batching improve GPU utilization while reducing latency and memory pressure.\\n\\nThese optimizations enable the platform to deliver sub-second responses, sustain high concurrency, and efficiently serve both lightweight and long-context workloads. By continuously optimizing across the full inference stack, we keep LLM serving scalable, cost-efficient, and production-ready for real-time AI applications."},{"id":"multi-engine-llm-inferencing-platform","metadata":{"permalink":"/BharatMLStack/blog/multi-engine-llm-inferencing-platform","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/llm-inferencing-platform/index.md","source":"@site/blog/bharatmlstack-history/llm-inferencing-platform/index.md","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","description":"A deep dive into building a production-grade LLM inference platform\u2014covering the full LLMOps lifecycle from model onboarding and automated compilation to multi-engine serving with TensorRT-LLM, vLLM, and Dynamo, along with cold-start mitigation and LLM-specific observability.","date":"2025-03-29T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":13.31,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","description":"A deep dive into building a production-grade LLM inference platform\u2014covering the full LLMOps lifecycle from model onboarding and automated compilation to multi-engine serving with TensorRT-LLM, vLLM, and Dynamo, along with cold-start mitigation and LLM-specific observability.","authors":["jaya"],"slug":"multi-engine-llm-inferencing-platform","date":"2025-3-29","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency"},"nextItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/scaling-model-inference-and-embedding-search"}},"content":"![BharatMLStack](./bms.png)\\nServing large language models in production introduces new challenges across infrastructure, performance optimization, and operational lifecycle management. The LLM Inference Platform addresses these challenges by providing a unified system for deploying and managing open-source and fine-tuned LLMs at scale.\\n\\nThe platform implements a complete LLMOps lifecycle \u2014 from model registration and automated compilation to deployment, runtime optimization, and monitoring. Designed as a self-service environment, users can onboard models directly from open repositories such as Hugging Face or upload custom fine-tuned models, and deploy them using a single-click workflow with no manual infrastructure or configuration steps required.\\n\\nIn addition to fully automated deployment, the platform allows users to select and apply custom inference optimization techniques \u2014 such as quantization strategies, batching configurations, and runtime-specific performance enhancements \u2014 enabling teams to balance latency, throughput, and cost based on their use case. The goal is to reduce operational friction while enabling high-performance, production-grade LLM inference.\\n\\n## Why LLM Inference Is not just bigger ML model serving\\n\\nLarge language model (LLM) inference introduces a fundamentally different set of challenges compared to traditional machine learning inference. While classical ML models typically perform a single forward pass to produce a fixed prediction, LLMs operate as autoregressive systems, generating outputs token by token based on previously generated context. This difference dramatically changes how inference systems must be designed, optimized, and scaled.\\n\\n### Autoregressive Generation and Sequential Computation:\\n\\nUnlike traditional models such as classifiers or recommenders \u2014 where inference cost is relatively constant \u2014 LLMs generate responses incrementally. Each new token depends on all previously generated tokens, making inference inherently sequential and dynamic. This means latency and compute requirements vary significantly depending on prompt length and output size, introducing complexity in scheduling and resource allocation.\\nBecause tokens cannot be generated fully in parallel during decoding, GPUs may become underutilized without specialized batching and scheduling strategies. This has led to the development of dedicated LLM inference engines optimized for token-level execution.\\n\\n### Prefill and Decode Phases:\\n\\nLLM inference typically consists of two distinct stages:\\n\\n- Prefill phase \u2014 the model processes the input prompt and builds internal representations. This stage is compute-heavy and highly parallelizable.\\n- Decode phase \u2014 the model generates tokens sequentially, predicting one token at a time using previously generated context.\\n\\nThe decode stage often becomes memory-bound rather than compute-bound, which creates new performance bottlenecks compared to traditional ML workloads.\\n\\n### Context Management and KV Caching:\\n\\nAnother fundamental difference lies in how LLMs maintain context. Transformer-based models rely on attention mechanisms that require access to past token representations. To avoid recomputing these representations repeatedly, inference engines use key-value (KV) caching, which stores intermediate activations from previous tokens.\\nKV caching significantly improves performance by eliminating redundant computation, but it introduces new challenges:\\n\\n- Memory consumption grows with sequence length and batch size\\n- GPU memory becomes a critical bottleneck\\n- Efficient memory management becomes essential for scaling concurrent requests\\n\\nThis tradeoff between compute efficiency and memory usage is unique to LLM inference workloads.\\n\\n### Dynamic and Irregular Workloads:\\n\\nTraditional ML inference typically operates on fixed-size inputs with predictable latency. In contrast, LLM requests vary widely in prompt length, output length, and runtime behavior. As a result:\\n\\n- Batch sizes must be dynamic rather than static\\n- Requests may enter and leave batches asynchronously\\n- Scheduling systems must continuously rebalance workloads to maximize GPU utilization\\n\\nThese characteristics require specialized serving architectures that differ significantly from standard ML serving pipelines.\\n\\n### Streaming and User Experience Constraints:\\n\\nAnother distinguishing factor is the expectation of real-time streaming responses. Instead of returning a single output, LLM systems often stream tokens to users as they are generated. \\nBecause of these differences \u2014 sequential generation, growing memory requirements, dynamic workloads, and streaming constraints \u2014 LLM inference cannot be treated as a simple extension of existing ML serving systems. Production platforms must incorporate specialized runtime engines, advanced optimization techniques, and observability tailored specifically to LLM workloads.\\n\\n## LLMOps: High-Level Architecture \\n\\n![LLM Architecture](./llm-plat.png)\\n\\nThe LLM Inference Framework is designed as a fully automated, end-to-end system for deploying and operating open-source and fine-tuned large language models at scale. The architecture abstracts the complexity of model optimization, hardware selection, deployment, and runtime management into a unified workflow that enables users to move from raw model weights to production-ready inference endpoints with minimal manual intervention.\\n\\nOur LLM Inference Framework is architected not just as a serving engine, but as a complete lifecycle management system. As illustrated in the high-level design below, the platform automates the journey of a model through seven distinct stages, ensuring reproducibility, performance, and scalability.\\n\\n1. Onboarding & Registration (The Source of Truth)\\n\\n The lifecycle begins with the Data Scientist or engineer.\\n\\n - Model Ingestion: Users onboard models\u2014whether open-source (Hugging Face, NeMo) or internally fine-tuned\u2014via the Truffle Box SDK/UI.\\n - LLM + Prompt Registry: Unlike traditional systems that only track model weights, our registry is a unified control plane. It stores both the Model Artifacts and the Prompt Templates. This allows Data Scientists to register and version-control prompts (e.g., \\"customer_support_v2\\") independently of the application code.\\n\\n2. The \\"Black Box\\" Build Engine\\n\\n Once a model is registered, the Automated LLM Compiler + Quantizer Module kicks off a background job on ephemeral GPU resources.\\n\\n - Transformation: The raw model is converted into a TRT-LLM Checkpoint.\\n - Quantization: The system automatically applies quantization algorithms (like INT4 AWQ or FP8) to reduce memory footprint.\\n - Engine Building: Finally, it compiles a highly optimized TRT Engine specifically tuned for the target hardware.\\n\\n3. Intelligent Profiling & Validation\\n\\n Before deployment, the new engine passes through the Hardware & Inference Runtime Profiler.\\n\\n - Benchmarking: This module empirically tests the engine against various hardware configurations (L4 vs. A100) and runtimes (TRT-LLM vs. vLLM).\\n - Optimization: It recommends the optimal configuration that meets latency SLAs (Time-To-First-Token) while minimizing cost.\\n\\n4. Smart Artifact Generation & Distribution\\n\\n To solve the Kubernetes \\"Cold Start\\" problem, the LLM Serving Artifacts Generation module packages the model using a bifurcated strategy:\\n\\n - Standard Models: Artifacts are uploaded to Cloud Storage (GCS) and downloaded by pods at startup.\\n - Very Large Models: For massive models (>8GB) where network downloads are too slow, the system pre-caches the model onto Secondary Boot Disks. These disks are attached directly to new GPU nodes during autoscaling, eliminating download wait times.\\n\\n5. Image Streaming & Deployment\\n\\n Simultaneously, the inference runtime container images are pulled from the Artifact Registry.\\n\\n - Image Streaming: We utilize container image streaming to allow pods to start initializing while the massive Triton/Dynamo container layers are still downloading, further shaving seconds off the startup time. link\\n\\n6. The Inference Runtime (Kubernetes)\\n\\n The workload lands on Kubernetes with Autoscaling.\\n\\n - Dynamic Backends: Depending on the profile generated in Stage 3, the pod initializes either TensorRT-LLM (for throughput) or vLLM (for flexibility), or spins up a Dynamo worker for distributed inference.\\n - Data Loading: The pod either downloads the model from Cloud Storage or mounts the pre-warmed Secondary Boot Disk (\\"Pull from Disk\\").\\n\\n7. Client Interaction & Observability\\n\\n Finally, the LLM Inference Client executes the request.\\n\\n - Prompt Injection: The client pulls the specific prompt template ID from the Registry, ensuring the exact versioned instructions are used.\\n - Streaming Response: The request is sent via gRPC, and tokens are streamed back to the user in real-time.\\n\\n8. Observability: Monitoring the Pulse of GenAI\\n\\n In traditional microservices, success is measured by CPU utilization and request latency (p99). For Large Language Models, these metrics are insufficient. A user doesn\'t care if the GPU is at 80% utilization; they care about how fast the first word appears and how smoothly the rest of the sentence follows.\\n\\n To capture the true user experience, our platform instrumentation focuses on three critical LLM-specific metrics:\\n\\n 1. Time to First Token (TTFT)\\n - Definition: TTFT measures the time elapsed from the moment a request is received until the very first token is generated and streamed back to the user.\\n - Why it matters: This represents the \\"Prefill Phase\\" latency\u2014the time the model takes to process the input prompt and load weights. A high TTFT makes the application feel unresponsive or \\"hung.\\"\\n - Optimization: We closely monitor TTFT to ensure our Prefix Caching is effective (aiming for high cache hitrates), which drastically lowers this metric by skipping redundant prompt processing.\\n\\n 2. Inter-Token Latency (ITL)\\n - Definition: ITL measures the average time interval between the generation of consecutive tokens during the \\"Decode Phase\\".\\n - Why it matters: This defines the \\"perceived speed\\" of reading. Even if the first token is fast (low TTFT), high ITL makes the text generation look \\"jerky\\" or slow to the user.\\n - Benchmarks: In our testing with Llama 3.1, we track p99 ITL to ensure it stays below human reading speeds to maintain a natural conversational flow.\\n\\n 3. Token Throughput vs. Request Throughput\\n - We distinguish between two types of throughput to balance system efficiency with user load:\\n - Token Throughput (tokens/sec): The total number of tokens generated across all concurrent requests. This measures the raw compute efficiency of the GPU and the effectiveness of batching.\\n - Request Throughput (req/sec): The number of distinct user queries served per second. We use this to determine autoscaling thresholds, ensuring we scale out before the queue depth impacts ITL.\\n\\n 4. The Monitoring Stack\\n - Real-time Dashboards: We utilize Grafana to visualize these streaming metrics in real-time, allowing on-call engineers to spot \\"slow generation\\" incidents that generic \\"500 error\\" alerts would miss.\\n - Request Tracing: Since Triton Inference Server does not log request payloads by default, we integrate a Helix Client to asynchronously publish request logs to Log Tables. This allows us to trace a specific \\"slow\\" request back to its prompt to understand if a complex input caused the latency spike.\\n\\n## Supported Inference backends (TensorRT LLM, Dynamo & vLLM)\\n\\nTailored for the Use Case: We do not believe in a \\"one-size-fits-all\\" approach to inference. Different use cases\u2014whether a real-time voice bot requiring ultra-lowsub-second latency or a massive reasoning task requiring huge context windows\u2014demand different runtime characteristics. Our platform is designed to be runtime-agnostic, allowing us to automatically select and tailor the best engine based on the specific requirements of the application:\\n\\n1. TensorRT-LLM: The High-Performance Standard\\n\\n Suitable for: High-throughput production workloads where latency is critical (e.g., customer support chat, real-time voice bots).\\n\\n TensorRT-LLM serves as our default backend for these scenarios. Our internal benchmarks on Llama 3.1 and 3.2 models demonstrated that a tuned TensorRT-LLM engine significantly outperforms standard runtimes, especially when utilizing INT4 AWQ and FP8 quantization .\\n\\n Key optimizations we tailor for these high-load cases include:\\n\\n - Optimized execution via TensorRT engine compilation\\n - Quantization-aware execution for reduced memory usage and improved throughput\\n - Inflight Batching: Allowing requests to be processed continuously without waiting for the entire batch to finish, drastically improving GPU utilization .\\n - Custom Plugins: Enabling specific NVIDIA plugins like the GEMM plugin and GPT Attention plugin to accelerate matrix multiplications and attention mechanisms .\\n\\n2. Dynamo: Distributed Inference for Reasoning Models\\n\\n Suitable for: Very large \\"reasoning\\" models (70B+) or scenarios requiring massive context windows where a single GPU\'s memory is insufficient.\\n\\n For these memory-bound tasks, we utilize Dynamo, a low-latency distributed inference framework . Unlike monolithic servers, Dynamo disaggregates the inference process to scale resources horizontally:\\n\\n - KV Aware Routing: A specialized router directs requests to workers that already hold the relevant Key-Value (KV) cache, minimizing redundant computation .\\n - Prefill vs. Decode Split: The workload is divided into Prefill Workers (processing the prompt) and Decode Workers (generating tokens), allowing us to scale the compute-heavy \\"reading\\" phase independently from the memory-heavy \\"writing\\" phase .\\n - Distributed execution across multiple GPU resources\\n\\n3. vLLM: The Flexible Baseline\\n\\n Suitable for: Rapid prototyping, testing new model architectures, or low-traffic internal tools where ease of deployment outweighs raw throughput.\\n\\n While TensorRT-LLM is optimized for maximum speed, vLLM provides a robust and flexible baseline .\\n\\n - High throughput through dynamic batching and efficient memory utilization\\n - Paged KV cache management for handling long contexts and concurrent requests\\n - Strong support for open-source model ecosystems\\n - Rapid Adoption: It allows us to onboard new model architectures immediately without waiting for a custom TensorRT build.\\n - Benchmarking Insight: In our internal tests, vLLM provided a strong baseline but often lacked the specific max-token optimizations present in our custom TRT engines . We use it strategically for initial testing before committing to a full TensorRT optimization pipeline.\\n\\n## Conclusion\\n\\nLarge language model inference introduces a fundamentally new class of infrastructure challenges\u2014where performance is governed not just by raw compute, but by memory efficiency, intelligent scheduling, runtime specialization, and lifecycle automation. Unlike traditional ML serving, LLM inference requires systems that understand token-level execution, manage rapidly growing context state, and continuously balance latency, throughput, and cost under highly dynamic workloads.\\n\\nThe LLM Inference Framework addresses these challenges by transforming inference into a fully automated, reproducible lifecycle\u2014from model onboarding and compilation to deployment, optimization, and observability. By integrating automated quantization and engine compilation, intelligent runtime selection, cold-start mitigation strategies, and LLM-specific observability metrics such as Time-to-First-Token and Inter-Token Latency, the platform ensures both high performance and operational simplicity.\\n\\nEqually important, the framework is designed with flexibility and future evolution in mind. Its runtime-agnostic architecture enables seamless adoption of emerging inference engines, hardware accelerators, and optimization techniques without requiring platform redesign. This ensures that teams can continuously leverage advancements in the rapidly evolving LLM ecosystem while maintaining consistent operational workflows.\\n\\nUltimately, the goal of the platform is to make production-scale LLM deployment as seamless and reliable as traditional software deployment\u2014allowing teams to focus on building intelligent applications rather than managing infrastructure complexity. By combining lifecycle automation, runtime optimization, and deep observability, the LLM Inference Framework provides a scalable foundation for delivering fast, cost-efficient, and production-ready LLM experiences.\\n\\n## Future Explorations\\n\\nWhile we have achieved significant milestones in latency and throughput, the landscape of GenAI is evolving rapidly. Our roadmap focuses on increasing flexibility, reducing costs, and enhancing reliability for enterprise-grade workloads. Here is what we are building next:\\n\\n- TPU Support: To diversify our hardware supply chain and further optimize cost-per-token, we are evaluating Google Cloud TPUs to bake it into our platform. By leveraging the JAX and PyTorch/XLA ecosystems, we aim to unlock the massive throughput potential of TPU v5e chips, particularly for our open-source Llama models. This will allow the hardware profiler to dynamically choose between NVIDIA GPUs and Google TPUs based on real-time availability and price-performance metrics.\\n- Multi-LoRA Serving (Serverless Experience): Currently, deploying a fine-tuned model requires a dedicated GPU. We are building support for Multi-LoRA serving, which will allow us to serve hundreds of unique, fine-tuned adapters on top of a single frozen base model. This will drastically reduce costs for multi-tenant applications, enabling a \\"serverless\\" experience where specific fine-tunes are hot-swapped instantly per request.\\n- Spot Instance Orchestration: To further optimize cloud costs, we are developing fault-tolerant mechanisms to run inference workloads on Spot Instances. By implementing aggressive checkpointing and seamless request draining, we aim to leverage cheaper, preemptible compute capacity without interrupting the user\'s streaming experience.\\n- Semantic Caching Layer: We plan to move beyond standard Prefix Caching to implement Semantic Caching. By using a vector database to fetch responses for semantically similar queries (e.g., \\"How do I reset my password?\\" vs. \\"Password reset steps\\"), we can bypass the GPU entirely for repetitive queries, reducing latency to near-zero.\\n- Context-Aware Autoscaling: Standard CPU/GPU utilization metrics are often insufficient signals for scaling LLMs. We are working on KV-cache pressure metrics for autoscaling. This ensures that we scale out before the memory fills up, preventing eviction-based slowdowns during traffic spikes.\\n- Online Evaluation & Guardrails: We are integrating a lightweight \\"Trust Layer\\" into the proxy. This will allow for low-latency input/output filtering (Guardrails) and asynchronous \\"LLM-as-a-Judge\\" evaluation pipelines to monitor response quality in production, not just system health."},{"id":"scaling-model-inference-and-embedding-search","metadata":{"permalink":"/BharatMLStack/blog/scaling-model-inference-and-embedding-search","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/scaling-model-inference-and-embedding-search/index.md","source":"@site/blog/bharatmlstack-history/scaling-model-inference-and-embedding-search/index.md","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"How Meesho scaled model inference with self-hosted Triton on GKE\u2014slashing latency and costs by 65%\u2014and built a real-time embedding search system on Qdrant to power personalized recommendations at scale.","date":"2024-05-21T00:00:00.000Z","tags":[{"inline":true,"label":"model-inference","permalink":"/BharatMLStack/blog/tags/model-inference"},{"inline":true,"label":"embedding-search","permalink":"/BharatMLStack/blog/tags/embedding-search"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":3.55,"hasTruncateMarker":false,"authors":[{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"How Meesho scaled model inference with self-hosted Triton on GKE\u2014slashing latency and costs by 65%\u2014and built a real-time embedding search system on Qdrant to power personalized recommendations at scale.","authors":["aditya","jaya","adarsha"],"slug":"scaling-model-inference-and-embedding-search","date":"2024-05-21T00:00:00.000Z","tags":["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/multi-engine-llm-inferencing-platform"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/building-meeshos-mlplatform-lessons-from-first-gen"}},"content":"![BharatMLStack](./bms.png)\\nBy mid-2023, we had transformed our ML stack\u2014building a real-time feature store, optimizing model retrieval, and fine-tuning ranking. But two critical gaps remained:\\n\\n- \ud83d\udd39 Scaling model inference without hitting infrastructure roadblocks\\n- \ud83d\udd39 Moving embedding search from batch to real-time for candidate generation\\n\\nHere\u2019s how we tackled these last-mile challenges, broke free from infrastructure constraints, and built a cost-efficient, high-performance system.\\n\\n## Breaking Free from the Scalability Ceiling\\n\\n### The Model Serving Bottleneck\u2014A Wake-Up Call\\n\\nJuly 2023. With just months left for the Mega Blockbuster Sale (MBS), we noticed a serious issue\u2014scaling our model-serving infrastructure was taking 10\u201315 minutes. In real-time ML, that\u2019s an eternity.\\nIn one of our war rooms, we ran a quick experiment:\\n\\n- \ud83d\ude80 We deployed an XGBoost model on a self-hosted Triton Inference Server running on a 16-core machine.\\n- \ud83d\ude80 Fired requests and compared the outputs with our existing cloud-hosted setup.\\n- \ud83d\ude80 The results matched\u2014perfectly.\\n\\nThat moment changed everything. We prepped a backup Triton setup on EKS, just in case our cloud provider couldn\'t allocate enough compute resources in time. Luckily, they did\u2014but the seed was planted.\\nThen in October, just two weeks before MBS, we got an alarming response from our infrastructure team:\\n \\"Node availability may be an issue.\\"\\nWith no time to waste, we moved 30% of real-time ML traffic to our self-hosted Triton cluster. The results?\\n\\n- \u2705 p99 latency dropped from 90\u2013100ms to 30\u201340ms\\n- \u2705 Triton handled significantly higher throughput on fewer resources\\n- \u2705 No model changes were needed\\n\\nMBS ran without a hitch, proving that self-hosted inference was the way forward.\\n\\n### Scaling Triton on GKE\\n\\nThis left us with two choices:\\n\\n- 1\ufe0f\u20e3 Port models to a managed cloud inference service, investing time in learning a new deployment stack\\n- 2\ufe0f\u20e3 Scale our existing Triton setup on GKE, optimizing for cost and performance\\n\\nWe went with Option 2\u2014and it slashed inference costs to 35% of what we previously paid, while giving us full control over scaling and optimizations.\\n\\n### Fixing the Cold Start Problem\\n\\nAs we onboarded more deep learning (DL) models, we hit a new bottleneck, new inference pods took 7\u20139 minutes to spin up.\\n\\nAfter profiling, we found the culprits:\\n\\n- Triton\u2019s base image\u2014a massive 5GB\\n- Model binaries\u2014often 1GB+\\n- Startup delay\u2014mostly due to downloading and initializing these assets\\n\\nTo fix this, we built a lightweight Triton image, stripping unused components and shrinking the size to 900MB. This cut cold start times drastically, making auto-scaling faster and smoother.\\n\\n## Embedding Search: The Last Piece of the Puzzle\\n\\nBy mid-2023, most of our ML stack had gone real-time\u2014except for Candidate Generation (CG), which still ran in batch mode. To truly power real-time recommendations, we needed an online embedding search system.\\n\\n### Choosing the Right Vector Database\\n\\nWe benchmarked three production-ready vector DBs across key parameters:\\n\\n- Milvus\\n- Qdrant\\n- Weaviate\\n\\nAfter extensive POCs, Qdrant stood out for its:\\n\\n- \u2705 Blazing-fast search latency on high-dimensional vectors\\n- \u2705 Efficient memory usage, crucial for in-memory workloads\\n- \u2705 Support for upserts and soft deletes, vital for Ads use cases\\n- \u2705 gRPC + REST APIs, making integration seamless\\n- \u2705 Powerful filtering, allowing fine-tuned retrieval (e.g., filtering Ads by category, active status, etc.)\\n\\nAt its core, Qdrant uses HNSW indexing, delivering both high recall and low-latency nearest-neighbor search\u2014a perfect fit for our needs.\\n\\n### Embedding Freshness & Real-Time Updates\\n\\nTo ensure embeddings stayed up to date, we built a dual ingestion pipeline:\\n\\n- \ud83d\udccc Daily Refresh: A bulk pipeline updated embeddings overnight\\n- \ud83d\udccc Real-Time Updates: Ads events triggered immediate upserts/deletes\\n\\nThis setup powered real-time \\"Similar Products\\" recommendations on the product page and became the foundation for Ads Candidate Generation, ensuring the right ads surfaced in milliseconds.\\n\\n![Skye](./vss.png)\\n\\n## Final Takeaways: Scaling Smartly for Real-Time ML\\n\\n- \ud83d\ude80 Self-hosted inference on Triton gave us lower cost, faster scaling, and better performance than managed services\\n- \ud83d\ude80 Building a custom Triton image reduced cold starts, improving responsiveness\\n- \ud83d\ude80 Qdrant-based embedding search enabled real-time personalization at scale\\n- \ud83d\ude80 Real-time updates for embeddings unlocked dynamic, up-to-date recommendations\\n\\nBy early 2024, Meesho\u2019s ML stack had evolved into a fully real-time, scalable, and cost-efficient system, setting the foundation for even bigger leaps ahead."},{"id":"building-meeshos-mlplatform-lessons-from-first-gen","metadata":{"permalink":"/BharatMLStack/blog/building-meeshos-mlplatform-lessons-from-first-gen","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/building-meeshos-mlplatform-lessons-from-first-gen/index.md","source":"@site/blog/bharatmlstack-history/building-meeshos-mlplatform-lessons-from-first-gen/index.md","title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","description":"Lessons from scaling Meesho\'s first-gen ML platform\u2014building Inferflow for no-code feature retrieval, migrating from Cassandra to ScyllaDB, optimizing the Interaction Store with tiered storage, and cutting infra costs by 60% while hitting 1M QPS.","date":"2023-04-10T00:00:00.000Z","tags":[{"inline":true,"label":"inferflow","permalink":"/BharatMLStack/blog/tags/inferflow"},{"inline":true,"label":"interaction-store","permalink":"/BharatMLStack/blog/tags/interaction-store"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":6.25,"hasTruncateMarker":false,"authors":[{"name":"Bhawani Singh","title":"Architect @ Meesho","url":"https://github.com/singh-bhawani","imageURL":"https://github.com/singh-bhawani.png","key":"bhawani","page":null},{"name":"Jigar Dave","title":"Lead Software Engineer @ Meesho","url":"https://github.com/jigarpatel26","imageURL":"https://github.com/jigarpatel26.png","key":"jigar","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","description":"Lessons from scaling Meesho\'s first-gen ML platform\u2014building Inferflow for no-code feature retrieval, migrating from Cassandra to ScyllaDB, optimizing the Interaction Store with tiered storage, and cutting infra costs by 60% while hitting 1M QPS.","authors":["bhawani","jigar","adarsha"],"slug":"building-meeshos-mlplatform-lessons-from-first-gen","date":"2023-4-10","tags":["inferflow","interaction-store","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/scaling-model-inference-and-embedding-search"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","permalink":"/BharatMLStack/blog/building-meeshos-mlplatform"}},"content":"![BharatMLStack](./bms.png)\\nBy late 2022, we had built something we were truly proud of\u2014a real-time ML serving system with a DAG-based executor, a feature store, and an interaction store powering key ranking and personalization models. It was a major milestone, the culmination of months of effort from data scientists, ML engineers, and backend teams. Our system was live, and we were ready to push the boundaries of experimentation.\\nAnd it worked. Mostly.\\nBut soon, cracks appeared. Every new model needed custom feature retrieval logic, DAGs became dense and unmanageable, and scaling turned into a constant firefight. Costs surged, and infra bottlenecks slowed experimentation. Our system worked, but it wasn\u2019t built for scale.\\nThis is the story of how we tackled these challenges\u2014building Inferflow for seamless feature retrieval, optimizing real-time infra, and cutting costs while scaling to millions of QPS.\\n\\n### The Cost of Success\\nEvery new Ranker model required its own feature set, often pulling from different entities. Each addition meant:\\n\\n- Adding new DAG nodes in IOP\\n- Writing custom logic to fetch features from multiple sources (e.g., user, product, user \xd7 category)\\n- Inferring intermediate features (e.g., extracting category from a product to fetch user \xd7 category data)\\n- Optimizing I/O and dealing with the inevitable bugs\\n\\nWhat began as clean DAGs soon turned into a tangled web of cross-dependent graphs. Every experimentation cycle meant new nodes, new dependencies, and slower iterations.\\n\\n### Scaling Pains (and Cassandra\u2019s Limits)\\nAt some point, we were hitting:\\n\\n- 250\u2013300K reads/sec\\n- 1M writes/sec (during lean hours)\\n\\nAll of this ran on Cassandra. While its distributed architecture had been proven in production, operating large-scale clusters came with considerable infrastructure overhead. Our proof-of-concept (POC) demonstrated throughput of around 100K ops/sec, but as we scaled further, the challenges grew. Ensuring node health, optimizing compaction, and maintaining storage balance became increasingly demanding. We also observed latency spikes under heavy load, alongside a sharp increase in total cost of ownership.\\n\\n### Interaction Store Woes\\nOur interaction store was another ticking time bomb:\\n\\n- \ud83d\udea8 Clusters kept growing in size and cost\\n- \ud83d\udea8 Latency spikes became increasingly frequent\\n- \ud83d\udea8 The DMC proxy occasionally lost locality of nodes against shards, causing cross-node communication and degraded performance\\n\\nEach time this happened, we had to manually rebalance shards just to restore stable latency, making operations unsustainable at scale.\\n\\n### Silver Linings\\nDespite the chaos, the system was live and delivering value:\\n\\n- Real-time infrastructure was in production\\n- Costs dropped by 60\u201370% compared to offline personalization\\n- New experiments rolled out faster and more successfully\\n- User engagement metrics improved\\n\\nIt wasn\u2019t perfect. It was far from easy. But it worked\u2014and that counted for a lot.\\n\\n### Round Two: Solving the Top 2 Bottlenecks\\nWith the first-gen system stretched to its limits, we stepped back. Conversations with data scientists and backend engineers revealed three recurring pain points:\\n\\n1. Coding feature retrieval logic for every new model was becoming unsustainable\\n2. ML scale was exploding\u2014bringing rising infra costs with it\\n3. Real-time embedding search was the next big unlock\\n\\nWe tackled them one by one\u2014starting with the biggest pain point.\\n\\n#### Problem 1: No-Code Feature Retrieval for Model Inference\\nWe noticed a pattern: for personalized ranking, models needed features from:\\n\\n- \u2705 Product\\n- \u2705 User\\n- \u2705 User \xd7 Category\\n- \u2705 Region, cohort, sub-category, etc.\\n\\nA key insight emerged: Entities that contribute features for a model always map back to the context entities.\\n\\n![MP Dag](./mp-dag.png)\\n\\nWith this, we designed Inferflow, a graph-driven feature retrieval and model orchestration system:\\n\\n- 1\ufe0f\u20e3 Inferflow takes a modelId and context IDs (e.g., userId, productIds)\\n- 2\ufe0f\u20e3 Loads a pre-defined feature retrieval graph from ZooKeeper\\n- 3\ufe0f\u20e3 Executes the graph to resolve entity relationships dynamically\\n- 4\ufe0f\u20e3 Outputs a 2D matrix of feature vectors\\n\\n\ud83d\udca1 The impact?\\n\\n- \ud83d\ude80 No more custom feature retrieval code\u2014just graph updates in config\\n- \ud83d\ude80 Feature consistency across experiments\\n- \ud83d\ude80 Faster iteration cycles for ranking, fraud detection, and beyond\\n\\nHere\u2019s a visual example that shows how this graph plays out during execution. We further extended the graph to call multiple models as needed:\\n![MP matrix](./mp-matrix.png)\\nWe built Inferflow in GoLang, using gRPC and Proto3 serialization for efficiency.\\n\\n#### Problem 2: Scaling Without Breaking the Bank\\nWith more ML use cases coming online, we needed to cut costs without compromising performance. We focused on:\\n\\n- \ud83d\udd39 Online Feature Store\\n- \ud83d\udd39 Interaction Store\\n\\n#### Optimizing the Online Feature Store\\nOur costs were concentrated in:\\n\\n- \ud83d\udccc Database (Cassandra)\\n- \ud83d\udccc Cache (Redis)\\n- \ud83d\udccc Running Pods (Java services)\\n\\n1\ufe0f\u20e3 Replacing Cassandra with ScyllaDB\\nAs we hit the operational limits of large Cassandra clusters, we transitioned to ScyllaDB, which offered a seamless drop-in replacement without major code changes. The switch brought significant benefits:\\n\\n- Throughput: Matched or exceeded Cassandra\'s performance under identical workloads, even under high concurrency.\\n- Latency: Achieved consistently lower P99 latencies due to ScyllaDB\'s shard-per-core architecture and better I/O utilization.\\n- Cost Efficiency: Reduced infra footprint by ~70% through better CPU and memory efficiency, eliminating the need for over-provisioned nodes.\\n\\n2\ufe0f\u20e3 Finding the Right Cache\\nTo reduce backend load and improve response times, we benchmarked multiple caching solutions\u2014Memcached, KeyDB, and Dragonfly\u2014under real production traffic patterns. Dragonfly stood out due to its robust architecture and operational simplicity:\\n\\n- Data Skew Handling: Efficiently managed extreme key hotness and uneven access patterns without performance degradation.\\n- Throughput: Delivered consistently high throughput, even with large object sizes and concurrent access.\\n- Ease of Adoption: Acted as a drop-in Redis replacement with full protocol compatibility\u2014no changes needed in application code or client libraries.\\n\\n3\ufe0f\u20e3 Moving to GoLang for Cost-Efficient Serving\\nJava services were memory-heavy\u2014so we rewrote core services in GoLang. The results?\\n\\n\u2705 Memory usage dropped by ~80%\\n\u2705 CPU utilization was significantly lower\\n\u2705 Faster, more efficient deployments\\n\\n#### Optimizing the Interaction Store\\nWe realized that we only need a user\u2019s interaction data in Redis when they open the app. So, we implemented a tiered storage approach:\\n\\n- \ud83d\udccc Cold Tier (ScyllaDB)\u2014Stores click, order, wishlist events\\n- \ud83d\udccc Hot Tier (Redis)\u2014Loads a user\u2019s past interactions only when they open the app\\n\\nSmart Offloading: We introduced an inactivity tracker to detect when a user session ends. At that point, Redis data was flushed back to Scylla, reducing unnecessary writes.\\n\\n![InteractionStore](./interaction-str.png)\\n#### Results\\n\\n- Online Feature Store hit 1M QPS for the first time during the 2023 Mega Blockbuster Sale\u2014without breaking a sweat\\n- Infra costs for Online Feature Store and Interaction Store dropped by ~60%\\n\\n#### The Catch: Our ML Hosting Hit a Hard Limit\\nWhile planning for 2023 MBS, we ran into a critical scalability bottleneck:\\n\\n- \u274c Insufficient compute availability in our region for ML instances\\n- \u274c Couldn\u2019t provision enough nodes to handle real-time inference at scale\\n\\nThis forced us to rethink where and how we hosted our models. The existing setup was great for prototyping\u2014but it wasn\u2019t built to handle the bursty, high-QPS demands of real-world production workloads.\\n\\n### Conclusion: From Firefighting to Future-Proofing\\nWhat started as an ambitious experiment turned into a real-time ML infrastructure that powered millions of requests per second. We battled scaling pains, rethought feature retrieval with Inferflow, and rebuilt our infra stack for efficiency\u2014driving down costs while improving experimentation velocity.\\nBut new challenges emerged. Our infrastructure could now handle scale, but our ML model hosting setup hit a hard limit. With compute availability bottlenecks threatening real-time inference, we faced a critical decision: how do we make model serving as scalable and cost-efficient as the rest of our stack? That\u2019s the next piece of the puzzle\u2014and the story of Part 3."},{"id":"building-meeshos-mlplatform","metadata":{"permalink":"/BharatMLStack/blog/building-meeshos-mlplatform","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/building-meeshos-mlplatform-from-chaos-to-cutting-edge/index.md","source":"@site/blog/bharatmlstack-history/building-meeshos-mlplatform-from-chaos-to-cutting-edge/index.md","title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","description":"How Meesho transitioned from batch-based recommendations to a real-time ML platform\u2014building an Online Feature Store, Interaction Store, and DAG execution framework that became BharatMLStack.","date":"2022-11-15T00:00:00.000Z","tags":[{"inline":true,"label":"online-feature-store","permalink":"/BharatMLStack/blog/tags/online-feature-store"},{"inline":true,"label":"interaction-store","permalink":"/BharatMLStack/blog/tags/interaction-store"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"}],"readingTime":10.19,"hasTruncateMarker":false,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null},{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Bhawani Singh","title":"Architect @ Meesho","url":"https://github.com/singh-bhawani","imageURL":"https://github.com/singh-bhawani.png","key":"bhawani","page":null},{"name":"Jigar Dave","title":"Lead Software Engineer @ Meesho","url":"https://github.com/jigarpatel26","imageURL":"https://github.com/jigarpatel26.png","key":"jigar","page":null}],"frontMatter":{"title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","description":"How Meesho transitioned from batch-based recommendations to a real-time ML platform\u2014building an Online Feature Store, Interaction Store, and DAG execution framework that became BharatMLStack.","slug":"building-meeshos-mlplatform","authors":["adarsha","aditya","bhawani","jigar"],"date":"2022-11-15T00:00:00.000Z","tags":["online-feature-store","interaction-store","mlplatform","meesho"]},"unlisted":false,"prevItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/building-meeshos-mlplatform-lessons-from-first-gen"}},"content":"![BharatMLStack](./bms.png)\\nIt all started in early 2022, over a casual Friday evening catch-up. Like many great origin stories, this one began with friendly banter between a group of backend engineers and data scientists. As the conversations unfolded, so did the roasting\u2014until one remark hit a little too close to home:\\n\\n*\\"Why are we still crunching data for Monthly Active Users (MAU) when the next day it\u2019s all about Daily Active Users (DAU)?\\"*\\n\\nThe laughter died down, and the question lingered. When we regrouped on Monday\u2014clear-headed and slightly reflective\u2014we decided to dig into the numbers. What they discovered was quite revealing: a large portion of compute resources wasn\u2019t being put to good use.\\nMuch of the system\u2019s effort was spent supporting users who weren\u2019t actively engaging, and even for new users, the experience wasn\u2019t optimized to make a meaningful impact.\\n\\nAt the same time, Meesho had just launched a company-wide initiative to reduce costs\u2014and every team had to contribute. This realization sparked the journey that would eventually lead to the **Meesho ML Platform**, known today as **BharatMLStack**.\\n\\n![Alt Text](./old-batch-arch.png)\\n\\nBefore the ML Platform, our recommendation and ranking pipelines followed a batch processing approach:\\n- **Data Ingestion**: The Data Platform team executed ETL jobs to ingest raw user data\u2014including user profiles, interaction logs, and product impressions\u2014into designated S3 buckets.\\n- **Layer 1**: Embedding Generation: On the Data Science side, Spark jobs pulled data from multiple S3 sources, cleaned and preprocessed it, and applied matrix factorization to generate user and item embeddings. The processed data and embeddings were then stored back in S3 in a structured format.\\n- **Layer 2**: Candidate Generation (CG): In this stage, Spark jobs leveraged embeddings and historical interaction data to generate candidate recommendations for users. These candidate lists were subsequently written to S3.\\n- **Layer 3**: Ranking and Merging \u2013 A final round of processing ranked the generated candidates using ML models, combined different candidate lists, and stored the final ranked recommendations in a caching system.\\n- **Serving**: A microservice retrieved ranked recommendations from an in-memory data store via exposed APIs, delivering personalized listings across key surfaces such as \\"For You\\" and Category Landing Pages (CLP).\\n\\nThis approach held up well\u2014until Meesho started seeing a significant surge in traffic.\\n\\n## The Turning Point: From Batch to Real-Time\\n\\nAt this time, the team was iterating on new **Ranker models**, and real-time inference seemed like the next logical step. But Rankers needed **real-time feature retrieval**, which meant an **online feature store** had to be built first.\\n\\nExploring open-source options led to **cost vs. performance trade-offs**, but Meesho\u2019s surging traffic meant that **latency and stability were non-negotiable**. After multiple debates and stakeholder discussions, a bold decision was made:\\n\\n*We would build our own feature store.*\\n\\nMeanwhile, efforts began to bring **Candidate Generators (CGs)** to real-time. The challenge? **Storing and retrieving user interactions quickly enough** to power real-time recommendations.\\n\\nAs the team dove deeper, a new roadblock emerged: \\nOur ML jobs were orchestrated using **Airflow DAGs**, giving data scientists flexibility in experimentation. But transitioning to real-time execution threatened this agility. Every change would now require backend engineering support, **slowing down iteration cycles**.\\n\\nThat\u2019s when the idea struck: \\nWe needed a **framework for real-time DAG execution**\u2014one that preserved the same flexibility as Airflow but worked for **streaming data**.\\n\\nThis moment shaped the **next phase of our journey**.\\n\\n## First Generation Design\\n\\n![Alt Text](./first-gen-arch.png)\\n\\n# Laying the Groundwork: The First-Gen ML Platform\\n\\nTo solve these challenges, the team built three foundational components:\\n\\n\\n### 1. IOP Framework: A Real-Time DAG Executor\\n\\n- **Reusable Nodes**: Each DAG node (e.g., an invocation to a CG service, a ranker, or a filter) had to be implemented only once. After that, it could be reused across any workflow by referencing it in config.\\n- **Config-driven Dynamic Graphs**: Execution graphs were defined as adjacency lists stored in **ZooKeeper**, allowing teams to modify the sequence or structure of operations without touching application code.\\n- **Plug-and-play CGs**: The Candidate Generator interface was preserved, so a single CG node could call any CG service by passing `cg_name` in the request. This drastically reduced the code surface area and improved maintainability.\\n- **Production-Grade DAGs**: DAGs were designed to execute in **low-latency real-time environments**, with support for **parallel execution, retries, and branching**.\\n\\n[More about IOP DAG](https://www.meesho.io/blog/rebuilding-meeshos-ranking-platform)\\n\\n\\n### 2. Online Feature Store - 0th Version\\n\\n- Used **Cassandra** and **Redis** for low-latency feature serving.\\n- Maintained feature consistency using **Feature Groups** with TTL-based expiry.\\n- A hybrid schema was used: feature keys stored in **ZooKeeper**, data stored in **compact arrays**.\\n\\n\\n### 3. Interaction Store - 0th Version\\n\\n- Captured real-time user interactions like clicks, orders, and add-to-cart events.\\n- Stored event data in **Redis ZSETs (sorted sets)** to enable fast lookups for recommendation engines.\\n- Provided an API to fetch a user\'s **last _k_ interactions** or **interactions within a time window**.\\n\\n\\nWith these components in place, **real-time ML at Meesho became a reality**.\\n\\nThis was just the beginning.\\n\\n## Building the Online Feature Store - 0th Version\\n\\n![Alt text](./online-feature-store-v0.png)\\n\\n### Choosing the Right Tech Stack\\n\\nWe spent considerable time evaluating various databases, caches, and communication protocols for our **online feature store**. After carefully weighing **cost, latency, throughput**, and **operational stability**, we settled on a combination of:\\n\\n- **Cassandra** and **Redis** for storage\\n- **gRPC + Proto3** as our communication layer\\n\\n\\n### Streamlining the Data Flow\\n\\nTo keep things simple in the initial version:\\n\\n- **Feature engineering jobs** wrote raw outputs to an **S3 bucket**\\n- A **daily feature push job**:\\n - Read from S3\\n - Grouped related features into **Feature Groups** (ensuring consistency)\\n - Pushed them to **Kafka**\\n\\nFor features requiring frequent updates:\\n\\n- **Ad-hoc jobs** computed features in higher frequency\\n- These jobs pushed to both **Kafka** and **S3** (S3 preserved historical data for future model training)\\n\\n\\n## The Challenges: Data Format and Storage\\n\\nOne of the most critical design challenges was how to store feature data **efficiently and consistently**, especially in databases like **Cassandra** and **Redis**, which come with unique storage constraints.\\n\\nWe had to solve for three key requirements:\\n\\n- ### Feature Consistency\\n When a feature group contains features like `order_count_1h` and `click_count_1h`, both must reflect the **same time window**. Inconsistent updates would lead to **unreliable model predictions**.\\n\\n- ### TTL Granularity\\n Each feature group required an **expiry timestamp**, so that **all features within it expired together**\u2014preserving consistency during reads.\\n\\n- ### Extensibility Across Databases\\n We anticipated that infra needs would evolve. To future-proof our system, the data format was designed to be **decoupled from DB-specific layouts**, enabling portability to systems like **ScyllaDB**, **DynamoDB**, **HBase**, or **BigTable**.\\n\\n\\n---\\n\\n## Overcoming Technical Constraints\\nAt the time, we were using Cassandra, which not only imposed a soft limit of 75 columns per row, but also exhibited significant performance degradation as the number of columns increased further, particularly in memory constrained machines. Wide rows caused high memory usage during reads, unpredictable latencies due to heavy deserialization overhead, and inefficiencies during compactions and repairs. This ruled out the naive \\"one column per feature\\" approach. We needed a format that was compact, minimized the number of columns, and remained efficient and portable across different storage systems.\\n\\n## The Solution: Schema Separation\\n\\nWe introduced the concept of Feature Groups\u2014logical groupings of features that must remain consistent with one another.\\nTo represent these groups efficiently, we adopted a layered storage approach:\\n\\n- **Feature Labels (Keys)** were stored in ZooKeeper, serving as the schema.\\n- **Feature Values** were stored as a comma-separated string array in Cassandra or Redis.\\n- **Expiry Timestamp and Schema Version** were appended using a semi-colon delimiter at the end of the string.\\n\\nExample:\\n\\n```bash\\nfeature_1_value,feature_2_value,feature_3_value;expiry_ts\\n```\\n\\nThis format allowed:\\n- Consistent writes and reads at the group level\\n- Easy parsing of feature values using the schema lookup from ZooKeeper\\n- Efficient storage with minimal DB column usage\\n- Support for per-group TTLs and schema evolution\\n\\n## Tracking Changes in Feature Groups\\nFeature groups don\u2019t stay static. As models evolve, features get added, renamed, or removed. But schema changes often go live before the data is ready\u2014and stopping ingestion just to wait for everything to align isn\'t feasible.\\n\\n### Common Real-World Scenarios:\\n- A new feature is added to the schema, but ingestion jobs still use the older schema version.\\n- Ongoing writes don\u2019t include the newly added feature, and stopping ingestion would break freshness for existing features.\\n- During serving, models request a mix of old and new features, depending on rollout stages.\\n\\n## The Solution: Schema Versioning\\nWe solved this with versioned feature group schemas, which unlocked several capabilities:\\n- ### Backward Compatibility\\n Older ingestion jobs can continue writing using older schema versions. During reads, the system uses the schema version embedded in the value to interpret the data correctly.\\n- ### Partial Availability Handling \\n During inference, if some features in the request aren\u2019t available (due to rollout delays or missing data), the system serves default values, ensuring the inference call doesn\u2019t fail.\\n- ### Safe Writes Without Pipeline Pauses\\n With schema versioning, we no longer had to stop ingestion pipelines for schema updates. Writes using previous versions can continue safely, and downstream consumers evolve independently.\\nThis design gave us the flexibility to move fast without breaking things\u2014preserving data quality, enabling experimentation, and ensuring reliability at scale.\\n\\n![Alt Text](./schema.png)\\n\\n## Interaction Store - 0th Version\\n\\n![Alt Text](./interaction-store-v0.png)\\n\\nTo power real-time Candidate Generators (CGs), we needed fast access to user behavior signals\u2014like what a user recently clicked, ordered, or added to their cart. These interactions form the basis for many real-time recommendations, such as **Similar Products**, **People Also Viewed**, or **Recently Ordered Again**.\\nFor the **0th version** of the Interaction Store, we focused on a design that was **simple, fast, and reliable** \u2014 optimized for high-throughput ingestion and low-latency lookups.\\n\\n## Event Ingestion\\nWe instrumented our backend services to emit key user interaction events to Kafka in real time. These included:\\n- Click\\n- Order\\n- Add to Cart\\n- Wishlist\\n- Share\\n\\nEach event carried essential metadata:\\n- userId \u2014 uniquely identifies the user\\n- productId \u2014 the item being interacted with\\n- timestamp \u2014 the moment the interaction occurred\\n\\nThis decoupled the interaction logging from storage, allowing ingestion and consumption to scale independently.\\n\\n## Storage Design\\nTo store these events, we built Kafka consumers that processed the incoming streams and wrote the data into Redis, using sorted sets (ZSETs) as the primary data structure.\\n\\n### Why Redis?\\nRedis gave us:\\n- **Low-latency** reads and writes\\n- **Time-ordered data** using ZSETs (via score = timestamp)\\n- **Native TTL support**, if needed in later versions\\n- **In-memory performance** \u2014ideal for real-time CGs\\n\\n### Storage Structure\\nEach user\u2019s interactions were stored using a composite key format, uniquely identifying the user and interaction type. This structure allowed efficient organization and quick retrieval of recent activity for recommendation generation:\\n\\n```bash\\nuserId_eventType \u2192 ZSET[...(pid, ts)...]\\n```\\n\\nWithin each ZSET:\\n\\n- The **timestamp** served as the score, maintaining temporal order\\n- The **productId** (optionally with metadata) was the **value**\\n\\nThis allowed us to efficiently retrieve the interactions with HTTP-based API server with two query modes:\\n- Fetch the **last k interactions** of a specific type for a given user with `ZREVRANGE(userId_eventType, count)`\\n- Retrieve **all interactions within a time range** (e.g., last 24 hours) with `ZREVRANGEBYSCORE(userId_eventType, timeRange)`\\n\\n### Built-in Guardrails\\nSince Redis was the sole store, we implemented High Availability (HA) to prevent data loss. To optimize memory usage, we also enforced size limits per event type\u2014only storing the last k interactions per user, with older entries getting truncated.\\n\\n## Conclusion: Laying the Foundation for Real-Time ML\\n\\nIn this first phase, we tackled the **fundamentals**\u2014shifting from batch-based recommendations to a **real-time Recommendation** using ML platform that could keep up with Meesho\u2019s growth.\\n\\nWith the **IOP Framework**, **Online Feature Store**, and **Interaction Store**, we built the core infrastructure to support real-time personalization at scale. These wins have already unlocked: \\n- \u2705 Faster, more dynamic recommendations for millions of users. \\n- \u2705 Better infrastructure efficiency, reducing wasted compute power. \\n- \u2705 A flexible, modular system that allows for further experimentation.\\n\\nBut this is just the beginning. While we\'ve solved key challenges, **certain roadblocks remain** \u2014from optimizing **cost-performance trade-offs** to **seamlessly evolving schemas**.\\n\\n\\nThis foundational work laid the path for a reliable and scalable **real-time feature serving layer**."}]}}')}}]); \ No newline at end of file diff --git a/docs/assets/js/6479fb86.431b9ea8.js b/docs/assets/js/6479fb86.431b9ea8.js new file mode 100644 index 00000000..4a69b80c --- /dev/null +++ b/docs/assets/js/6479fb86.431b9ea8.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[5579],{3751:e=>{e.exports=JSON.parse('{"archive":{"blogPosts":[{"id":"episodic-memory-for-agents","metadata":{"permalink":"/BharatMLStack/blog/episodic-memory-for-agents","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","source":"@site/blog/bharatmlstack-history/episodic-memory-for-agents/index.md","title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","date":"2026-02-19T00:00:00.000Z","tags":[{"inline":true,"label":"ai-agents","permalink":"/BharatMLStack/blog/tags/ai-agents"},{"inline":true,"label":"memory","permalink":"/BharatMLStack/blog/tags/memory"},{"inline":true,"label":"architecture","permalink":"/BharatMLStack/blog/tags/architecture"},{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"episodic-memory","permalink":"/BharatMLStack/blog/tags/episodic-memory"}],"readingTime":11.67,"hasTruncateMarker":true,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","description":"Current agent memory is just search. We built an episodic memory system that tracks outcomes, forms causal links, extracts reasoning heuristics, and actually learns from failure \u2014 without retraining the model.","slug":"episodic-memory-for-agents","authors":["adarsha"],"date":"2026-02-19T00:00:00.000Z","tags":["ai-agents","memory","architecture","llm","episodic-memory"]},"unlisted":false,"nextItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency"}},"content":"![BharatMLStack](./bms.png)\\nAgent memory has come a long way. Persistent context, vector retrieval, knowledge graphs \u2014 the building blocks are real and getting better fast.\\n\\nBut most of what we call \\"memory\\" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.\\n\\nWe are trying to experiment something different. An episodic memory system where a frozen LLM \u2014 same weights, no retraining \u2014 produces increasingly better decisions over time because the memory feeding it context is continuously evolving.\\nThen we tested it. The results were interesting.\\n\\n\\n\x3c!-- truncate --\x3e\\n\\n## The Gap Nobody Talks About\\n\\nHere\'s a scenario every engineering team has encountered: AI agent hits a Redis connection pool exhaustion issue. It misdiagnoses it as a database problem. You correct it. Next week, a different service has the exact same failure pattern. The agent makes the exact same mistake.\\n\\nWhy? Because LLMs don\'t learn at inference time. Corrections adjust behavior within a conversation. Once the session ends, the lesson is gone. The model weights haven\'t changed. The next conversation starts from zero.\\n\\nCurrent \\"memory\\" systems don\'t fully address this. They store facts \u2014 user preferences, document chunks, conversation summaries. But facts aren\'t experience. Knowing that \\"Redis connection pools can exhaust under load\\" is different from remembering \\"last time I saw 500 errors under load, I assumed it was the database, I was wrong, it was actually the connection pool, and here\'s the correction I received.\\"\\n\\nThe first is a fact. The second is an episode. The difference matters.\\n\\n## What\'s Wrong With Vector RAG as Memory\\n\\nWe identified five structural gaps in how current agent frameworks handle memory:\\n\\n**No concept of time.** Two events are either semantically similar or they\'re not. The system can\'t represent \\"this happened after that\\" without distorting similarity scores. An agent can\'t reason about sequence or causality.\\n\\n**No concept of situation.** A production incident and a design review might use the same technical vocabulary. Flat vector search can\'t distinguish them. Your agent retrieves planning notes when it should be retrieving incident postmortems.\\n\\n**No outcome tracking.** The system stores *what happened* but not *whether it worked*. A failed approach and a successful one are equally retrievable. The agent has no way to prefer strategies that worked over strategies that didn\'t.\\n\\n**Summaries destroy evidence.** Summarization-based memory compresses experience but discards the reasoning chain. The agent loses the ability to explain *how* it arrived at a conclusion. The audit trail is gone.\\n\\n**No causal links.** Each memory chunk is independent. There\'s no way to express that incident A caused decision B, which led to outcome C, which was corrected by approach D. Without this structure, the agent can\'t traverse chains of reasoning.\\n\\nThese gaps compound. As an agent accumulates more experience, flat vector memory gets noisier, more contradictory, and less useful. The system degrades precisely when it should be improving.\\n\\n## The Architecture: Episodic Memory\\n\\nWe are building a memory system modeled on how human episodic memory works \u2014 not as a metaphor, but as an engineering specification.\\n\\nThe system has four layers:\\n\\n### Layer 1: Immutable Timeline\\n\\nEvery piece of agent experience is recorded as an append-only timeline entry. Each entry carries a semantic embedding (what it means), a timestamp (when it happened), and a state label (what situation the agent was in \u2014 debugging, planning, code review, incident response). Entries are never modified, never deleted, never summarized. This is the source of truth.\\n\\n### Layer 2: Episode Segmentation\\n\\nThe system watches the timeline and detects when one coherent unit of experience ends and another begins \u2014 via state transitions, semantic shifts, temporal gaps, or explicit signals. Each episode is a reference into the timeline (not a copy) with a generated summary, an outcome (SUCCESS, FAILURE, PARTIAL, UNKNOWN), decisions made, assumptions held, and corrections received.\\n\\nThe outcome field is the most important thing that doesn\'t exist in any current memory system. Without it, you can\'t learn from mistakes.\\n\\n### Layer 3: Episodic Graph\\n\\nEpisodes are connected through typed, weighted links: CAUSED_BY, LED_TO, RETRY_OF, LEARNED_FROM, CONTINUATION, CONTRADICTED. Over time, this forms a directed graph that enables traversal by meaning and causality. You can follow the chain: \\"this incident caused that investigation, which led to a failed fix, which was corrected by this approach.\\"\\n\\n### Layer 4: Generalized Facts\\n\\nWhen multiple episodes exhibit consistent patterns, the system extracts reasoning heuristics: \\"When services fail immediately after deployment with no traffic change, investigate configuration errors before connection pool problems.\\" Facts are versioned, never overwritten, and maintain links back to supporting and contradicting episodes. When contradicting evidence accumulates, confidence decreases. When confidence drops below a threshold, the fact is revised \u2014 but the old version is preserved.\\n\\nThe LLM sits above all four layers. At query time, the system assembles structured context \u2014 relevant episodes with outcomes, applicable facts with confidence scores, causal narratives \u2014 and passes it to the LLM for reasoning. The model reasons over structured memory. It doesn\'t store or manage memory.\\n\\n### The Reinforcement Loop\\n\\nThis is where it comes together:\\n\\n1. Agent reasons using retrieved episodes and facts\\n2. Outcome is detected (CI pass/fail, user correction, test result)\\n3. New episode is created with outcome tracking\\n4. Links are created between the retrieved episodes and the new episode\\n5. Facts are reinforced (if outcome aligned) or contradicted (if outcome conflicted)\\n6. If the decision was wrong and corrected, a LEARNED_FROM link is created\\n\\nThe model weights never change. The memory structure evolves continuously. A frozen LLM produces better decisions over time because it receives better context from richer memory.\\n\\n## The Experiment\\n\\nWe built the full system in Python (~1,000 lines) and tested it head-to-head against a baseline flat-vector RAG agent across a 9-round synthetic debugging scenario. Both agents used the identical LLM (Claude Sonnet 4) for reasoning. The only variable was the memory system.\\n\\nThe scenario was designed to test five capabilities:\\n\\n| Round Type | What It Tests | Rounds |\\n|---|---|---|\\n| LEARN | Can the agent build experience from failures? | 1, 2, 4 |\\n| RED HERRING | Can the agent resist applying a pattern when it doesn\'t fit? | 3 |\\n| TEST | Can the agent apply learned patterns to new services? | 5, 6 |\\n| SUBTLE | Can the agent generalize to different symptoms, same root cause? | 7 |\\n| CORRECTION | After being corrected, does the agent adapt? | 8, 9 |\\n\\nRounds 1-4 build experience: three connection pool failures across different services, plus one red herring (a deployment config error that *looks* like a connection pool issue). Rounds 5-7 test whether the agent applies the learned pattern to unfamiliar services and subtle symptom variations. Rounds 8-9 are the critical test: the agent is corrected after misdiagnosing a deployment-correlated error, then tested on a near-identical scenario to see if it adapts.\\n\\n## Results\\n\\n### Decision Accuracy\\n\\n| Round | Type | Episodic Agent | Baseline Agent |\\n|---|---|---|---|\\n| 1 | LEARN | \u2717 | \u2713 |\\n| 2 | LEARN | \u2713 | \u2713 |\\n| 3 | RED HERRING | \u2717 | \u2717 |\\n| 4 | LEARN | \u2713 | \u2713 |\\n| 5 | TEST | **\u2713** | \u2717 |\\n| 6 | TEST | **\u2713** | \u2717 |\\n| 7 | SUBTLE | **\u2713** | \u2717 |\\n| 8 | CORRECTION | \u2713 | \u2713 |\\n| 9 | CORRECTION | \u2713 | \u2713 |\\n| **Total** | | **7/9 (78%)** | **5/9 (56%)** |\\n\\nThe episodic agent won 7-5. A 40% relative improvement in decision accuracy using the exact same LLM.\\n\\n### Where the Gap Opened\\n\\nThe episodic agent\'s advantage concentrated in exactly the rounds designed to test memory quality:\\n\\n**Rounds 5-6 (pattern application):** The episodic agent cited 4 past failure episodes with connection pool exhaustion as root cause, complete with correction annotations. It correctly identified pool exhaustion in new services. The baseline retrieved disconnected chunks and suggested checking timeout configurations \u2014 a pattern it picked up from the Round 3 red herring.\\n\\n**Round 7 (subtle symptoms \u2014 latency increase, no errors):** Both agents had the same evidence available. The episodic agent\'s retrieval surfaced a diverse set of episodes (thanks to MMR diversity filtering) including the Redis pool exhaustion from Round 6, which primed it to recognize that latency without errors can still be pool contention. The baseline defaulted to \\"check recent config changes.\\"\\n\\n**Round 9 (adaptation after correction):** This is the result we\'re most proud of. Look at the episodic agent\'s reasoning:\\n\\n> *\\"Episode 1 directly parallels this situation \u2014 errors spiking immediately after a deployment (v2.4.1 then, v3.1.0 now) with no traffic change. In that case, the root cause was a database migration that dropped an index. The generalized fact confirms that deployment-related issues with immediate onset after version changes are more likely caused by configuration errors or missing dependencies than by connection pool problems.\\"*\\n\\nIt cited a specific past episode by analogy, quoted a generalized fact, and explained *why* this situation matches the deployment pattern rather than the connection pool pattern. The baseline gave a vaguer assessment.\\n\\n### Retrieval Quality\\n\\nThis is where the structural difference is most visible:\\n\\n| Metric | Episodic Agent | Baseline Agent |\\n|---|---|---|\\n| Retrieved items with explicit outcome labels | **100%** | 25% |\\n| Correct pattern applications (Rounds 4-7) | **4/4** | 1/4 |\\n| False positives (Rounds 8-9) | **0** | 0 |\\n\\nEvery item the episodic agent retrieved carried a structured outcome label (SUCCESS or FAILURE) with correction details. Only 25% of the baseline\'s chunks contained any outcome information \u2014 and those were incidental text mentions, not structured labels.\\n\\nThe episodic agent correctly applied the connection pool pattern in all four rounds where it was the root cause, and correctly avoided it in both rounds where it wasn\'t. The baseline applied it correctly once.\\n\\n## What Didn\'t Work\\n\\nTwo things didn\'t work as anticipated:\\n\\n**Round 3 (red herring):** Both agents failed. The symptoms looked like connection pool issues, but the root cause was a deployment config change. At this point, the episodic agent had only seen connection pool episodes \u2014 it had no counter-evidence for deployment-correlated errors. You can\'t distinguish patterns you\'ve only seen one side of. After Round 8 introduced a correction, the agent successfully avoided this mistake in Round 9.\\n\\n**Fact quality variance.** Some extracted facts were specific and actionable (\\"Deployment-related issues with immediate onset are more likely configuration errors\\"). Others were vague (\\"Initial symptom-based diagnosis often leads to misidentifying the root cause\\"). A production system needs a usefulness filter, not just a confidence score.\\n\\n## What This Means\\n\\nThe most important finding isn\'t the accuracy improvement. It\'s that the reinforcement loop closes without retraining.\\n\\nIn the POC, we observed:\\n\\n- Rounds 1-4: Agent encounters failures, episodes recorded with outcomes and corrections\\n- After Round 4: Fact extracted \u2014 \\"Connection pool exhaustion is a common root cause under load\\"\\n- Rounds 5-7: Agent applies the pattern with increasing confidence (fact support count grows)\\n- Round 8: Agent encounters a deployment error, correctly identifies it as config, gets corrected\\n- After Round 8: New fact \u2014 \\"Deployment-related issues with immediate onset are more likely configuration errors\\"\\n- Round 9: Agent receives near-identical scenario, correctly avoids connection pool pattern, cites the Round 8 correction\\n\\nThe model didn\'t change. The memory evolved. That\'s the whole point.\\n\\n## How It Compares to Existing Solutions\\n\\nAgent memory is a fast-moving space with several strong systems, each solving a different slice of the problem:\\n\\n**Mem0** excels at persistent personalization \u2014 extracting user preferences, managing session context, and reducing token costs through intelligent compression. It\'s the most production-ready memory layer available and integrates with nearly every agent framework. Its focus is on remembering about users and conversations rather than learning from task-level outcomes, which is a different problem than the one we\'re exploring here.\\n\\n**Zep/Graphiti** is doing some of the most interesting work in temporal knowledge graphs. Their bi-temporal model \u2014 tracking both when an event occurred and when it was ingested \u2014 addresses a real structural gap in how agent memory handles changing facts over time. Their episode and entity subgraphs share some philosophical DNA with our approach. Where our work diverges is in outcome tracking and reinforcement: we\'re specifically focused on whether a decision worked, and using that signal to update memory structure.\\n\\n**Letta (formerly MemGPT)** pioneered self-editing memory \u2014 giving the LLM tools to manage its own memory blocks. This is a powerful paradigm, and their recent work on \\"Context Repositories\\" and sleep-time compute suggests they\'re actively pushing toward agents that learn over time. Their team has been transparent that experiential learning is an unsolved problem, which is part of what motivated our exploration.\\n\\n**MemRL (Jan 2026 paper)** is the closest to our work academically. It shares the core insight of decoupling stable LLM reasoning from plastic, evolving memory. Their approach uses reinforcement learning to assign utility Q-values to memories, which is elegant but requires training a value function. Our approach is purely structural \u2014 no training step, no Q-values, just graph evolution and LLM-based reasoning over outcomes.\\n\\n\\nThe common thread: most existing systems focus on knowledge persistence \u2014 remembering facts, preferences, and conversation history across sessions. The problem we\'re exploring is experiential learning \u2014 tracking whether past decisions worked, forming causal chains between episodes, and extracting reasoning heuristics that improve over time. These are complementary capabilities that would be needed by an ideal production system.\\n\\n## Try It Yourself\\n\\nThe prototype is available in our experiments directory:\\n\\n```\\nexperiments/episodic-memory-prototype/\\n\u251c\u2500\u2500 memory/ # Timeline, encoder, episodes, graph, facts, retriever, reinforcer\\n\u251c\u2500\u2500 agent/ # Episodic memory agent\\n\u251c\u2500\u2500 baseline/ # Flat vector RAG agent (comparison)\\n\u251c\u2500\u2500 simulator/ # 9-round debugging scenario\\n\u251c\u2500\u2500 eval/ # Head-to-head comparison + scoring\\n\u2514\u2500\u2500 tests/\\n```\\n\\nTo run the comparison:\\n\\n```bash\\ncd experiments/episodic-memory-prototype\\npython -m venv .venv && source .venv/bin/activate\\npip install -r requirements.txt\\nexport ANTHROPIC_API_KEY=sk-ant-...\\npython -m eval.compare\\n```\\n\\nWithout an API key, it runs in heuristic mode (keyword-based decisions). With a key, both agents use Claude Sonnet for reasoning \u2014 that\'s where the quality gap becomes visible.\\n\\n\\n## Conclusion\\nThis is a 9-round synthetic scenario we designed. It demonstrates the poc architecture works end-to-end and shows where episodic memory provides qualitatively different reasoning. It is not a peer-reviewed benchmark and should not be interpreted as a statistically rigorous claim. We\'re publishing the prototype so others can reproduce and extend the evaluation.\\nIf this sparks interest do trigger github discussion.\\n\\n---\\n\\n*The episodic memory prototype is available in `BharatMLStack` repo at `/experiments/episodic-memory-prototype`*"},{"id":"llm-inference-optimization-sub-sec-latency","metadata":{"permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/llm-inference-optimization/index.md","source":"@site/blog/bharatmlstack-history/llm-inference-optimization/index.md","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"A practical guide to the optimization techniques behind sub-second LLM inference\u2014covering paged KV caching, INT4 AWQ and FP8 quantization, kernel fusion, inflight batching, parallelism strategies, and speculative decoding, with production benchmarks on L4 and A100 GPUs.","date":"2025-06-02T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":4.88,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"A practical guide to the optimization techniques behind sub-second LLM inference\u2014covering paged KV caching, INT4 AWQ and FP8 quantization, kernel fusion, inflight batching, parallelism strategies, and speculative decoding, with production benchmarks on L4 and A100 GPUs.","authors":["jaya"],"slug":"llm-inference-optimization-sub-sec-latency","date":"2025-6-2","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Beyond Vector RAG: Building Agent Memory That Learns From Experience.","permalink":"/BharatMLStack/blog/episodic-memory-for-agents"},"nextItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/multi-engine-llm-inferencing-platform"}},"content":"![BharatMLStack](./bms.png)\\nRaw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack\u2014from memory management to kernel execution.\\n\\n## 1. Advanced Memory Management: Paged & Prefix KV Caching\\n\\nThe most significant bottleneck in LLM inference is not always compute, but memory bandwidth\u2014specifically managing the Key-Value (KV) cache.\\n\\n### Paged KV caching\\n\\nStandard caching suffers from fragmentation. We use **Paged KV caching**, which operates similarly to an operating system\'s virtual memory: the KV cache is divided into non-contiguous blocks. This lets us serve larger batch sizes without running out of memory.\\n\\n### KV cache quantization\\n\\nTo further maximize available memory, we implement **KV cache quantization** (e.g., FP8). By compressing stored attention keys and values from 16-bit to 8-bit, we nearly double the effective context window capacity of the GPU, allowing longer conversations or larger batches without materially degrading quality.\\n\\n### Prefix caching (the \\"voice bot\\" optimizer)\\n\\nFor use cases like GenAI voice bots where the system prompt (e.g., \\"You are a helpful assistant...\\") is static across thousands of requests, we enable **prefix caching**.\\n\\n- **Impact**: By reusing pre-computed KV states for common prefixes, we achieve a cache hit rate of ~90%. This reduces **Time To First Token (TTFT)** by skipping redundant computation of the system prompt.\\n\\n## 2. Aggressive Quantization (INT4 AWQ & FP8)\\n\\nRunning models in their native 16-bit precision (BF16) restricts maximum batch size and throughput. We use quantization to shrink model weights without sacrificing accuracy.\\n\\n### INT4 AWQ (Activation-aware Weight Quantization)\\n\\nFor the Llama 3 family, we use **AWQ** to compress weights to 4 bits. This reduces model size by ~75%, allowing larger models to fit into L4 GPU memory and significantly improving token generation speed.\\n\\n### FP8 precision\\n\\nFor NVIDIA Hopper (H100) architectures, we are exploring **FP8 quantization**, leveraging native FP8 tensor cores to accelerate matrix multiplications while maintaining a higher dynamic range than integer quantization.\\n\\n- **Verification**: We validate quantized models by comparing dot-product similarity of embeddings against the FP16 baseline, consistently achieving **>99% similarity**.\\n\\n## 3. Kernel Fusion & Custom Plugins\\n\\nTo minimize overhead from launching thousands of small GPU operations, we fuse them into monolithic kernels using NVIDIA TensorRT plugins.\\n\\n- **Flash attention & FMHA**: We enable **Fused Multi-Head Attention (FMHA)** combined with flash attention to reduce memory reads/writes.\\n- **GEMM plugins**: We use specialized **GEMM** plugins to accelerate transformer linear layers.\\n- **Removing input padding**: Instead of padding short sequences to match the longest, we remove input padding so the GPU processes only valid tokens.\\n\\n## 4. Inflight (Continuous) Batching\\n\\nTraditional static batching waits for all requests in a batch to finish before returning results\u2014so one long response delays everyone else.\\n\\nWe implement **inflight batching**: as soon as one request completes, its slot is freed and filled by a new request from the queue. This keeps GPUs saturated and decouples latency of short queries from long ones.\\n\\n## 5. Parallelism Strategies: Scaling Beyond One GPU\\n\\nFor large models (e.g., 70B+ parameters) that cannot fit into the VRAM of a single GPU, we use parallelism strategies.\\n\\n- **Tensor parallelism (TP)**: Split weight matrices across multiple GPUs (e.g., 4\xd7 L4 or 8\xd7 A100). Each GPU computes a shard and outputs are reduced at every layer.\\n- **Pipeline parallelism (PP)**: Split model layers across GPUs to pipeline compute (e.g., while one GPU computes later layers for Request A, another starts early layers for Request B).\\n\\n## 6. Speculative Decoding\\n\\nTo reduce inter-token latency (ITL), we explore **speculative decoding**.\\n\\n- **Mechanism**: A smaller, faster \\"draft\\" model speculatively generates a short token sequence (e.g., 5 tokens).\\n- **Verification**: The larger target model verifies those tokens in one parallel forward pass. If correct, we effectively generate multiple tokens per large-model step; if not, we discard and regenerate. This is effective for predictable text, improving perceived generation speed.\\n\\n## Few Benchmarks\\n\\nBelow are a couple of representative use cases and performance numbers.\\n\\n### Search query rewriting\\n\\n- **LLM**: Fine-tuned llama-3.2-1B\\n- **Input & output token length**: ~10\u201320\\n- **Response type**: Non-streaming\\n\\n| Inference runtime | Hardware | Max requests/sec | Max p99 latency |\\n| --- | --- | ---: | ---: |\\n| TensorRT-LLM | 4 \xd7 L4 GPUs (multi-GPU) | 1000 | 95 ms |\\n| TensorRT-LLM | 1 \xd7 A100 40 GB GPU | 1000 | 69 ms |\\n\\n### Voice bot query\\n\\n- **LLM**: Llama-3.1-8B\\n- **Input token length**: ~1900\u20132000\\n- **Output token length**: ~200\\n- **Response type**: Streaming\\n\\n| Inference runtime | Concurrency | p99 TTFT (ms) | p99 ITL (ms) | Token throughput (tokens/sec) | Request throughput (req/sec) | Hardware |\\n| --- | ---: | ---: | ---: | ---: | ---: | --- |\\n| TensorRT-LLM | 1 | 36.27 | 22.78 | 45.66 | 0.23 | L4 |\\n| TensorRT-LLM | 2 | 49.81 | 23.21 | 89.37 | 0.45 | L4 |\\n| TensorRT-LLM | 4 | 55.33 | 36.62 | 153.39 | 0.78 | L4 |\\n| TensorRT-LLM | 8 | 66.5 | 39.11 | 279.88 | 1.47 | L4 |\\n| TensorRT-LLM | 16 | 131.8 | 30.39 | 547.8 | 2.77 | L4 |\\n| TensorRT-LLM | 32 | 277.22 | 48.02 | 925.7 | 4.78 | L4 |\\n| TensorRT-LLM | 64 | 498.52 | 71.62 | 1,164.40 | 6.2 | L4 |\\n| TensorRT-LLM | 128 | 677.31 | 120.37 | 1,445.18 | 7.69 | L4 |\\n| TensorRT-LLM | 256 | 1,926.31 | 216.88 | 1,600.81 | 8.52 | L4 |\\n| TensorRT-LLM | 1 | 21.17 | 9.24 | 130.05 | 0.68 | A100 |\\n| TensorRT-LLM | 2 | 25.78 | 9.21 | 264.5 | 1.35 | A100 |\\n| TensorRT-LLM | 4 | 28.52 | 10.99 | 437.69 | 2.27 | A100 |\\n| TensorRT-LLM | 8 | 34.4 | 12.61 | 760.49 | 3.96 | A100 |\\n| TensorRT-LLM | 16 | 68.03 | 14.32 | 1,343.80 | 7.01 | A100 |\\n| TensorRT-LLM | 32 | 185.96 | 16.82 | 2,287.30 | 11.92 | A100 |\\n| TensorRT-LLM | 64 | 136.87 | 21.17 | 3,625.22 | 18.89 | A100 |\\n| TensorRT-LLM | 128 | 463.78 | 34.15 | 4,456.51 | 23.24 | A100 |\\n| TensorRT-LLM | 256 | 890.12 | 59.18 | 5,188.24 | 27.05 | A100 |\\n\\n## Conclusion\\n\\nHigh-performance LLM inference is fundamentally a systems engineering problem: memory efficiency, kernel execution, batching strategy, and parallelism determine real-world latency and throughput. Techniques such as paged KV caching, aggressive quantization, kernel fusion, and inflight batching improve GPU utilization while reducing latency and memory pressure.\\n\\nThese optimizations enable the platform to deliver sub-second responses, sustain high concurrency, and efficiently serve both lightweight and long-context workloads. By continuously optimizing across the full inference stack, we keep LLM serving scalable, cost-efficient, and production-ready for real-time AI applications."},{"id":"multi-engine-llm-inferencing-platform","metadata":{"permalink":"/BharatMLStack/blog/multi-engine-llm-inferencing-platform","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/llm-inferencing-platform/index.md","source":"@site/blog/bharatmlstack-history/llm-inferencing-platform/index.md","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","description":"A deep dive into building a production-grade LLM inference platform\u2014covering the full LLMOps lifecycle from model onboarding and automated compilation to multi-engine serving with TensorRT-LLM, vLLM, and Dynamo, along with cold-start mitigation and LLM-specific observability.","date":"2025-03-29T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":13.31,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","description":"A deep dive into building a production-grade LLM inference platform\u2014covering the full LLMOps lifecycle from model onboarding and automated compilation to multi-engine serving with TensorRT-LLM, vLLM, and Dynamo, along with cold-start mitigation and LLM-specific observability.","authors":["jaya"],"slug":"multi-engine-llm-inferencing-platform","date":"2025-3-29","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/llm-inference-optimization-sub-sec-latency"},"nextItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/scaling-model-inference-and-embedding-search"}},"content":"![BharatMLStack](./bms.png)\\nServing large language models in production introduces new challenges across infrastructure, performance optimization, and operational lifecycle management. The LLM Inference Platform addresses these challenges by providing a unified system for deploying and managing open-source and fine-tuned LLMs at scale.\\n\\nThe platform implements a complete LLMOps lifecycle \u2014 from model registration and automated compilation to deployment, runtime optimization, and monitoring. Designed as a self-service environment, users can onboard models directly from open repositories such as Hugging Face or upload custom fine-tuned models, and deploy them using a single-click workflow with no manual infrastructure or configuration steps required.\\n\\nIn addition to fully automated deployment, the platform allows users to select and apply custom inference optimization techniques \u2014 such as quantization strategies, batching configurations, and runtime-specific performance enhancements \u2014 enabling teams to balance latency, throughput, and cost based on their use case. The goal is to reduce operational friction while enabling high-performance, production-grade LLM inference.\\n\\n## Why LLM Inference Is not just bigger ML model serving\\n\\nLarge language model (LLM) inference introduces a fundamentally different set of challenges compared to traditional machine learning inference. While classical ML models typically perform a single forward pass to produce a fixed prediction, LLMs operate as autoregressive systems, generating outputs token by token based on previously generated context. This difference dramatically changes how inference systems must be designed, optimized, and scaled.\\n\\n### Autoregressive Generation and Sequential Computation:\\n\\nUnlike traditional models such as classifiers or recommenders \u2014 where inference cost is relatively constant \u2014 LLMs generate responses incrementally. Each new token depends on all previously generated tokens, making inference inherently sequential and dynamic. This means latency and compute requirements vary significantly depending on prompt length and output size, introducing complexity in scheduling and resource allocation.\\nBecause tokens cannot be generated fully in parallel during decoding, GPUs may become underutilized without specialized batching and scheduling strategies. This has led to the development of dedicated LLM inference engines optimized for token-level execution.\\n\\n### Prefill and Decode Phases:\\n\\nLLM inference typically consists of two distinct stages:\\n\\n- Prefill phase \u2014 the model processes the input prompt and builds internal representations. This stage is compute-heavy and highly parallelizable.\\n- Decode phase \u2014 the model generates tokens sequentially, predicting one token at a time using previously generated context.\\n\\nThe decode stage often becomes memory-bound rather than compute-bound, which creates new performance bottlenecks compared to traditional ML workloads.\\n\\n### Context Management and KV Caching:\\n\\nAnother fundamental difference lies in how LLMs maintain context. Transformer-based models rely on attention mechanisms that require access to past token representations. To avoid recomputing these representations repeatedly, inference engines use key-value (KV) caching, which stores intermediate activations from previous tokens.\\nKV caching significantly improves performance by eliminating redundant computation, but it introduces new challenges:\\n\\n- Memory consumption grows with sequence length and batch size\\n- GPU memory becomes a critical bottleneck\\n- Efficient memory management becomes essential for scaling concurrent requests\\n\\nThis tradeoff between compute efficiency and memory usage is unique to LLM inference workloads.\\n\\n### Dynamic and Irregular Workloads:\\n\\nTraditional ML inference typically operates on fixed-size inputs with predictable latency. In contrast, LLM requests vary widely in prompt length, output length, and runtime behavior. As a result:\\n\\n- Batch sizes must be dynamic rather than static\\n- Requests may enter and leave batches asynchronously\\n- Scheduling systems must continuously rebalance workloads to maximize GPU utilization\\n\\nThese characteristics require specialized serving architectures that differ significantly from standard ML serving pipelines.\\n\\n### Streaming and User Experience Constraints:\\n\\nAnother distinguishing factor is the expectation of real-time streaming responses. Instead of returning a single output, LLM systems often stream tokens to users as they are generated. \\nBecause of these differences \u2014 sequential generation, growing memory requirements, dynamic workloads, and streaming constraints \u2014 LLM inference cannot be treated as a simple extension of existing ML serving systems. Production platforms must incorporate specialized runtime engines, advanced optimization techniques, and observability tailored specifically to LLM workloads.\\n\\n## LLMOps: High-Level Architecture \\n\\n![LLM Architecture](./llm-plat.png)\\n\\nThe LLM Inference Framework is designed as a fully automated, end-to-end system for deploying and operating open-source and fine-tuned large language models at scale. The architecture abstracts the complexity of model optimization, hardware selection, deployment, and runtime management into a unified workflow that enables users to move from raw model weights to production-ready inference endpoints with minimal manual intervention.\\n\\nOur LLM Inference Framework is architected not just as a serving engine, but as a complete lifecycle management system. As illustrated in the high-level design below, the platform automates the journey of a model through seven distinct stages, ensuring reproducibility, performance, and scalability.\\n\\n1. Onboarding & Registration (The Source of Truth)\\n\\n The lifecycle begins with the Data Scientist or engineer.\\n\\n - Model Ingestion: Users onboard models\u2014whether open-source (Hugging Face, NeMo) or internally fine-tuned\u2014via the Truffle Box SDK/UI.\\n - LLM + Prompt Registry: Unlike traditional systems that only track model weights, our registry is a unified control plane. It stores both the Model Artifacts and the Prompt Templates. This allows Data Scientists to register and version-control prompts (e.g., \\"customer_support_v2\\") independently of the application code.\\n\\n2. The \\"Black Box\\" Build Engine\\n\\n Once a model is registered, the Automated LLM Compiler + Quantizer Module kicks off a background job on ephemeral GPU resources.\\n\\n - Transformation: The raw model is converted into a TRT-LLM Checkpoint.\\n - Quantization: The system automatically applies quantization algorithms (like INT4 AWQ or FP8) to reduce memory footprint.\\n - Engine Building: Finally, it compiles a highly optimized TRT Engine specifically tuned for the target hardware.\\n\\n3. Intelligent Profiling & Validation\\n\\n Before deployment, the new engine passes through the Hardware & Inference Runtime Profiler.\\n\\n - Benchmarking: This module empirically tests the engine against various hardware configurations (L4 vs. A100) and runtimes (TRT-LLM vs. vLLM).\\n - Optimization: It recommends the optimal configuration that meets latency SLAs (Time-To-First-Token) while minimizing cost.\\n\\n4. Smart Artifact Generation & Distribution\\n\\n To solve the Kubernetes \\"Cold Start\\" problem, the LLM Serving Artifacts Generation module packages the model using a bifurcated strategy:\\n\\n - Standard Models: Artifacts are uploaded to Cloud Storage (GCS) and downloaded by pods at startup.\\n - Very Large Models: For massive models (>8GB) where network downloads are too slow, the system pre-caches the model onto Secondary Boot Disks. These disks are attached directly to new GPU nodes during autoscaling, eliminating download wait times.\\n\\n5. Image Streaming & Deployment\\n\\n Simultaneously, the inference runtime container images are pulled from the Artifact Registry.\\n\\n - Image Streaming: We utilize container image streaming to allow pods to start initializing while the massive Triton/Dynamo container layers are still downloading, further shaving seconds off the startup time. link\\n\\n6. The Inference Runtime (Kubernetes)\\n\\n The workload lands on Kubernetes with Autoscaling.\\n\\n - Dynamic Backends: Depending on the profile generated in Stage 3, the pod initializes either TensorRT-LLM (for throughput) or vLLM (for flexibility), or spins up a Dynamo worker for distributed inference.\\n - Data Loading: The pod either downloads the model from Cloud Storage or mounts the pre-warmed Secondary Boot Disk (\\"Pull from Disk\\").\\n\\n7. Client Interaction & Observability\\n\\n Finally, the LLM Inference Client executes the request.\\n\\n - Prompt Injection: The client pulls the specific prompt template ID from the Registry, ensuring the exact versioned instructions are used.\\n - Streaming Response: The request is sent via gRPC, and tokens are streamed back to the user in real-time.\\n\\n8. Observability: Monitoring the Pulse of GenAI\\n\\n In traditional microservices, success is measured by CPU utilization and request latency (p99). For Large Language Models, these metrics are insufficient. A user doesn\'t care if the GPU is at 80% utilization; they care about how fast the first word appears and how smoothly the rest of the sentence follows.\\n\\n To capture the true user experience, our platform instrumentation focuses on three critical LLM-specific metrics:\\n\\n 1. Time to First Token (TTFT)\\n - Definition: TTFT measures the time elapsed from the moment a request is received until the very first token is generated and streamed back to the user.\\n - Why it matters: This represents the \\"Prefill Phase\\" latency\u2014the time the model takes to process the input prompt and load weights. A high TTFT makes the application feel unresponsive or \\"hung.\\"\\n - Optimization: We closely monitor TTFT to ensure our Prefix Caching is effective (aiming for high cache hitrates), which drastically lowers this metric by skipping redundant prompt processing.\\n\\n 2. Inter-Token Latency (ITL)\\n - Definition: ITL measures the average time interval between the generation of consecutive tokens during the \\"Decode Phase\\".\\n - Why it matters: This defines the \\"perceived speed\\" of reading. Even if the first token is fast (low TTFT), high ITL makes the text generation look \\"jerky\\" or slow to the user.\\n - Benchmarks: In our testing with Llama 3.1, we track p99 ITL to ensure it stays below human reading speeds to maintain a natural conversational flow.\\n\\n 3. Token Throughput vs. Request Throughput\\n - We distinguish between two types of throughput to balance system efficiency with user load:\\n - Token Throughput (tokens/sec): The total number of tokens generated across all concurrent requests. This measures the raw compute efficiency of the GPU and the effectiveness of batching.\\n - Request Throughput (req/sec): The number of distinct user queries served per second. We use this to determine autoscaling thresholds, ensuring we scale out before the queue depth impacts ITL.\\n\\n 4. The Monitoring Stack\\n - Real-time Dashboards: We utilize Grafana to visualize these streaming metrics in real-time, allowing on-call engineers to spot \\"slow generation\\" incidents that generic \\"500 error\\" alerts would miss.\\n - Request Tracing: Since Triton Inference Server does not log request payloads by default, we integrate a Helix Client to asynchronously publish request logs to Log Tables. This allows us to trace a specific \\"slow\\" request back to its prompt to understand if a complex input caused the latency spike.\\n\\n## Supported Inference backends (TensorRT LLM, Dynamo & vLLM)\\n\\nTailored for the Use Case: We do not believe in a \\"one-size-fits-all\\" approach to inference. Different use cases\u2014whether a real-time voice bot requiring ultra-lowsub-second latency or a massive reasoning task requiring huge context windows\u2014demand different runtime characteristics. Our platform is designed to be runtime-agnostic, allowing us to automatically select and tailor the best engine based on the specific requirements of the application:\\n\\n1. TensorRT-LLM: The High-Performance Standard\\n\\n Suitable for: High-throughput production workloads where latency is critical (e.g., customer support chat, real-time voice bots).\\n\\n TensorRT-LLM serves as our default backend for these scenarios. Our internal benchmarks on Llama 3.1 and 3.2 models demonstrated that a tuned TensorRT-LLM engine significantly outperforms standard runtimes, especially when utilizing INT4 AWQ and FP8 quantization .\\n\\n Key optimizations we tailor for these high-load cases include:\\n\\n - Optimized execution via TensorRT engine compilation\\n - Quantization-aware execution for reduced memory usage and improved throughput\\n - Inflight Batching: Allowing requests to be processed continuously without waiting for the entire batch to finish, drastically improving GPU utilization .\\n - Custom Plugins: Enabling specific NVIDIA plugins like the GEMM plugin and GPT Attention plugin to accelerate matrix multiplications and attention mechanisms .\\n\\n2. Dynamo: Distributed Inference for Reasoning Models\\n\\n Suitable for: Very large \\"reasoning\\" models (70B+) or scenarios requiring massive context windows where a single GPU\'s memory is insufficient.\\n\\n For these memory-bound tasks, we utilize Dynamo, a low-latency distributed inference framework . Unlike monolithic servers, Dynamo disaggregates the inference process to scale resources horizontally:\\n\\n - KV Aware Routing: A specialized router directs requests to workers that already hold the relevant Key-Value (KV) cache, minimizing redundant computation .\\n - Prefill vs. Decode Split: The workload is divided into Prefill Workers (processing the prompt) and Decode Workers (generating tokens), allowing us to scale the compute-heavy \\"reading\\" phase independently from the memory-heavy \\"writing\\" phase .\\n - Distributed execution across multiple GPU resources\\n\\n3. vLLM: The Flexible Baseline\\n\\n Suitable for: Rapid prototyping, testing new model architectures, or low-traffic internal tools where ease of deployment outweighs raw throughput.\\n\\n While TensorRT-LLM is optimized for maximum speed, vLLM provides a robust and flexible baseline .\\n\\n - High throughput through dynamic batching and efficient memory utilization\\n - Paged KV cache management for handling long contexts and concurrent requests\\n - Strong support for open-source model ecosystems\\n - Rapid Adoption: It allows us to onboard new model architectures immediately without waiting for a custom TensorRT build.\\n - Benchmarking Insight: In our internal tests, vLLM provided a strong baseline but often lacked the specific max-token optimizations present in our custom TRT engines . We use it strategically for initial testing before committing to a full TensorRT optimization pipeline.\\n\\n## Conclusion\\n\\nLarge language model inference introduces a fundamentally new class of infrastructure challenges\u2014where performance is governed not just by raw compute, but by memory efficiency, intelligent scheduling, runtime specialization, and lifecycle automation. Unlike traditional ML serving, LLM inference requires systems that understand token-level execution, manage rapidly growing context state, and continuously balance latency, throughput, and cost under highly dynamic workloads.\\n\\nThe LLM Inference Framework addresses these challenges by transforming inference into a fully automated, reproducible lifecycle\u2014from model onboarding and compilation to deployment, optimization, and observability. By integrating automated quantization and engine compilation, intelligent runtime selection, cold-start mitigation strategies, and LLM-specific observability metrics such as Time-to-First-Token and Inter-Token Latency, the platform ensures both high performance and operational simplicity.\\n\\nEqually important, the framework is designed with flexibility and future evolution in mind. Its runtime-agnostic architecture enables seamless adoption of emerging inference engines, hardware accelerators, and optimization techniques without requiring platform redesign. This ensures that teams can continuously leverage advancements in the rapidly evolving LLM ecosystem while maintaining consistent operational workflows.\\n\\nUltimately, the goal of the platform is to make production-scale LLM deployment as seamless and reliable as traditional software deployment\u2014allowing teams to focus on building intelligent applications rather than managing infrastructure complexity. By combining lifecycle automation, runtime optimization, and deep observability, the LLM Inference Framework provides a scalable foundation for delivering fast, cost-efficient, and production-ready LLM experiences.\\n\\n## Future Explorations\\n\\nWhile we have achieved significant milestones in latency and throughput, the landscape of GenAI is evolving rapidly. Our roadmap focuses on increasing flexibility, reducing costs, and enhancing reliability for enterprise-grade workloads. Here is what we are building next:\\n\\n- TPU Support: To diversify our hardware supply chain and further optimize cost-per-token, we are evaluating Google Cloud TPUs to bake it into our platform. By leveraging the JAX and PyTorch/XLA ecosystems, we aim to unlock the massive throughput potential of TPU v5e chips, particularly for our open-source Llama models. This will allow the hardware profiler to dynamically choose between NVIDIA GPUs and Google TPUs based on real-time availability and price-performance metrics.\\n- Multi-LoRA Serving (Serverless Experience): Currently, deploying a fine-tuned model requires a dedicated GPU. We are building support for Multi-LoRA serving, which will allow us to serve hundreds of unique, fine-tuned adapters on top of a single frozen base model. This will drastically reduce costs for multi-tenant applications, enabling a \\"serverless\\" experience where specific fine-tunes are hot-swapped instantly per request.\\n- Spot Instance Orchestration: To further optimize cloud costs, we are developing fault-tolerant mechanisms to run inference workloads on Spot Instances. By implementing aggressive checkpointing and seamless request draining, we aim to leverage cheaper, preemptible compute capacity without interrupting the user\'s streaming experience.\\n- Semantic Caching Layer: We plan to move beyond standard Prefix Caching to implement Semantic Caching. By using a vector database to fetch responses for semantically similar queries (e.g., \\"How do I reset my password?\\" vs. \\"Password reset steps\\"), we can bypass the GPU entirely for repetitive queries, reducing latency to near-zero.\\n- Context-Aware Autoscaling: Standard CPU/GPU utilization metrics are often insufficient signals for scaling LLMs. We are working on KV-cache pressure metrics for autoscaling. This ensures that we scale out before the memory fills up, preventing eviction-based slowdowns during traffic spikes.\\n- Online Evaluation & Guardrails: We are integrating a lightweight \\"Trust Layer\\" into the proxy. This will allow for low-latency input/output filtering (Guardrails) and asynchronous \\"LLM-as-a-Judge\\" evaluation pipelines to monitor response quality in production, not just system health."},{"id":"scaling-model-inference-and-embedding-search","metadata":{"permalink":"/BharatMLStack/blog/scaling-model-inference-and-embedding-search","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/scaling-model-inference-and-embedding-search/index.md","source":"@site/blog/bharatmlstack-history/scaling-model-inference-and-embedding-search/index.md","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"How Meesho scaled model inference with self-hosted Triton on GKE\u2014slashing latency and costs by 65%\u2014and built a real-time embedding search system on Qdrant to power personalized recommendations at scale.","date":"2024-05-21T00:00:00.000Z","tags":[{"inline":true,"label":"model-inference","permalink":"/BharatMLStack/blog/tags/model-inference"},{"inline":true,"label":"embedding-search","permalink":"/BharatMLStack/blog/tags/embedding-search"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":3.55,"hasTruncateMarker":false,"authors":[{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"How Meesho scaled model inference with self-hosted Triton on GKE\u2014slashing latency and costs by 65%\u2014and built a real-time embedding search system on Qdrant to power personalized recommendations at scale.","authors":["aditya","jaya","adarsha"],"slug":"scaling-model-inference-and-embedding-search","date":"2024-05-21T00:00:00.000Z","tags":["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/multi-engine-llm-inferencing-platform"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/building-meeshos-mlplatform-lessons-from-first-gen"}},"content":"![BharatMLStack](./bms.png)\\nBy mid-2023, we had transformed our ML stack\u2014building a real-time feature store, optimizing model retrieval, and fine-tuning ranking. But two critical gaps remained:\\n\\n- \ud83d\udd39 Scaling model inference without hitting infrastructure roadblocks\\n- \ud83d\udd39 Moving embedding search from batch to real-time for candidate generation\\n\\nHere\u2019s how we tackled these last-mile challenges, broke free from infrastructure constraints, and built a cost-efficient, high-performance system.\\n\\n## Breaking Free from the Scalability Ceiling\\n\\n### The Model Serving Bottleneck\u2014A Wake-Up Call\\n\\nJuly 2023. With just months left for the Mega Blockbuster Sale (MBS), we noticed a serious issue\u2014scaling our model-serving infrastructure was taking 10\u201315 minutes. In real-time ML, that\u2019s an eternity.\\nIn one of our war rooms, we ran a quick experiment:\\n\\n- \ud83d\ude80 We deployed an XGBoost model on a self-hosted Triton Inference Server running on a 16-core machine.\\n- \ud83d\ude80 Fired requests and compared the outputs with our existing cloud-hosted setup.\\n- \ud83d\ude80 The results matched\u2014perfectly.\\n\\nThat moment changed everything. We prepped a backup Triton setup on EKS, just in case our cloud provider couldn\'t allocate enough compute resources in time. Luckily, they did\u2014but the seed was planted.\\nThen in October, just two weeks before MBS, we got an alarming response from our infrastructure team:\\n \\"Node availability may be an issue.\\"\\nWith no time to waste, we moved 30% of real-time ML traffic to our self-hosted Triton cluster. The results?\\n\\n- \u2705 p99 latency dropped from 90\u2013100ms to 30\u201340ms\\n- \u2705 Triton handled significantly higher throughput on fewer resources\\n- \u2705 No model changes were needed\\n\\nMBS ran without a hitch, proving that self-hosted inference was the way forward.\\n\\n### Scaling Triton on GKE\\n\\nThis left us with two choices:\\n\\n- 1\ufe0f\u20e3 Port models to a managed cloud inference service, investing time in learning a new deployment stack\\n- 2\ufe0f\u20e3 Scale our existing Triton setup on GKE, optimizing for cost and performance\\n\\nWe went with Option 2\u2014and it slashed inference costs to 35% of what we previously paid, while giving us full control over scaling and optimizations.\\n\\n### Fixing the Cold Start Problem\\n\\nAs we onboarded more deep learning (DL) models, we hit a new bottleneck, new inference pods took 7\u20139 minutes to spin up.\\n\\nAfter profiling, we found the culprits:\\n\\n- Triton\u2019s base image\u2014a massive 5GB\\n- Model binaries\u2014often 1GB+\\n- Startup delay\u2014mostly due to downloading and initializing these assets\\n\\nTo fix this, we built a lightweight Triton image, stripping unused components and shrinking the size to 900MB. This cut cold start times drastically, making auto-scaling faster and smoother.\\n\\n## Embedding Search: The Last Piece of the Puzzle\\n\\nBy mid-2023, most of our ML stack had gone real-time\u2014except for Candidate Generation (CG), which still ran in batch mode. To truly power real-time recommendations, we needed an online embedding search system.\\n\\n### Choosing the Right Vector Database\\n\\nWe benchmarked three production-ready vector DBs across key parameters:\\n\\n- Milvus\\n- Qdrant\\n- Weaviate\\n\\nAfter extensive POCs, Qdrant stood out for its:\\n\\n- \u2705 Blazing-fast search latency on high-dimensional vectors\\n- \u2705 Efficient memory usage, crucial for in-memory workloads\\n- \u2705 Support for upserts and soft deletes, vital for Ads use cases\\n- \u2705 gRPC + REST APIs, making integration seamless\\n- \u2705 Powerful filtering, allowing fine-tuned retrieval (e.g., filtering Ads by category, active status, etc.)\\n\\nAt its core, Qdrant uses HNSW indexing, delivering both high recall and low-latency nearest-neighbor search\u2014a perfect fit for our needs.\\n\\n### Embedding Freshness & Real-Time Updates\\n\\nTo ensure embeddings stayed up to date, we built a dual ingestion pipeline:\\n\\n- \ud83d\udccc Daily Refresh: A bulk pipeline updated embeddings overnight\\n- \ud83d\udccc Real-Time Updates: Ads events triggered immediate upserts/deletes\\n\\nThis setup powered real-time \\"Similar Products\\" recommendations on the product page and became the foundation for Ads Candidate Generation, ensuring the right ads surfaced in milliseconds.\\n\\n![Skye](./vss.png)\\n\\n## Final Takeaways: Scaling Smartly for Real-Time ML\\n\\n- \ud83d\ude80 Self-hosted inference on Triton gave us lower cost, faster scaling, and better performance than managed services\\n- \ud83d\ude80 Building a custom Triton image reduced cold starts, improving responsiveness\\n- \ud83d\ude80 Qdrant-based embedding search enabled real-time personalization at scale\\n- \ud83d\ude80 Real-time updates for embeddings unlocked dynamic, up-to-date recommendations\\n\\nBy early 2024, Meesho\u2019s ML stack had evolved into a fully real-time, scalable, and cost-efficient system, setting the foundation for even bigger leaps ahead."},{"id":"building-meeshos-mlplatform-lessons-from-first-gen","metadata":{"permalink":"/BharatMLStack/blog/building-meeshos-mlplatform-lessons-from-first-gen","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/building-meeshos-mlplatform-lessons-from-first-gen/index.md","source":"@site/blog/bharatmlstack-history/building-meeshos-mlplatform-lessons-from-first-gen/index.md","title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","description":"Lessons from scaling Meesho\'s first-gen ML platform\u2014building Inferflow for no-code feature retrieval, migrating from Cassandra to ScyllaDB, optimizing the Interaction Store with tiered storage, and cutting infra costs by 60% while hitting 1M QPS.","date":"2023-04-10T00:00:00.000Z","tags":[{"inline":true,"label":"inferflow","permalink":"/BharatMLStack/blog/tags/inferflow"},{"inline":true,"label":"interaction-store","permalink":"/BharatMLStack/blog/tags/interaction-store"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":6.25,"hasTruncateMarker":false,"authors":[{"name":"Bhawani Singh","title":"Architect @ Meesho","url":"https://github.com/singh-bhawani","imageURL":"https://github.com/singh-bhawani.png","key":"bhawani","page":null},{"name":"Jigar Dave","title":"Lead Software Engineer @ Meesho","url":"https://github.com/jigarpatel26","imageURL":"https://github.com/jigarpatel26.png","key":"jigar","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","description":"Lessons from scaling Meesho\'s first-gen ML platform\u2014building Inferflow for no-code feature retrieval, migrating from Cassandra to ScyllaDB, optimizing the Interaction Store with tiered storage, and cutting infra costs by 60% while hitting 1M QPS.","authors":["bhawani","jigar","adarsha"],"slug":"building-meeshos-mlplatform-lessons-from-first-gen","date":"2023-4-10","tags":["inferflow","interaction-store","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/scaling-model-inference-and-embedding-search"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","permalink":"/BharatMLStack/blog/building-meeshos-mlplatform"}},"content":"![BharatMLStack](./bms.png)\\nBy late 2022, we had built something we were truly proud of\u2014a real-time ML serving system with a DAG-based executor, a feature store, and an interaction store powering key ranking and personalization models. It was a major milestone, the culmination of months of effort from data scientists, ML engineers, and backend teams. Our system was live, and we were ready to push the boundaries of experimentation.\\nAnd it worked. Mostly.\\nBut soon, cracks appeared. Every new model needed custom feature retrieval logic, DAGs became dense and unmanageable, and scaling turned into a constant firefight. Costs surged, and infra bottlenecks slowed experimentation. Our system worked, but it wasn\u2019t built for scale.\\nThis is the story of how we tackled these challenges\u2014building Inferflow for seamless feature retrieval, optimizing real-time infra, and cutting costs while scaling to millions of QPS.\\n\\n### The Cost of Success\\nEvery new Ranker model required its own feature set, often pulling from different entities. Each addition meant:\\n\\n- Adding new DAG nodes in IOP\\n- Writing custom logic to fetch features from multiple sources (e.g., user, product, user \xd7 category)\\n- Inferring intermediate features (e.g., extracting category from a product to fetch user \xd7 category data)\\n- Optimizing I/O and dealing with the inevitable bugs\\n\\nWhat began as clean DAGs soon turned into a tangled web of cross-dependent graphs. Every experimentation cycle meant new nodes, new dependencies, and slower iterations.\\n\\n### Scaling Pains (and Cassandra\u2019s Limits)\\nAt some point, we were hitting:\\n\\n- 250\u2013300K reads/sec\\n- 1M writes/sec (during lean hours)\\n\\nAll of this ran on Cassandra. While its distributed architecture had been proven in production, operating large-scale clusters came with considerable infrastructure overhead. Our proof-of-concept (POC) demonstrated throughput of around 100K ops/sec, but as we scaled further, the challenges grew. Ensuring node health, optimizing compaction, and maintaining storage balance became increasingly demanding. We also observed latency spikes under heavy load, alongside a sharp increase in total cost of ownership.\\n\\n### Interaction Store Woes\\nOur interaction store was another ticking time bomb:\\n\\n- \ud83d\udea8 Clusters kept growing in size and cost\\n- \ud83d\udea8 Latency spikes became increasingly frequent\\n- \ud83d\udea8 The DMC proxy occasionally lost locality of nodes against shards, causing cross-node communication and degraded performance\\n\\nEach time this happened, we had to manually rebalance shards just to restore stable latency, making operations unsustainable at scale.\\n\\n### Silver Linings\\nDespite the chaos, the system was live and delivering value:\\n\\n- Real-time infrastructure was in production\\n- Costs dropped by 60\u201370% compared to offline personalization\\n- New experiments rolled out faster and more successfully\\n- User engagement metrics improved\\n\\nIt wasn\u2019t perfect. It was far from easy. But it worked\u2014and that counted for a lot.\\n\\n### Round Two: Solving the Top 2 Bottlenecks\\nWith the first-gen system stretched to its limits, we stepped back. Conversations with data scientists and backend engineers revealed three recurring pain points:\\n\\n1. Coding feature retrieval logic for every new model was becoming unsustainable\\n2. ML scale was exploding\u2014bringing rising infra costs with it\\n3. Real-time embedding search was the next big unlock\\n\\nWe tackled them one by one\u2014starting with the biggest pain point.\\n\\n#### Problem 1: No-Code Feature Retrieval for Model Inference\\nWe noticed a pattern: for personalized ranking, models needed features from:\\n\\n- \u2705 Product\\n- \u2705 User\\n- \u2705 User \xd7 Category\\n- \u2705 Region, cohort, sub-category, etc.\\n\\nA key insight emerged: Entities that contribute features for a model always map back to the context entities.\\n\\n![MP Dag](./mp-dag.png)\\n\\nWith this, we designed Inferflow, a graph-driven feature retrieval and model orchestration system:\\n\\n- 1\ufe0f\u20e3 Inferflow takes a modelId and context IDs (e.g., userId, productIds)\\n- 2\ufe0f\u20e3 Loads a pre-defined feature retrieval graph from ZooKeeper\\n- 3\ufe0f\u20e3 Executes the graph to resolve entity relationships dynamically\\n- 4\ufe0f\u20e3 Outputs a 2D matrix of feature vectors\\n\\n\ud83d\udca1 The impact?\\n\\n- \ud83d\ude80 No more custom feature retrieval code\u2014just graph updates in config\\n- \ud83d\ude80 Feature consistency across experiments\\n- \ud83d\ude80 Faster iteration cycles for ranking, fraud detection, and beyond\\n\\nHere\u2019s a visual example that shows how this graph plays out during execution. We further extended the graph to call multiple models as needed:\\n![MP matrix](./mp-matrix.png)\\nWe built Inferflow in GoLang, using gRPC and Proto3 serialization for efficiency.\\n\\n#### Problem 2: Scaling Without Breaking the Bank\\nWith more ML use cases coming online, we needed to cut costs without compromising performance. We focused on:\\n\\n- \ud83d\udd39 Online Feature Store\\n- \ud83d\udd39 Interaction Store\\n\\n#### Optimizing the Online Feature Store\\nOur costs were concentrated in:\\n\\n- \ud83d\udccc Database (Cassandra)\\n- \ud83d\udccc Cache (Redis)\\n- \ud83d\udccc Running Pods (Java services)\\n\\n1\ufe0f\u20e3 Replacing Cassandra with ScyllaDB\\nAs we hit the operational limits of large Cassandra clusters, we transitioned to ScyllaDB, which offered a seamless drop-in replacement without major code changes. The switch brought significant benefits:\\n\\n- Throughput: Matched or exceeded Cassandra\'s performance under identical workloads, even under high concurrency.\\n- Latency: Achieved consistently lower P99 latencies due to ScyllaDB\'s shard-per-core architecture and better I/O utilization.\\n- Cost Efficiency: Reduced infra footprint by ~70% through better CPU and memory efficiency, eliminating the need for over-provisioned nodes.\\n\\n2\ufe0f\u20e3 Finding the Right Cache\\nTo reduce backend load and improve response times, we benchmarked multiple caching solutions\u2014Memcached, KeyDB, and Dragonfly\u2014under real production traffic patterns. Dragonfly stood out due to its robust architecture and operational simplicity:\\n\\n- Data Skew Handling: Efficiently managed extreme key hotness and uneven access patterns without performance degradation.\\n- Throughput: Delivered consistently high throughput, even with large object sizes and concurrent access.\\n- Ease of Adoption: Acted as a drop-in Redis replacement with full protocol compatibility\u2014no changes needed in application code or client libraries.\\n\\n3\ufe0f\u20e3 Moving to GoLang for Cost-Efficient Serving\\nJava services were memory-heavy\u2014so we rewrote core services in GoLang. The results?\\n\\n\u2705 Memory usage dropped by ~80%\\n\u2705 CPU utilization was significantly lower\\n\u2705 Faster, more efficient deployments\\n\\n#### Optimizing the Interaction Store\\nWe realized that we only need a user\u2019s interaction data in Redis when they open the app. So, we implemented a tiered storage approach:\\n\\n- \ud83d\udccc Cold Tier (ScyllaDB)\u2014Stores click, order, wishlist events\\n- \ud83d\udccc Hot Tier (Redis)\u2014Loads a user\u2019s past interactions only when they open the app\\n\\nSmart Offloading: We introduced an inactivity tracker to detect when a user session ends. At that point, Redis data was flushed back to Scylla, reducing unnecessary writes.\\n\\n![InteractionStore](./interaction-str.png)\\n#### Results\\n\\n- Online Feature Store hit 1M QPS for the first time during the 2023 Mega Blockbuster Sale\u2014without breaking a sweat\\n- Infra costs for Online Feature Store and Interaction Store dropped by ~60%\\n\\n#### The Catch: Our ML Hosting Hit a Hard Limit\\nWhile planning for 2023 MBS, we ran into a critical scalability bottleneck:\\n\\n- \u274c Insufficient compute availability in our region for ML instances\\n- \u274c Couldn\u2019t provision enough nodes to handle real-time inference at scale\\n\\nThis forced us to rethink where and how we hosted our models. The existing setup was great for prototyping\u2014but it wasn\u2019t built to handle the bursty, high-QPS demands of real-world production workloads.\\n\\n### Conclusion: From Firefighting to Future-Proofing\\nWhat started as an ambitious experiment turned into a real-time ML infrastructure that powered millions of requests per second. We battled scaling pains, rethought feature retrieval with Inferflow, and rebuilt our infra stack for efficiency\u2014driving down costs while improving experimentation velocity.\\nBut new challenges emerged. Our infrastructure could now handle scale, but our ML model hosting setup hit a hard limit. With compute availability bottlenecks threatening real-time inference, we faced a critical decision: how do we make model serving as scalable and cost-efficient as the rest of our stack? That\u2019s the next piece of the puzzle\u2014and the story of Part 3."},{"id":"building-meeshos-mlplatform","metadata":{"permalink":"/BharatMLStack/blog/building-meeshos-mlplatform","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/building-meeshos-mlplatform-from-chaos-to-cutting-edge/index.md","source":"@site/blog/bharatmlstack-history/building-meeshos-mlplatform-from-chaos-to-cutting-edge/index.md","title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","description":"How Meesho transitioned from batch-based recommendations to a real-time ML platform\u2014building an Online Feature Store, Interaction Store, and DAG execution framework that became BharatMLStack.","date":"2022-11-15T00:00:00.000Z","tags":[{"inline":true,"label":"online-feature-store","permalink":"/BharatMLStack/blog/tags/online-feature-store"},{"inline":true,"label":"interaction-store","permalink":"/BharatMLStack/blog/tags/interaction-store"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"}],"readingTime":10.19,"hasTruncateMarker":false,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null},{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Bhawani Singh","title":"Architect @ Meesho","url":"https://github.com/singh-bhawani","imageURL":"https://github.com/singh-bhawani.png","key":"bhawani","page":null},{"name":"Jigar Dave","title":"Lead Software Engineer @ Meesho","url":"https://github.com/jigarpatel26","imageURL":"https://github.com/jigarpatel26.png","key":"jigar","page":null}],"frontMatter":{"title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","description":"How Meesho transitioned from batch-based recommendations to a real-time ML platform\u2014building an Online Feature Store, Interaction Store, and DAG execution framework that became BharatMLStack.","slug":"building-meeshos-mlplatform","authors":["adarsha","aditya","bhawani","jigar"],"date":"2022-11-15T00:00:00.000Z","tags":["online-feature-store","interaction-store","mlplatform","meesho"]},"unlisted":false,"prevItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/building-meeshos-mlplatform-lessons-from-first-gen"}},"content":"![BharatMLStack](./bms.png)\\nIt all started in early 2022, over a casual Friday evening catch-up. Like many great origin stories, this one began with friendly banter between a group of backend engineers and data scientists. As the conversations unfolded, so did the roasting\u2014until one remark hit a little too close to home:\\n\\n*\\"Why are we still crunching data for Monthly Active Users (MAU) when the next day it\u2019s all about Daily Active Users (DAU)?\\"*\\n\\nThe laughter died down, and the question lingered. When we regrouped on Monday\u2014clear-headed and slightly reflective\u2014we decided to dig into the numbers. What they discovered was quite revealing: a large portion of compute resources wasn\u2019t being put to good use.\\nMuch of the system\u2019s effort was spent supporting users who weren\u2019t actively engaging, and even for new users, the experience wasn\u2019t optimized to make a meaningful impact.\\n\\nAt the same time, Meesho had just launched a company-wide initiative to reduce costs\u2014and every team had to contribute. This realization sparked the journey that would eventually lead to the **Meesho ML Platform**, known today as **BharatMLStack**.\\n\\n![Alt Text](./old-batch-arch.png)\\n\\nBefore the ML Platform, our recommendation and ranking pipelines followed a batch processing approach:\\n- **Data Ingestion**: The Data Platform team executed ETL jobs to ingest raw user data\u2014including user profiles, interaction logs, and product impressions\u2014into designated S3 buckets.\\n- **Layer 1**: Embedding Generation: On the Data Science side, Spark jobs pulled data from multiple S3 sources, cleaned and preprocessed it, and applied matrix factorization to generate user and item embeddings. The processed data and embeddings were then stored back in S3 in a structured format.\\n- **Layer 2**: Candidate Generation (CG): In this stage, Spark jobs leveraged embeddings and historical interaction data to generate candidate recommendations for users. These candidate lists were subsequently written to S3.\\n- **Layer 3**: Ranking and Merging \u2013 A final round of processing ranked the generated candidates using ML models, combined different candidate lists, and stored the final ranked recommendations in a caching system.\\n- **Serving**: A microservice retrieved ranked recommendations from an in-memory data store via exposed APIs, delivering personalized listings across key surfaces such as \\"For You\\" and Category Landing Pages (CLP).\\n\\nThis approach held up well\u2014until Meesho started seeing a significant surge in traffic.\\n\\n## The Turning Point: From Batch to Real-Time\\n\\nAt this time, the team was iterating on new **Ranker models**, and real-time inference seemed like the next logical step. But Rankers needed **real-time feature retrieval**, which meant an **online feature store** had to be built first.\\n\\nExploring open-source options led to **cost vs. performance trade-offs**, but Meesho\u2019s surging traffic meant that **latency and stability were non-negotiable**. After multiple debates and stakeholder discussions, a bold decision was made:\\n\\n*We would build our own feature store.*\\n\\nMeanwhile, efforts began to bring **Candidate Generators (CGs)** to real-time. The challenge? **Storing and retrieving user interactions quickly enough** to power real-time recommendations.\\n\\nAs the team dove deeper, a new roadblock emerged: \\nOur ML jobs were orchestrated using **Airflow DAGs**, giving data scientists flexibility in experimentation. But transitioning to real-time execution threatened this agility. Every change would now require backend engineering support, **slowing down iteration cycles**.\\n\\nThat\u2019s when the idea struck: \\nWe needed a **framework for real-time DAG execution**\u2014one that preserved the same flexibility as Airflow but worked for **streaming data**.\\n\\nThis moment shaped the **next phase of our journey**.\\n\\n## First Generation Design\\n\\n![Alt Text](./first-gen-arch.png)\\n\\n# Laying the Groundwork: The First-Gen ML Platform\\n\\nTo solve these challenges, the team built three foundational components:\\n\\n\\n### 1. IOP Framework: A Real-Time DAG Executor\\n\\n- **Reusable Nodes**: Each DAG node (e.g., an invocation to a CG service, a ranker, or a filter) had to be implemented only once. After that, it could be reused across any workflow by referencing it in config.\\n- **Config-driven Dynamic Graphs**: Execution graphs were defined as adjacency lists stored in **ZooKeeper**, allowing teams to modify the sequence or structure of operations without touching application code.\\n- **Plug-and-play CGs**: The Candidate Generator interface was preserved, so a single CG node could call any CG service by passing `cg_name` in the request. This drastically reduced the code surface area and improved maintainability.\\n- **Production-Grade DAGs**: DAGs were designed to execute in **low-latency real-time environments**, with support for **parallel execution, retries, and branching**.\\n\\n[More about IOP DAG](https://www.meesho.io/blog/rebuilding-meeshos-ranking-platform)\\n\\n\\n### 2. Online Feature Store - 0th Version\\n\\n- Used **Cassandra** and **Redis** for low-latency feature serving.\\n- Maintained feature consistency using **Feature Groups** with TTL-based expiry.\\n- A hybrid schema was used: feature keys stored in **ZooKeeper**, data stored in **compact arrays**.\\n\\n\\n### 3. Interaction Store - 0th Version\\n\\n- Captured real-time user interactions like clicks, orders, and add-to-cart events.\\n- Stored event data in **Redis ZSETs (sorted sets)** to enable fast lookups for recommendation engines.\\n- Provided an API to fetch a user\'s **last _k_ interactions** or **interactions within a time window**.\\n\\n\\nWith these components in place, **real-time ML at Meesho became a reality**.\\n\\nThis was just the beginning.\\n\\n## Building the Online Feature Store - 0th Version\\n\\n![Alt text](./online-feature-store-v0.png)\\n\\n### Choosing the Right Tech Stack\\n\\nWe spent considerable time evaluating various databases, caches, and communication protocols for our **online feature store**. After carefully weighing **cost, latency, throughput**, and **operational stability**, we settled on a combination of:\\n\\n- **Cassandra** and **Redis** for storage\\n- **gRPC + Proto3** as our communication layer\\n\\n\\n### Streamlining the Data Flow\\n\\nTo keep things simple in the initial version:\\n\\n- **Feature engineering jobs** wrote raw outputs to an **S3 bucket**\\n- A **daily feature push job**:\\n - Read from S3\\n - Grouped related features into **Feature Groups** (ensuring consistency)\\n - Pushed them to **Kafka**\\n\\nFor features requiring frequent updates:\\n\\n- **Ad-hoc jobs** computed features in higher frequency\\n- These jobs pushed to both **Kafka** and **S3** (S3 preserved historical data for future model training)\\n\\n\\n## The Challenges: Data Format and Storage\\n\\nOne of the most critical design challenges was how to store feature data **efficiently and consistently**, especially in databases like **Cassandra** and **Redis**, which come with unique storage constraints.\\n\\nWe had to solve for three key requirements:\\n\\n- ### Feature Consistency\\n When a feature group contains features like `order_count_1h` and `click_count_1h`, both must reflect the **same time window**. Inconsistent updates would lead to **unreliable model predictions**.\\n\\n- ### TTL Granularity\\n Each feature group required an **expiry timestamp**, so that **all features within it expired together**\u2014preserving consistency during reads.\\n\\n- ### Extensibility Across Databases\\n We anticipated that infra needs would evolve. To future-proof our system, the data format was designed to be **decoupled from DB-specific layouts**, enabling portability to systems like **ScyllaDB**, **DynamoDB**, **HBase**, or **BigTable**.\\n\\n\\n---\\n\\n## Overcoming Technical Constraints\\nAt the time, we were using Cassandra, which not only imposed a soft limit of 75 columns per row, but also exhibited significant performance degradation as the number of columns increased further, particularly in memory constrained machines. Wide rows caused high memory usage during reads, unpredictable latencies due to heavy deserialization overhead, and inefficiencies during compactions and repairs. This ruled out the naive \\"one column per feature\\" approach. We needed a format that was compact, minimized the number of columns, and remained efficient and portable across different storage systems.\\n\\n## The Solution: Schema Separation\\n\\nWe introduced the concept of Feature Groups\u2014logical groupings of features that must remain consistent with one another.\\nTo represent these groups efficiently, we adopted a layered storage approach:\\n\\n- **Feature Labels (Keys)** were stored in ZooKeeper, serving as the schema.\\n- **Feature Values** were stored as a comma-separated string array in Cassandra or Redis.\\n- **Expiry Timestamp and Schema Version** were appended using a semi-colon delimiter at the end of the string.\\n\\nExample:\\n\\n```bash\\nfeature_1_value,feature_2_value,feature_3_value;expiry_ts\\n```\\n\\nThis format allowed:\\n- Consistent writes and reads at the group level\\n- Easy parsing of feature values using the schema lookup from ZooKeeper\\n- Efficient storage with minimal DB column usage\\n- Support for per-group TTLs and schema evolution\\n\\n## Tracking Changes in Feature Groups\\nFeature groups don\u2019t stay static. As models evolve, features get added, renamed, or removed. But schema changes often go live before the data is ready\u2014and stopping ingestion just to wait for everything to align isn\'t feasible.\\n\\n### Common Real-World Scenarios:\\n- A new feature is added to the schema, but ingestion jobs still use the older schema version.\\n- Ongoing writes don\u2019t include the newly added feature, and stopping ingestion would break freshness for existing features.\\n- During serving, models request a mix of old and new features, depending on rollout stages.\\n\\n## The Solution: Schema Versioning\\nWe solved this with versioned feature group schemas, which unlocked several capabilities:\\n- ### Backward Compatibility\\n Older ingestion jobs can continue writing using older schema versions. During reads, the system uses the schema version embedded in the value to interpret the data correctly.\\n- ### Partial Availability Handling \\n During inference, if some features in the request aren\u2019t available (due to rollout delays or missing data), the system serves default values, ensuring the inference call doesn\u2019t fail.\\n- ### Safe Writes Without Pipeline Pauses\\n With schema versioning, we no longer had to stop ingestion pipelines for schema updates. Writes using previous versions can continue safely, and downstream consumers evolve independently.\\nThis design gave us the flexibility to move fast without breaking things\u2014preserving data quality, enabling experimentation, and ensuring reliability at scale.\\n\\n![Alt Text](./schema.png)\\n\\n## Interaction Store - 0th Version\\n\\n![Alt Text](./interaction-store-v0.png)\\n\\nTo power real-time Candidate Generators (CGs), we needed fast access to user behavior signals\u2014like what a user recently clicked, ordered, or added to their cart. These interactions form the basis for many real-time recommendations, such as **Similar Products**, **People Also Viewed**, or **Recently Ordered Again**.\\nFor the **0th version** of the Interaction Store, we focused on a design that was **simple, fast, and reliable** \u2014 optimized for high-throughput ingestion and low-latency lookups.\\n\\n## Event Ingestion\\nWe instrumented our backend services to emit key user interaction events to Kafka in real time. These included:\\n- Click\\n- Order\\n- Add to Cart\\n- Wishlist\\n- Share\\n\\nEach event carried essential metadata:\\n- userId \u2014 uniquely identifies the user\\n- productId \u2014 the item being interacted with\\n- timestamp \u2014 the moment the interaction occurred\\n\\nThis decoupled the interaction logging from storage, allowing ingestion and consumption to scale independently.\\n\\n## Storage Design\\nTo store these events, we built Kafka consumers that processed the incoming streams and wrote the data into Redis, using sorted sets (ZSETs) as the primary data structure.\\n\\n### Why Redis?\\nRedis gave us:\\n- **Low-latency** reads and writes\\n- **Time-ordered data** using ZSETs (via score = timestamp)\\n- **Native TTL support**, if needed in later versions\\n- **In-memory performance** \u2014ideal for real-time CGs\\n\\n### Storage Structure\\nEach user\u2019s interactions were stored using a composite key format, uniquely identifying the user and interaction type. This structure allowed efficient organization and quick retrieval of recent activity for recommendation generation:\\n\\n```bash\\nuserId_eventType \u2192 ZSET[...(pid, ts)...]\\n```\\n\\nWithin each ZSET:\\n\\n- The **timestamp** served as the score, maintaining temporal order\\n- The **productId** (optionally with metadata) was the **value**\\n\\nThis allowed us to efficiently retrieve the interactions with HTTP-based API server with two query modes:\\n- Fetch the **last k interactions** of a specific type for a given user with `ZREVRANGE(userId_eventType, count)`\\n- Retrieve **all interactions within a time range** (e.g., last 24 hours) with `ZREVRANGEBYSCORE(userId_eventType, timeRange)`\\n\\n### Built-in Guardrails\\nSince Redis was the sole store, we implemented High Availability (HA) to prevent data loss. To optimize memory usage, we also enforced size limits per event type\u2014only storing the last k interactions per user, with older entries getting truncated.\\n\\n## Conclusion: Laying the Foundation for Real-Time ML\\n\\nIn this first phase, we tackled the **fundamentals**\u2014shifting from batch-based recommendations to a **real-time Recommendation** using ML platform that could keep up with Meesho\u2019s growth.\\n\\nWith the **IOP Framework**, **Online Feature Store**, and **Interaction Store**, we built the core infrastructure to support real-time personalization at scale. These wins have already unlocked: \\n- \u2705 Faster, more dynamic recommendations for millions of users. \\n- \u2705 Better infrastructure efficiency, reducing wasted compute power. \\n- \u2705 A flexible, modular system that allows for further experimentation.\\n\\nBut this is just the beginning. While we\'ve solved key challenges, **certain roadblocks remain** \u2014from optimizing **cost-performance trade-offs** to **seamlessly evolving schemas**.\\n\\n\\nThis foundational work laid the path for a reliable and scalable **real-time feature serving layer**."}]}}')}}]); \ No newline at end of file diff --git a/docs/assets/js/runtime~main.1a0737d5.js b/docs/assets/js/runtime~main.a356c557.js similarity index 97% rename from docs/assets/js/runtime~main.1a0737d5.js rename to docs/assets/js/runtime~main.a356c557.js index 412e72d4..ec433a43 100644 --- a/docs/assets/js/runtime~main.1a0737d5.js +++ b/docs/assets/js/runtime~main.a356c557.js @@ -1 +1 @@ -(()=>{"use strict";var e,a,c,f,d,b={},t={};function r(e){var a=t[e];if(void 0!==a)return a.exports;var c=t[e]={id:e,loaded:!1,exports:{}};return b[e].call(c.exports,c,c.exports,r),c.loaded=!0,c.exports}r.m=b,r.c=t,e=[],r.O=(a,c,f,d)=>{if(!c){var b=1/0;for(i=0;i=d)&&Object.keys(r.O).every(e=>r.O[e](c[o]))?c.splice(o--,1):(t=!1,d0&&e[i-1][2]>d;i--)e[i]=e[i-1];e[i]=[c,f,d]},r.n=e=>{var a=e&&e.__esModule?()=>e.default:()=>e;return r.d(a,{a:a}),a},c=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,f){if(1&f&&(e=this(e)),8&f)return e;if("object"==typeof e&&e){if(4&f&&e.__esModule)return e;if(16&f&&"function"==typeof e.then)return e}var d=Object.create(null);r.r(d);var b={};a=a||[null,c({}),c([]),c(c)];for(var t=2&f&&e;("object"==typeof t||"function"==typeof t)&&!~a.indexOf(t);t=c(t))Object.getOwnPropertyNames(t).forEach(a=>b[a]=()=>e[a]);return b.default=()=>e,r.d(d,b),d},r.d=(e,a)=>{for(var c in a)r.o(a,c)&&!r.o(e,c)&&Object.defineProperty(e,c,{enumerable:!0,get:a[c]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce((a,c)=>(r.f[c](e,a),a),[])),r.u=e=>"assets/js/"+({9:"845957d4",74:"340c7c5f",149:"e8321834",690:"3e1c5046",770:"aaabe254",940:"3980073a",1009:"50899a24",1065:"44d1c015",1235:"a7456010",1405:"e66382f6",1508:"56eef1be",1537:"8dd2df60",1606:"d152284c",1686:"08daf6b6",1782:"3650a837",1903:"acecf23e",1915:"c4822c4f",1964:"4af50aac",1965:"b0267ac9",1966:"3039fa8c",1999:"f994c8da",2092:"0dae2a8b",2344:"4caa95bf",2379:"4df0e30b",2576:"2303959d",2634:"c4f5d8e4",2711:"9e4087bc",2771:"e8202a51",2951:"9aed321e",3068:"d853e668",3212:"c31e69d4",3239:"23d02069",3249:"ccc49370",3322:"7fa80e1c",3645:"1a64de69",3976:"0e384e19",4064:"99009a21",4134:"393be207",4164:"bcee635f",4197:"a1ba6e62",4212:"621db11d",4416:"93f344c7",4424:"252a9097",4582:"14064408",4797:"4dd73b28",4813:"6875c492",5425:"479eb034",5430:"9796f4b8",5579:"6479fb86",5742:"aba21aa0",5801:"2c62ead1",6027:"1a4fe2b7",6054:"4137b431",6061:"1f391b9e",6063:"bd5b7851",6088:"df502808",6100:"176d210f",6267:"769c1945",6812:"bba9e323",6969:"14eb3368",7098:"a7bd4aaa",7290:"982cae12",7472:"814f3328",7508:"0a89f5c9",7609:"adb039a4",7643:"a6aa9e1f",7720:"fcf4f6ca",7795:"3c208a5b",7813:"74783256",7871:"be9e6e2d",8014:"9d13045e",8209:"01a85c17",8241:"4d1a2db0",8276:"c621f852",8401:"17896441",8439:"23a1b8fc",8465:"8ac6191a",8588:"67d4782a",8591:"ae7a6e8a",8593:"d01bc907",8643:"a2d4c71d",8933:"c7b64fcc",9048:"a94703ab",9095:"4b01b88a",9158:"616111d3",9197:"2d865531",9226:"6bb91276",9473:"ac51638e",9596:"0fff8dc8",9647:"5e95c892",9688:"bf2864cf",9795:"d9861b0f",9824:"8ea48c46",9858:"36994c47",9919:"0413d9af"}[e]||e)+"."+{9:"8290eece",74:"a496fe54",149:"dbcc9814",690:"22a78085",770:"ba3e9f5f",940:"43116f8b",1009:"53577092",1065:"880095c2",1235:"5f9bbb01",1405:"ad26fd04",1508:"ad4e065e",1537:"f10b075c",1606:"0800e671",1686:"852abb6b",1782:"fd1a89f8",1903:"4b2b5a9c",1915:"c80625fe",1964:"59b38fde",1965:"2ed3e1de",1966:"6e21a8e8",1999:"ba88b6c0",2092:"a19d1bf1",2237:"bfceba09",2344:"ca3bb1d0",2379:"00b3a0ff",2576:"3e6bfee0",2634:"3bbeb6fa",2711:"342bf9bc",2771:"49541ad2",2951:"0ede45c0",3068:"e1234cf0",3212:"0061b133",3239:"48cf6bc2",3249:"471f68d9",3322:"b2123398",3645:"844e372c",3976:"cb894d32",4064:"1a57fa22",4134:"6e979fd2",4164:"b2209c62",4197:"00332bec",4212:"1a835b77",4416:"7cebeb9e",4424:"294bfc5c",4582:"9ca9709f",4797:"f4ce8cb6",4813:"1403aab6",5425:"fc01692f",5430:"34772e40",5579:"1b301bf1",5742:"ed09cce9",5801:"fc5c1b17",6027:"286ed6de",6054:"eda97697",6061:"4acd5995",6063:"1913daf9",6088:"a61c45a0",6100:"cd62be36",6267:"d427d253",6812:"c204228f",6870:"25f53758",6969:"60af715e",7098:"8da7b7a1",7290:"44c377b8",7472:"9351096c",7508:"190be82b",7518:"9525ffbe",7609:"88a13880",7643:"7671586a",7720:"8b12d88e",7795:"9fedc403",7813:"0fa34723",7871:"944ea2f0",8014:"3f255bd8",8209:"9618aedf",8241:"1e2d0e16",8276:"beef9a06",8401:"72377930",8439:"29176600",8465:"8ac511aa",8588:"96733ef0",8591:"58ed54b2",8593:"3a0113c2",8643:"c8f92a2a",8933:"70fc6828",9048:"3a38a667",9095:"b97bdb87",9158:"9f2925b1",9197:"30aab314",9226:"d5be09e8",9473:"0b4da379",9596:"70193857",9647:"6dda521c",9688:"6fc085c5",9795:"449cd5bc",9824:"119645ab",9858:"337a7516",9919:"fc3050c7"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,a)=>Object.prototype.hasOwnProperty.call(e,a),f={},d="docs:",r.l=(e,a,c,b)=>{if(f[e])f[e].push(a);else{var t,o;if(void 0!==c)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var d=f[e];if(delete f[e],t.parentNode&&t.parentNode.removeChild(t),d&&d.forEach(e=>e(c)),a)return a(c)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/BharatMLStack/",r.gca=function(e){return e={14064408:"4582",17896441:"8401",74783256:"7813","845957d4":"9","340c7c5f":"74",e8321834:"149","3e1c5046":"690",aaabe254:"770","3980073a":"940","50899a24":"1009","44d1c015":"1065",a7456010:"1235",e66382f6:"1405","56eef1be":"1508","8dd2df60":"1537",d152284c:"1606","08daf6b6":"1686","3650a837":"1782",acecf23e:"1903",c4822c4f:"1915","4af50aac":"1964",b0267ac9:"1965","3039fa8c":"1966",f994c8da:"1999","0dae2a8b":"2092","4caa95bf":"2344","4df0e30b":"2379","2303959d":"2576",c4f5d8e4:"2634","9e4087bc":"2711",e8202a51:"2771","9aed321e":"2951",d853e668:"3068",c31e69d4:"3212","23d02069":"3239",ccc49370:"3249","7fa80e1c":"3322","1a64de69":"3645","0e384e19":"3976","99009a21":"4064","393be207":"4134",bcee635f:"4164",a1ba6e62:"4197","621db11d":"4212","93f344c7":"4416","252a9097":"4424","4dd73b28":"4797","6875c492":"4813","479eb034":"5425","9796f4b8":"5430","6479fb86":"5579",aba21aa0:"5742","2c62ead1":"5801","1a4fe2b7":"6027","4137b431":"6054","1f391b9e":"6061",bd5b7851:"6063",df502808:"6088","176d210f":"6100","769c1945":"6267",bba9e323:"6812","14eb3368":"6969",a7bd4aaa:"7098","982cae12":"7290","814f3328":"7472","0a89f5c9":"7508",adb039a4:"7609",a6aa9e1f:"7643",fcf4f6ca:"7720","3c208a5b":"7795",be9e6e2d:"7871","9d13045e":"8014","01a85c17":"8209","4d1a2db0":"8241",c621f852:"8276","23a1b8fc":"8439","8ac6191a":"8465","67d4782a":"8588",ae7a6e8a:"8591",d01bc907:"8593",a2d4c71d:"8643",c7b64fcc:"8933",a94703ab:"9048","4b01b88a":"9095","616111d3":"9158","2d865531":"9197","6bb91276":"9226",ac51638e:"9473","0fff8dc8":"9596","5e95c892":"9647",bf2864cf:"9688",d9861b0f:"9795","8ea48c46":"9824","36994c47":"9858","0413d9af":"9919"}[e]||e,r.p+r.u(e)},(()=>{var e={5354:0,1869:0};r.f.j=(a,c)=>{var f=r.o(e,a)?e[a]:void 0;if(0!==f)if(f)c.push(f[2]);else if(/^(1869|5354)$/.test(a))e[a]=0;else{var d=new Promise((c,d)=>f=e[a]=[c,d]);c.push(f[2]=d);var b=r.p+r.u(a),t=new Error;r.l(b,c=>{if(r.o(e,a)&&(0!==(f=e[a])&&(e[a]=void 0),f)){var d=c&&("load"===c.type?"missing":c.type),b=c&&c.target&&c.target.src;t.message="Loading chunk "+a+" failed.\n("+d+": "+b+")",t.name="ChunkLoadError",t.type=d,t.request=b,f[1](t)}},"chunk-"+a,a)}},r.O.j=a=>0===e[a];var a=(a,c)=>{var f,d,b=c[0],t=c[1],o=c[2],n=0;if(b.some(a=>0!==e[a])){for(f in t)r.o(t,f)&&(r.m[f]=t[f]);if(o)var i=o(r)}for(a&&a(c);n{"use strict";var e,a,c,f,d,b={},t={};function r(e){var a=t[e];if(void 0!==a)return a.exports;var c=t[e]={id:e,loaded:!1,exports:{}};return b[e].call(c.exports,c,c.exports,r),c.loaded=!0,c.exports}r.m=b,r.c=t,e=[],r.O=(a,c,f,d)=>{if(!c){var b=1/0;for(i=0;i=d)&&Object.keys(r.O).every(e=>r.O[e](c[o]))?c.splice(o--,1):(t=!1,d0&&e[i-1][2]>d;i--)e[i]=e[i-1];e[i]=[c,f,d]},r.n=e=>{var a=e&&e.__esModule?()=>e.default:()=>e;return r.d(a,{a:a}),a},c=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,f){if(1&f&&(e=this(e)),8&f)return e;if("object"==typeof e&&e){if(4&f&&e.__esModule)return e;if(16&f&&"function"==typeof e.then)return e}var d=Object.create(null);r.r(d);var b={};a=a||[null,c({}),c([]),c(c)];for(var t=2&f&&e;("object"==typeof t||"function"==typeof t)&&!~a.indexOf(t);t=c(t))Object.getOwnPropertyNames(t).forEach(a=>b[a]=()=>e[a]);return b.default=()=>e,r.d(d,b),d},r.d=(e,a)=>{for(var c in a)r.o(a,c)&&!r.o(e,c)&&Object.defineProperty(e,c,{enumerable:!0,get:a[c]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce((a,c)=>(r.f[c](e,a),a),[])),r.u=e=>"assets/js/"+({9:"845957d4",74:"340c7c5f",149:"e8321834",690:"3e1c5046",770:"aaabe254",940:"3980073a",1009:"50899a24",1065:"44d1c015",1235:"a7456010",1405:"e66382f6",1508:"56eef1be",1537:"8dd2df60",1606:"d152284c",1686:"08daf6b6",1782:"3650a837",1903:"acecf23e",1915:"c4822c4f",1964:"4af50aac",1965:"b0267ac9",1966:"3039fa8c",1999:"f994c8da",2092:"0dae2a8b",2344:"4caa95bf",2379:"4df0e30b",2576:"2303959d",2634:"c4f5d8e4",2711:"9e4087bc",2771:"e8202a51",2951:"9aed321e",3068:"d853e668",3212:"c31e69d4",3239:"23d02069",3249:"ccc49370",3322:"7fa80e1c",3645:"1a64de69",3976:"0e384e19",4064:"99009a21",4134:"393be207",4164:"bcee635f",4197:"a1ba6e62",4212:"621db11d",4416:"93f344c7",4424:"252a9097",4582:"14064408",4797:"4dd73b28",4813:"6875c492",5425:"479eb034",5430:"9796f4b8",5579:"6479fb86",5742:"aba21aa0",5801:"2c62ead1",6027:"1a4fe2b7",6054:"4137b431",6061:"1f391b9e",6063:"bd5b7851",6088:"df502808",6100:"176d210f",6267:"769c1945",6812:"bba9e323",6969:"14eb3368",7098:"a7bd4aaa",7290:"982cae12",7472:"814f3328",7508:"0a89f5c9",7609:"adb039a4",7643:"a6aa9e1f",7720:"fcf4f6ca",7795:"3c208a5b",7813:"74783256",7871:"be9e6e2d",8014:"9d13045e",8209:"01a85c17",8241:"4d1a2db0",8276:"c621f852",8401:"17896441",8439:"23a1b8fc",8465:"8ac6191a",8588:"67d4782a",8591:"ae7a6e8a",8593:"d01bc907",8643:"a2d4c71d",8933:"c7b64fcc",9048:"a94703ab",9095:"4b01b88a",9158:"616111d3",9197:"2d865531",9226:"6bb91276",9473:"ac51638e",9596:"0fff8dc8",9647:"5e95c892",9688:"bf2864cf",9795:"d9861b0f",9824:"8ea48c46",9858:"36994c47",9919:"0413d9af"}[e]||e)+"."+{9:"8290eece",74:"a496fe54",149:"dbcc9814",690:"22a78085",770:"ba3e9f5f",940:"43116f8b",1009:"53577092",1065:"880095c2",1235:"5f9bbb01",1405:"ad26fd04",1508:"ad4e065e",1537:"f10b075c",1606:"0800e671",1686:"852abb6b",1782:"fd1a89f8",1903:"4b2b5a9c",1915:"c80625fe",1964:"59b38fde",1965:"2ed3e1de",1966:"6e21a8e8",1999:"ba88b6c0",2092:"a19d1bf1",2237:"bfceba09",2344:"ca3bb1d0",2379:"00b3a0ff",2576:"3e6bfee0",2634:"3bbeb6fa",2711:"342bf9bc",2771:"49541ad2",2951:"0ede45c0",3068:"e1234cf0",3212:"0061b133",3239:"48cf6bc2",3249:"471f68d9",3322:"b2123398",3645:"844e372c",3976:"cb894d32",4064:"1a57fa22",4134:"6e979fd2",4164:"b2209c62",4197:"00332bec",4212:"1a835b77",4416:"7cebeb9e",4424:"294bfc5c",4582:"9ca9709f",4797:"06495428",4813:"1403aab6",5425:"fc01692f",5430:"34772e40",5579:"431b9ea8",5742:"ed09cce9",5801:"fc5c1b17",6027:"286ed6de",6054:"eda97697",6061:"4acd5995",6063:"1913daf9",6088:"a61c45a0",6100:"cd62be36",6267:"d427d253",6812:"c204228f",6870:"25f53758",6969:"60af715e",7098:"8da7b7a1",7290:"44c377b8",7472:"9351096c",7508:"190be82b",7518:"9525ffbe",7609:"88a13880",7643:"7671586a",7720:"8b12d88e",7795:"9fedc403",7813:"0fa34723",7871:"944ea2f0",8014:"3f255bd8",8209:"9618aedf",8241:"1e2d0e16",8276:"beef9a06",8401:"72377930",8439:"29176600",8465:"8ac511aa",8588:"96733ef0",8591:"58ed54b2",8593:"3a0113c2",8643:"c8f92a2a",8933:"70fc6828",9048:"3a38a667",9095:"e818c95d",9158:"9f2925b1",9197:"30aab314",9226:"d5be09e8",9473:"0b4da379",9596:"70193857",9647:"6dda521c",9688:"6fc085c5",9795:"449cd5bc",9824:"119645ab",9858:"337a7516",9919:"fc3050c7"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,a)=>Object.prototype.hasOwnProperty.call(e,a),f={},d="docs:",r.l=(e,a,c,b)=>{if(f[e])f[e].push(a);else{var t,o;if(void 0!==c)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var d=f[e];if(delete f[e],t.parentNode&&t.parentNode.removeChild(t),d&&d.forEach(e=>e(c)),a)return a(c)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/BharatMLStack/",r.gca=function(e){return e={14064408:"4582",17896441:"8401",74783256:"7813","845957d4":"9","340c7c5f":"74",e8321834:"149","3e1c5046":"690",aaabe254:"770","3980073a":"940","50899a24":"1009","44d1c015":"1065",a7456010:"1235",e66382f6:"1405","56eef1be":"1508","8dd2df60":"1537",d152284c:"1606","08daf6b6":"1686","3650a837":"1782",acecf23e:"1903",c4822c4f:"1915","4af50aac":"1964",b0267ac9:"1965","3039fa8c":"1966",f994c8da:"1999","0dae2a8b":"2092","4caa95bf":"2344","4df0e30b":"2379","2303959d":"2576",c4f5d8e4:"2634","9e4087bc":"2711",e8202a51:"2771","9aed321e":"2951",d853e668:"3068",c31e69d4:"3212","23d02069":"3239",ccc49370:"3249","7fa80e1c":"3322","1a64de69":"3645","0e384e19":"3976","99009a21":"4064","393be207":"4134",bcee635f:"4164",a1ba6e62:"4197","621db11d":"4212","93f344c7":"4416","252a9097":"4424","4dd73b28":"4797","6875c492":"4813","479eb034":"5425","9796f4b8":"5430","6479fb86":"5579",aba21aa0:"5742","2c62ead1":"5801","1a4fe2b7":"6027","4137b431":"6054","1f391b9e":"6061",bd5b7851:"6063",df502808:"6088","176d210f":"6100","769c1945":"6267",bba9e323:"6812","14eb3368":"6969",a7bd4aaa:"7098","982cae12":"7290","814f3328":"7472","0a89f5c9":"7508",adb039a4:"7609",a6aa9e1f:"7643",fcf4f6ca:"7720","3c208a5b":"7795",be9e6e2d:"7871","9d13045e":"8014","01a85c17":"8209","4d1a2db0":"8241",c621f852:"8276","23a1b8fc":"8439","8ac6191a":"8465","67d4782a":"8588",ae7a6e8a:"8591",d01bc907:"8593",a2d4c71d:"8643",c7b64fcc:"8933",a94703ab:"9048","4b01b88a":"9095","616111d3":"9158","2d865531":"9197","6bb91276":"9226",ac51638e:"9473","0fff8dc8":"9596","5e95c892":"9647",bf2864cf:"9688",d9861b0f:"9795","8ea48c46":"9824","36994c47":"9858","0413d9af":"9919"}[e]||e,r.p+r.u(e)},(()=>{var e={5354:0,1869:0};r.f.j=(a,c)=>{var f=r.o(e,a)?e[a]:void 0;if(0!==f)if(f)c.push(f[2]);else if(/^(1869|5354)$/.test(a))e[a]=0;else{var d=new Promise((c,d)=>f=e[a]=[c,d]);c.push(f[2]=d);var b=r.p+r.u(a),t=new Error;r.l(b,c=>{if(r.o(e,a)&&(0!==(f=e[a])&&(e[a]=void 0),f)){var d=c&&("load"===c.type?"missing":c.type),b=c&&c.target&&c.target.src;t.message="Loading chunk "+a+" failed.\n("+d+": "+b+")",t.name="ChunkLoadError",t.type=d,t.request=b,f[1](t)}},"chunk-"+a,a)}},r.O.j=a=>0===e[a];var a=(a,c)=>{var f,d,b=c[0],t=c[1],o=c[2],n=0;if(b.some(a=>0!==e[a])){for(f in t)r.o(t,f)&&(r.m[f]=t[f]);if(o)var i=o(r)}for(a&&a(c);n Archive | BharatMLStack - + diff --git a/docs/blog/atom.xml b/docs/blog/atom.xml index 347236ca..01db1346 100644 --- a/docs/blog/atom.xml +++ b/docs/blog/atom.xml @@ -14,10 +14,10 @@ 2026-02-19T00:00:00.000Z BharatMLStack -Every agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.

-

They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

-

We are trying to built something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving.

-

Then we tested it. The results surprised us.

+Agent memory has come a long way. Persistent context, vector retrieval, knowledge graphs — the building blocks are real and getting better fast.

+

But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

+

We are trying to experiment something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving. +Then we tested it. The results were interesting.

The Gap Nobody Talks About​

Here's a scenario every engineering team has encountered: AI agent hits a Redis connection pool exhaustion issue. It misdiagnoses it as a database problem. You correct it. Next week, a different service has the exact same failure pattern. The agent makes the exact same mistake.

Why? Because LLMs don't learn at inference time. Corrections adjust behavior within a conversation. Once the session ends, the lesson is gone. The model weights haven't changed. The next conversation starts from zero.

diff --git a/docs/blog/authors/index.html b/docs/blog/authors/index.html index c0e6b648..347173f8 100644 --- a/docs/blog/authors/index.html +++ b/docs/blog/authors/index.html @@ -5,7 +5,7 @@ Authors | BharatMLStack - + diff --git a/docs/blog/building-meeshos-mlplatform-lessons-from-first-gen/index.html b/docs/blog/building-meeshos-mlplatform-lessons-from-first-gen/index.html index 8bbc4634..94c130a3 100644 --- a/docs/blog/building-meeshos-mlplatform-lessons-from-first-gen/index.html +++ b/docs/blog/building-meeshos-mlplatform-lessons-from-first-gen/index.html @@ -5,7 +5,7 @@ Building Meesho’s ML Platform: Lessons from the First-Gen System (Part 2) | BharatMLStack - + diff --git a/docs/blog/building-meeshos-mlplatform/index.html b/docs/blog/building-meeshos-mlplatform/index.html index 054734ea..dd8b65b9 100644 --- a/docs/blog/building-meeshos-mlplatform/index.html +++ b/docs/blog/building-meeshos-mlplatform/index.html @@ -5,7 +5,7 @@ Building Meesho’s ML Platform: From Chaos to Cutting-Edge (Part 1) | BharatMLStack - + diff --git a/docs/blog/episodic-memory-for-agents/index.html b/docs/blog/episodic-memory-for-agents/index.html index 35cf3a5c..da6e2e68 100644 --- a/docs/blog/episodic-memory-for-agents/index.html +++ b/docs/blog/episodic-memory-for-agents/index.html @@ -5,7 +5,7 @@ Beyond Vector RAG: Building Agent Memory That Learns From Experience. | BharatMLStack - + @@ -13,10 +13,10 @@

Beyond Vector RAG: Building Agent Memory That Learns From Experience.

· 12 min read
Adarsha Das
Senior Architect @ Meesho

BharatMLStack -Every agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.

-

They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

-

We are trying to built something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving.

-

Then we tested it. The results surprised us.

+Agent memory has come a long way. Persistent context, vector retrieval, knowledge graphs — the building blocks are real and getting better fast.

+

But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

+

We are trying to experiment something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving. +Then we tested it. The results were interesting.

The Gap Nobody Talks About​

Here's a scenario every engineering team has encountered: AI agent hits a Redis connection pool exhaustion issue. It misdiagnoses it as a database problem. You correct it. Next week, a different service has the exact same failure pattern. The agent makes the exact same mistake.

Why? Because LLMs don't learn at inference time. Corrections adjust behavior within a conversation. Once the session ends, the lesson is gone. The model weights haven't changed. The next conversation starts from zero.

diff --git a/docs/blog/index.html b/docs/blog/index.html index e168f16f..3a5bed61 100644 --- a/docs/blog/index.html +++ b/docs/blog/index.html @@ -5,7 +5,7 @@ Blog | BharatMLStack - + @@ -13,10 +13,10 @@

Beyond Vector RAG: Building Agent Memory That Learns From Experience.

· 12 min read
Adarsha Das
Senior Architect @ Meesho

BharatMLStack -Every agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.

-

They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

-

We are trying to built something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving.

-

Then we tested it. The results surprised us.

LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale

· 5 min read
Jaya Kumar
Lead ML Engineer @ Meesho

BharatMLStack +Agent memory has come a long way. Persistent context, vector retrieval, knowledge graphs — the building blocks are real and getting better fast.

+

But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

+

We are trying to experiment something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving. +Then we tested it. The results were interesting.

LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale

· 5 min read
Jaya Kumar
Lead ML Engineer @ Meesho

BharatMLStack Raw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack—from memory management to kernel execution.

1. Advanced Memory Management: Paged & Prefix KV Caching​

The most significant bottleneck in LLM inference is not always compute, but memory bandwidth—specifically managing the Key-Value (KV) cache.

diff --git a/docs/blog/llm-inference-optimization-sub-sec-latency/index.html b/docs/blog/llm-inference-optimization-sub-sec-latency/index.html index e6b17871..0ff7ad94 100644 --- a/docs/blog/llm-inference-optimization-sub-sec-latency/index.html +++ b/docs/blog/llm-inference-optimization-sub-sec-latency/index.html @@ -5,7 +5,7 @@ LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale | BharatMLStack - + diff --git a/docs/blog/multi-engine-llm-inferencing-platform/index.html b/docs/blog/multi-engine-llm-inferencing-platform/index.html index 29a14417..1401ca41 100644 --- a/docs/blog/multi-engine-llm-inferencing-platform/index.html +++ b/docs/blog/multi-engine-llm-inferencing-platform/index.html @@ -5,7 +5,7 @@ Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving | BharatMLStack - + diff --git a/docs/blog/rss.xml b/docs/blog/rss.xml index f9e5f0c9..0b795f1e 100644 --- a/docs/blog/rss.xml +++ b/docs/blog/rss.xml @@ -15,10 +15,10 @@ Thu, 19 Feb 2026 00:00:00 GMT BharatMLStack -Every agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.

-

They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

-

We are trying to built something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving.

-

Then we tested it. The results surprised us.

+Agent memory has come a long way. Persistent context, vector retrieval, knowledge graphs — the building blocks are real and getting better fast.

+

But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

+

We are trying to experiment something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving. +Then we tested it. The results were interesting.

The Gap Nobody Talks About​

Here's a scenario every engineering team has encountered: AI agent hits a Redis connection pool exhaustion issue. It misdiagnoses it as a database problem. You correct it. Next week, a different service has the exact same failure pattern. The agent makes the exact same mistake.

Why? Because LLMs don't learn at inference time. Corrections adjust behavior within a conversation. Once the session ends, the lesson is gone. The model weights haven't changed. The next conversation starts from zero.

diff --git a/docs/blog/scaling-model-inference-and-embedding-search/index.html b/docs/blog/scaling-model-inference-and-embedding-search/index.html index 62643c89..a5da21ff 100644 --- a/docs/blog/scaling-model-inference-and-embedding-search/index.html +++ b/docs/blog/scaling-model-inference-and-embedding-search/index.html @@ -5,7 +5,7 @@ Cracking the Code: Scaling Model Inference & Real-Time Embedding Search | BharatMLStack - + diff --git a/docs/blog/tags/ai-agents/index.html b/docs/blog/tags/ai-agents/index.html index 744897c3..c1dd3fa2 100644 --- a/docs/blog/tags/ai-agents/index.html +++ b/docs/blog/tags/ai-agents/index.html @@ -5,7 +5,7 @@ One post tagged with "ai-agents" | BharatMLStack - + @@ -13,9 +13,9 @@

One post tagged with "ai-agents"

View All Tags

Beyond Vector RAG: Building Agent Memory That Learns From Experience.

· 12 min read
Adarsha Das
Senior Architect @ Meesho

BharatMLStack -Every agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.

-

They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

-

We are trying to built something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving.

-

Then we tested it. The results surprised us.

+Agent memory has come a long way. Persistent context, vector retrieval, knowledge graphs — the building blocks are real and getting better fast.

+

But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

+

We are trying to experiment something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving. +Then we tested it. The results were interesting.

\ No newline at end of file diff --git a/docs/blog/tags/architecture/index.html b/docs/blog/tags/architecture/index.html index 07604d11..817be7a7 100644 --- a/docs/blog/tags/architecture/index.html +++ b/docs/blog/tags/architecture/index.html @@ -5,7 +5,7 @@ One post tagged with "architecture" | BharatMLStack - + @@ -13,9 +13,9 @@

One post tagged with "architecture"

View All Tags

Beyond Vector RAG: Building Agent Memory That Learns From Experience.

· 12 min read
Adarsha Das
Senior Architect @ Meesho

BharatMLStack -Every agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.

-

They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

-

We are trying to built something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving.

-

Then we tested it. The results surprised us.

+Agent memory has come a long way. Persistent context, vector retrieval, knowledge graphs — the building blocks are real and getting better fast.

+

But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

+

We are trying to experiment something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving. +Then we tested it. The results were interesting.

\ No newline at end of file diff --git a/docs/blog/tags/bharatmlstack/index.html b/docs/blog/tags/bharatmlstack/index.html index 5e4387f7..867a1596 100644 --- a/docs/blog/tags/bharatmlstack/index.html +++ b/docs/blog/tags/bharatmlstack/index.html @@ -5,7 +5,7 @@ 4 posts tagged with "bharatmlstack" | BharatMLStack - + diff --git a/docs/blog/tags/embedding-search/index.html b/docs/blog/tags/embedding-search/index.html index 4582195d..de433cf7 100644 --- a/docs/blog/tags/embedding-search/index.html +++ b/docs/blog/tags/embedding-search/index.html @@ -5,7 +5,7 @@ One post tagged with "embedding-search" | BharatMLStack - + diff --git a/docs/blog/tags/episodic-memory/index.html b/docs/blog/tags/episodic-memory/index.html index 7907450c..8dd34da7 100644 --- a/docs/blog/tags/episodic-memory/index.html +++ b/docs/blog/tags/episodic-memory/index.html @@ -5,7 +5,7 @@ One post tagged with "episodic-memory" | BharatMLStack - + @@ -13,9 +13,9 @@

One post tagged with "episodic-memory"

View All Tags

Beyond Vector RAG: Building Agent Memory That Learns From Experience.

· 12 min read
Adarsha Das
Senior Architect @ Meesho

BharatMLStack -Every agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.

-

They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

-

We are trying to built something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving.

-

Then we tested it. The results surprised us.

+Agent memory has come a long way. Persistent context, vector retrieval, knowledge graphs — the building blocks are real and getting better fast.

+

But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

+

We are trying to experiment something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving. +Then we tested it. The results were interesting.

\ No newline at end of file diff --git a/docs/blog/tags/index.html b/docs/blog/tags/index.html index 66343915..076ffac2 100644 --- a/docs/blog/tags/index.html +++ b/docs/blog/tags/index.html @@ -5,7 +5,7 @@ Tags | BharatMLStack - + diff --git a/docs/blog/tags/inferflow/index.html b/docs/blog/tags/inferflow/index.html index ab6147b4..f7cfe9b8 100644 --- a/docs/blog/tags/inferflow/index.html +++ b/docs/blog/tags/inferflow/index.html @@ -5,7 +5,7 @@ One post tagged with "inferflow" | BharatMLStack - + diff --git a/docs/blog/tags/interaction-store/index.html b/docs/blog/tags/interaction-store/index.html index 9f01cc0a..58804adf 100644 --- a/docs/blog/tags/interaction-store/index.html +++ b/docs/blog/tags/interaction-store/index.html @@ -5,7 +5,7 @@ 2 posts tagged with "interaction-store" | BharatMLStack - + diff --git a/docs/blog/tags/llm/index.html b/docs/blog/tags/llm/index.html index f4a09331..81eeea46 100644 --- a/docs/blog/tags/llm/index.html +++ b/docs/blog/tags/llm/index.html @@ -5,7 +5,7 @@ 3 posts tagged with "llm" | BharatMLStack - + @@ -13,10 +13,10 @@

3 posts tagged with "llm"

View All Tags

Beyond Vector RAG: Building Agent Memory That Learns From Experience.

· 12 min read
Adarsha Das
Senior Architect @ Meesho

BharatMLStack -Every agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.

-

They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

-

We are trying to built something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving.

-

Then we tested it. The results surprised us.

LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale

· 5 min read
Jaya Kumar
Lead ML Engineer @ Meesho

BharatMLStack +Agent memory has come a long way. Persistent context, vector retrieval, knowledge graphs — the building blocks are real and getting better fast.

+

But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

+

We are trying to experiment something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving. +Then we tested it. The results were interesting.

LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale

· 5 min read
Jaya Kumar
Lead ML Engineer @ Meesho

BharatMLStack Raw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack—from memory management to kernel execution.

1. Advanced Memory Management: Paged & Prefix KV Caching​

The most significant bottleneck in LLM inference is not always compute, but memory bandwidth—specifically managing the Key-Value (KV) cache.

diff --git a/docs/blog/tags/meesho/index.html b/docs/blog/tags/meesho/index.html index c9d0a9e7..a6e3d5f9 100644 --- a/docs/blog/tags/meesho/index.html +++ b/docs/blog/tags/meesho/index.html @@ -5,7 +5,7 @@ 5 posts tagged with "meesho" | BharatMLStack - + diff --git a/docs/blog/tags/memory/index.html b/docs/blog/tags/memory/index.html index 1b9d91ac..df8215d6 100644 --- a/docs/blog/tags/memory/index.html +++ b/docs/blog/tags/memory/index.html @@ -5,7 +5,7 @@ One post tagged with "memory" | BharatMLStack - + @@ -13,9 +13,9 @@

One post tagged with "memory"

View All Tags

Beyond Vector RAG: Building Agent Memory That Learns From Experience.

· 12 min read
Adarsha Das
Senior Architect @ Meesho

BharatMLStack -Every agent framework on the market will tell you their agents "have memory." What they mean is: they have a vector database.

-

They chunk text, embed it, store it, and retrieve whatever looks similar at query time. This works for document Q&A. It fails the moment you expect an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

-

We are trying to built something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving.

-

Then we tested it. The results surprised us.

+Agent memory has come a long way. Persistent context, vector retrieval, knowledge graphs — the building blocks are real and getting better fast.

+

But most of what we call "memory" today is still closer to search: chunk text, embed it, retrieve whatever looks similar at query time. That works well for recalling facts and preferences. It starts to break down when you need an agent to recall what happened last time, learn from a mistake, or avoid repeating a failed approach.

+

We are trying to experiment something different. An episodic memory system where a frozen LLM — same weights, no retraining — produces increasingly better decisions over time because the memory feeding it context is continuously evolving. +Then we tested it. The results were interesting.

\ No newline at end of file diff --git a/docs/blog/tags/mlplatform/index.html b/docs/blog/tags/mlplatform/index.html index 9199a6a0..55ef3368 100644 --- a/docs/blog/tags/mlplatform/index.html +++ b/docs/blog/tags/mlplatform/index.html @@ -5,7 +5,7 @@ 5 posts tagged with "mlplatform" | BharatMLStack - + diff --git a/docs/blog/tags/model-inference/index.html b/docs/blog/tags/model-inference/index.html index ab224ac0..ebe156ec 100644 --- a/docs/blog/tags/model-inference/index.html +++ b/docs/blog/tags/model-inference/index.html @@ -5,7 +5,7 @@ One post tagged with "model-inference" | BharatMLStack - + diff --git a/docs/blog/tags/online-feature-store/index.html b/docs/blog/tags/online-feature-store/index.html index e1e61ece..4174fd89 100644 --- a/docs/blog/tags/online-feature-store/index.html +++ b/docs/blog/tags/online-feature-store/index.html @@ -5,7 +5,7 @@ One post tagged with "online-feature-store" | BharatMLStack - + diff --git a/docs/blog/tags/tensorrt-llm/index.html b/docs/blog/tags/tensorrt-llm/index.html index bc3c7b7e..40061f44 100644 --- a/docs/blog/tags/tensorrt-llm/index.html +++ b/docs/blog/tags/tensorrt-llm/index.html @@ -5,7 +5,7 @@ 2 posts tagged with "tensorrt-llm" | BharatMLStack - + diff --git a/docs/blog/tags/vllm/index.html b/docs/blog/tags/vllm/index.html index 1f86cd12..e0924c19 100644 --- a/docs/blog/tags/vllm/index.html +++ b/docs/blog/tags/vllm/index.html @@ -5,7 +5,7 @@ 2 posts tagged with "vllm" | BharatMLStack - + diff --git a/docs/category/go-sdk/index.html b/docs/category/go-sdk/index.html index d824d424..7fc9cba0 100644 --- a/docs/category/go-sdk/index.html +++ b/docs/category/go-sdk/index.html @@ -5,7 +5,7 @@ Go SDK | BharatMLStack - + diff --git a/docs/category/inferflow/index.html b/docs/category/inferflow/index.html index 345bb8f9..40a09723 100644 --- a/docs/category/inferflow/index.html +++ b/docs/category/inferflow/index.html @@ -5,7 +5,7 @@ Inferflow | BharatMLStack - + diff --git a/docs/category/numerix/index.html b/docs/category/numerix/index.html index 053f015c..f913409f 100644 --- a/docs/category/numerix/index.html +++ b/docs/category/numerix/index.html @@ -5,7 +5,7 @@ Numerix | BharatMLStack - + diff --git a/docs/category/online-feature-store/index.html b/docs/category/online-feature-store/index.html index 1bf27dc8..ee8747e9 100644 --- a/docs/category/online-feature-store/index.html +++ b/docs/category/online-feature-store/index.html @@ -5,7 +5,7 @@ Online Feature Store | BharatMLStack - + diff --git a/docs/category/predator/index.html b/docs/category/predator/index.html index 9bf0be14..de730787 100644 --- a/docs/category/predator/index.html +++ b/docs/category/predator/index.html @@ -5,7 +5,7 @@ Predator | BharatMLStack - + diff --git a/docs/category/python-sdk/index.html b/docs/category/python-sdk/index.html index 4ec838ba..83f5e379 100644 --- a/docs/category/python-sdk/index.html +++ b/docs/category/python-sdk/index.html @@ -5,7 +5,7 @@ Python SDK | BharatMLStack - + diff --git a/docs/category/quick-start/index.html b/docs/category/quick-start/index.html index e560631d..6f09dac7 100644 --- a/docs/category/quick-start/index.html +++ b/docs/category/quick-start/index.html @@ -5,7 +5,7 @@ Quick Start | BharatMLStack - + diff --git a/docs/category/sdks/index.html b/docs/category/sdks/index.html index 54a55146..6a6d8bc6 100644 --- a/docs/category/sdks/index.html +++ b/docs/category/sdks/index.html @@ -5,7 +5,7 @@ SDKs | BharatMLStack - + diff --git a/docs/category/skye/index.html b/docs/category/skye/index.html index 9283b4dc..60187e30 100644 --- a/docs/category/skye/index.html +++ b/docs/category/skye/index.html @@ -5,7 +5,7 @@ Skye | BharatMLStack - + diff --git a/docs/category/trufflebox-ui/index.html b/docs/category/trufflebox-ui/index.html index e8bbd726..2eddcd14 100644 --- a/docs/category/trufflebox-ui/index.html +++ b/docs/category/trufflebox-ui/index.html @@ -5,7 +5,7 @@ Trufflebox UI | BharatMLStack - + diff --git a/docs/index.html b/docs/index.html index 025eb8b9..e4a27af4 100644 --- a/docs/index.html +++ b/docs/index.html @@ -5,7 +5,7 @@ BharatMLStack - Open Source ML Infrastructure | BharatMLStack - + diff --git a/docs/inferflow/v1.0.0/architecture/index.html b/docs/inferflow/v1.0.0/architecture/index.html index cf83f705..114b43d4 100644 --- a/docs/inferflow/v1.0.0/architecture/index.html +++ b/docs/inferflow/v1.0.0/architecture/index.html @@ -5,7 +5,7 @@ Architecture | BharatMLStack - + diff --git a/docs/inferflow/v1.0.0/configuration/index.html b/docs/inferflow/v1.0.0/configuration/index.html index c9c96569..c59f82fe 100644 --- a/docs/inferflow/v1.0.0/configuration/index.html +++ b/docs/inferflow/v1.0.0/configuration/index.html @@ -5,7 +5,7 @@ Configuration Guide | BharatMLStack - + diff --git a/docs/inferflow/v1.0.0/functionalities/index.html b/docs/inferflow/v1.0.0/functionalities/index.html index cadf73eb..0bc433fe 100644 --- a/docs/inferflow/v1.0.0/functionalities/index.html +++ b/docs/inferflow/v1.0.0/functionalities/index.html @@ -5,7 +5,7 @@ Key Functionalities | BharatMLStack - + diff --git a/docs/inferflow/v1.0.0/index.html b/docs/inferflow/v1.0.0/index.html index 79d40418..369cf8c3 100644 --- a/docs/inferflow/v1.0.0/index.html +++ b/docs/inferflow/v1.0.0/index.html @@ -5,7 +5,7 @@ v1.0.0 | BharatMLStack - + diff --git a/docs/inferflow/v1.0.0/release-notes/index.html b/docs/inferflow/v1.0.0/release-notes/index.html index 7e838dbf..2230bb7a 100644 --- a/docs/inferflow/v1.0.0/release-notes/index.html +++ b/docs/inferflow/v1.0.0/release-notes/index.html @@ -5,7 +5,7 @@ Release Notes | BharatMLStack - + diff --git a/docs/intro/index.html b/docs/intro/index.html index 86049f63..47c4a121 100644 --- a/docs/intro/index.html +++ b/docs/intro/index.html @@ -5,7 +5,7 @@ BharatMLStack Documentation | BharatMLStack - + diff --git a/docs/markdown-page/index.html b/docs/markdown-page/index.html index 1c9be3fa..efd2be04 100644 --- a/docs/markdown-page/index.html +++ b/docs/markdown-page/index.html @@ -5,7 +5,7 @@ Markdown page example | BharatMLStack - + diff --git a/docs/numerix/v1.0.0/architecture/index.html b/docs/numerix/v1.0.0/architecture/index.html index 8bfaedb9..eddc2c8a 100644 --- a/docs/numerix/v1.0.0/architecture/index.html +++ b/docs/numerix/v1.0.0/architecture/index.html @@ -5,7 +5,7 @@ Architecture | BharatMLStack - + diff --git a/docs/numerix/v1.0.0/benchmarks/index.html b/docs/numerix/v1.0.0/benchmarks/index.html index c162c884..0a893dac 100644 --- a/docs/numerix/v1.0.0/benchmarks/index.html +++ b/docs/numerix/v1.0.0/benchmarks/index.html @@ -5,7 +5,7 @@ Benchmarks | BharatMLStack - + diff --git a/docs/numerix/v1.0.0/functionalities/index.html b/docs/numerix/v1.0.0/functionalities/index.html index 3379765f..3fadd62a 100644 --- a/docs/numerix/v1.0.0/functionalities/index.html +++ b/docs/numerix/v1.0.0/functionalities/index.html @@ -5,7 +5,7 @@ Key Functionalities | BharatMLStack - + diff --git a/docs/numerix/v1.0.0/index.html b/docs/numerix/v1.0.0/index.html index bd28189e..4db9a0cd 100644 --- a/docs/numerix/v1.0.0/index.html +++ b/docs/numerix/v1.0.0/index.html @@ -5,7 +5,7 @@ v1.0.0 | BharatMLStack - + diff --git a/docs/numerix/v1.0.0/release-notes/index.html b/docs/numerix/v1.0.0/release-notes/index.html index 37e44b06..7e9490a7 100644 --- a/docs/numerix/v1.0.0/release-notes/index.html +++ b/docs/numerix/v1.0.0/release-notes/index.html @@ -5,7 +5,7 @@ Release Notes | BharatMLStack - + diff --git a/docs/online-feature-store/v1.0.0/architecture/index.html b/docs/online-feature-store/v1.0.0/architecture/index.html index ec1881e0..54c81159 100644 --- a/docs/online-feature-store/v1.0.0/architecture/index.html +++ b/docs/online-feature-store/v1.0.0/architecture/index.html @@ -5,7 +5,7 @@ Architecture | BharatMLStack - + diff --git a/docs/online-feature-store/v1.0.0/benchmarks/index.html b/docs/online-feature-store/v1.0.0/benchmarks/index.html index 711b68ea..adbf2dbf 100644 --- a/docs/online-feature-store/v1.0.0/benchmarks/index.html +++ b/docs/online-feature-store/v1.0.0/benchmarks/index.html @@ -5,7 +5,7 @@ Benchmarks | BharatMLStack - + diff --git a/docs/online-feature-store/v1.0.0/data-formats/index.html b/docs/online-feature-store/v1.0.0/data-formats/index.html index 57ba7dbd..c7d2612f 100644 --- a/docs/online-feature-store/v1.0.0/data-formats/index.html +++ b/docs/online-feature-store/v1.0.0/data-formats/index.html @@ -5,7 +5,7 @@ Data Formats | BharatMLStack - + diff --git a/docs/online-feature-store/v1.0.0/functionalities/index.html b/docs/online-feature-store/v1.0.0/functionalities/index.html index 67ef8ef9..a76199e9 100644 --- a/docs/online-feature-store/v1.0.0/functionalities/index.html +++ b/docs/online-feature-store/v1.0.0/functionalities/index.html @@ -5,7 +5,7 @@ Key Functionalities | BharatMLStack - + diff --git a/docs/online-feature-store/v1.0.0/index.html b/docs/online-feature-store/v1.0.0/index.html index 31ed313e..db0c938d 100644 --- a/docs/online-feature-store/v1.0.0/index.html +++ b/docs/online-feature-store/v1.0.0/index.html @@ -5,7 +5,7 @@ v1.0.0 | BharatMLStack - + diff --git a/docs/online-feature-store/v1.0.0/release-notes/index.html b/docs/online-feature-store/v1.0.0/release-notes/index.html index ea73db0d..9acb4730 100644 --- a/docs/online-feature-store/v1.0.0/release-notes/index.html +++ b/docs/online-feature-store/v1.0.0/release-notes/index.html @@ -5,7 +5,7 @@ Release Notes | BharatMLStack - + diff --git a/docs/predator/v1.0.0/architecture/index.html b/docs/predator/v1.0.0/architecture/index.html index ef180095..c4bdfde5 100644 --- a/docs/predator/v1.0.0/architecture/index.html +++ b/docs/predator/v1.0.0/architecture/index.html @@ -5,7 +5,7 @@ Architecture | BharatMLStack - + diff --git a/docs/predator/v1.0.0/functionalities/index.html b/docs/predator/v1.0.0/functionalities/index.html index 62cb6ec1..f09d3af8 100644 --- a/docs/predator/v1.0.0/functionalities/index.html +++ b/docs/predator/v1.0.0/functionalities/index.html @@ -5,7 +5,7 @@ Key Functionalities | BharatMLStack - + diff --git a/docs/predator/v1.0.0/index.html b/docs/predator/v1.0.0/index.html index 2f201e7b..b0d0ab02 100644 --- a/docs/predator/v1.0.0/index.html +++ b/docs/predator/v1.0.0/index.html @@ -5,7 +5,7 @@ v1.0.0 | BharatMLStack - + diff --git a/docs/predator/v1.0.0/release-notes/index.html b/docs/predator/v1.0.0/release-notes/index.html index 7415b2de..01d3d9e8 100644 --- a/docs/predator/v1.0.0/release-notes/index.html +++ b/docs/predator/v1.0.0/release-notes/index.html @@ -5,7 +5,7 @@ Release Notes | BharatMLStack - + diff --git a/docs/quick-start/v1.0.0/index.html b/docs/quick-start/v1.0.0/index.html index a5b1f2da..8e9988b0 100644 --- a/docs/quick-start/v1.0.0/index.html +++ b/docs/quick-start/v1.0.0/index.html @@ -5,7 +5,7 @@ v1.0.0 | BharatMLStack - + diff --git a/docs/quick-start/v1.0.0/quick-start/index.html b/docs/quick-start/v1.0.0/quick-start/index.html index dff64b4e..03a3e2f9 100644 --- a/docs/quick-start/v1.0.0/quick-start/index.html +++ b/docs/quick-start/v1.0.0/quick-start/index.html @@ -5,7 +5,7 @@ Quick Start | BharatMLStack - + diff --git a/docs/sdks/go/v1.0.0/feature_client/index.html b/docs/sdks/go/v1.0.0/feature_client/index.html index 0acab76b..52e0d956 100644 --- a/docs/sdks/go/v1.0.0/feature_client/index.html +++ b/docs/sdks/go/v1.0.0/feature_client/index.html @@ -5,7 +5,7 @@ GRPC Feature client | BharatMLStack - + diff --git a/docs/sdks/go/v1.0.0/index.html b/docs/sdks/go/v1.0.0/index.html index b81fc11e..76d6b09f 100644 --- a/docs/sdks/go/v1.0.0/index.html +++ b/docs/sdks/go/v1.0.0/index.html @@ -5,7 +5,7 @@ v1.0.0 | BharatMLStack - + diff --git a/docs/sdks/python/v1.0.0/grpc_feature_client/index.html b/docs/sdks/python/v1.0.0/grpc_feature_client/index.html index 56a3489c..5d93465a 100644 --- a/docs/sdks/python/v1.0.0/grpc_feature_client/index.html +++ b/docs/sdks/python/v1.0.0/grpc_feature_client/index.html @@ -5,7 +5,7 @@ GRPC Feature client | BharatMLStack - + diff --git a/docs/sdks/python/v1.0.0/index.html b/docs/sdks/python/v1.0.0/index.html index a63b30ca..ae05960b 100644 --- a/docs/sdks/python/v1.0.0/index.html +++ b/docs/sdks/python/v1.0.0/index.html @@ -5,7 +5,7 @@ v1.0.0 | BharatMLStack - + diff --git a/docs/sdks/python/v1.0.0/spark_feature_push_client/index.html b/docs/sdks/python/v1.0.0/spark_feature_push_client/index.html index 7e3c86ff..ea9238f9 100644 --- a/docs/sdks/python/v1.0.0/spark_feature_push_client/index.html +++ b/docs/sdks/python/v1.0.0/spark_feature_push_client/index.html @@ -5,7 +5,7 @@ Spark client | BharatMLStack - + diff --git a/docs/skye/v1.0.0/architecture/index.html b/docs/skye/v1.0.0/architecture/index.html index 6cff5c7d..398a0d1d 100644 --- a/docs/skye/v1.0.0/architecture/index.html +++ b/docs/skye/v1.0.0/architecture/index.html @@ -5,7 +5,7 @@ Architecture | BharatMLStack - + diff --git a/docs/skye/v1.0.0/functionalities/index.html b/docs/skye/v1.0.0/functionalities/index.html index 68fc8abf..7f233199 100644 --- a/docs/skye/v1.0.0/functionalities/index.html +++ b/docs/skye/v1.0.0/functionalities/index.html @@ -5,7 +5,7 @@ Functionalities | BharatMLStack - + diff --git a/docs/skye/v1.0.0/index.html b/docs/skye/v1.0.0/index.html index c6c19d38..347bb9ee 100644 --- a/docs/skye/v1.0.0/index.html +++ b/docs/skye/v1.0.0/index.html @@ -5,7 +5,7 @@ v1.0.0 | BharatMLStack - + diff --git a/docs/skye/v1.0.0/release-notes/index.html b/docs/skye/v1.0.0/release-notes/index.html index c393e684..b51b7ac8 100644 --- a/docs/skye/v1.0.0/release-notes/index.html +++ b/docs/skye/v1.0.0/release-notes/index.html @@ -5,7 +5,7 @@ Release Notes | BharatMLStack - + diff --git a/docs/trufflebox-ui/v1.0.0/index.html b/docs/trufflebox-ui/v1.0.0/index.html index 85da5a7b..13a938ff 100644 --- a/docs/trufflebox-ui/v1.0.0/index.html +++ b/docs/trufflebox-ui/v1.0.0/index.html @@ -5,7 +5,7 @@ v1.0.0 | BharatMLStack - + diff --git a/docs/trufflebox-ui/v1.0.0/userguide/index.html b/docs/trufflebox-ui/v1.0.0/userguide/index.html index 11eff374..24898469 100644 --- a/docs/trufflebox-ui/v1.0.0/userguide/index.html +++ b/docs/trufflebox-ui/v1.0.0/userguide/index.html @@ -5,7 +5,7 @@ User Manual | BharatMLStack - +