From c782ccdcadf9123f6cf90bdfac4cd878c36d63fd Mon Sep 17 00:00:00 2001 From: a0d00kc Date: Sat, 14 Feb 2026 22:20:00 +0530 Subject: [PATCH 01/34] better site --- docs-src/docusaurus.config.js | 4 + docs-src/src/css/custom.css | 701 ++++++++++-- docs-src/src/pages/index.js | 638 ++++++++--- docs-src/src/pages/index.module.css | 999 ++++++++++++++++-- docs-src/src/theme/Root.js | 14 + docs/404.html | 8 +- ...tyles.14b2d0af.css => styles.030f898a.css} | 2 +- docs/assets/js/3aeb33c7.dbef3914.js | 1 + docs/assets/js/428aafcc.1b6a0a9c.js | 1 + docs/assets/js/428aafcc.2c1db158.js | 1 - docs/assets/js/6479fb86.3f75012c.js | 1 + docs/assets/js/6479fb86.96631f8d.js | 1 - docs/assets/js/79ae4ea7.1416ba4f.js | 1 - docs/assets/js/79ae4ea7.1af179c5.js | 1 + docs/assets/js/814f3328.189ef834.js | 1 + docs/assets/js/814f3328.bfb123e8.js | 1 - docs/assets/js/8cdb4121.2549a6bf.js | 1 + docs/assets/js/c4f5d8e4.41d5b3c8.js | 1 - docs/assets/js/c4f5d8e4.e88e308f.js | 1 + docs/assets/js/f9755c6e.8811662b.js | 1 - ...b33c7.b4a8c40f.js => f9755c6e.ac48cb60.js} | 2 +- docs/assets/js/fccc4c42.4690f84a.js | 1 - docs/assets/js/fccc4c42.796edc5f.js | 1 + .../js/{main.3e15e71d.js => main.5b79a858.js} | 4 +- ...CENSE.txt => main.5b79a858.js.LICENSE.txt} | 0 docs/assets/js/runtime~main.2b9c67a0.js | 1 + docs/assets/js/runtime~main.d5e46064.js | 1 - docs/blog/archive/index.html | 8 +- docs/blog/atom.xml | 295 ++---- docs/blog/authors/index.html | 8 +- docs/blog/index.html | 245 +++-- docs/blog/post-five/index.html | 10 +- docs/blog/post-four/index.html | 206 ++++ docs/blog/post-one/index.html | 12 +- docs/blog/post-three/index.html | 275 ++--- docs/blog/post-two/index.html | 8 +- docs/blog/rss.xml | 295 ++---- docs/blog/tags/bharatmlstack/index.html | 259 +++-- docs/blog/tags/embedding-search/index.html | 8 +- docs/blog/tags/index.html | 8 +- docs/blog/tags/inferflow/index.html | 8 +- docs/blog/tags/interaction-store/index.html | 12 +- docs/blog/tags/llm/index.html | 275 +++-- docs/blog/tags/meesho/index.html | 243 +++-- docs/blog/tags/mlplatform/index.html | 243 +++-- docs/blog/tags/model-inference/index.html | 8 +- .../blog/tags/online-feature-store/index.html | 12 +- docs/blog/tags/tensorrt-llm/index.html | 275 +++-- docs/blog/tags/vllm/index.html | 275 +++-- docs/category/go-sdk/index.html | 8 +- docs/category/inferflow/index.html | 8 +- docs/category/numerix/index.html | 8 +- docs/category/online-feature-store/index.html | 8 +- docs/category/python-sdk/index.html | 8 +- docs/category/quick-start/index.html | 8 +- docs/category/sdks/index.html | 8 +- docs/category/trufflebox-ui/index.html | 8 +- docs/category/v100/index.html | 8 +- docs/index.html | 15 +- docs/inferflow/v1.0.0/architecture/index.html | 10 +- .../inferflow/v1.0.0/configuration/index.html | 24 +- .../v1.0.0/functionalities/index.html | 20 +- docs/inferflow/v1.0.0/index.html | 8 +- .../inferflow/v1.0.0/release-notes/index.html | 18 +- docs/markdown-page/index.html | 8 +- docs/numerix/v1.0.0/architecture/index.html | 12 +- docs/numerix/v1.0.0/benchmarks/index.html | 8 +- .../numerix/v1.0.0/functionalities/index.html | 12 +- docs/numerix/v1.0.0/index.html | 8 +- docs/numerix/v1.0.0/release-notes/index.html | 16 +- .../v1.0.0/architecture/index.html | 8 +- .../v1.0.0/benchmarks/index.html | 14 +- .../v1.0.0/data-formats/index.html | 14 +- .../v1.0.0/functionalities/index.html | 14 +- docs/online-feature-store/v1.0.0/index.html | 8 +- .../v1.0.0/release-notes/index.html | 14 +- .../quick-start/v1.0.0/quick-start/index.html | 44 +- docs/sdks/go/v1.0.0/feature_client/index.html | 18 +- .../v1.0.0/grpc_feature_client/index.html | 22 +- .../spark_feature_push_client/index.html | 72 +- docs/sitemap.xml | 2 +- .../trufflebox-ui/v1.0.0/userguide/index.html | 28 +- 82 files changed, 4085 insertions(+), 1781 deletions(-) create mode 100644 docs-src/src/theme/Root.js rename docs/assets/css/{styles.14b2d0af.css => styles.030f898a.css} (55%) create mode 100644 docs/assets/js/3aeb33c7.dbef3914.js create mode 100644 docs/assets/js/428aafcc.1b6a0a9c.js delete mode 100644 docs/assets/js/428aafcc.2c1db158.js create mode 100644 docs/assets/js/6479fb86.3f75012c.js delete mode 100644 docs/assets/js/6479fb86.96631f8d.js delete mode 100644 docs/assets/js/79ae4ea7.1416ba4f.js create mode 100644 docs/assets/js/79ae4ea7.1af179c5.js create mode 100644 docs/assets/js/814f3328.189ef834.js delete mode 100644 docs/assets/js/814f3328.bfb123e8.js create mode 100644 docs/assets/js/8cdb4121.2549a6bf.js delete mode 100644 docs/assets/js/c4f5d8e4.41d5b3c8.js create mode 100644 docs/assets/js/c4f5d8e4.e88e308f.js delete mode 100644 docs/assets/js/f9755c6e.8811662b.js rename docs/assets/js/{3aeb33c7.b4a8c40f.js => f9755c6e.ac48cb60.js} (92%) delete mode 100644 docs/assets/js/fccc4c42.4690f84a.js create mode 100644 docs/assets/js/fccc4c42.796edc5f.js rename docs/assets/js/{main.3e15e71d.js => main.5b79a858.js} (69%) rename docs/assets/js/{main.3e15e71d.js.LICENSE.txt => main.5b79a858.js.LICENSE.txt} (100%) create mode 100644 docs/assets/js/runtime~main.2b9c67a0.js delete mode 100644 docs/assets/js/runtime~main.d5e46064.js create mode 100644 docs/blog/post-four/index.html diff --git a/docs-src/docusaurus.config.js b/docs-src/docusaurus.config.js index 229e0cb2..59f6ea48 100644 --- a/docs-src/docusaurus.config.js +++ b/docs-src/docusaurus.config.js @@ -78,6 +78,10 @@ const config = { ({ // Replace with your project's social card image: 'img/docusaurus-social-card.jpg', + colorMode: { + defaultMode: 'dark', + respectPrefersColorScheme: true, + }, navbar: { title: 'BharatMLStack', items: [ diff --git a/docs-src/src/css/custom.css b/docs-src/src/css/custom.css index ff94defd..68e9c8e2 100644 --- a/docs-src/src/css/custom.css +++ b/docs-src/src/css/custom.css @@ -1,143 +1,636 @@ /** - * Any CSS included here will be global. The classic template - * bundles Infima by default. Infima is a CSS framework designed to - * work well for content-centric websites. + * Global theme for BharatMLStack docs site. + * Overrides Infima variables to match the homepage's indigo/purple dark theme. + * Supports both dark (primary) and light modes. */ -/* You can override the default Infima variables here. */ +/* ======================================== + 1. Infima Variable Overrides + ======================================== */ + :root { - /* BharatMLStack brand colors - purple/burgundy theme */ - --ifm-color-primary: #450839; - --ifm-color-primary-dark: #3d0732; - --ifm-color-primary-darker: #39062f; - --ifm-color-primary-darkest: #2f0527; - --ifm-color-primary-light: #4d0940; - --ifm-color-primary-lighter: #510a43; - --ifm-color-primary-lightest: #5d0c4d; + /* Primary palette – indigo */ + --ifm-color-primary: #6366f1; + --ifm-color-primary-dark: #4f46e5; + --ifm-color-primary-darker: #4338ca; + --ifm-color-primary-darkest: #3730a3; + --ifm-color-primary-light: #818cf8; + --ifm-color-primary-lighter: #a5b4fc; + --ifm-color-primary-lightest: #c7d2fe; + + /* Light mode backgrounds and text */ + --ifm-background-color: #f8fafc; + --ifm-background-surface-color: #ffffff; + --ifm-font-color-base: #1e293b; + --ifm-font-color-secondary: #64748b; + --ifm-heading-color: #0f172a; + --ifm-link-color: #6366f1; + --ifm-link-hover-color: #4f46e5; + + /* Code */ --ifm-code-font-size: 95%; - --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); - - /* Custom BharatMLStack variables with better contrast */ - --bharatml-primary: #450839; - --bharatml-primary-hover: #6a0c59; - --bharatml-secondary: #f9f9f9; - --bharatml-text: #1c1e21; /* Much darker for better contrast */ - --bharatml-text-light: #606770; /* Darker gray for better readability */ + --ifm-code-background: #f1f5f9; + --ifm-code-border-radius: 6px; + --ifm-code-padding-horizontal: 0.4rem; + --ifm-code-padding-vertical: 0.15rem; + --docusaurus-highlighted-code-line-bg: rgba(99, 102, 241, 0.08); + + /* Cards, borders, shadows */ + --ifm-card-background-color: #ffffff; + --ifm-global-shadow-lw: 0 2px 8px rgba(0, 0, 0, 0.06); + --ifm-global-shadow-md: 0 4px 16px rgba(0, 0, 0, 0.08); + --ifm-global-shadow-tl: 0 8px 32px rgba(0, 0, 0, 0.1); + --ifm-global-radius: 8px; + + /* Table of contents */ + --ifm-toc-border-color: rgba(0, 0, 0, 0.08); + + /* Navbar height for padding */ + --ifm-navbar-height: 3.75rem; } -/* For readability concerns, you should choose a lighter palette in dark mode. */ +/* Dark mode */ [data-theme='dark'] { - --ifm-color-primary: #8b4582; - --ifm-color-primary-dark: #7d3f75; - --ifm-color-primary-darker: #763c6e; - --ifm-color-primary-darkest: #62315a; - --ifm-color-primary-light: #994b8f; - --ifm-color-primary-lighter: #a04e96; - --ifm-color-primary-lightest: #b657a9; - --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3); - - /* Dark mode BharatMLStack colors */ - --bharatml-primary: #8b4582; - --bharatml-primary-hover: #a04e96; - --bharatml-secondary: #1e1e1e; - --bharatml-text: #e3e3e3; /* Light text for dark mode */ - --bharatml-text-light: #b4b4b4; /* Lighter gray for dark mode */ + --ifm-color-primary: #818cf8; + --ifm-color-primary-dark: #6366f1; + --ifm-color-primary-darker: #4f46e5; + --ifm-color-primary-darkest: #4338ca; + --ifm-color-primary-light: #a5b4fc; + --ifm-color-primary-lighter: #c7d2fe; + --ifm-color-primary-lightest: #e0e7ff; + + --ifm-background-color: #0a0e27; + --ifm-background-surface-color: #151932; + --ifm-font-color-base: #e2e8f0; + --ifm-font-color-secondary: #94a3b8; + --ifm-heading-color: #f1f5f9; + --ifm-link-color: #818cf8; + --ifm-link-hover-color: #a5b4fc; + + --ifm-code-background: rgba(255, 255, 255, 0.06); + --docusaurus-highlighted-code-line-bg: rgba(99, 102, 241, 0.15); + + --ifm-card-background-color: rgba(255, 255, 255, 0.03); + --ifm-global-shadow-lw: 0 2px 8px rgba(0, 0, 0, 0.3); + --ifm-global-shadow-md: 0 4px 16px rgba(0, 0, 0, 0.4); + --ifm-global-shadow-tl: 0 8px 32px rgba(0, 0, 0, 0.5); + + --ifm-toc-border-color: rgba(255, 255, 255, 0.06); } -/* Custom BharatMLStack styles */ -.bharatml-hero { - background: linear-gradient(135deg, var(--bharatml-primary) 0%, var(--bharatml-primary-hover) 100%); - color: white; + +/* ======================================== + 2. Global Gradient Orb Background + ======================================== */ + +.gradient-bg-global { + position: fixed; + top: 0; + left: 0; + width: 100%; + height: 100%; + z-index: 0; + pointer-events: none; } -/* Hero button styling - both buttons should have white borders and proper text colors */ -.bharatml-hero .bharatml-button { - background-color: var(--bharatml-primary); - border: 2px solid white !important; - color: white !important; - transition: all 0.3s ease; +.gradient-orb-global { + position: absolute; + border-radius: 50%; + filter: blur(100px); + opacity: 0.25; + animation: globalOrbFloat 25s ease-in-out infinite; } -.bharatml-hero .bharatml-button:hover { - background-color: white !important; - border-color: white !important; - color: var(--bharatml-primary) !important; +[data-theme='light'] .gradient-orb-global { + opacity: 0.10; } -.bharatml-hero .button--outline { - background-color: transparent !important; - border: 2px solid white !important; - color: white !important; - transition: all 0.3s ease; +.orb-global-1 { + width: 600px; + height: 600px; + background: radial-gradient(circle, #6366f1, transparent); + top: -10%; + left: -10%; } -.bharatml-hero .button--outline:hover { - background-color: white !important; - border-color: white !important; - color: var(--bharatml-primary) !important; +.orb-global-2 { + width: 500px; + height: 500px; + background: radial-gradient(circle, #8b5cf6, transparent); + top: 50%; + right: -10%; + animation-delay: 8s; } -/* Dark mode hero buttons */ -[data-theme='dark'] .bharatml-hero .bharatml-button { - background-color: var(--bharatml-primary); - border: 2px solid white !important; - color: white !important; +.orb-global-3 { + width: 700px; + height: 700px; + background: radial-gradient(circle, #06b6d4, transparent); + bottom: -20%; + left: 30%; + animation-delay: 15s; } -[data-theme='dark'] .bharatml-hero .bharatml-button:hover { - background-color: white !important; - border-color: white !important; - color: var(--bharatml-primary) !important; +@keyframes globalOrbFloat { + 0%, 100% { + transform: translate(0, 0) scale(1); + } + 33% { + transform: translate(60px, -60px) scale(1.1); + } + 66% { + transform: translate(-40px, 40px) scale(0.9); + } } -[data-theme='dark'] .bharatml-hero .button--outline { - background-color: transparent !important; - border: 2px solid white !important; - color: white !important; + +/* ======================================== + 3. Navbar – Glass Morphism + ======================================== */ + +.navbar { + background: rgba(10, 14, 39, 0.8) !important; + backdrop-filter: blur(20px); + -webkit-backdrop-filter: blur(20px); + border-bottom: 1px solid rgba(255, 255, 255, 0.05); + box-shadow: none; + position: sticky; + z-index: 100; +} + +[data-theme='light'] .navbar { + background: rgba(255, 255, 255, 0.85) !important; + border-bottom: 1px solid rgba(0, 0, 0, 0.08); +} + +.navbar__title { + font-weight: 800; + background: linear-gradient(135deg, #6366f1, #8b5cf6, #06b6d4); + background-size: 200% 200%; + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; + animation: navGradientShift 3s ease infinite; +} + +@keyframes navGradientShift { + 0%, 100% { background-position: 0% 50%; } + 50% { background-position: 100% 50%; } +} + +.navbar__link { + font-weight: 500; } -[data-theme='dark'] .bharatml-hero .button--outline:hover { - background-color: white !important; - border-color: white !important; - color: var(--bharatml-primary) !important; +[data-theme='dark'] .navbar__link { + color: #e2e8f0; } -/* General button styling for other parts of the site */ -.bharatml-button { - background-color: var(--bharatml-primary); - border-color: var(--bharatml-primary); - transition: all 0.3s ease; +[data-theme='dark'] .navbar__link:hover, +[data-theme='dark'] .navbar__link--active { + color: #818cf8; +} + +.navbar__toggle { + color: var(--ifm-font-color-base); +} + +/* Navbar sidebar (mobile) */ +.navbar-sidebar { + background: var(--ifm-background-color); +} + + +/* ======================================== + 4. Footer – Dark Theme + ======================================== */ + +.footer { + background: #151932 !important; + border-top: 1px solid rgba(255, 255, 255, 0.05); +} + +[data-theme='light'] .footer { + background: #f1f5f9 !important; + border-top: 1px solid rgba(0, 0, 0, 0.08); +} + +.footer__title { + color: #e2e8f0; + font-weight: 700; +} + +[data-theme='light'] .footer__title { + color: #1e293b; +} + +.footer__link-item { + color: #94a3b8; + transition: color 0.3s; +} + +.footer__link-item:hover { + color: #818cf8; + text-decoration: none; +} + +[data-theme='light'] .footer__link-item { + color: #64748b; +} + +[data-theme='light'] .footer__link-item:hover { + color: #6366f1; +} + +.footer__copyright { + color: #64748b; +} + + +/* ======================================== + 5. Sidebar – Glass Effect + ======================================== */ + +[data-theme='dark'] .theme-doc-sidebar-container { + border-right: 1px solid rgba(255, 255, 255, 0.05) !important; } -.bharatml-button:hover { - background-color: var(--bharatml-primary-hover); - border-color: var(--bharatml-primary-hover); - color: white; +[data-theme='dark'] .menu { + background: transparent; } -.bharatml-card { - border: 1px solid rgba(69, 8, 57, 0.1); +[data-theme='dark'] .menu__link { + color: #cbd5e1; border-radius: 8px; - padding: 2rem; - transition: all 0.3s ease; - background: white; + transition: all 0.2s; +} + +[data-theme='dark'] .menu__link:hover { + background: rgba(99, 102, 241, 0.1); + color: #e2e8f0; +} + +[data-theme='dark'] .menu__link--active:not(.menu__link--sublist) { + background: rgba(99, 102, 241, 0.15); + color: #818cf8; + font-weight: 600; +} + +[data-theme='dark'] .menu__list-item-collapsible:hover { + background: rgba(99, 102, 241, 0.08); +} + +[data-theme='dark'] .theme-doc-sidebar-item-category > .menu__list-item-collapsible > .menu__link { + color: #e2e8f0; + font-weight: 600; +} + + +/* ======================================== + 6. Doc / Blog Content + ======================================== */ + +/* Ensure proper z-index for content above gradient orbs */ +[class*='docMainContainer'], +[class*='mainWrapper'], +.main-wrapper { + position: relative; + z-index: 1; +} + +/* Markdown content */ +.markdown h1, +.markdown h2, +.markdown h3, +.markdown h4, +.markdown h5, +.markdown h6 { + color: var(--ifm-heading-color); +} + +/* Tables */ +[data-theme='dark'] table { + border-color: rgba(255, 255, 255, 0.08); +} + +[data-theme='dark'] table thead tr { + background: rgba(255, 255, 255, 0.04); + border-bottom: 1px solid rgba(255, 255, 255, 0.08); +} + +[data-theme='dark'] table tbody tr { + border-bottom: 1px solid rgba(255, 255, 255, 0.04); +} + +[data-theme='dark'] table tbody tr:nth-child(2n) { + background: rgba(255, 255, 255, 0.02); +} + +[data-theme='dark'] th, +[data-theme='dark'] td { + border-color: rgba(255, 255, 255, 0.06); +} + +/* Blockquotes */ +[data-theme='dark'] blockquote { + border-left-color: #818cf8; + background: rgba(99, 102, 241, 0.05); + color: #cbd5e1; +} + +/* Horizontal rules */ +[data-theme='dark'] hr { + border-color: rgba(255, 255, 255, 0.06); +} + + +/* ======================================== + 7. Code Blocks + ======================================== */ + +[data-theme='dark'] .prism-code { + background: rgba(255, 255, 255, 0.04) !important; + border: 1px solid rgba(255, 255, 255, 0.06); +} + +[data-theme='dark'] code { + background: rgba(255, 255, 255, 0.06); + border: 1px solid rgba(255, 255, 255, 0.08); + color: #e2e8f0; +} + +[data-theme='dark'] a code { + color: var(--ifm-link-color); +} + +/* Code block title bar */ +[data-theme='dark'] .codeBlockTitle_node_modules-\@docusaurus-theme-classic-lib-theme-CodeBlock-Content-styles-module { + background: rgba(255, 255, 255, 0.06) !important; + border-bottom: 1px solid rgba(255, 255, 255, 0.06); +} + + +/* ======================================== + 8. Admonitions + ======================================== */ + +[data-theme='dark'] .alert { + background: rgba(255, 255, 255, 0.03); + border: 1px solid rgba(255, 255, 255, 0.06); + color: #e2e8f0; +} + +[data-theme='dark'] .alert--info { + border-left: 4px solid #06b6d4; + background: rgba(6, 182, 212, 0.06); +} + +[data-theme='dark'] .alert--warning { + border-left: 4px solid #f59e0b; + background: rgba(245, 158, 11, 0.06); +} + +[data-theme='dark'] .alert--danger { + border-left: 4px solid #ef4444; + background: rgba(239, 68, 68, 0.06); } -.bharatml-card:hover { - border-color: var(--bharatml-primary); - box-shadow: 0 4px 20px rgba(69, 8, 57, 0.1); - transform: translateY(-2px); +[data-theme='dark'] .alert--success { + border-left: 4px solid #10b981; + background: rgba(16, 185, 129, 0.06); } -.bharatml-icon { - width: 64px; - height: 64px; - background: linear-gradient(135deg, var(--bharatml-primary), var(--bharatml-primary-hover)); +[data-theme='dark'] .alert--secondary { + border-left: 4px solid #818cf8; + background: rgba(99, 102, 241, 0.06); +} + +[data-theme='dark'] .admonitionHeading_node_modules-\@docusaurus-theme-classic-lib-theme-Admonition-Layout-styles-module { + color: inherit; +} + + +/* ======================================== + 9. Table of Contents (right sidebar) + ======================================== */ + +[data-theme='dark'] .table-of-contents__link { + color: #94a3b8; +} + +[data-theme='dark'] .table-of-contents__link:hover, +[data-theme='dark'] .table-of-contents__link--active { + color: #818cf8; +} + +[data-theme='dark'] .table-of-contents { + border-left: 1px solid rgba(255, 255, 255, 0.06); +} + + +/* ======================================== + 10. Pagination / Doc navigation + ======================================== */ + +[data-theme='dark'] .pagination-nav__link { + background: rgba(255, 255, 255, 0.03); + border: 1px solid rgba(255, 255, 255, 0.08); border-radius: 12px; - display: flex; - align-items: center; - justify-content: center; - margin: 0 auto 1rem; - font-size: 1.5rem; - color: white; + transition: all 0.3s; +} + +[data-theme='dark'] .pagination-nav__link:hover { + border-color: rgba(99, 102, 241, 0.3); + background: rgba(99, 102, 241, 0.06); +} + +[data-theme='dark'] .pagination-nav__sublabel { + color: #94a3b8; +} + +[data-theme='dark'] .pagination-nav__label { + color: #e2e8f0; +} + + +/* ======================================== + 11. Blog-specific + ======================================== */ + +[data-theme='dark'] .blog-post-page article header h1 { + color: #f1f5f9; +} + +[data-theme='dark'] article .avatar__name a { + color: #818cf8; +} + +[data-theme='dark'] .blog-tags a { + background: rgba(99, 102, 241, 0.1); + border: 1px solid rgba(99, 102, 241, 0.2); + color: #818cf8; +} + +[data-theme='dark'] .blog-tags a:hover { + background: rgba(99, 102, 241, 0.2); + border-color: rgba(99, 102, 241, 0.4); + text-decoration: none; +} + + +/* ======================================== + 12. Search and misc inputs + ======================================== */ + +[data-theme='dark'] .navbar__search-input { + background: rgba(255, 255, 255, 0.05); + border: 1px solid rgba(255, 255, 255, 0.1); + color: #e2e8f0; +} + +[data-theme='dark'] .navbar__search-input::placeholder { + color: #64748b; +} + + +/* ======================================== + 13. Breadcrumbs + ======================================== */ + +[data-theme='dark'] .breadcrumbs__link { + background: rgba(255, 255, 255, 0.04); + color: #94a3b8; + border-radius: 6px; +} + +[data-theme='dark'] .breadcrumbs__link:hover { + background: rgba(99, 102, 241, 0.1); + color: #e2e8f0; +} + +[data-theme='dark'] .breadcrumbs__item--active .breadcrumbs__link { + background: rgba(99, 102, 241, 0.12); + color: #818cf8; +} + + +/* ======================================== + 14. Tabs + ======================================== */ + +[data-theme='dark'] .tabs__item { + color: #94a3b8; + border-bottom-color: transparent; +} + +[data-theme='dark'] .tabs__item:hover { + color: #e2e8f0; +} + +[data-theme='dark'] .tabs__item--active { + color: #818cf8; + border-bottom-color: #818cf8; +} + + +/* ======================================== + 15. Scrollbar (dark mode) + ======================================== */ + +[data-theme='dark'] ::-webkit-scrollbar { + width: 8px; + height: 8px; +} + +[data-theme='dark'] ::-webkit-scrollbar-track { + background: transparent; +} + +[data-theme='dark'] ::-webkit-scrollbar-thumb { + background: rgba(255, 255, 255, 0.12); + border-radius: 4px; +} + +[data-theme='dark'] ::-webkit-scrollbar-thumb:hover { + background: rgba(255, 255, 255, 0.2); +} + + +/* ======================================== + 16. Version / Dropdown badges + ======================================== */ + +[data-theme='dark'] .dropdown__menu { + background: #151932; + border: 1px solid rgba(255, 255, 255, 0.08); +} + +[data-theme='dark'] .dropdown__link { + color: #cbd5e1; +} + +[data-theme='dark'] .dropdown__link:hover { + background: rgba(99, 102, 241, 0.1); + color: #e2e8f0; +} + +[data-theme='dark'] .dropdown__link--active { + color: #818cf8; + background: rgba(99, 102, 241, 0.12); +} + + +/* ======================================== + 17. Homepage Isolation + (hide Docusaurus navbar/footer on homepage) + ======================================== */ + +html.homepage-active .navbar { + display: none !important; +} + +html.homepage-active .footer { + display: none !important; +} + +html.homepage-active main { + margin-top: 0; +} + +html.homepage-active [class*='docMainContainer'], +html.homepage-active [class*='mainWrapper'] { + padding-top: 0; +} + + +/* ======================================== + 18. Light mode refinements + ======================================== */ + +[data-theme='light'] .theme-doc-sidebar-container { + border-right: 1px solid rgba(0, 0, 0, 0.06); +} + +[data-theme='light'] .menu__link--active:not(.menu__link--sublist) { + background: rgba(99, 102, 241, 0.08); + color: #6366f1; + font-weight: 600; +} + +[data-theme='light'] .menu__link:hover { + background: rgba(99, 102, 241, 0.05); +} + +[data-theme='light'] .pagination-nav__link { + border-radius: 12px; + transition: all 0.3s; +} + +[data-theme='light'] .pagination-nav__link:hover { + border-color: rgba(99, 102, 241, 0.3); + box-shadow: 0 4px 16px rgba(99, 102, 241, 0.08); +} + +[data-theme='light'] blockquote { + border-left-color: #6366f1; } diff --git a/docs-src/src/pages/index.js b/docs-src/src/pages/index.js index c58e5f55..38586550 100644 --- a/docs-src/src/pages/index.js +++ b/docs-src/src/pages/index.js @@ -1,202 +1,424 @@ -import clsx from 'clsx'; -import Link from '@docusaurus/Link'; +import React, { useEffect, useLayoutEffect, useRef } from 'react'; +import Layout from '@theme/Layout'; import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import useBaseUrl from '@docusaurus/useBaseUrl'; -import Layout from '@theme/Layout'; -import { OnlineFeatureStoreFeatures, TruffleboxUIFeatures, SDKsFeatures } from '@site/src/components/HomepageFeatures'; - -import Heading from '@theme/Heading'; import styles from './index.module.css'; -function HomepageHeader() { - const {siteConfig} = useDocusaurusContext(); +// ─── Data ────────────────────────────────────────────── + +const BARRIERS = [ + { + icon: '\u{1F9E0}', + title: 'Focus on building intelligence, not infrastructure', + questions: [ + 'Does every model deployment require a full-stack integration effort?', + 'Do engineers have to rebuild feature retrieval, endpoint integrations, and logging for each new model?', + 'Does changing a simple expression like 0.2\u00D7s\u2081 + 0.8\u00D7s\u2082 to 0.3\u00D7s\u2081 + 0.7\u00D7s\u2082 really need code reviews and redeployments?', + 'Why does deploying intelligence require the devops team to provision infra?', + ], + answer: + 'Machine learning teams should be iterating on models, not systems. Yet today, infrastructure complexity turns simple improvements into weeks of engineering effort, slowing experimentation and innovation.', + }, + { + icon: '\u{1F4B0}', + title: 'Built for scale without exponential cost growth', + questions: [ + 'Do your infrastructure costs scale faster than your ML impact?', + 'Are you recomputing the same features, reloading the same data, and moving the same bytes across systems repeatedly?', + 'Are expensive GPUs and compute sitting underutilized while workloads wait on data or inefficient pipelines?', + 'Why does scaling ML often mean scaling cost linearly\u2014or worse?', + ], + answer: + 'A modern ML platform should eliminate redundant computation, reuse features intelligently, and optimize data access across memory, NVMe, and object storage. Compute should be pooled, scheduled efficiently, and fully utilized\u2014ensuring that scale drives impact, not runaway infrastructure costs.', + }, + { + icon: '\u{1F30D}', + title: 'Freedom to deploy anywhere, without lock-in', + questions: [ + 'Are your models tied to a single cloud, making migration costly and complex?', + 'Does adopting managed services today limit your ability to optimize cost or move infrastructure tomorrow?', + 'Can you deploy the same ML stack across public cloud, private cloud, or sovereign environments without redesigning everything?', + 'Why should infrastructure choices dictate the future of your ML systems?', + ], + answer: + 'A modern ML platform should be built on open standards and cloud-neutral abstractions, allowing you to deploy anywhere\u2014public cloud, private infrastructure, or sovereign environments. This ensures complete control over your data, freedom from vendor lock-in, and the ability to optimize for cost, performance, and compliance without architectural constraints.', + }, +]; + +const COMPONENTS = [ + { + icon: '\u{26A1}', + title: 'Online Feature Store', + description: + 'BharatMLStack Online Feature Store delivers sub-10ms, high-throughput access to machine learning features for real-time inference. It seamlessly ingests batch and streaming data, validates schemas, and persists compact, versioned feature groups optimized for low latency and efficiency. With scalable storage backends, gRPC APIs, and binary-optimized formats, it ensures consistent, reliable feature serving across ML pipelines.', + cta: '/online-feature-store/v1.0.0', + }, + { + icon: '\u{1F500}', + title: 'Inferflow', + description: + "Inferflow is BharatMLStack's intelligent inference gateway that dynamically retrieves and assembles features required by ML models using a graph-based configuration called Inferpipes. It automatically resolves entity relationships, fetches features from the Online Feature Store, and constructs feature vectors without custom code.", + cta: '/inferflow/v1.0.0', + }, + { + icon: '\u{1F50D}', + title: 'Skye', + description: + 'Skye enables fast similarity retrieval by representing data as vectors and querying nearest matches in high-dimensional space. It supports pluggable vector databases, ensuring flexibility across infrastructure. The system provides tenant-level index isolation while allowing single embedding ingestion even when shared across tenants, reducing redundancy.', + cta: '/skye/v1.0.0', + }, + { + icon: '\u{1F9EE}', + title: 'Numerix', + description: + 'Numerix is a high-performance compute engine designed for ultra-fast element-wise matrix operations. Built in Rust and accelerated using SIMD, it delivers exceptional efficiency and predictable performance. Optimized for real-time inference workloads, it achieves strict sub-5ms p99 latency on matrices up to 1000\u00D710.', + cta: '/numerix/v1.0.0', + }, + { + icon: '\u{1F680}', + title: 'Predator', + description: + 'Predator streamlines infrastructure and model lifecycle management. It enables the creation of deployables with specific Triton Server versions and supports seamless model rollouts. Leveraging Helm charts and Argo CD, Predator automates Kubernetes-based deployments while integrating with KEDA for auto-scaling and performance tuning.', + cta: '/predator/v1.0.0', + }, +]; + +const STATS = [ + { value: '4.5M+', label: 'Daily Orders', description: 'Daily orders processed via ML pipelines' }, + { value: '2.4M', label: 'QPS on FS', description: 'QPS on Feature Store with batch size of 100 id lookups' }, + { value: '1M+', label: 'QPS Inference', description: 'QPS on Model Inference' }, + { value: '500K', label: 'QPS Embedding', description: 'QPS Embedding Search' }, +]; + +const DEMO_VIDEOS = [ + { + title: 'Embedding Platform', + description: 'See how Skye powers real-time similarity search and embedding retrieval at scale.', + url: 'https://videos.meesho.com/reels/embedding_platform.mp4', + }, + { + title: 'Feature Store', + description: 'Watch the Online Feature Store deliver sub-10ms feature serving for real-time inference.', + url: 'https://videos.meesho.com/reels/feature_store.mp4', + }, + { + title: 'Numerix', + description: 'Explore ultra-fast matrix operations powered by Rust and SIMD acceleration.', + url: 'https://videos.meesho.com/reels/numerix.mp4', + }, + { + title: 'Predator', + description: 'Automated Kubernetes-based model deployment with Helm, Argo CD, and KEDA.', + url: 'https://videos.meesho.com/reels/predator.mp4', + }, + { + title: 'Inferflow', + description: 'Graph-based feature assembly and intelligent inference gateway in action.', + url: 'https://videos.meesho.com/reels/inferflow.mp4', + }, +]; + +const BLOG_POSTS = [ + { + title: "Building Meesho's ML Platform: From Chaos to Cutting-Edge (Part 1)", + category: 'ML Platform', + icon: '\u{1F680}', + link: '/blog/post-one', + }, + { + title: "Building Meesho's ML Platform: Lessons from the First-Gen System (Part 2)", + category: 'ML Platform', + icon: '\u{1F9E9}', + link: '/blog/post-two', + }, + { + title: 'Cracking the Code: Scaling Model Inference & Real-Time Embedding Search', + category: 'Inference', + icon: '\u{26A1}', + link: '/blog/post-three', + }, + { + title: 'Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving', + category: 'LLM', + icon: '\u{1F9E0}', + link: '/blog/post-four', + }, + { + title: 'LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale', + category: 'Optimization', + icon: '\u{1F52C}', + link: '/blog/post-five', + }, +]; + +// ─── Components ──────────────────────────────────────── + +function CustomNav() { + const docsUrl = useBaseUrl('/'); + const blogUrl = useBaseUrl('/blog'); return ( -
-
-
- BharatMLStack Logo -
- - Welcome to {siteConfig.title} - -

+

+ ); +} + +function HeroSection() { + const getStartedUrl = useBaseUrl('/category/online-feature-store'); + return ( +
+
+
Open-source, scalable stack for enterprise ML
+

Build production ML pipelines faster

+

Open source, end-to-end ML infrastructure stack built for scale, speed, and simplicity. + Integrate, deploy, and manage robust ML workflows with full reliability and control.

- - +
+

Adopted by data teams building at scale

-
+
+ ML Infrastructure +
+ ); } -function OnlineFeatureStoreAbout() { +function BarriersSection() { return ( -
-
-
-
- Built for India's Scale -

- BharatMLStack is a comprehensive, production-ready machine learning infrastructure - platform designed to democratize ML capabilities across India and beyond. Our mission - is to provide a robust, scalable, and accessible ML stack that empowers organizations - to build, deploy, and manage machine learning solutions at massive scale. -

- - Explore Online Feature Store → - -
-
-
-

🏆 Key Achievements

-
    -
  • ✅ Sub-10ms P99 latency for real-time inference
  • -
  • ✅ 1M+ RPS tested with 100 IDs per request
  • -
  • ✅ PSDB format outperforms Proto3 & Arrow
  • -
  • ✅ Multi-database: Scylla, Dragonfly, Redis
  • -
  • ✅ Production-ready with comprehensive monitoring
  • +
    +
    +
    +

    Why BharatMLStack

    +

    The Real Barriers to Scaling Machine Learning

    +

    + ML teams spend more time fighting infrastructure than building intelligence. + BharatMLStack removes those barriers. +

    +
    +
    + {BARRIERS.map((barrier, idx) => ( +
    +
    {barrier.icon}
    +

    {barrier.title}

    +
      + {barrier.questions.map((q, i) => ( +
    • {q}
    • + ))}
    +

    {barrier.answer}

    -
    + ))}
); } -function TruffleboxAbout() { +function ComponentsSection() { + const cardsRef = useRef([]); + const baseUrl = useBaseUrl('/'); + + useEffect(() => { + const observer = new IntersectionObserver( + (entries) => { + entries.forEach((entry) => { + if (entry.isIntersecting) { + entry.target.classList.add(styles.componentCardVisible); + } + }); + }, + { threshold: 0.1, rootMargin: '0px 0px -80px 0px' } + ); + + cardsRef.current.forEach((card) => { + if (card) observer.observe(card); + }); + + return () => observer.disconnect(); + }, []); + return ( -
-
-
-
- Modern MLOps Management -

- Trufflebox UI provides a comprehensive, modern web interface for managing your entire - ML infrastructure. Built with cutting-edge web technologies, it delivers an intuitive - experience for feature management, user administration, and operational oversight. - Streamline your MLOps workflows with enterprise-grade UI components. -

- - Explore Trufflebox UI → - -
-
-
-

🎨 UI Features

-
    -
  • ✅ Comprehensive feature catalog & discovery
  • -
  • ✅ Role-based access control & user management
  • -
  • ✅ Job, Store, Admin Ops management
  • -
  • ✅ Approval flow for everything
  • -
  • ✅ Responsive design for desktop & mobile
  • -
+
+
+
+

Platform Components

+

BharatMLStack Components

+

+ Purpose-built components for every stage of the ML lifecycle, from feature + serving to model deployment. +

+
+
+ {COMPONENTS.map((comp, idx) => ( +
(cardsRef.current[idx] = el)} + > +
{comp.icon}
+
+

{comp.title}

+

{comp.description}

+ + Learn more → + +
-
+ ))}
); } -function SDKsAbout() { +function StatsSection() { return ( -
-
-
-
- Developer-First Integration -

- Our SDKs are designed with developers in mind, providing idiomatic APIs for Go and Python - that feel natural in your existing codebase. Whether you're building microservices, - data pipelines, or ML applications, our SDKs provide the tools you need for seamless - integration with BharatMLStack's powerful infrastructure. -

- - Explore SDKs → - -
-
-
-

🛠️ Developer Tools

-
    -
  • ✅ Native Go & Python SDKs with type safety
  • -
  • ✅ High-performance gRPC
  • -
  • ✅ Apache Spark integration for publishing features
  • -
+
+
+
+

Proven at scale

+

Scaling Numbers

+
+
+ {STATS.map((stat, idx) => ( +
+

{stat.label}

+
{stat.value}
+

{stat.description}

-
+ ))}
); } -function NumerixAbout() { +function DemoVideosSection() { return ( -
-
-
-
- Numerix -

- Numerix is a mathematical compute engine for BharatML Stack. It is used to perform mathematical operations on matrices and vectors. -

- - Explore Numerix → - -
-
-
-

🛠️ Numerix Features

-
    -
  • ✅ Postfix expression evaluation
  • -
  • ✅ Vectorized math operations
  • -
  • ✅ Typed evaluation
  • -
  • ✅ Compiler-assisted SIMD
  • -
  • ✅ ARM & AMD support
  • -
  • ✅ Multi-arch builds
  • -
  • ✅ Deterministic runtime
  • - -
+
+
+
+

See it in action

+

Demo Videos

+

+ Watch short demos of each BharatMLStack component in action. +

+
+
+ {DEMO_VIDEOS.map((video, idx) => ( +
+
+ +
+
+

{video.title}

+

{video.description}

+
+ ))} +
+
+
+ ); +} + +function BlogSection() { + const baseUrl = useBaseUrl('/'); + return ( +
+
+
+

From our blog

+

View Our Blogs

+

+ Technical articles, architecture deep-dives, and the story behind BharatMLStack. +

+
+ +
+
+ ); +} + +function CTASection() { + const getStartedUrl = useBaseUrl('/category/online-feature-store'); + return ( +
+
+
+

Deploy ML models with confidence

+

+ Comprehensive stack for business-ready ML. Integrates seamlessly with enterprise + systems. Robust security and regulatory compliance. +

+
@@ -204,22 +426,96 @@ function NumerixAbout() { ); } +function CustomFooter() { + const docsUrl = useBaseUrl('/'); + const blogUrl = useBaseUrl('/blog'); + return ( + + ); +} + +// ─── Page ────────────────────────────────────────────── + export default function Home() { - const {siteConfig} = useDocusaurusContext(); + const { siteConfig } = useDocusaurusContext(); + + // Hide Docusaurus navbar/footer on homepage (client-side, before paint) + useLayoutEffect(() => { + document.documentElement.classList.add('homepage-active'); + return () => { + document.documentElement.classList.remove('homepage-active'); + }; + }, []); + return ( - -
- - - - - - - -
+ description="Open source, end-to-end ML infrastructure stack built for scale, speed, and simplicity." + > + {/* Inline style ensures Docusaurus navbar/footer are hidden during SSR and before JS hydration */} + +
+ + + + + + + + + +
); } diff --git a/docs-src/src/pages/index.module.css b/docs-src/src/pages/index.module.css index 30770d52..abcb7794 100644 --- a/docs-src/src/pages/index.module.css +++ b/docs-src/src/pages/index.module.css @@ -1,144 +1,977 @@ /** - * CSS files with the .module.css suffix will be treated as CSS modules - * and scoped locally. + * Homepage CSS Module + * Dark-themed design (primary), with light mode variant. + * Based on reference HTML design for BharatMLStack. */ -.heroBanner { - padding: 4rem 0; - text-align: center; - position: relative; - overflow: hidden; +/* ======================================== + CSS Variables (scoped via data-theme) + ======================================== */ + +:root { + --hp-primary: #6366f1; + --hp-primary-dark: #4f46e5; + --hp-secondary: #8b5cf6; + --hp-accent: #06b6d4; + --hp-success: #10b981; + --hp-dark: #0a0e27; + --hp-dark-light: #151932; + --hp-text: #e2e8f0; + --hp-text-muted: #94a3b8; + --hp-bg-card: rgba(255, 255, 255, 0.03); + --hp-bg-page: #0a0e27; } -@media screen and (max-width: 996px) { - .heroBanner { - padding: 2rem; - } +[data-theme='light'] { + --hp-dark: #f8fafc; + --hp-dark-light: #f1f5f9; + --hp-text: #1e293b; + --hp-text-muted: #64748b; + --hp-bg-card: rgba(0, 0, 0, 0.02); + --hp-bg-page: #f8fafc; } -.logoContainer { - margin-bottom: 2rem; +/* ======================================== + Page wrapper + ======================================== */ + +.homepageWrapper { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', sans-serif; + background: var(--hp-bg-page); + color: var(--hp-text); + line-height: 1.6; + overflow-x: hidden; +} + +/* ======================================== + Custom Navigation + ======================================== */ + +.customNav { + position: fixed; + top: 0; + width: 100%; + background: rgba(10, 14, 39, 0.8); + backdrop-filter: blur(20px); + border-bottom: 1px solid rgba(255, 255, 255, 0.05); + z-index: 1000; + padding: 1.2rem 0; + transition: transform 0.3s ease; +} + +[data-theme='light'] .customNav { + background: rgba(255, 255, 255, 0.85); + border-bottom: 1px solid rgba(0, 0, 0, 0.08); +} + +.navContainer { + max-width: 1400px; + margin: 0 auto; + padding: 0 2rem; display: flex; - justify-content: center; + justify-content: space-between; align-items: center; } -.heroLogo { - width: 180px; - height: 180px; - filter: drop-shadow(0 4px 8px rgba(0, 0, 0, 0.1)); - transition: transform 0.3s ease; +.logo { + font-size: 1.6rem; + font-weight: 800; + background: linear-gradient(135deg, #6366f1, #8b5cf6, #06b6d4); + background-size: 200% 200%; + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; + animation: hpGradientShift 3s ease infinite; + text-decoration: none; +} + +@keyframes hpGradientShift { + 0%, 100% { background-position: 0% 50%; } + 50% { background-position: 100% 50%; } +} + +.navLinks { + display: flex; + gap: 2.5rem; + align-items: center; +} + +.navLink { + color: var(--hp-text); + text-decoration: none; + transition: color 0.3s; + font-weight: 500; +} + +.navLink:hover { + color: var(--hp-primary); + text-decoration: none; +} + +/* ======================================== + Buttons + ======================================== */ + +.btn { + padding: 0.75rem 2rem; + border-radius: 50px; + text-decoration: none; + font-weight: 600; + transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 1.275); + display: inline-block; + cursor: pointer; + border: none; + font-size: 1rem; +} + +.btn:hover { + text-decoration: none; +} + +.btnPrimary { + background: linear-gradient(135deg, #6366f1, #8b5cf6); + color: white; + box-shadow: 0 10px 30px rgba(99, 102, 241, 0.3); +} + +.btnPrimary:hover { + transform: translateY(-3px); + box-shadow: 0 15px 40px rgba(99, 102, 241, 0.5); + color: white; +} + +.btnSecondary { + background: rgba(255, 255, 255, 0.05); + color: var(--hp-text); + border: 2px solid rgba(99, 102, 241, 0.5); +} + +[data-theme='light'] .btnSecondary { + background: rgba(99, 102, 241, 0.05); + border-color: rgba(99, 102, 241, 0.4); } -.heroLogo:hover { - transform: scale(1.05); +.btnSecondary:hover { + background: rgba(99, 102, 241, 0.2); + border-color: var(--hp-primary); + transform: translateY(-3px); + color: var(--hp-text); } -@media screen and (max-width: 768px) { - .heroLogo { - width: 120px; - height: 120px; +.btnWhite { + background: white; + color: var(--hp-primary); +} + +.btnWhite:hover { + background: #f8fafc; + transform: translateY(-3px) scale(1.05); + color: var(--hp-primary); +} + +.btnOutlineWhite { + background: transparent; + border: 2px solid white; + color: white; +} + +.btnOutlineWhite:hover { + background: rgba(255, 255, 255, 0.15); + color: white; + transform: translateY(-3px); +} + +/* ======================================== + Hero Section + ======================================== */ + +.hero { + min-height: 100vh; + display: grid; + grid-template-columns: 1fr 1fr; + gap: 4rem; + align-items: center; + padding: 10rem 2rem 5rem; + max-width: 1400px; + margin: 0 auto; + position: relative; + z-index: 1; +} + +.heroContent { + animation: hpFadeInUp 1s ease-out; +} + +@keyframes hpFadeInUp { + from { + opacity: 0; + transform: translateY(40px); } - - .logoContainer { - margin-bottom: 1.5rem; + to { + opacity: 1; + transform: translateY(0); } } -.buttons { - display: flex; - align-items: center; - justify-content: center; - gap: 1rem; +.heroBadge { + display: inline-block; + padding: 0.5rem 1.5rem; + background: rgba(99, 102, 241, 0.1); + border: 1px solid rgba(99, 102, 241, 0.3); + border-radius: 50px; + color: var(--hp-primary); + font-size: 0.9rem; + font-weight: 600; margin-bottom: 2rem; + backdrop-filter: blur(10px); } -@media screen and (max-width: 768px) { - .buttons { - flex-direction: column; - gap: 0.5rem; - } +.heroTitle { + font-size: 4.5rem; + font-weight: 900; + margin-bottom: 1.5rem; + line-height: 1.1; + background: linear-gradient(135deg, #fff 0%, #a5b4fc 100%); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} + +[data-theme='light'] .heroTitle { + background: linear-gradient(135deg, #1e293b 0%, #6366f1 100%); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} + +.heroSubtitle { + font-size: 1.25rem; + color: var(--hp-text-muted); + margin-bottom: 2.5rem; + line-height: 1.8; } -.statsContainer { +.heroButtons { display: flex; - justify-content: center; + gap: 1.5rem; + flex-wrap: wrap; +} + +.heroImage { + position: relative; + animation: hpFadeInUp 1s ease-out 0.3s both; +} + +.heroImage img { + width: 100%; + border-radius: 20px; + box-shadow: 0 40px 80px rgba(0, 0, 0, 0.5); +} + +[data-theme='light'] .heroImage img { + box-shadow: 0 40px 80px rgba(0, 0, 0, 0.15); +} + +.adoptionBadge { + text-align: center; + margin-top: 3rem; + animation: hpFadeInUp 1s ease-out 0.6s both; +} + +.adoptionBadge p { + color: var(--hp-text-muted); + font-size: 0.95rem; +} + +/* ======================================== + Section (generic) + ======================================== */ + +.section { + padding: 8rem 2rem; + position: relative; + z-index: 1; +} + +.container { + max-width: 1400px; + margin: 0 auto; +} + +.sectionHeader { + text-align: center; + margin-bottom: 5rem; +} + +.sectionSubtitle { + font-size: 0.95rem; + color: var(--hp-primary); + font-weight: 700; + text-transform: uppercase; + letter-spacing: 2px; + margin-bottom: 1rem; +} + +.sectionTitle { + font-size: 3.5rem; + font-weight: 900; + margin-bottom: 1.5rem; + background: linear-gradient(135deg, #fff, #a5b4fc); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} + +[data-theme='light'] .sectionTitle { + background: linear-gradient(135deg, #1e293b, #6366f1); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} + +.sectionDescription { + font-size: 1.2rem; + color: var(--hp-text-muted); + max-width: 800px; + margin: 0 auto; +} + +/* ======================================== + Barriers Section (3-panel) + ======================================== */ + +.barriersGrid { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 2.5rem; + margin-top: 4rem; +} + +.barrierCard { + background: var(--hp-bg-card); + backdrop-filter: blur(20px); + border: 1px solid rgba(255, 255, 255, 0.08); + border-radius: 24px; + padding: 2.5rem; + transition: all 0.4s; +} + +[data-theme='light'] .barrierCard { + background: white; + border-color: rgba(0, 0, 0, 0.08); + box-shadow: 0 4px 20px rgba(0, 0, 0, 0.05); +} + +.barrierCard:hover { + transform: translateY(-8px); + border-color: rgba(99, 102, 241, 0.3); + box-shadow: 0 20px 50px rgba(0, 0, 0, 0.4); +} + +[data-theme='light'] .barrierCard:hover { + box-shadow: 0 20px 50px rgba(99, 102, 241, 0.12); + border-color: rgba(99, 102, 241, 0.3); +} + +.barrierIcon { + font-size: 2.5rem; + margin-bottom: 1.5rem; +} + +.barrierCard h3 { + font-size: 1.4rem; + font-weight: 700; + margin-bottom: 1rem; + color: var(--hp-text); +} + +.barrierCard p { + color: var(--hp-text-muted); + line-height: 1.8; + font-size: 0.95rem; +} + +.barrierQuestions { + list-style: none; + padding: 0; + margin: 1rem 0; +} + +.barrierQuestions li { + color: var(--hp-text-muted); + padding: 0.4rem 0; + font-size: 0.92rem; + line-height: 1.6; + position: relative; + padding-left: 1.2rem; +} + +.barrierQuestions li::before { + content: '?'; + position: absolute; + left: 0; + color: var(--hp-primary); + font-weight: 700; +} + +.barrierAnswer { + margin-top: 1rem; + color: var(--hp-text-muted); + font-size: 0.92rem; + line-height: 1.8; + border-top: 1px solid rgba(255, 255, 255, 0.06); + padding-top: 1rem; +} + +[data-theme='light'] .barrierAnswer { + border-top-color: rgba(0, 0, 0, 0.06); +} + +/* ======================================== + Component Cards (5 cards) + ======================================== */ + +.componentsGrid { + display: grid; + grid-template-columns: repeat(3, 1fr); gap: 3rem; - margin-top: 2rem; - opacity: 0.9; + margin-top: 4rem; } -@media screen and (max-width: 768px) { - .statsContainer { - flex-direction: column; - gap: 1rem; - align-items: center; - } +.componentCard { + background: var(--hp-bg-card); + backdrop-filter: blur(20px); + border: 1px solid rgba(255, 255, 255, 0.08); + border-radius: 24px; + overflow: hidden; + transition: all 0.5s cubic-bezier(0.175, 0.885, 0.32, 1.275); + opacity: 0; + transform: translateY(50px); +} + +.componentCardVisible { + opacity: 1; + transform: translateY(0); +} + +[data-theme='light'] .componentCard { + background: white; + border-color: rgba(0, 0, 0, 0.08); + box-shadow: 0 4px 20px rgba(0, 0, 0, 0.05); +} + +.componentCard:hover { + transform: translateY(-10px); + border-color: rgba(99, 102, 241, 0.3); + box-shadow: 0 30px 60px rgba(0, 0, 0, 0.5); +} + +[data-theme='light'] .componentCard:hover { + box-shadow: 0 30px 60px rgba(99, 102, 241, 0.1); +} + +.componentCardVisible:hover { + transform: translateY(-10px); +} + +.componentContent { + padding: 2.5rem; +} + +.componentContent h3 { + font-size: 1.6rem; + margin-bottom: 1rem; + font-weight: 700; + color: var(--hp-text); +} + +.componentContent p { + color: var(--hp-text-muted); + margin-bottom: 1.5rem; + line-height: 1.7; +} + +.componentLink { + color: var(--hp-primary); + text-decoration: none; + font-weight: 600; + display: inline-flex; + align-items: center; + gap: 0.5rem; + transition: gap 0.3s; +} + +.componentLink:hover { + gap: 1rem; + text-decoration: none; + color: var(--hp-primary); } -.statItem { +.componentIcon { + width: 100%; + height: 180px; display: flex; - flex-direction: column; align-items: center; + justify-content: center; + font-size: 4rem; + background: linear-gradient(135deg, rgba(99, 102, 241, 0.1), rgba(139, 92, 246, 0.1)); +} + +[data-theme='light'] .componentIcon { + background: linear-gradient(135deg, rgba(99, 102, 241, 0.06), rgba(139, 92, 246, 0.06)); +} + +/* ======================================== + Stats Grid + ======================================== */ + +.statsSection { + background: rgba(0, 0, 0, 0.2); +} + +[data-theme='light'] .statsSection { + background: rgba(99, 102, 241, 0.03); +} + +.statsGrid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); + gap: 2.5rem; + margin-top: 4rem; +} + +.statCard { + background: var(--hp-bg-card); + backdrop-filter: blur(20px); + border: 1px solid rgba(255, 255, 255, 0.08); + border-radius: 20px; + padding: 2.5rem; text-align: center; - color: white; + transition: all 0.4s; } -.statItem strong { - font-size: 1.5rem; +[data-theme='light'] .statCard { + background: white; + border-color: rgba(0, 0, 0, 0.08); + box-shadow: 0 4px 20px rgba(0, 0, 0, 0.05); +} + +.statCard:hover { + transform: translateY(-5px); + border-color: rgba(99, 102, 241, 0.3); +} + +.statLabel { + font-size: 0.9rem; + color: var(--hp-text-muted); + text-transform: uppercase; + letter-spacing: 1.5px; + margin-bottom: 0.5rem; +} + +.statValue { + font-size: 2.5rem; + font-weight: 900; + background: linear-gradient(135deg, #6366f1, #8b5cf6); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} + +.statDescription { + color: var(--hp-text-muted); + font-size: 0.95rem; + margin-top: 0.5rem; +} + +/* ======================================== + Demo Videos Grid + ======================================== */ + +.videosGrid { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 2.5rem; + margin-top: 4rem; +} + +.videoCard { + background: var(--hp-bg-card); + backdrop-filter: blur(20px); + border: 1px solid rgba(255, 255, 255, 0.08); + border-radius: 24px; + overflow: hidden; + transition: all 0.4s; +} + +[data-theme='light'] .videoCard { + background: white; + border-color: rgba(0, 0, 0, 0.08); + box-shadow: 0 4px 20px rgba(0, 0, 0, 0.05); +} + +.videoCard:hover { + transform: translateY(-8px); + border-color: rgba(99, 102, 241, 0.3); + box-shadow: 0 20px 50px rgba(0, 0, 0, 0.4); +} + +[data-theme='light'] .videoCard:hover { + box-shadow: 0 20px 50px rgba(99, 102, 241, 0.12); +} + +.videoWrapper { + position: relative; + width: 100%; + aspect-ratio: 16 / 9; + background: #000; + overflow: hidden; +} + +.videoPlayer { + width: 100%; + height: 100%; + object-fit: cover; + display: block; +} + +.videoContent { + padding: 1.5rem 2rem 2rem; +} + +.videoContent h3 { + font-size: 1.3rem; font-weight: 700; - margin-bottom: 0.25rem; + margin-bottom: 0.5rem; + color: var(--hp-text); +} + +.videoContent p { + color: var(--hp-text-muted); + font-size: 0.92rem; + line-height: 1.6; + margin: 0; +} + +/* ======================================== + Blog Grid + ======================================== */ + +.blogGrid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(350px, 1fr)); + gap: 2.5rem; + margin-top: 4rem; +} + +.blogCard { + background: var(--hp-bg-card); + backdrop-filter: blur(20px); + border: 1px solid rgba(255, 255, 255, 0.08); + border-radius: 20px; + overflow: hidden; + transition: all 0.4s; + text-decoration: none; + color: inherit; display: block; } -.statItem span { - font-size: 0.875rem; - opacity: 0.8; - text-transform: uppercase; - letter-spacing: 0.5px; +[data-theme='light'] .blogCard { + background: white; + border-color: rgba(0, 0, 0, 0.08); + box-shadow: 0 4px 20px rgba(0, 0, 0, 0.05); } -.aboutSection { - padding: 4rem 0; - background-color: var(--ifm-background-surface-color); +.blogCard:hover { + transform: translateY(-5px); + border-color: rgba(99, 102, 241, 0.3); + text-decoration: none; + color: inherit; } -.highlightBox { - background: linear-gradient(135deg, #f8f9ff 0%, #e8f0ff 100%); - border: 1px solid rgba(69, 8, 57, 0.1); - border-radius: 12px; +.blogCardIcon { + width: 100%; + height: 160px; + display: flex; + align-items: center; + justify-content: center; + font-size: 3rem; + background: linear-gradient(135deg, rgba(99, 102, 241, 0.15), rgba(6, 182, 212, 0.15)); +} + +[data-theme='light'] .blogCardIcon { + background: linear-gradient(135deg, rgba(99, 102, 241, 0.08), rgba(6, 182, 212, 0.08)); +} + +.blogContent { padding: 2rem; - height: 100%; } -.highlightBox h3 { - color: var(--bharatml-primary); +.blogCategory { + display: inline-block; + padding: 0.25rem 0.75rem; + background: rgba(99, 102, 241, 0.2); + border-radius: 12px; + font-size: 0.75rem; + color: var(--hp-primary); + font-weight: 700; + text-transform: uppercase; margin-bottom: 1rem; - font-size: 1.25rem; } -.highlightBox ul { +.blogCard h3 { + font-size: 1.3rem; + margin-bottom: 0.75rem; + font-weight: 700; + color: var(--hp-text); +} + +.blogMeta { + display: flex; + align-items: center; + gap: 0.5rem; + color: var(--hp-text-muted); + font-size: 0.85rem; +} + +/* ======================================== + CTA Section + ======================================== */ + +.ctaSection { + background: linear-gradient(135deg, #6366f1, #8b5cf6); + border-radius: 40px; + padding: 6rem 4rem; + text-align: center; + margin: 2rem 0; + position: relative; + overflow: hidden; +} + +.ctaSection::before { + content: ''; + position: absolute; + top: -50%; + left: -50%; + width: 200%; + height: 200%; + background: radial-gradient(circle, rgba(255, 255, 255, 0.1) 0%, transparent 70%); + animation: hpRotate 20s linear infinite; +} + +@keyframes hpRotate { + from { transform: rotate(0deg); } + to { transform: rotate(360deg); } +} + +.ctaTitle { + font-size: 3.5rem; + font-weight: 900; + margin-bottom: 1.5rem; + position: relative; + z-index: 1; + color: white; + background: none; + -webkit-text-fill-color: white; +} + +.ctaDescription { + font-size: 1.3rem; + margin-bottom: 3rem; + position: relative; + z-index: 1; + color: rgba(255, 255, 255, 0.9); +} + +.ctaButtons { + display: flex; + gap: 1.5rem; + justify-content: center; + flex-wrap: wrap; + position: relative; + z-index: 1; +} + +/* ======================================== + Custom Footer + ======================================== */ + +.customFooter { + background: var(--hp-dark-light); + border-top: 1px solid rgba(255, 255, 255, 0.05); + padding: 5rem 2rem 2rem; + position: relative; + z-index: 1; +} + +[data-theme='light'] .customFooter { + background: #f1f5f9; + border-top-color: rgba(0, 0, 0, 0.08); +} + +.footerContent { + max-width: 1400px; + margin: 0 auto; + display: grid; + grid-template-columns: 2fr 1fr 1fr 1fr; + gap: 4rem; + margin-bottom: 3rem; +} + +.footerSection h4 { + font-size: 1.2rem; + margin-bottom: 1.5rem; + font-weight: 700; + color: var(--hp-text); +} + +.footerSection p { + color: var(--hp-text-muted); + line-height: 1.8; +} + +.footerList { list-style: none; padding: 0; margin: 0; } -.highlightBox li { - padding: 0.5rem 0; - font-size: 0.95rem; - color: var(--bharatml-text); +.footerList li { + margin-bottom: 0.75rem; +} + +.footerList a { + color: var(--hp-text-muted); + text-decoration: none; + transition: all 0.3s; +} + +.footerList a:hover { + color: var(--hp-primary); + text-decoration: none; +} + +.footerBottom { + max-width: 1400px; + margin: 0 auto; + padding-top: 2rem; + border-top: 1px solid rgba(255, 255, 255, 0.05); + display: flex; + justify-content: space-between; + align-items: center; + color: var(--hp-text-muted); + flex-wrap: wrap; + gap: 1rem; +} + +[data-theme='light'] .footerBottom { + border-top-color: rgba(0, 0, 0, 0.08); +} + +.footerLinks { + display: flex; + gap: 2rem; } -.highlightBox li:not(:last-child) { - border-bottom: 1px solid rgba(69, 8, 57, 0.05); +.footerLinks a { + color: var(--hp-text-muted); + text-decoration: none; + transition: color 0.3s; } -/* Dark mode adjustments */ -[data-theme='dark'] .highlightBox { - background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); - border-color: rgba(139, 69, 130, 0.2); +.footerLinks a:hover { + color: var(--hp-primary); + text-decoration: none; } -[data-theme='dark'] .highlightBox li { - color: var(--bharatml-text); +/* ======================================== + Responsive + ======================================== */ + +@media (max-width: 1024px) { + .hero { + grid-template-columns: 1fr; + text-align: center; + padding-top: 8rem; + } + + .heroButtons { + justify-content: center; + } + + .componentsGrid { + grid-template-columns: 1fr 1fr; + } + + .barriersGrid { + grid-template-columns: 1fr; + } + + .videosGrid { + grid-template-columns: 1fr 1fr; + } + + .footerContent { + grid-template-columns: 1fr 1fr; + } +} + +@media (max-width: 768px) { + .heroTitle { + font-size: 3rem; + } + + .sectionTitle { + font-size: 2.5rem; + } + + .navLinks a:not(.btn):not(.btnPrimary) { + display: none; + } + + .componentsGrid, + .blogGrid, + .videosGrid { + grid-template-columns: 1fr; + } + + .footerContent { + grid-template-columns: 1fr; + } + + .ctaTitle { + font-size: 2.5rem; + } + + .ctaSection { + padding: 4rem 2rem; + border-radius: 20px; + } + + .section { + padding: 4rem 1.5rem; + } + + .hero { + padding: 7rem 1.5rem 3rem; + } +} + +@media (max-width: 480px) { + .heroTitle { + font-size: 2.2rem; + } + + .sectionTitle { + font-size: 2rem; + } + + .statsGrid { + grid-template-columns: 1fr 1fr; + } + + .heroButtons { + flex-direction: column; + align-items: center; + } } diff --git a/docs-src/src/theme/Root.js b/docs-src/src/theme/Root.js new file mode 100644 index 00000000..3ceb9c99 --- /dev/null +++ b/docs-src/src/theme/Root.js @@ -0,0 +1,14 @@ +import React from 'react'; + +export default function Root({ children }) { + return ( + <> +
+
+
+
+
+ {children} + + ); +} diff --git a/docs/404.html b/docs/404.html index a131da18..2f19dcf3 100644 --- a/docs/404.html +++ b/docs/404.html @@ -4,14 +4,14 @@ BharatMLStack - - - + + + -

Page Not Found

We could not find what you were looking for.

Please contact the owner of the site that linked you to the original URL and let them know their link is broken.

+

Page Not Found

We could not find what you were looking for.

Please contact the owner of the site that linked you to the original URL and let them know their link is broken.

\ No newline at end of file diff --git a/docs/assets/css/styles.14b2d0af.css b/docs/assets/css/styles.030f898a.css similarity index 55% rename from docs/assets/css/styles.14b2d0af.css rename to docs/assets/css/styles.030f898a.css index 8bc1333c..df0629cc 100644 --- a/docs/assets/css/styles.14b2d0af.css +++ b/docs/assets/css/styles.030f898a.css @@ -1 +1 @@ -@layer docusaurus.infima,docusaurus.theme-common,docusaurus.theme-classic,docusaurus.core,docusaurus.plugin-debug,docusaurus.theme-mermaid,docusaurus.theme-live-codeblock,docusaurus.theme-search-algolia.docsearch,docusaurus.theme-search-algolia;@layer docusaurus.infima{.col,.container{padding:0 var(--ifm-spacing-horizontal);width:100%}.markdown>h2,.markdown>h3,.markdown>h4,.markdown>h5,.markdown>h6{margin-bottom:calc(var(--ifm-heading-vertical-rhythm-bottom)*var(--ifm-leading))}.markdown li,body{word-wrap:break-word}body,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow:auto}blockquote,pre{margin:0 0 var(--ifm-spacing-vertical)}.breadcrumbs__link,.button{transition-timing-function:var(--ifm-transition-timing-default)}.button,code{vertical-align:middle}.button--outline.button--active,.button--outline:active,.button--outline:hover,:root{--ifm-button-color:var(--ifm-font-color-base-inverse)}.menu__link:hover,a{transition:color var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.navbar--dark,:root{--ifm-navbar-link-hover-color:var(--ifm-color-primary)}.menu,.navbar-sidebar{overflow-x:hidden}:root,html[data-theme=dark]{--ifm-color-emphasis-500:var(--ifm-color-gray-500)}:root{--ifm-color-scheme:light;--ifm-dark-value:10%;--ifm-darker-value:15%;--ifm-darkest-value:30%;--ifm-light-value:15%;--ifm-lighter-value:30%;--ifm-lightest-value:50%;--ifm-contrast-background-value:90%;--ifm-contrast-foreground-value:70%;--ifm-contrast-background-dark-value:70%;--ifm-contrast-foreground-dark-value:90%;--ifm-color-primary:#3578e5;--ifm-color-secondary:#ebedf0;--ifm-color-success:#00a400;--ifm-color-info:#54c7ec;--ifm-color-warning:#ffba00;--ifm-color-danger:#fa383e;--ifm-color-primary-dark:#306cce;--ifm-color-primary-darker:#2d66c3;--ifm-color-primary-darkest:#2554a0;--ifm-color-primary-light:#538ce9;--ifm-color-primary-lighter:#72a1ed;--ifm-color-primary-lightest:#9abcf2;--ifm-color-primary-contrast-background:#ebf2fc;--ifm-color-primary-contrast-foreground:#102445;--ifm-color-secondary-dark:#d4d5d8;--ifm-color-secondary-darker:#c8c9cc;--ifm-color-secondary-darkest:#a4a6a8;--ifm-color-secondary-light:#eef0f2;--ifm-color-secondary-lighter:#f1f2f5;--ifm-color-secondary-lightest:#f5f6f8;--ifm-color-secondary-contrast-background:#fdfdfe;--ifm-color-secondary-contrast-foreground:#474748;--ifm-color-success-dark:#009400;--ifm-color-success-darker:#008b00;--ifm-color-success-darkest:#007300;--ifm-color-success-light:#26b226;--ifm-color-success-lighter:#4dbf4d;--ifm-color-success-lightest:#80d280;--ifm-color-success-contrast-background:#e6f6e6;--ifm-color-success-contrast-foreground:#003100;--ifm-color-info-dark:#4cb3d4;--ifm-color-info-darker:#47a9c9;--ifm-color-info-darkest:#3b8ba5;--ifm-color-info-light:#6ecfef;--ifm-color-info-lighter:#87d8f2;--ifm-color-info-lightest:#aae3f6;--ifm-color-info-contrast-background:#eef9fd;--ifm-color-info-contrast-foreground:#193c47;--ifm-color-warning-dark:#e6a700;--ifm-color-warning-darker:#d99e00;--ifm-color-warning-darkest:#b38200;--ifm-color-warning-light:#ffc426;--ifm-color-warning-lighter:#ffcf4d;--ifm-color-warning-lightest:#ffdd80;--ifm-color-warning-contrast-background:#fff8e6;--ifm-color-warning-contrast-foreground:#4d3800;--ifm-color-danger-dark:#e13238;--ifm-color-danger-darker:#d53035;--ifm-color-danger-darkest:#af272b;--ifm-color-danger-light:#fb565b;--ifm-color-danger-lighter:#fb7478;--ifm-color-danger-lightest:#fd9c9f;--ifm-color-danger-contrast-background:#ffebec;--ifm-color-danger-contrast-foreground:#4b1113;--ifm-color-white:#fff;--ifm-color-black:#000;--ifm-color-gray-0:var(--ifm-color-white);--ifm-color-gray-100:#f5f6f7;--ifm-color-gray-200:#ebedf0;--ifm-color-gray-300:#dadde1;--ifm-color-gray-400:#ccd0d5;--ifm-color-gray-500:#bec3c9;--ifm-color-gray-600:#8d949e;--ifm-color-gray-700:#606770;--ifm-color-gray-800:#444950;--ifm-color-gray-900:#1c1e21;--ifm-color-gray-1000:var(--ifm-color-black);--ifm-color-emphasis-0:var(--ifm-color-gray-0);--ifm-color-emphasis-100:var(--ifm-color-gray-100);--ifm-color-emphasis-200:var(--ifm-color-gray-200);--ifm-color-emphasis-300:var(--ifm-color-gray-300);--ifm-color-emphasis-400:var(--ifm-color-gray-400);--ifm-color-emphasis-600:var(--ifm-color-gray-600);--ifm-color-emphasis-700:var(--ifm-color-gray-700);--ifm-color-emphasis-800:var(--ifm-color-gray-800);--ifm-color-emphasis-900:var(--ifm-color-gray-900);--ifm-color-emphasis-1000:var(--ifm-color-gray-1000);--ifm-color-content:var(--ifm-color-emphasis-900);--ifm-color-content-inverse:var(--ifm-color-emphasis-0);--ifm-color-content-secondary:#525860;--ifm-background-color:#0000;--ifm-background-surface-color:var(--ifm-color-content-inverse);--ifm-global-border-width:1px;--ifm-global-radius:0.4rem;--ifm-hover-overlay:#0000000d;--ifm-font-color-base:var(--ifm-color-content);--ifm-font-color-base-inverse:var(--ifm-color-content-inverse);--ifm-font-color-secondary:var(--ifm-color-content-secondary);--ifm-font-family-base:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol";--ifm-font-family-monospace:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;--ifm-font-size-base:100%;--ifm-font-weight-light:300;--ifm-font-weight-normal:400;--ifm-font-weight-semibold:500;--ifm-font-weight-bold:700;--ifm-font-weight-base:var(--ifm-font-weight-normal);--ifm-line-height-base:1.65;--ifm-global-spacing:1rem;--ifm-spacing-vertical:var(--ifm-global-spacing);--ifm-spacing-horizontal:var(--ifm-global-spacing);--ifm-transition-fast:200ms;--ifm-transition-slow:400ms;--ifm-transition-timing-default:cubic-bezier(0.08,0.52,0.52,1);--ifm-global-shadow-lw:0 1px 2px 0 #0000001a;--ifm-global-shadow-md:0 5px 40px #0003;--ifm-global-shadow-tl:0 12px 28px 0 #0003,0 2px 4px 0 #0000001a;--ifm-z-index-dropdown:100;--ifm-z-index-fixed:200;--ifm-z-index-overlay:400;--ifm-container-width:1140px;--ifm-container-width-xl:1320px;--ifm-code-background:#f6f7f8;--ifm-code-border-radius:var(--ifm-global-radius);--ifm-code-font-size:90%;--ifm-code-padding-horizontal:0.1rem;--ifm-code-padding-vertical:0.1rem;--ifm-pre-background:var(--ifm-code-background);--ifm-pre-border-radius:var(--ifm-code-border-radius);--ifm-pre-color:inherit;--ifm-pre-line-height:1.45;--ifm-pre-padding:1rem;--ifm-heading-color:inherit;--ifm-heading-margin-top:0;--ifm-heading-margin-bottom:var(--ifm-spacing-vertical);--ifm-heading-font-family:var(--ifm-font-family-base);--ifm-heading-font-weight:var(--ifm-font-weight-bold);--ifm-heading-line-height:1.25;--ifm-h1-font-size:2rem;--ifm-h2-font-size:1.5rem;--ifm-h3-font-size:1.25rem;--ifm-h4-font-size:1rem;--ifm-h5-font-size:0.875rem;--ifm-h6-font-size:0.85rem;--ifm-image-alignment-padding:1.25rem;--ifm-leading-desktop:1.25;--ifm-leading:calc(var(--ifm-leading-desktop)*1rem);--ifm-list-left-padding:2rem;--ifm-list-margin:1rem;--ifm-list-item-margin:0.25rem;--ifm-list-paragraph-margin:1rem;--ifm-table-cell-padding:0.75rem;--ifm-table-background:#0000;--ifm-table-stripe-background:#00000008;--ifm-table-border-width:1px;--ifm-table-border-color:var(--ifm-color-emphasis-300);--ifm-table-head-background:inherit;--ifm-table-head-color:inherit;--ifm-table-head-font-weight:var(--ifm-font-weight-bold);--ifm-table-cell-color:inherit;--ifm-link-color:var(--ifm-color-primary);--ifm-link-decoration:none;--ifm-link-hover-color:var(--ifm-link-color);--ifm-link-hover-decoration:underline;--ifm-paragraph-margin-bottom:var(--ifm-leading);--ifm-blockquote-font-size:var(--ifm-font-size-base);--ifm-blockquote-border-left-width:2px;--ifm-blockquote-padding-horizontal:var(--ifm-spacing-horizontal);--ifm-blockquote-padding-vertical:0;--ifm-blockquote-shadow:none;--ifm-blockquote-color:var(--ifm-color-emphasis-800);--ifm-blockquote-border-color:var(--ifm-color-emphasis-300);--ifm-hr-background-color:var(--ifm-color-emphasis-500);--ifm-hr-height:1px;--ifm-hr-margin-vertical:1.5rem;--ifm-scrollbar-size:7px;--ifm-scrollbar-track-background-color:#f1f1f1;--ifm-scrollbar-thumb-background-color:silver;--ifm-scrollbar-thumb-hover-background-color:#a7a7a7;--ifm-alert-background-color:inherit;--ifm-alert-border-color:inherit;--ifm-alert-border-radius:var(--ifm-global-radius);--ifm-alert-border-width:0px;--ifm-alert-border-left-width:5px;--ifm-alert-color:var(--ifm-font-color-base);--ifm-alert-padding-horizontal:var(--ifm-spacing-horizontal);--ifm-alert-padding-vertical:var(--ifm-spacing-vertical);--ifm-alert-shadow:var(--ifm-global-shadow-lw);--ifm-avatar-intro-margin:1rem;--ifm-avatar-intro-alignment:inherit;--ifm-avatar-photo-size:3rem;--ifm-badge-background-color:inherit;--ifm-badge-border-color:inherit;--ifm-badge-border-radius:var(--ifm-global-radius);--ifm-badge-border-width:var(--ifm-global-border-width);--ifm-badge-color:var(--ifm-color-white);--ifm-badge-padding-horizontal:calc(var(--ifm-spacing-horizontal)*0.5);--ifm-badge-padding-vertical:calc(var(--ifm-spacing-vertical)*0.25);--ifm-breadcrumb-border-radius:1.5rem;--ifm-breadcrumb-spacing:0.5rem;--ifm-breadcrumb-color-active:var(--ifm-color-primary);--ifm-breadcrumb-item-background-active:var(--ifm-hover-overlay);--ifm-breadcrumb-padding-horizontal:0.8rem;--ifm-breadcrumb-padding-vertical:0.4rem;--ifm-breadcrumb-size-multiplier:1;--ifm-breadcrumb-separator:url('data:image/svg+xml;utf8,');--ifm-breadcrumb-separator-filter:none;--ifm-breadcrumb-separator-size:0.5rem;--ifm-breadcrumb-separator-size-multiplier:1.25;--ifm-button-background-color:inherit;--ifm-button-border-color:var(--ifm-button-background-color);--ifm-button-border-width:var(--ifm-global-border-width);--ifm-button-font-weight:var(--ifm-font-weight-bold);--ifm-button-padding-horizontal:1.5rem;--ifm-button-padding-vertical:0.375rem;--ifm-button-size-multiplier:1;--ifm-button-transition-duration:var(--ifm-transition-fast);--ifm-button-border-radius:calc(var(--ifm-global-radius)*var(--ifm-button-size-multiplier));--ifm-button-group-spacing:2px;--ifm-card-background-color:var(--ifm-background-surface-color);--ifm-card-border-radius:calc(var(--ifm-global-radius)*2);--ifm-card-horizontal-spacing:var(--ifm-global-spacing);--ifm-card-vertical-spacing:var(--ifm-global-spacing);--ifm-toc-border-color:var(--ifm-color-emphasis-300);--ifm-toc-link-color:var(--ifm-color-content-secondary);--ifm-toc-padding-vertical:0.5rem;--ifm-toc-padding-horizontal:0.5rem;--ifm-dropdown-background-color:var(--ifm-background-surface-color);--ifm-dropdown-font-weight:var(--ifm-font-weight-semibold);--ifm-dropdown-link-color:var(--ifm-font-color-base);--ifm-dropdown-hover-background-color:var(--ifm-hover-overlay);--ifm-footer-background-color:var(--ifm-color-emphasis-100);--ifm-footer-color:inherit;--ifm-footer-link-color:var(--ifm-color-emphasis-700);--ifm-footer-link-hover-color:var(--ifm-color-primary);--ifm-footer-link-horizontal-spacing:0.5rem;--ifm-footer-padding-horizontal:calc(var(--ifm-spacing-horizontal)*2);--ifm-footer-padding-vertical:calc(var(--ifm-spacing-vertical)*2);--ifm-footer-title-color:inherit;--ifm-footer-logo-max-width:min(30rem,90vw);--ifm-hero-background-color:var(--ifm-background-surface-color);--ifm-hero-text-color:var(--ifm-color-emphasis-800);--ifm-menu-color:var(--ifm-color-emphasis-700);--ifm-menu-color-active:var(--ifm-color-primary);--ifm-menu-color-background-active:var(--ifm-hover-overlay);--ifm-menu-color-background-hover:var(--ifm-hover-overlay);--ifm-menu-link-padding-horizontal:0.75rem;--ifm-menu-link-padding-vertical:0.375rem;--ifm-menu-link-sublist-icon:url('data:image/svg+xml;utf8,');--ifm-menu-link-sublist-icon-filter:none;--ifm-navbar-background-color:var(--ifm-background-surface-color);--ifm-navbar-height:3.75rem;--ifm-navbar-item-padding-horizontal:0.75rem;--ifm-navbar-item-padding-vertical:0.25rem;--ifm-navbar-link-color:var(--ifm-font-color-base);--ifm-navbar-link-active-color:var(--ifm-link-color);--ifm-navbar-padding-horizontal:var(--ifm-spacing-horizontal);--ifm-navbar-padding-vertical:calc(var(--ifm-spacing-vertical)*0.5);--ifm-navbar-shadow:var(--ifm-global-shadow-lw);--ifm-navbar-search-input-background-color:var(--ifm-color-emphasis-200);--ifm-navbar-search-input-color:var(--ifm-color-emphasis-800);--ifm-navbar-search-input-placeholder-color:var(--ifm-color-emphasis-500);--ifm-navbar-search-input-icon:url('data:image/svg+xml;utf8,');--ifm-navbar-sidebar-width:83vw;--ifm-pagination-border-radius:var(--ifm-global-radius);--ifm-pagination-color-active:var(--ifm-color-primary);--ifm-pagination-font-size:1rem;--ifm-pagination-item-active-background:var(--ifm-hover-overlay);--ifm-pagination-page-spacing:0.2em;--ifm-pagination-padding-horizontal:calc(var(--ifm-spacing-horizontal)*1);--ifm-pagination-padding-vertical:calc(var(--ifm-spacing-vertical)*0.25);--ifm-pagination-nav-border-radius:var(--ifm-global-radius);--ifm-pagination-nav-color-hover:var(--ifm-color-primary);--ifm-pills-color-active:var(--ifm-color-primary);--ifm-pills-color-background-active:var(--ifm-hover-overlay);--ifm-pills-spacing:0.125rem;--ifm-tabs-color:var(--ifm-font-color-secondary);--ifm-tabs-color-active:var(--ifm-color-primary);--ifm-tabs-color-active-border:var(--ifm-tabs-color-active);--ifm-tabs-padding-horizontal:1rem;--ifm-tabs-padding-vertical:1rem}.badge--danger,.badge--info,.badge--primary,.badge--secondary,.badge--success,.badge--warning{--ifm-badge-border-color:var(--ifm-badge-background-color)}.button--link,.button--outline{--ifm-button-background-color:#0000}*{box-sizing:border-box}html{background-color:var(--ifm-background-color);color:var(--ifm-font-color-base);color-scheme:var(--ifm-color-scheme);font:var(--ifm-font-size-base)/var(--ifm-line-height-base) var(--ifm-font-family-base);-webkit-font-smoothing:antialiased;-webkit-tap-highlight-color:transparent;text-rendering:optimizelegibility;-webkit-text-size-adjust:100%;text-size-adjust:100%}iframe{border:0;color-scheme:auto}.container{margin:0 auto;max-width:var(--ifm-container-width)}.container--fluid{max-width:inherit}.row{display:flex;flex-wrap:wrap;margin:0 calc(var(--ifm-spacing-horizontal)*-1)}.margin-bottom--none,.margin-vert--none,.markdown>:last-child{margin-bottom:0!important}.margin-top--none,.margin-vert--none{margin-top:0!important}.row--no-gutters{margin-left:0;margin-right:0}.margin-horiz--none,.margin-right--none{margin-right:0!important}.row--no-gutters>.col{padding-left:0;padding-right:0}.row--align-top{align-items:flex-start}.row--align-bottom{align-items:flex-end}.row--align-center{align-items:center}.row--align-stretch{align-items:stretch}.row--align-baseline{align-items:baseline}.col{--ifm-col-width:100%;flex:1 0;margin-left:0;max-width:var(--ifm-col-width)}.padding-bottom--none,.padding-vert--none{padding-bottom:0!important}.padding-top--none,.padding-vert--none{padding-top:0!important}.padding-horiz--none,.padding-left--none{padding-left:0!important}.padding-horiz--none,.padding-right--none{padding-right:0!important}.col[class*=col--]{flex:0 0 var(--ifm-col-width)}.col--1{--ifm-col-width:8.33333%}.col--offset-1{margin-left:8.33333%}.col--2{--ifm-col-width:16.66667%}.col--offset-2{margin-left:16.66667%}.col--3{--ifm-col-width:25%}.col--offset-3{margin-left:25%}.col--4{--ifm-col-width:33.33333%}.col--offset-4{margin-left:33.33333%}.col--5{--ifm-col-width:41.66667%}.col--offset-5{margin-left:41.66667%}.col--6{--ifm-col-width:50%}.col--offset-6{margin-left:50%}.col--7{--ifm-col-width:58.33333%}.col--offset-7{margin-left:58.33333%}.col--8{--ifm-col-width:66.66667%}.col--offset-8{margin-left:66.66667%}.col--9{--ifm-col-width:75%}.col--offset-9{margin-left:75%}.col--10{--ifm-col-width:83.33333%}.col--offset-10{margin-left:83.33333%}.col--11{--ifm-col-width:91.66667%}.col--offset-11{margin-left:91.66667%}.col--12{--ifm-col-width:100%}.col--offset-12{margin-left:100%}.margin-horiz--none,.margin-left--none{margin-left:0!important}.margin--none{margin:0!important}.margin-bottom--xs,.margin-vert--xs{margin-bottom:.25rem!important}.margin-top--xs,.margin-vert--xs{margin-top:.25rem!important}.margin-horiz--xs,.margin-left--xs{margin-left:.25rem!important}.margin-horiz--xs,.margin-right--xs{margin-right:.25rem!important}.margin--xs{margin:.25rem!important}.margin-bottom--sm,.margin-vert--sm{margin-bottom:.5rem!important}.margin-top--sm,.margin-vert--sm{margin-top:.5rem!important}.margin-horiz--sm,.margin-left--sm{margin-left:.5rem!important}.margin-horiz--sm,.margin-right--sm{margin-right:.5rem!important}.margin--sm{margin:.5rem!important}.margin-bottom--md,.margin-vert--md{margin-bottom:1rem!important}.margin-top--md,.margin-vert--md{margin-top:1rem!important}.margin-horiz--md,.margin-left--md{margin-left:1rem!important}.margin-horiz--md,.margin-right--md{margin-right:1rem!important}.margin--md{margin:1rem!important}.margin-bottom--lg,.margin-vert--lg{margin-bottom:2rem!important}.margin-top--lg,.margin-vert--lg{margin-top:2rem!important}.margin-horiz--lg,.margin-left--lg{margin-left:2rem!important}.margin-horiz--lg,.margin-right--lg{margin-right:2rem!important}.margin--lg{margin:2rem!important}.margin-bottom--xl,.margin-vert--xl{margin-bottom:5rem!important}.margin-top--xl,.margin-vert--xl{margin-top:5rem!important}.margin-horiz--xl,.margin-left--xl{margin-left:5rem!important}.margin-horiz--xl,.margin-right--xl{margin-right:5rem!important}.margin--xl{margin:5rem!important}.padding--none{padding:0!important}.padding-bottom--xs,.padding-vert--xs{padding-bottom:.25rem!important}.padding-top--xs,.padding-vert--xs{padding-top:.25rem!important}.padding-horiz--xs,.padding-left--xs{padding-left:.25rem!important}.padding-horiz--xs,.padding-right--xs{padding-right:.25rem!important}.padding--xs{padding:.25rem!important}.padding-bottom--sm,.padding-vert--sm{padding-bottom:.5rem!important}.padding-top--sm,.padding-vert--sm{padding-top:.5rem!important}.padding-horiz--sm,.padding-left--sm{padding-left:.5rem!important}.padding-horiz--sm,.padding-right--sm{padding-right:.5rem!important}.padding--sm{padding:.5rem!important}.padding-bottom--md,.padding-vert--md{padding-bottom:1rem!important}.padding-top--md,.padding-vert--md{padding-top:1rem!important}.padding-horiz--md,.padding-left--md{padding-left:1rem!important}.padding-horiz--md,.padding-right--md{padding-right:1rem!important}.padding--md{padding:1rem!important}.padding-bottom--lg,.padding-vert--lg{padding-bottom:2rem!important}.padding-top--lg,.padding-vert--lg{padding-top:2rem!important}.padding-horiz--lg,.padding-left--lg{padding-left:2rem!important}.padding-horiz--lg,.padding-right--lg{padding-right:2rem!important}.padding--lg{padding:2rem!important}.padding-bottom--xl,.padding-vert--xl{padding-bottom:5rem!important}.padding-top--xl,.padding-vert--xl{padding-top:5rem!important}.padding-horiz--xl,.padding-left--xl{padding-left:5rem!important}.padding-horiz--xl,.padding-right--xl{padding-right:5rem!important}.padding--xl{padding:5rem!important}code{background-color:var(--ifm-code-background);border:.1rem solid #0000001a;border-radius:var(--ifm-code-border-radius);font-family:var(--ifm-font-family-monospace);font-size:var(--ifm-code-font-size);padding:var(--ifm-code-padding-vertical) var(--ifm-code-padding-horizontal)}a code{color:inherit}pre{background-color:var(--ifm-pre-background);border-radius:var(--ifm-pre-border-radius);color:var(--ifm-pre-color);font:var(--ifm-code-font-size)/var(--ifm-pre-line-height) var(--ifm-font-family-monospace);padding:var(--ifm-pre-padding)}pre code{background-color:initial;border:none;font-size:100%;line-height:inherit;padding:0}kbd{background-color:var(--ifm-color-emphasis-0);border:1px solid var(--ifm-color-emphasis-400);border-radius:.2rem;box-shadow:inset 0 -1px 0 var(--ifm-color-emphasis-400);color:var(--ifm-color-emphasis-800);font:80% var(--ifm-font-family-monospace);padding:.15rem .3rem}h1,h2,h3,h4,h5,h6{color:var(--ifm-heading-color);font-family:var(--ifm-heading-font-family);font-weight:var(--ifm-heading-font-weight);line-height:var(--ifm-heading-line-height);margin:var(--ifm-heading-margin-top) 0 var(--ifm-heading-margin-bottom) 0}h1{font-size:var(--ifm-h1-font-size)}h2{font-size:var(--ifm-h2-font-size)}h3{font-size:var(--ifm-h3-font-size)}h4{font-size:var(--ifm-h4-font-size)}h5{font-size:var(--ifm-h5-font-size)}h6{font-size:var(--ifm-h6-font-size)}img{max-width:100%}img[align=right]{padding-left:var(--image-alignment-padding)}img[align=left]{padding-right:var(--image-alignment-padding)}.markdown{--ifm-h1-vertical-rhythm-top:3;--ifm-h2-vertical-rhythm-top:2;--ifm-h3-vertical-rhythm-top:1.5;--ifm-heading-vertical-rhythm-top:1.25;--ifm-h1-vertical-rhythm-bottom:1.25;--ifm-heading-vertical-rhythm-bottom:1}.markdown:after,.markdown:before{content:"";display:table}.markdown:after{clear:both}.markdown h1:first-child{--ifm-h1-font-size:3rem;margin-bottom:calc(var(--ifm-h1-vertical-rhythm-bottom)*var(--ifm-leading))}.markdown>h2{--ifm-h2-font-size:2rem;margin-top:calc(var(--ifm-h2-vertical-rhythm-top)*var(--ifm-leading))}.markdown>h3{--ifm-h3-font-size:1.5rem;margin-top:calc(var(--ifm-h3-vertical-rhythm-top)*var(--ifm-leading))}.markdown>h4,.markdown>h5,.markdown>h6{margin-top:calc(var(--ifm-heading-vertical-rhythm-top)*var(--ifm-leading))}.markdown>p,.markdown>pre,.markdown>ul{margin-bottom:var(--ifm-leading)}.markdown li>p{margin-top:var(--ifm-list-paragraph-margin)}.markdown li+li{margin-top:var(--ifm-list-item-margin)}ol,ul{margin:0 0 var(--ifm-list-margin);padding-left:var(--ifm-list-left-padding)}ol ol,ul ol{list-style-type:lower-roman}ol ol ol,ol ul ol,ul ol ol,ul ul ol{list-style-type:lower-alpha}table{border-collapse:collapse;display:block;margin-bottom:var(--ifm-spacing-vertical)}table thead tr{border-bottom:2px solid var(--ifm-table-border-color)}table thead,table tr:nth-child(2n){background-color:var(--ifm-table-stripe-background)}table tr{background-color:var(--ifm-table-background);border-top:var(--ifm-table-border-width) solid var(--ifm-table-border-color)}table td,table th{border:var(--ifm-table-border-width) solid var(--ifm-table-border-color);padding:var(--ifm-table-cell-padding)}table th{background-color:var(--ifm-table-head-background);color:var(--ifm-table-head-color);font-weight:var(--ifm-table-head-font-weight)}table td{color:var(--ifm-table-cell-color)}strong{font-weight:var(--ifm-font-weight-bold)}a{color:var(--ifm-link-color);text-decoration:var(--ifm-link-decoration)}a:hover{color:var(--ifm-link-hover-color);text-decoration:var(--ifm-link-hover-decoration)}.button:hover,.text--no-decoration,.text--no-decoration:hover,a:not([href]){-webkit-text-decoration:none;text-decoration:none}p{margin:0 0 var(--ifm-paragraph-margin-bottom)}blockquote{border-left:var(--ifm-blockquote-border-left-width) solid var(--ifm-blockquote-border-color);box-shadow:var(--ifm-blockquote-shadow);color:var(--ifm-blockquote-color);font-size:var(--ifm-blockquote-font-size);padding:var(--ifm-blockquote-padding-vertical) var(--ifm-blockquote-padding-horizontal)}blockquote>:first-child{margin-top:0}blockquote>:last-child{margin-bottom:0}hr{background-color:var(--ifm-hr-background-color);border:0;height:var(--ifm-hr-height);margin:var(--ifm-hr-margin-vertical) 0}.shadow--lw{box-shadow:var(--ifm-global-shadow-lw)!important}.shadow--md{box-shadow:var(--ifm-global-shadow-md)!important}.shadow--tl{box-shadow:var(--ifm-global-shadow-tl)!important}.text--primary{color:var(--ifm-color-primary)}.text--secondary{color:var(--ifm-color-secondary)}.text--success{color:var(--ifm-color-success)}.text--info{color:var(--ifm-color-info)}.text--warning{color:var(--ifm-color-warning)}.text--danger{color:var(--ifm-color-danger)}.text--center{text-align:center}.text--left{text-align:left}.text--justify{text-align:justify}.text--right{text-align:right}.text--capitalize{text-transform:capitalize}.text--lowercase{text-transform:lowercase}.alert__heading,.text--uppercase{text-transform:uppercase}.text--light{font-weight:var(--ifm-font-weight-light)}.text--normal{font-weight:var(--ifm-font-weight-normal)}.text--semibold{font-weight:var(--ifm-font-weight-semibold)}.text--bold{font-weight:var(--ifm-font-weight-bold)}.text--italic{font-style:italic}.text--truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.text--break{word-wrap:break-word!important;word-break:break-word!important}.clean-btn{background:none;border:none;color:inherit;cursor:pointer;font-family:inherit;padding:0}.alert,.alert .close{color:var(--ifm-alert-foreground-color)}.clean-list{list-style:none;padding-left:0}.alert--primary{--ifm-alert-background-color:var(--ifm-color-primary-contrast-background);--ifm-alert-background-color-highlight:#3578e526;--ifm-alert-foreground-color:var(--ifm-color-primary-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-primary-dark)}.alert--secondary{--ifm-alert-background-color:var(--ifm-color-secondary-contrast-background);--ifm-alert-background-color-highlight:#ebedf026;--ifm-alert-foreground-color:var(--ifm-color-secondary-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-secondary-dark)}.alert--success{--ifm-alert-background-color:var(--ifm-color-success-contrast-background);--ifm-alert-background-color-highlight:#00a40026;--ifm-alert-foreground-color:var(--ifm-color-success-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-success-dark)}.alert--info{--ifm-alert-background-color:var(--ifm-color-info-contrast-background);--ifm-alert-background-color-highlight:#54c7ec26;--ifm-alert-foreground-color:var(--ifm-color-info-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-info-dark)}.alert--warning{--ifm-alert-background-color:var(--ifm-color-warning-contrast-background);--ifm-alert-background-color-highlight:#ffba0026;--ifm-alert-foreground-color:var(--ifm-color-warning-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-warning-dark)}.alert--danger{--ifm-alert-background-color:var(--ifm-color-danger-contrast-background);--ifm-alert-background-color-highlight:#fa383e26;--ifm-alert-foreground-color:var(--ifm-color-danger-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-danger-dark)}.alert{--ifm-code-background:var(--ifm-alert-background-color-highlight);--ifm-link-color:var(--ifm-alert-foreground-color);--ifm-link-hover-color:var(--ifm-alert-foreground-color);--ifm-link-decoration:underline;--ifm-tabs-color:var(--ifm-alert-foreground-color);--ifm-tabs-color-active:var(--ifm-alert-foreground-color);--ifm-tabs-color-active-border:var(--ifm-alert-border-color);background-color:var(--ifm-alert-background-color);border:var(--ifm-alert-border-width) solid var(--ifm-alert-border-color);border-left-width:var(--ifm-alert-border-left-width);border-radius:var(--ifm-alert-border-radius);box-shadow:var(--ifm-alert-shadow);padding:var(--ifm-alert-padding-vertical) var(--ifm-alert-padding-horizontal)}.alert__heading{align-items:center;display:flex;font:700 var(--ifm-h5-font-size)/var(--ifm-heading-line-height) var(--ifm-heading-font-family);margin-bottom:.5rem}.alert__icon{display:inline-flex;margin-right:.4em}.alert__icon svg{fill:var(--ifm-alert-foreground-color);stroke:var(--ifm-alert-foreground-color);stroke-width:0}.alert .close{margin:calc(var(--ifm-alert-padding-vertical)*-1) calc(var(--ifm-alert-padding-horizontal)*-1) 0 0;opacity:.75}.alert .close:focus,.alert .close:hover{opacity:1}.alert a{text-decoration-color:var(--ifm-alert-border-color)}.alert a:hover{text-decoration-thickness:2px}.avatar{column-gap:var(--ifm-avatar-intro-margin);display:flex}.avatar__photo{border-radius:50%;display:block;height:var(--ifm-avatar-photo-size);overflow:hidden;width:var(--ifm-avatar-photo-size)}.card--full-height,.navbar__logo img{height:100%}.avatar__photo--sm{--ifm-avatar-photo-size:2rem}.avatar__photo--lg{--ifm-avatar-photo-size:4rem}.avatar__photo--xl{--ifm-avatar-photo-size:6rem}.avatar__intro{display:flex;flex:1 1;flex-direction:column;justify-content:center;text-align:var(--ifm-avatar-intro-alignment)}.badge,.breadcrumbs__item,.breadcrumbs__link,.button,.dropdown>.navbar__link:after{display:inline-block}.avatar__name{font:700 var(--ifm-h4-font-size)/var(--ifm-heading-line-height) var(--ifm-font-family-base)}.avatar__subtitle{margin-top:.25rem}.avatar--vertical{--ifm-avatar-intro-alignment:center;--ifm-avatar-intro-margin:0.5rem;align-items:center;flex-direction:column}.badge{background-color:var(--ifm-badge-background-color);border:var(--ifm-badge-border-width) solid var(--ifm-badge-border-color);border-radius:var(--ifm-badge-border-radius);color:var(--ifm-badge-color);font-size:75%;font-weight:var(--ifm-font-weight-bold);line-height:1;padding:var(--ifm-badge-padding-vertical) var(--ifm-badge-padding-horizontal)}.badge--primary{--ifm-badge-background-color:var(--ifm-color-primary)}.badge--secondary{--ifm-badge-background-color:var(--ifm-color-secondary);color:var(--ifm-color-black)}.breadcrumbs__link,.button.button--secondary.button--outline:not(.button--active):not(:hover){color:var(--ifm-font-color-base)}.badge--success{--ifm-badge-background-color:var(--ifm-color-success)}.badge--info{--ifm-badge-background-color:var(--ifm-color-info)}.badge--warning{--ifm-badge-background-color:var(--ifm-color-warning)}.badge--danger{--ifm-badge-background-color:var(--ifm-color-danger)}.breadcrumbs{margin-bottom:0;padding-left:0}.breadcrumbs__item:not(:last-child):after{background:var(--ifm-breadcrumb-separator) center;content:" ";display:inline-block;filter:var(--ifm-breadcrumb-separator-filter);height:calc(var(--ifm-breadcrumb-separator-size)*var(--ifm-breadcrumb-size-multiplier)*var(--ifm-breadcrumb-separator-size-multiplier));margin:0 var(--ifm-breadcrumb-spacing);opacity:.5;width:calc(var(--ifm-breadcrumb-separator-size)*var(--ifm-breadcrumb-size-multiplier)*var(--ifm-breadcrumb-separator-size-multiplier))}.breadcrumbs__item--active .breadcrumbs__link{background:var(--ifm-breadcrumb-item-background-active);color:var(--ifm-breadcrumb-color-active)}.breadcrumbs__link{border-radius:var(--ifm-breadcrumb-border-radius);font-size:calc(1rem*var(--ifm-breadcrumb-size-multiplier));padding:calc(var(--ifm-breadcrumb-padding-vertical)*var(--ifm-breadcrumb-size-multiplier)) calc(var(--ifm-breadcrumb-padding-horizontal)*var(--ifm-breadcrumb-size-multiplier));transition-duration:var(--ifm-transition-fast);transition-property:background,color}.breadcrumbs__link:any-link:hover,.breadcrumbs__link:link:hover,.breadcrumbs__link:visited:hover,area[href].breadcrumbs__link:hover{background:var(--ifm-breadcrumb-item-background-active);-webkit-text-decoration:none;text-decoration:none}.breadcrumbs--sm{--ifm-breadcrumb-size-multiplier:0.8}.breadcrumbs--lg{--ifm-breadcrumb-size-multiplier:1.2}.button{background-color:var(--ifm-button-background-color);border:var(--ifm-button-border-width) solid var(--ifm-button-border-color);border-radius:var(--ifm-button-border-radius);cursor:pointer;font-size:calc(.875rem*var(--ifm-button-size-multiplier));font-weight:var(--ifm-button-font-weight);line-height:1.5;padding:calc(var(--ifm-button-padding-vertical)*var(--ifm-button-size-multiplier)) calc(var(--ifm-button-padding-horizontal)*var(--ifm-button-size-multiplier));text-align:center;transition-duration:var(--ifm-button-transition-duration);transition-property:color,background,border-color;-webkit-user-select:none;user-select:none;white-space:nowrap}.button,.button:hover{color:var(--ifm-button-color)}.button--outline{--ifm-button-color:var(--ifm-button-border-color)}.button--outline:hover{--ifm-button-background-color:var(--ifm-button-border-color)}.button--link{--ifm-button-border-color:#0000;color:var(--ifm-link-color);text-decoration:var(--ifm-link-decoration)}.button--link.button--active,.button--link:active,.button--link:hover{color:var(--ifm-link-hover-color);text-decoration:var(--ifm-link-hover-decoration)}.dropdown__link--active,.dropdown__link:hover,.menu__link:hover,.navbar__brand:hover,.navbar__link--active,.navbar__link:hover,.pagination-nav__link:hover,.pagination__link:hover{-webkit-text-decoration:none;text-decoration:none}.button.disabled,.button:disabled,.button[disabled]{opacity:.65;pointer-events:none}.button--sm{--ifm-button-size-multiplier:0.8}.button--lg{--ifm-button-size-multiplier:1.35}.button--block{display:block;width:100%}.button.button--secondary{color:var(--ifm-color-gray-900)}:where(.button--primary){--ifm-button-background-color:var(--ifm-color-primary);--ifm-button-border-color:var(--ifm-color-primary)}:where(.button--primary):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-primary-dark);--ifm-button-border-color:var(--ifm-color-primary-dark)}.button--primary.button--active,.button--primary:active{--ifm-button-background-color:var(--ifm-color-primary-darker);--ifm-button-border-color:var(--ifm-color-primary-darker)}:where(.button--secondary){--ifm-button-background-color:var(--ifm-color-secondary);--ifm-button-border-color:var(--ifm-color-secondary)}:where(.button--secondary):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-secondary-dark);--ifm-button-border-color:var(--ifm-color-secondary-dark)}.button--secondary.button--active,.button--secondary:active{--ifm-button-background-color:var(--ifm-color-secondary-darker);--ifm-button-border-color:var(--ifm-color-secondary-darker)}:where(.button--success){--ifm-button-background-color:var(--ifm-color-success);--ifm-button-border-color:var(--ifm-color-success)}:where(.button--success):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-success-dark);--ifm-button-border-color:var(--ifm-color-success-dark)}.button--success.button--active,.button--success:active{--ifm-button-background-color:var(--ifm-color-success-darker);--ifm-button-border-color:var(--ifm-color-success-darker)}:where(.button--info){--ifm-button-background-color:var(--ifm-color-info);--ifm-button-border-color:var(--ifm-color-info)}:where(.button--info):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-info-dark);--ifm-button-border-color:var(--ifm-color-info-dark)}.button--info.button--active,.button--info:active{--ifm-button-background-color:var(--ifm-color-info-darker);--ifm-button-border-color:var(--ifm-color-info-darker)}:where(.button--warning){--ifm-button-background-color:var(--ifm-color-warning);--ifm-button-border-color:var(--ifm-color-warning)}:where(.button--warning):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-warning-dark);--ifm-button-border-color:var(--ifm-color-warning-dark)}.button--warning.button--active,.button--warning:active{--ifm-button-background-color:var(--ifm-color-warning-darker);--ifm-button-border-color:var(--ifm-color-warning-darker)}:where(.button--danger){--ifm-button-background-color:var(--ifm-color-danger);--ifm-button-border-color:var(--ifm-color-danger)}:where(.button--danger):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-danger-dark);--ifm-button-border-color:var(--ifm-color-danger-dark)}.button--danger.button--active,.button--danger:active{--ifm-button-background-color:var(--ifm-color-danger-darker);--ifm-button-border-color:var(--ifm-color-danger-darker)}.button-group{display:inline-flex;gap:var(--ifm-button-group-spacing)}.button-group>.button:not(:first-child){border-bottom-left-radius:0;border-top-left-radius:0}.button-group>.button:not(:last-child){border-bottom-right-radius:0;border-top-right-radius:0}.button-group--block{display:flex;justify-content:stretch}.button-group--block>.button{flex-grow:1}.card{background-color:var(--ifm-card-background-color);border-radius:var(--ifm-card-border-radius);box-shadow:var(--ifm-global-shadow-lw);display:flex;flex-direction:column;overflow:hidden}.card__image{padding-top:var(--ifm-card-vertical-spacing)}.card__image:first-child{padding-top:0}.card__body,.card__footer,.card__header{padding:var(--ifm-card-vertical-spacing) var(--ifm-card-horizontal-spacing)}.card__body:not(:last-child),.card__footer:not(:last-child),.card__header:not(:last-child){padding-bottom:0}.card__body>:last-child,.card__footer>:last-child,.card__header>:last-child{margin-bottom:0}.card__footer{margin-top:auto}.table-of-contents{font-size:.8rem;margin-bottom:0;padding:var(--ifm-toc-padding-vertical) 0}.table-of-contents,.table-of-contents ul{list-style:none;padding-left:var(--ifm-toc-padding-horizontal)}.table-of-contents li{margin:var(--ifm-toc-padding-vertical) var(--ifm-toc-padding-horizontal)}.table-of-contents__left-border{border-left:1px solid var(--ifm-toc-border-color)}.table-of-contents__link{color:var(--ifm-toc-link-color);display:block}.table-of-contents__link--active,.table-of-contents__link--active code,.table-of-contents__link:hover,.table-of-contents__link:hover code{color:var(--ifm-color-primary);-webkit-text-decoration:none;text-decoration:none}.close{color:var(--ifm-color-black);float:right;font-size:1.5rem;font-weight:var(--ifm-font-weight-bold);line-height:1;opacity:.5;padding:1rem;transition:opacity var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.close:hover{opacity:.7}.close:focus{opacity:.8}.dropdown{display:inline-flex;font-weight:var(--ifm-dropdown-font-weight);position:relative;vertical-align:top}.dropdown--hoverable:hover .dropdown__menu,.dropdown--show .dropdown__menu{opacity:1;pointer-events:all;transform:translateY(-1px);visibility:visible}.dropdown__menu,.navbar__item.dropdown .navbar__link:not([href]){pointer-events:none}.dropdown--right .dropdown__menu{left:inherit;right:0}.dropdown--nocaret .navbar__link:after{content:none!important}.dropdown__menu{background-color:var(--ifm-dropdown-background-color);border-radius:var(--ifm-global-radius);box-shadow:var(--ifm-global-shadow-md);left:0;list-style:none;max-height:80vh;min-width:10rem;opacity:0;overflow-y:auto;padding:.5rem;position:absolute;top:calc(100% - var(--ifm-navbar-item-padding-vertical) + .3rem);transform:translateY(-.625rem);transition-duration:var(--ifm-transition-fast);transition-property:opacity,transform,visibility;transition-timing-function:var(--ifm-transition-timing-default);visibility:hidden;z-index:var(--ifm-z-index-dropdown)}.menu__caret,.menu__link,.menu__list-item-collapsible{border-radius:.25rem;transition:background var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.dropdown__link{border-radius:.25rem;color:var(--ifm-dropdown-link-color);display:block;font-size:.875rem;margin-top:.2rem;padding:.25rem .5rem;white-space:nowrap}.dropdown__link--active,.dropdown__link:hover{background-color:var(--ifm-dropdown-hover-background-color);color:var(--ifm-dropdown-link-color)}.dropdown__link--active,.dropdown__link--active:hover{--ifm-dropdown-link-color:var(--ifm-link-color)}.dropdown>.navbar__link:after{border-color:currentcolor #0000;border-style:solid;border-width:.4em .4em 0;content:"";margin-left:.3em;position:relative;top:2px;transform:translateY(-50%)}.footer{background-color:var(--ifm-footer-background-color);color:var(--ifm-footer-color);padding:var(--ifm-footer-padding-vertical) var(--ifm-footer-padding-horizontal)}.footer--dark{--ifm-footer-background-color:#303846;--ifm-footer-color:var(--ifm-footer-link-color);--ifm-footer-link-color:var(--ifm-color-secondary);--ifm-footer-title-color:var(--ifm-color-white)}.footer__links{margin-bottom:1rem}.footer__link-item{color:var(--ifm-footer-link-color);line-height:2}.footer__link-item:hover{color:var(--ifm-footer-link-hover-color)}.footer__link-separator{margin:0 var(--ifm-footer-link-horizontal-spacing)}.footer__logo{margin-top:1rem;max-width:var(--ifm-footer-logo-max-width)}.footer__title{color:var(--ifm-footer-title-color);font:700 var(--ifm-h4-font-size)/var(--ifm-heading-line-height) var(--ifm-font-family-base);margin-bottom:var(--ifm-heading-margin-bottom)}.menu,.navbar__link{font-weight:var(--ifm-font-weight-semibold)}.footer__item{margin-top:0}.footer__items{margin-bottom:0}[type=checkbox]{padding:0}.hero{align-items:center;background-color:var(--ifm-hero-background-color);color:var(--ifm-hero-text-color);display:flex;padding:4rem 2rem}.hero--primary{--ifm-hero-background-color:var(--ifm-color-primary);--ifm-hero-text-color:var(--ifm-font-color-base-inverse)}.hero--dark{--ifm-hero-background-color:#303846;--ifm-hero-text-color:var(--ifm-color-white)}.hero__title{font-size:3rem}.hero__subtitle{font-size:1.5rem}.menu__list{list-style:none;margin:0;padding-left:0}.menu__caret,.menu__link{padding:var(--ifm-menu-link-padding-vertical) var(--ifm-menu-link-padding-horizontal)}.menu__list .menu__list{flex:0 0 100%;margin-top:.25rem;padding-left:var(--ifm-menu-link-padding-horizontal)}.menu__list-item:not(:first-child){margin-top:.25rem}.menu__list-item--collapsed .menu__list{height:0;overflow:hidden}.menu__list-item--collapsed .menu__caret:before,.menu__list-item--collapsed .menu__link--sublist:after{transform:rotate(90deg)}.menu__list-item-collapsible{display:flex;flex-wrap:wrap;position:relative}.menu__caret:hover,.menu__link:hover,.menu__list-item-collapsible--active,.menu__list-item-collapsible:hover{background:var(--ifm-menu-color-background-hover)}.menu__list-item-collapsible .menu__link--active,.menu__list-item-collapsible .menu__link:hover{background:none!important}.menu__caret,.menu__link{align-items:center;display:flex}.menu__link{color:var(--ifm-menu-color);flex:1;line-height:1.25}.menu__link:hover{color:var(--ifm-menu-color)}.menu__caret:before,.menu__link--sublist-caret:after{content:"";filter:var(--ifm-menu-link-sublist-icon-filter);height:1.25rem;transform:rotate(180deg);transition:transform var(--ifm-transition-fast) linear;width:1.25rem}.menu__link--sublist-caret:after{background:var(--ifm-menu-link-sublist-icon) 50%/2rem 2rem;margin-left:auto;min-width:1.25rem}.menu__link--active,.menu__link--active:hover{color:var(--ifm-menu-color-active)}.navbar__brand,.navbar__link{color:var(--ifm-navbar-link-color)}.menu__link--active:not(.menu__link--sublist){background-color:var(--ifm-menu-color-background-active)}.menu__caret:before{background:var(--ifm-menu-link-sublist-icon) 50%/2rem 2rem}.navbar--dark,html[data-theme=dark]{--ifm-menu-link-sublist-icon-filter:invert(100%) sepia(94%) saturate(17%) hue-rotate(223deg) brightness(104%) contrast(98%)}.navbar{background-color:var(--ifm-navbar-background-color);box-shadow:var(--ifm-navbar-shadow);height:var(--ifm-navbar-height);padding:var(--ifm-navbar-padding-vertical) var(--ifm-navbar-padding-horizontal)}.navbar,.navbar>.container,.navbar>.container-fluid{display:flex}.navbar--fixed-top{position:sticky;top:0;z-index:var(--ifm-z-index-fixed)}.navbar-sidebar,.navbar-sidebar__backdrop{bottom:0;left:0;opacity:0;position:fixed;top:0;transition-duration:var(--ifm-transition-fast);transition-timing-function:ease-in-out;visibility:hidden}.navbar__inner{display:flex;flex-wrap:wrap;justify-content:space-between;width:100%}.navbar__brand{align-items:center;display:flex;margin-right:1rem;min-width:0}.navbar__brand:hover{color:var(--ifm-navbar-link-hover-color)}.navbar__title{flex:1 1 auto}.navbar__toggle{display:none;margin-right:.5rem}.navbar__logo{flex:0 0 auto;height:2rem;margin-right:.5rem}.navbar__items{align-items:center;display:flex;flex:1;min-width:0}.navbar__items--center{flex:0 0 auto}.navbar__items--center .navbar__brand{margin:0}.navbar__items--center+.navbar__items--right{flex:1}.navbar__items--right{flex:0 0 auto;justify-content:flex-end}.navbar__items--right>:last-child{padding-right:0}.navbar__item{display:inline-block;padding:var(--ifm-navbar-item-padding-vertical) var(--ifm-navbar-item-padding-horizontal)}.navbar__link--active,.navbar__link:hover{color:var(--ifm-navbar-link-hover-color)}.navbar--dark,.navbar--primary{--ifm-menu-color:var(--ifm-color-gray-300);--ifm-navbar-link-color:var(--ifm-color-gray-100);--ifm-navbar-search-input-background-color:#ffffff1a;--ifm-navbar-search-input-placeholder-color:#ffffff80;color:var(--ifm-color-white)}.navbar--dark{--ifm-navbar-background-color:#242526;--ifm-menu-color-background-active:#ffffff0d;--ifm-navbar-search-input-color:var(--ifm-color-white)}.navbar--primary{--ifm-navbar-background-color:var(--ifm-color-primary);--ifm-navbar-link-hover-color:var(--ifm-color-white);--ifm-menu-color-active:var(--ifm-color-white);--ifm-navbar-search-input-color:var(--ifm-color-emphasis-500)}.navbar__search-input{appearance:none;background:var(--ifm-navbar-search-input-background-color) var(--ifm-navbar-search-input-icon) no-repeat .75rem center/1rem 1rem;border:none;border-radius:2rem;color:var(--ifm-navbar-search-input-color);cursor:text;display:inline-block;font-size:1rem;height:2rem;padding:0 .5rem 0 2.25rem;width:12.5rem}.navbar__search-input::placeholder{color:var(--ifm-navbar-search-input-placeholder-color)}.navbar-sidebar{background-color:var(--ifm-navbar-background-color);box-shadow:var(--ifm-global-shadow-md);transform:translate3d(-100%,0,0);transition-property:opacity,visibility,transform;width:var(--ifm-navbar-sidebar-width)}.navbar-sidebar--show .navbar-sidebar,.navbar-sidebar__items{transform:translateZ(0)}.navbar-sidebar--show .navbar-sidebar,.navbar-sidebar--show .navbar-sidebar__backdrop{opacity:1;visibility:visible}.navbar-sidebar__backdrop{background-color:#0009;right:0;transition-property:opacity,visibility}.navbar-sidebar__brand{align-items:center;box-shadow:var(--ifm-navbar-shadow);display:flex;flex:1;height:var(--ifm-navbar-height);padding:var(--ifm-navbar-padding-vertical) var(--ifm-navbar-padding-horizontal)}.navbar-sidebar__items{display:flex;height:calc(100% - var(--ifm-navbar-height));transition:transform var(--ifm-transition-fast) ease-in-out}.navbar-sidebar__items--show-secondary{transform:translate3d(calc((var(--ifm-navbar-sidebar-width))*-1),0,0)}.navbar-sidebar__item{flex-shrink:0;padding:.5rem;width:calc(var(--ifm-navbar-sidebar-width))}.navbar-sidebar__back{background:var(--ifm-menu-color-background-active);font-size:15px;font-weight:var(--ifm-button-font-weight);margin:0 0 .2rem -.5rem;padding:.6rem 1.5rem;position:relative;text-align:left;top:-.5rem;width:calc(100% + 1rem)}.navbar-sidebar__close{display:flex;margin-left:auto}.pagination{column-gap:var(--ifm-pagination-page-spacing);display:flex;font-size:var(--ifm-pagination-font-size);padding-left:0}.pagination--sm{--ifm-pagination-font-size:0.8rem;--ifm-pagination-padding-horizontal:0.8rem;--ifm-pagination-padding-vertical:0.2rem}.pagination--lg{--ifm-pagination-font-size:1.2rem;--ifm-pagination-padding-horizontal:1.2rem;--ifm-pagination-padding-vertical:0.3rem}.pagination__item{display:inline-flex}.pagination__item>span{padding:var(--ifm-pagination-padding-vertical)}.pagination__item--active .pagination__link{color:var(--ifm-pagination-color-active)}.pagination__item--active .pagination__link,.pagination__item:not(.pagination__item--active):hover .pagination__link{background:var(--ifm-pagination-item-active-background)}.pagination__item--disabled,.pagination__item[disabled]{opacity:.25;pointer-events:none}.pagination__link{border-radius:var(--ifm-pagination-border-radius);color:var(--ifm-font-color-base);display:inline-block;padding:var(--ifm-pagination-padding-vertical) var(--ifm-pagination-padding-horizontal);transition:background var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.pagination-nav{display:grid;grid-gap:var(--ifm-spacing-horizontal);gap:var(--ifm-spacing-horizontal);grid-template-columns:repeat(2,1fr)}.pagination-nav__link{border:1px solid var(--ifm-color-emphasis-300);border-radius:var(--ifm-pagination-nav-border-radius);display:block;height:100%;line-height:var(--ifm-heading-line-height);padding:var(--ifm-global-spacing);transition:border-color var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.pagination-nav__link:hover{border-color:var(--ifm-pagination-nav-color-hover)}.pagination-nav__link--next{grid-column:2/3;text-align:right}.pagination-nav__label{font-size:var(--ifm-h4-font-size);font-weight:var(--ifm-heading-font-weight);word-break:break-word}.pagination-nav__link--prev .pagination-nav__label:before{content:"« "}.pagination-nav__link--next .pagination-nav__label:after{content:" »"}.pagination-nav__sublabel{color:var(--ifm-color-content-secondary);font-size:var(--ifm-h5-font-size);font-weight:var(--ifm-font-weight-semibold);margin-bottom:.25rem}.pills__item,.tabs{font-weight:var(--ifm-font-weight-bold)}.pills{display:flex;gap:var(--ifm-pills-spacing);padding-left:0}.pills__item{border-radius:.5rem;cursor:pointer;display:inline-block;padding:.25rem 1rem;transition:background var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.pills__item--active{color:var(--ifm-pills-color-active)}.pills__item--active,.pills__item:not(.pills__item--active):hover{background:var(--ifm-pills-color-background-active)}.pills--block{justify-content:stretch}.pills--block .pills__item{flex-grow:1;text-align:center}.tabs{color:var(--ifm-tabs-color);display:flex;margin-bottom:0;overflow-x:auto;padding-left:0}.tabs__item{border-bottom:3px solid #0000;border-radius:var(--ifm-global-radius);cursor:pointer;display:inline-flex;padding:var(--ifm-tabs-padding-vertical) var(--ifm-tabs-padding-horizontal);transition:background-color var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.tabs__item--active{border-bottom-color:var(--ifm-tabs-color-active-border);border-bottom-left-radius:0;border-bottom-right-radius:0;color:var(--ifm-tabs-color-active)}.tabs__item:hover{background-color:var(--ifm-hover-overlay)}.tabs--block{justify-content:stretch}.tabs--block .tabs__item{flex-grow:1;justify-content:center}html[data-theme=dark]{--ifm-color-scheme:dark;--ifm-color-emphasis-0:var(--ifm-color-gray-1000);--ifm-color-emphasis-100:var(--ifm-color-gray-900);--ifm-color-emphasis-200:var(--ifm-color-gray-800);--ifm-color-emphasis-300:var(--ifm-color-gray-700);--ifm-color-emphasis-400:var(--ifm-color-gray-600);--ifm-color-emphasis-600:var(--ifm-color-gray-400);--ifm-color-emphasis-700:var(--ifm-color-gray-300);--ifm-color-emphasis-800:var(--ifm-color-gray-200);--ifm-color-emphasis-900:var(--ifm-color-gray-100);--ifm-color-emphasis-1000:var(--ifm-color-gray-0);--ifm-background-color:#1b1b1d;--ifm-background-surface-color:#242526;--ifm-hover-overlay:#ffffff0d;--ifm-color-content:#e3e3e3;--ifm-color-content-secondary:#fff;--ifm-breadcrumb-separator-filter:invert(64%) sepia(11%) saturate(0%) hue-rotate(149deg) brightness(99%) contrast(95%);--ifm-code-background:#ffffff1a;--ifm-scrollbar-track-background-color:#444;--ifm-scrollbar-thumb-background-color:#686868;--ifm-scrollbar-thumb-hover-background-color:#7a7a7a;--ifm-table-stripe-background:#ffffff12;--ifm-toc-border-color:var(--ifm-color-emphasis-200);--ifm-color-primary-contrast-background:#102445;--ifm-color-primary-contrast-foreground:#ebf2fc;--ifm-color-secondary-contrast-background:#474748;--ifm-color-secondary-contrast-foreground:#fdfdfe;--ifm-color-success-contrast-background:#003100;--ifm-color-success-contrast-foreground:#e6f6e6;--ifm-color-info-contrast-background:#193c47;--ifm-color-info-contrast-foreground:#eef9fd;--ifm-color-warning-contrast-background:#4d3800;--ifm-color-warning-contrast-foreground:#fff8e6;--ifm-color-danger-contrast-background:#4b1113;--ifm-color-danger-contrast-foreground:#ffebec}}.bharatml-hero .bharatml-button:hover,.bharatml-hero .button--outline:hover,[data-theme=dark] .bharatml-hero .bharatml-button:hover,[data-theme=dark] .bharatml-hero .button--outline:hover{background-color:#fff!important;border-color:#fff!important;color:var(--bharatml-primary)!important}.bharatml-hero .bharatml-button,.bharatml-hero .button--outline{border:2px solid #fff!important;color:#fff!important;transition:.3s}:root{--ifm-color-primary:#450839;--ifm-color-primary-dark:#3d0732;--ifm-color-primary-darker:#39062f;--ifm-color-primary-darkest:#2f0527;--ifm-color-primary-light:#4d0940;--ifm-color-primary-lighter:#510a43;--ifm-color-primary-lightest:#5d0c4d;--ifm-code-font-size:95%;--docusaurus-highlighted-code-line-bg:#0000001a;--bharatml-primary:#450839;--bharatml-primary-hover:#6a0c59;--bharatml-secondary:#f9f9f9;--bharatml-text:#1c1e21;--bharatml-text-light:#606770}[data-theme=dark]{--ifm-color-primary:#8b4582;--ifm-color-primary-dark:#7d3f75;--ifm-color-primary-darker:#763c6e;--ifm-color-primary-darkest:#62315a;--ifm-color-primary-light:#994b8f;--ifm-color-primary-lighter:#a04e96;--ifm-color-primary-lightest:#b657a9;--docusaurus-highlighted-code-line-bg:#0000004d;--bharatml-primary:#8b4582;--bharatml-primary-hover:#a04e96;--bharatml-secondary:#1e1e1e;--bharatml-text:#e3e3e3;--bharatml-text-light:#b4b4b4}.bharatml-hero{background:linear-gradient(135deg,var(--bharatml-primary) 0,var(--bharatml-primary-hover) 100%);color:#fff}.bharatml-hero .bharatml-button{background-color:var(--bharatml-primary)}.bharatml-hero .button--outline{background-color:initial!important}[data-theme=dark] .bharatml-hero .bharatml-button{background-color:var(--bharatml-primary);border:2px solid #fff!important;color:#fff!important}[data-theme=dark] .bharatml-hero .button--outline{background-color:initial!important;border:2px solid #fff!important;color:#fff!important}.bharatml-button{background-color:var(--bharatml-primary);border-color:var(--bharatml-primary);transition:.3s}.bharatml-button:hover{background-color:var(--bharatml-primary-hover);border-color:var(--bharatml-primary-hover);color:#fff}.bharatml-card{background:#fff;border:1px solid #4508391a;border-radius:8px;padding:2rem;transition:.3s}.bharatml-card:hover{border-color:var(--bharatml-primary);box-shadow:0 4px 20px #4508391a;transform:translateY(-2px)}.bharatml-icon{align-items:center;background:linear-gradient(135deg,var(--bharatml-primary),var(--bharatml-primary-hover));border-radius:12px;color:#fff;display:flex;font-size:1.5rem;height:64px;justify-content:center;margin:0 auto 1rem;width:64px}@layer docusaurus.core{#__docusaurus-base-url-issue-banner-container{display:none}}.aboutSection_udvw,.features_t9lD{background-color:var(--ifm-background-surface-color)}.featuresHeader_qR2i,.features_t9lD h3{color:var(--bharatml-primary);margin-bottom:1rem}.features_t9lD{display:block;padding:4rem 0;text-align:center;width:100%}.featureSvg_GfXr{height:200px;width:200px}.featuresHeader_qR2i{font-size:2.5rem;font-weight:700;text-align:center}.featuresSubtitle_VdGe{color:var(--ifm-font-color-base);font-size:1.2rem;opacity:1;text-align:center}.features_t9lD .bharatml-card_xZ6l{height:100%;margin-top:1rem}.features_t9lD .bharatml-icon_XBoJ{margin-bottom:1.5rem}.features_t9lD h3{font-size:1.25rem;font-weight:600}.features_t9lD p{color:var(--ifm-font-color-base)!important;font-size:.95rem;font-weight:400;line-height:1.6;margin:0}.featureDescription_sP1D{color:#1c1e21!important;font-size:.95rem!important;font-weight:400!important;line-height:1.6!important;margin:0!important}[data-theme=dark] .bharatml-card_xZ6l{background:#2a2a2a!important;border-color:#8b45824d;color:#fff}[data-theme=dark] .bharatml-card_xZ6l:hover{background:#333!important;border-color:var(--bharatml-primary);box-shadow:0 4px 20px #8b45824d}[data-theme=dark] .featureDescription_sP1D,[data-theme=dark] .featuresHeader_qR2i,[data-theme=dark] .features_t9lD h3,[data-theme=dark] .features_t9lD p{color:#a04e96!important}[data-theme=dark] .featuresSubtitle_VdGe{color:#e0e0e0!important}.heroBanner_qdFl{overflow:hidden;padding:4rem 0;position:relative;text-align:center}.logoContainer_xdaK{align-items:center;display:flex;justify-content:center;margin-bottom:2rem}.heroLogo_U6bI{filter:drop-shadow(0 4px 8px rgba(0,0,0,.1));height:180px;transition:transform .3s;width:180px}.heroLogo_U6bI:hover{transform:scale(1.05)}.buttons_AeoN{align-items:center;gap:1rem;margin-bottom:2rem}.buttons_AeoN,.statsContainer_KpvY{display:flex;justify-content:center}.statsContainer_KpvY{gap:3rem;margin-top:2rem;opacity:.9}.statItem_bwiZ{align-items:center;color:#fff;display:flex;flex-direction:column;text-align:center}.statItem_bwiZ strong{display:block;font-size:1.5rem;font-weight:700;margin-bottom:.25rem}.statItem_bwiZ span{font-size:.875rem;letter-spacing:.5px;opacity:.8;text-transform:uppercase}.aboutSection_udvw{padding:4rem 0}.highlightBox_Uhe8{background:linear-gradient(135deg,#f8f9ff,#e8f0ff);border:1px solid #4508391a;border-radius:12px;height:100%;padding:2rem}.highlightBox_Uhe8 h3{color:var(--bharatml-primary);font-size:1.25rem;margin-bottom:1rem}.highlightBox_Uhe8 li,[data-theme=dark] .highlightBox_Uhe8 li{color:var(--bharatml-text)}.highlightBox_Uhe8 ul{list-style:none;margin:0;padding:0}.highlightBox_Uhe8 li{font-size:.95rem;padding:.5rem 0}.highlightBox_Uhe8 li:not(:last-child){border-bottom:1px solid #4508390d}[data-theme=dark] .highlightBox_Uhe8{background:linear-gradient(135deg,#1a1a2e,#16213e);border-color:#8b458233}@layer docusaurus.theme-common{body:not(.navigation-with-keyboard) :not(input):focus{outline:0}.themedComponent_mlkZ{display:none}[data-theme=dark] .themedComponent--dark_xIcU,[data-theme=light] .themedComponent--light_NVdE,html:not([data-theme]) .themedComponent--light_NVdE{display:initial}.errorBoundaryError_a6uf{color:red;white-space:pre-wrap}.errorBoundaryFallback_VBag{color:red;padding:.55rem}.details_lb9f{--docusaurus-details-summary-arrow-size:0.38rem;--docusaurus-details-transition:transform 200ms ease;--docusaurus-details-decoration-color:grey}.details_lb9f>summary{cursor:pointer;list-style:none;padding-left:1rem;position:relative}.details_lb9f>summary::-webkit-details-marker{display:none}.details_lb9f>summary:before{border-color:#0000 #0000 #0000 var(--docusaurus-details-decoration-color);border-style:solid;border-width:var(--docusaurus-details-summary-arrow-size);content:"";left:0;position:absolute;top:.45rem;transform:rotate(0);transform-origin:calc(var(--docusaurus-details-summary-arrow-size)/2) 50%;transition:var(--docusaurus-details-transition)}.details_lb9f[data-collapsed=false].isBrowser_bmU9>summary:before,.details_lb9f[open]:not(.isBrowser_bmU9)>summary:before{transform:rotate(90deg)}.collapsibleContent_i85q{border-top:1px solid var(--docusaurus-details-decoration-color);margin-top:1rem;padding-top:1rem}.collapsibleContent_i85q p:last-child,.details_lb9f>summary>p:last-child{margin-bottom:0}}@layer docusaurus.theme-classic{:root{--docusaurus-progress-bar-color:var(--ifm-color-primary);--docusaurus-announcement-bar-height:auto;--docusaurus-collapse-button-bg:#0000;--docusaurus-collapse-button-bg-hover:#0000001a;--doc-sidebar-width:300px;--doc-sidebar-hidden-width:30px;--docusaurus-blog-social-icon-size:1rem;--docusaurus-tag-list-border:var(--ifm-color-emphasis-300)}#nprogress{pointer-events:none}#nprogress .bar{background:var(--docusaurus-progress-bar-color);height:2px;left:0;position:fixed;top:0;width:100%;z-index:1031}#nprogress .peg{box-shadow:0 0 10px var(--docusaurus-progress-bar-color),0 0 5px var(--docusaurus-progress-bar-color);height:100%;opacity:1;position:absolute;right:0;transform:rotate(3deg) translateY(-4px);width:100px}.skipToContent_fXgn{background-color:var(--ifm-background-surface-color);color:var(--ifm-color-emphasis-900);left:100%;padding:calc(var(--ifm-global-spacing)/2) var(--ifm-global-spacing);position:fixed;top:1rem;z-index:calc(var(--ifm-z-index-fixed) + 1)}.skipToContent_fXgn:focus{box-shadow:var(--ifm-global-shadow-md);left:1rem}.closeButton_CVFx{line-height:0;padding:0}.content_knG7{font-size:85%;padding:5px 0;text-align:center}.content_knG7 a{color:inherit;-webkit-text-decoration:underline;text-decoration:underline}.announcementBar_mb4j{align-items:center;background-color:var(--ifm-color-white);border-bottom:1px solid var(--ifm-color-emphasis-100);color:var(--ifm-color-black);display:flex;height:var(--docusaurus-announcement-bar-height)}.docSidebarContainer_YfHR,.navbarSearchContainer_Bca1:empty,.sidebarLogo_isFc,.toggleIcon_g3eP,html[data-announcement-bar-initially-dismissed=true] .announcementBar_mb4j{display:none}.announcementBarPlaceholder_vyr4{flex:0 0 10px}.announcementBarClose_gvF7{align-self:stretch;flex:0 0 30px}.announcementBarContent_xLdY{flex:1 1 auto}.toggle_vylO{height:2rem;width:2rem}.toggleButton_gllP{-webkit-tap-highlight-color:transparent;align-items:center;border-radius:50%;display:flex;height:100%;justify-content:center;transition:background var(--ifm-transition-fast);width:100%}.toggleButton_gllP:hover{background:var(--ifm-color-emphasis-200)}[data-theme-choice=dark] .darkToggleIcon_wfgR,[data-theme-choice=light] .lightToggleIcon_pyhR,[data-theme-choice=system] .systemToggleIcon_QzmC{display:initial}.toggleButtonDisabled_aARS{cursor:not-allowed}.darkNavbarColorModeToggle_X3D1:hover{background:var(--ifm-color-gray-800)}.backToTopButton_sjWU{background-color:var(--ifm-color-emphasis-200);border-radius:50%;bottom:1.3rem;box-shadow:var(--ifm-global-shadow-lw);height:3rem;opacity:0;position:fixed;right:1.3rem;transform:scale(0);transition:all var(--ifm-transition-fast) var(--ifm-transition-timing-default);visibility:hidden;width:3rem;z-index:calc(var(--ifm-z-index-fixed) - 1)}.backToTopButton_sjWU:after{background-color:var(--ifm-color-emphasis-1000);content:" ";display:inline-block;height:100%;-webkit-mask:var(--ifm-menu-link-sublist-icon) 50%/2rem 2rem no-repeat;mask:var(--ifm-menu-link-sublist-icon) 50%/2rem 2rem no-repeat;width:100%}.backToTopButtonShow_xfvO{opacity:1;transform:scale(1);visibility:visible}[data-theme=dark]:root{--docusaurus-collapse-button-bg:#ffffff0d;--docusaurus-collapse-button-bg-hover:#ffffff1a}.collapseSidebarButton_PEFL{display:none;margin:0}.iconExternalLink_nPIU{margin-left:.3rem}.dropdownNavbarItemMobile_J0Sd{cursor:pointer}.iconLanguage_nlXk{margin-right:5px;vertical-align:text-bottom}.navbarHideable_m1mJ{transition:transform var(--ifm-transition-fast) ease}.navbarHidden_jGov{transform:translate3d(0,calc(-100% - 2px),0)}.navbar__items--right>:last-child{padding-right:0}.footerLogoLink_BH7S{opacity:.5;transition:opacity var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.footerLogoLink_BH7S:hover,.hash-link:focus,:hover>.hash-link{opacity:1}.menuExternalLink_NmtK{align-items:center}.docMainContainer_TBSr,.docRoot_UBD9{display:flex;width:100%}.authorSocialIcon_XYv3,.authorSocialLink_owbf{width:var(--docusaurus-blog-social-icon-size)}.docsWrapper_hBAB{display:flex;flex:1 0 auto}.anchorWithStickyNavbar_LWe7{scroll-margin-top:calc(var(--ifm-navbar-height) + .5rem)}.anchorWithHideOnScrollNavbar_WYt5{scroll-margin-top:.5rem}.hash-link{opacity:0;padding-left:.5rem;transition:opacity var(--ifm-transition-fast);-webkit-user-select:none;user-select:none}.hash-link:before{content:"#"}.docCardListItem_W1sv>*,body,html{height:100%}.mainWrapper_z2l0{display:flex;flex:1 0 auto;flex-direction:column}.docusaurus-mt-lg{margin-top:3rem}#__docusaurus{display:flex;flex-direction:column;min-height:100%}.sidebar_re4s{max-height:calc(100vh - var(--ifm-navbar-height) - 2rem);overflow-y:auto;position:sticky;top:calc(var(--ifm-navbar-height) + 2rem)}.authorSocials_rSDt,.authorTitle_nd0D{overflow:hidden;-webkit-box-orient:vertical}.sidebarItemTitle_pO2u{font-size:var(--ifm-h3-font-size);font-weight:var(--ifm-font-weight-bold)}.container_mt6G,.sidebarItemList_Yudw{font-size:.9rem}.sidebarItem__DBe{margin-top:.7rem}.sidebarItemLink_mo7H{color:var(--ifm-font-color-base);display:block}.sidebarItemLink_mo7H:hover{-webkit-text-decoration:none;text-decoration:none}.sidebarItemLinkActive_I1ZP{color:var(--ifm-color-primary)!important}.yearGroupHeading_rMGB{margin-bottom:.4rem;margin-top:1.6rem}.yearGroupHeading_QT03{margin:1rem .75rem .5rem}.cardContainer_fWXF{--ifm-link-color:var(--ifm-color-emphasis-800);--ifm-link-hover-color:var(--ifm-color-emphasis-700);--ifm-link-hover-decoration:none;border:1px solid var(--ifm-color-emphasis-200);box-shadow:0 1.5px 3px 0 #00000026;transition:all var(--ifm-transition-fast) ease;transition-property:border,box-shadow}.cardContainer_fWXF:hover{border-color:var(--ifm-color-primary);box-shadow:0 3px 6px 0 #0003}.admonitionContent_BuS1>:last-child,.cardContainer_fWXF :last-child{margin-bottom:0}.cardTitle_rnsV{font-size:1.2rem}.cardDescription_PWke{font-size:.8rem}.docCardListItem_W1sv{margin-bottom:2rem}.title_f1Hy{font-size:3rem}[data-theme=dark] .githubSvg_Uu4N,[data-theme=dark] .instagramSvg_YC40,[data-theme=dark] .threadsSvg_PTXY,[data-theme=dark] .xSvg_y3PF{fill:var(--light)}[data-theme=light] .githubSvg_Uu4N,[data-theme=light] .instagramSvg_YC40,[data-theme=light] .threadsSvg_PTXY,[data-theme=light] .xSvg_y3PF{fill:var(--dark)}.authorSocials_rSDt{align-items:center;display:flex;flex-wrap:wrap;line-clamp:1;-webkit-line-clamp:1}.authorSocialLink_owbf,.authorSocials_rSDt{height:var(--docusaurus-blog-social-icon-size);line-height:0}.authorSocialLink_owbf{margin-right:.4rem}.authorSocialIcon_XYv3{height:var(--docusaurus-blog-social-icon-size)}.authorImage_XqGP{--ifm-avatar-photo-size:3.6rem}.author-as-h1_n9oJ .authorImage_XqGP{--ifm-avatar-photo-size:7rem}.author-as-h2_gXvM .authorImage_XqGP{--ifm-avatar-photo-size:5.4rem}.authorDetails_lV9A{align-items:flex-start;display:flex;flex-direction:column;justify-content:space-around}.authorName_yefp{display:flex;flex-direction:row;font-size:1.1rem;line-height:1.1rem}.author-as-h1_n9oJ .authorName_yefp{display:inline;font-size:2.4rem;line-height:2.4rem}.author-as-h2_gXvM .authorName_yefp{display:inline;font-size:1.4rem;line-height:1.4rem}.authorTitle_nd0D{display:-webkit-box;font-size:.8rem;line-height:1rem;line-clamp:1;-webkit-line-clamp:1}.author-as-h1_n9oJ .authorTitle_nd0D{font-size:1.2rem;line-height:1.6rem}.author-as-h2_gXvM .authorTitle_nd0D{font-size:1rem;line-height:1.3rem}.authorBlogPostCount_iiJ5{background:var(--ifm-color-secondary);border-radius:var(--ifm-global-radius);color:var(--ifm-color-black);font-size:.8rem;line-height:1.2;margin-left:.3rem;padding:.1rem .4rem}.authorListItem_n3yI{list-style-type:none;margin-bottom:2rem}.authorCol_Hf19{max-width:inherit!important}.imageOnlyAuthorRow_pa_O{display:flex;flex-flow:row wrap}.imageOnlyAuthorCol_G86a{margin-left:.3rem;margin-right:.3rem}.codeBlockContainer_Ckt0{background:var(--prism-background-color);border-radius:var(--ifm-code-border-radius);box-shadow:var(--ifm-global-shadow-lw);color:var(--prism-color);margin-bottom:var(--ifm-leading)}.codeBlock_bY9V{--ifm-pre-background:var(--prism-background-color);margin:0;padding:0}.codeBlockStandalone_MEMb{padding:0}.codeBlockLines_e6Vv{float:left;font:inherit;min-width:100%;padding:var(--ifm-pre-padding)}.codeBlockLinesWithNumbering_o6Pm{display:table;padding:var(--ifm-pre-padding) 0}:where(:root){--docusaurus-highlighted-code-line-bg:#484d5b}:where([data-theme=dark]){--docusaurus-highlighted-code-line-bg:#646464}.theme-code-block-highlighted-line{background-color:var(--docusaurus-highlighted-code-line-bg);display:block;margin:0 calc(var(--ifm-pre-padding)*-1);padding:0 var(--ifm-pre-padding)}.codeLine_lJS_{counter-increment:a;display:table-row}.codeLineNumber_Tfdd{background:var(--ifm-pre-background);display:table-cell;left:0;overflow-wrap:normal;padding:0 var(--ifm-pre-padding);position:sticky;text-align:right;width:1%}.codeLineNumber_Tfdd:before{content:counter(a);opacity:.4}.theme-code-block-highlighted-line .codeLineNumber_Tfdd:before{opacity:.8}.codeLineContent_feaV{padding-right:var(--ifm-pre-padding)}.theme-code-block:hover .copyButtonCopied_Vdqa{opacity:1!important}.copyButtonIcons_IEyt{height:1.125rem;position:relative;width:1.125rem}.copyButtonIcon_TrPX,.copyButtonSuccessIcon_cVMy{left:0;position:absolute;top:0;fill:currentColor;height:inherit;opacity:inherit;transition:all var(--ifm-transition-fast) ease;width:inherit}.copyButtonSuccessIcon_cVMy{color:#00d600;left:50%;opacity:0;top:50%;transform:translate(-50%,-50%) scale(.33)}.copyButtonCopied_Vdqa .copyButtonIcon_TrPX{opacity:0;transform:scale(.33)}.copyButtonCopied_Vdqa .copyButtonSuccessIcon_cVMy{opacity:1;transform:translate(-50%,-50%) scale(1);transition-delay:75ms}.wordWrapButtonIcon_b1P5{height:1.2rem;width:1.2rem}.wordWrapButtonEnabled_uzNF .wordWrapButtonIcon_b1P5{color:var(--ifm-color-primary)}.buttonGroup_M5ko{column-gap:.2rem;display:flex;position:absolute;right:calc(var(--ifm-pre-padding)/2);top:calc(var(--ifm-pre-padding)/2)}.buttonGroup_M5ko button{align-items:center;background:var(--prism-background-color);border:1px solid var(--ifm-color-emphasis-300);border-radius:var(--ifm-global-radius);color:var(--prism-color);display:flex;line-height:0;opacity:0;padding:.4rem;transition:opacity var(--ifm-transition-fast) ease-in-out}.buttonGroup_M5ko button:focus-visible,.buttonGroup_M5ko button:hover{opacity:1!important}.theme-code-block:hover .buttonGroup_M5ko button{opacity:.4}.tag_zVej{border:1px solid var(--docusaurus-tag-list-border);transition:border var(--ifm-transition-fast)}.tag_zVej:hover{--docusaurus-tag-list-border:var(--ifm-link-color);-webkit-text-decoration:none;text-decoration:none}.tagRegular_sFm0{border-radius:var(--ifm-global-radius);font-size:90%;padding:.2rem .5rem .3rem}.tagWithCount_h2kH{align-items:center;border-left:0;display:flex;padding:0 .5rem 0 1rem;position:relative}.tagWithCount_h2kH:after,.tagWithCount_h2kH:before{border:1px solid var(--docusaurus-tag-list-border);content:"";position:absolute;top:50%;transition:inherit}.tagWithCount_h2kH:before{border-bottom:0;border-right:0;height:1.18rem;right:100%;transform:translate(50%,-50%) rotate(-45deg);width:1.18rem}.tagWithCount_h2kH:after{border-radius:50%;height:.5rem;left:0;transform:translateY(-50%);width:.5rem}.tagWithCount_h2kH span{background:var(--ifm-color-secondary);border-radius:var(--ifm-global-radius);color:var(--ifm-color-black);font-size:.7rem;line-height:1.2;margin-left:.3rem;padding:.1rem .4rem}.tag_Nnez{display:inline-block;margin:.5rem .5rem 0 1rem}.codeBlockContent_QJqH{border-radius:inherit;direction:ltr;position:relative}.codeBlockTitle_OeMC{border-bottom:1px solid var(--ifm-color-emphasis-300);border-top-left-radius:inherit;border-top-right-radius:inherit;font-size:var(--ifm-code-font-size);font-weight:500;padding:.75rem var(--ifm-pre-padding)}.codeBlockTitle_OeMC+.codeBlockContent_QJqH .codeBlock_a8dz{border-top-left-radius:0;border-top-right-radius:0}.tags_jXut{display:inline}.tag_QGVx{display:inline-block;margin:0 .4rem .5rem 0}.iconEdit_Z9Sw{margin-right:.3em;vertical-align:sub}.lastUpdated_JAkA{font-size:smaller;font-style:italic;margin-top:.2rem}.tocCollapsibleButton_TO0P{align-items:center;display:flex;font-size:inherit;justify-content:space-between;padding:.4rem .8rem;width:100%}.tocCollapsibleButton_TO0P:after{background:var(--ifm-menu-link-sublist-icon) 50% 50%/2rem 2rem no-repeat;content:"";filter:var(--ifm-menu-link-sublist-icon-filter);height:1.25rem;transform:rotate(180deg);transition:transform var(--ifm-transition-fast);width:1.25rem}.tocCollapsibleButtonExpanded_MG3E:after,.tocCollapsibleExpanded_sAul{transform:none}.tocCollapsible_ETCw{background-color:var(--ifm-menu-color-background-active);border-radius:var(--ifm-global-radius);margin:1rem 0}.tocCollapsibleContent_vkbj>ul{border-left:none;border-top:1px solid var(--ifm-color-emphasis-300);font-size:15px;padding:.2rem 0}.tocCollapsibleContent_vkbj ul li{margin:.4rem .8rem}.tocCollapsibleContent_vkbj a{display:block}.details_b_Ee{--docusaurus-details-decoration-color:var(--ifm-alert-border-color);--docusaurus-details-transition:transform var(--ifm-transition-fast) ease;border:1px solid var(--ifm-alert-border-color);margin:0 0 var(--ifm-spacing-vertical)}.containsTaskList_mC6p{list-style:none}:not(.containsTaskList_mC6p>li)>.containsTaskList_mC6p{padding-left:0}.img_ev3q{height:auto}.tableOfContents_bqdL{max-height:calc(100vh - var(--ifm-navbar-height) - 2rem);overflow-y:auto;position:sticky;top:calc(var(--ifm-navbar-height) + 1rem)}.admonition_xJq3{margin-bottom:1em}.admonitionHeading_Gvgb{font:var(--ifm-heading-font-weight) var(--ifm-h5-font-size)/var(--ifm-heading-line-height) var(--ifm-heading-font-family);text-transform:uppercase}.admonitionHeading_Gvgb:not(:last-child){margin-bottom:.3rem}.admonitionHeading_Gvgb code{text-transform:none}.admonitionIcon_Rf37{display:inline-block;margin-right:.4em;vertical-align:middle}.admonitionIcon_Rf37 svg{display:inline-block;height:1.6em;width:1.6em;fill:var(--ifm-alert-foreground-color)}.breadcrumbHomeIcon_YNFT{height:1.1rem;position:relative;top:1px;vertical-align:top;width:1.1rem}.breadcrumbsContainer_Z_bl{--ifm-breadcrumb-size-multiplier:0.8;margin-bottom:.8rem}.title_kItE{--ifm-h1-font-size:3rem;margin-bottom:calc(var(--ifm-leading)*1.25)}.docItemContainer_Djhp article>:first-child,.docItemContainer_Djhp header+*{margin-top:0}.mdxPageWrapper_j9I6{justify-content:center}}@media (min-width:997px){.collapseSidebarButton_PEFL,.expandButton_TmdG{background-color:var(--docusaurus-collapse-button-bg)}:root{--docusaurus-announcement-bar-height:30px}.announcementBarClose_gvF7,.announcementBarPlaceholder_vyr4{flex-basis:50px}.collapseSidebarButton_PEFL{border:1px solid var(--ifm-toc-border-color);border-radius:0;bottom:0;display:block!important;height:40px;position:sticky}.collapseSidebarButtonIcon_kv0_{margin-top:4px;transform:rotate(180deg)}.expandButtonIcon_i1dp,[dir=rtl] .collapseSidebarButtonIcon_kv0_{transform:rotate(0)}.collapseSidebarButton_PEFL:focus,.collapseSidebarButton_PEFL:hover,.expandButton_TmdG:focus,.expandButton_TmdG:hover{background-color:var(--docusaurus-collapse-button-bg-hover)}.navbarSearchContainer_Bca1{padding:var(--ifm-navbar-item-padding-vertical) var(--ifm-navbar-item-padding-horizontal)}.menuHtmlItem_M9Kj{padding:var(--ifm-menu-link-padding-vertical) var(--ifm-menu-link-padding-horizontal)}.menu_SIkG{flex-grow:1;padding:.5rem}@supports (scrollbar-gutter:stable){.menu_SIkG{padding:.5rem 0 .5rem .5rem;scrollbar-gutter:stable}}.menuWithAnnouncementBar_GW3s{margin-bottom:var(--docusaurus-announcement-bar-height)}.sidebar_njMd{display:flex;flex-direction:column;height:100%;padding-top:var(--ifm-navbar-height);width:var(--doc-sidebar-width)}.sidebarWithHideableNavbar_wUlq{padding-top:0}.sidebarHidden_VK0M{opacity:0;visibility:hidden}.sidebarLogo_isFc{align-items:center;color:inherit!important;display:flex!important;margin:0 var(--ifm-navbar-padding-horizontal);max-height:var(--ifm-navbar-height);min-height:var(--ifm-navbar-height);-webkit-text-decoration:none!important;text-decoration:none!important}.sidebarLogo_isFc img{height:2rem;margin-right:.5rem}.expandButton_TmdG{align-items:center;display:flex;height:100%;justify-content:center;position:absolute;right:0;top:0;transition:background-color var(--ifm-transition-fast) ease;width:100%}[dir=rtl] .expandButtonIcon_i1dp{transform:rotate(180deg)}.docSidebarContainer_YfHR{border-right:1px solid var(--ifm-toc-border-color);clip-path:inset(0);display:block;margin-top:calc(var(--ifm-navbar-height)*-1);transition:width var(--ifm-transition-fast) ease;width:var(--doc-sidebar-width);will-change:width}.docSidebarContainerHidden_DPk8{cursor:pointer;width:var(--doc-sidebar-hidden-width)}.sidebarViewport_aRkj{height:100%;max-height:100vh;position:sticky;top:0}.docMainContainer_TBSr{flex-grow:1;max-width:calc(100% - var(--doc-sidebar-width))}.docMainContainerEnhanced_lQrH{max-width:calc(100% - var(--doc-sidebar-hidden-width))}.docItemWrapperEnhanced_JWYK{max-width:calc(var(--ifm-container-width) + var(--doc-sidebar-width))!important}.lastUpdated_JAkA{text-align:right}.tocMobile_ITEo{display:none}.docItemCol_VOVn,.generatedIndexPage_vN6x{max-width:75%!important}}@media (min-width:1440px){.container{max-width:var(--ifm-container-width-xl)}}@media (max-width:996px){.col{--ifm-col-width:100%;flex-basis:var(--ifm-col-width);margin-left:0}.footer{--ifm-footer-padding-horizontal:0}.colorModeToggle_DEke,.footer__link-separator,.navbar__item,.sidebar_re4s,.tableOfContents_bqdL{display:none}.footer__col{margin-bottom:calc(var(--ifm-spacing-vertical)*3)}.footer__link-item{display:block;width:max-content}.hero{padding-left:0;padding-right:0}.navbar>.container,.navbar>.container-fluid{padding:0}.navbar__toggle{display:inherit}.navbar__search-input{width:9rem}.pills--block,.tabs--block{flex-direction:column}.navbarSearchContainer_Bca1{position:absolute;right:var(--ifm-navbar-padding-horizontal)}.docItemContainer_F8PC{padding:0 .3rem}}@media screen and (max-width:996px){.features_t9lD .bharatml-card_xZ6l{margin-bottom:2rem}.featuresHeader_qR2i{font-size:2rem}.featuresSubtitle_VdGe{font-size:1rem}.heroBanner_qdFl{padding:2rem}}@media screen and (max-width:768px){.heroLogo_U6bI{height:120px;width:120px}.logoContainer_xdaK{margin-bottom:1.5rem}.buttons_AeoN{flex-direction:column;gap:.5rem}.statsContainer_KpvY{align-items:center;flex-direction:column;gap:1rem}}@media (max-width:576px){.markdown h1:first-child{--ifm-h1-font-size:2rem}.markdown>h2{--ifm-h2-font-size:1.5rem}.markdown>h3{--ifm-h3-font-size:1.25rem}.title_f1Hy{font-size:2rem}}@media (hover:hover){.backToTopButton_sjWU:hover{background-color:var(--ifm-color-emphasis-300)}}@media (pointer:fine){.thin-scrollbar{scrollbar-width:thin}.thin-scrollbar::-webkit-scrollbar{height:var(--ifm-scrollbar-size);width:var(--ifm-scrollbar-size)}.thin-scrollbar::-webkit-scrollbar-track{background:var(--ifm-scrollbar-track-background-color);border-radius:10px}.thin-scrollbar::-webkit-scrollbar-thumb{background:var(--ifm-scrollbar-thumb-background-color);border-radius:10px}.thin-scrollbar::-webkit-scrollbar-thumb:hover{background:var(--ifm-scrollbar-thumb-hover-background-color)}}@media (prefers-reduced-motion:reduce){:root{--ifm-transition-fast:0ms;--ifm-transition-slow:0ms}}@media print{.announcementBar_mb4j,.footer,.menu,.navbar,.pagination-nav,.table-of-contents,.tocMobile_ITEo{display:none}.tabs{page-break-inside:avoid}.codeBlockLines_e6Vv{white-space:pre-wrap}} \ No newline at end of file +@layer docusaurus.infima,docusaurus.theme-common,docusaurus.theme-classic,docusaurus.core,docusaurus.plugin-debug,docusaurus.theme-mermaid,docusaurus.theme-live-codeblock,docusaurus.theme-search-algolia.docsearch,docusaurus.theme-search-algolia;@layer docusaurus.infima{.col,.container{padding:0 var(--ifm-spacing-horizontal);width:100%}.markdown>h2,.markdown>h3,.markdown>h4,.markdown>h5,.markdown>h6{margin-bottom:calc(var(--ifm-heading-vertical-rhythm-bottom)*var(--ifm-leading))}.markdown li,body{word-wrap:break-word}body,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow:auto}blockquote,pre{margin:0 0 var(--ifm-spacing-vertical)}.breadcrumbs__link,.button{transition-timing-function:var(--ifm-transition-timing-default)}.button,code{vertical-align:middle}.button--outline.button--active,.button--outline:active,.button--outline:hover,:root{--ifm-button-color:var(--ifm-font-color-base-inverse)}.menu__link:hover,a{transition:color var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.navbar--dark,:root{--ifm-navbar-link-hover-color:var(--ifm-color-primary)}.menu,.navbar-sidebar{overflow-x:hidden}:root,html[data-theme=dark]{--ifm-color-emphasis-500:var(--ifm-color-gray-500)}:root{--ifm-color-scheme:light;--ifm-dark-value:10%;--ifm-darker-value:15%;--ifm-darkest-value:30%;--ifm-light-value:15%;--ifm-lighter-value:30%;--ifm-lightest-value:50%;--ifm-contrast-background-value:90%;--ifm-contrast-foreground-value:70%;--ifm-contrast-background-dark-value:70%;--ifm-contrast-foreground-dark-value:90%;--ifm-color-primary:#3578e5;--ifm-color-secondary:#ebedf0;--ifm-color-success:#00a400;--ifm-color-info:#54c7ec;--ifm-color-warning:#ffba00;--ifm-color-danger:#fa383e;--ifm-color-primary-dark:#306cce;--ifm-color-primary-darker:#2d66c3;--ifm-color-primary-darkest:#2554a0;--ifm-color-primary-light:#538ce9;--ifm-color-primary-lighter:#72a1ed;--ifm-color-primary-lightest:#9abcf2;--ifm-color-primary-contrast-background:#ebf2fc;--ifm-color-primary-contrast-foreground:#102445;--ifm-color-secondary-dark:#d4d5d8;--ifm-color-secondary-darker:#c8c9cc;--ifm-color-secondary-darkest:#a4a6a8;--ifm-color-secondary-light:#eef0f2;--ifm-color-secondary-lighter:#f1f2f5;--ifm-color-secondary-lightest:#f5f6f8;--ifm-color-secondary-contrast-background:#fdfdfe;--ifm-color-secondary-contrast-foreground:#474748;--ifm-color-success-dark:#009400;--ifm-color-success-darker:#008b00;--ifm-color-success-darkest:#007300;--ifm-color-success-light:#26b226;--ifm-color-success-lighter:#4dbf4d;--ifm-color-success-lightest:#80d280;--ifm-color-success-contrast-background:#e6f6e6;--ifm-color-success-contrast-foreground:#003100;--ifm-color-info-dark:#4cb3d4;--ifm-color-info-darker:#47a9c9;--ifm-color-info-darkest:#3b8ba5;--ifm-color-info-light:#6ecfef;--ifm-color-info-lighter:#87d8f2;--ifm-color-info-lightest:#aae3f6;--ifm-color-info-contrast-background:#eef9fd;--ifm-color-info-contrast-foreground:#193c47;--ifm-color-warning-dark:#e6a700;--ifm-color-warning-darker:#d99e00;--ifm-color-warning-darkest:#b38200;--ifm-color-warning-light:#ffc426;--ifm-color-warning-lighter:#ffcf4d;--ifm-color-warning-lightest:#ffdd80;--ifm-color-warning-contrast-background:#fff8e6;--ifm-color-warning-contrast-foreground:#4d3800;--ifm-color-danger-dark:#e13238;--ifm-color-danger-darker:#d53035;--ifm-color-danger-darkest:#af272b;--ifm-color-danger-light:#fb565b;--ifm-color-danger-lighter:#fb7478;--ifm-color-danger-lightest:#fd9c9f;--ifm-color-danger-contrast-background:#ffebec;--ifm-color-danger-contrast-foreground:#4b1113;--ifm-color-white:#fff;--ifm-color-black:#000;--ifm-color-gray-0:var(--ifm-color-white);--ifm-color-gray-100:#f5f6f7;--ifm-color-gray-200:#ebedf0;--ifm-color-gray-300:#dadde1;--ifm-color-gray-400:#ccd0d5;--ifm-color-gray-500:#bec3c9;--ifm-color-gray-600:#8d949e;--ifm-color-gray-700:#606770;--ifm-color-gray-800:#444950;--ifm-color-gray-900:#1c1e21;--ifm-color-gray-1000:var(--ifm-color-black);--ifm-color-emphasis-0:var(--ifm-color-gray-0);--ifm-color-emphasis-100:var(--ifm-color-gray-100);--ifm-color-emphasis-200:var(--ifm-color-gray-200);--ifm-color-emphasis-300:var(--ifm-color-gray-300);--ifm-color-emphasis-400:var(--ifm-color-gray-400);--ifm-color-emphasis-600:var(--ifm-color-gray-600);--ifm-color-emphasis-700:var(--ifm-color-gray-700);--ifm-color-emphasis-800:var(--ifm-color-gray-800);--ifm-color-emphasis-900:var(--ifm-color-gray-900);--ifm-color-emphasis-1000:var(--ifm-color-gray-1000);--ifm-color-content:var(--ifm-color-emphasis-900);--ifm-color-content-inverse:var(--ifm-color-emphasis-0);--ifm-color-content-secondary:#525860;--ifm-background-color:#0000;--ifm-background-surface-color:var(--ifm-color-content-inverse);--ifm-global-border-width:1px;--ifm-global-radius:0.4rem;--ifm-hover-overlay:#0000000d;--ifm-font-color-base:var(--ifm-color-content);--ifm-font-color-base-inverse:var(--ifm-color-content-inverse);--ifm-font-color-secondary:var(--ifm-color-content-secondary);--ifm-font-family-base:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol";--ifm-font-family-monospace:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;--ifm-font-size-base:100%;--ifm-font-weight-light:300;--ifm-font-weight-normal:400;--ifm-font-weight-semibold:500;--ifm-font-weight-bold:700;--ifm-font-weight-base:var(--ifm-font-weight-normal);--ifm-line-height-base:1.65;--ifm-global-spacing:1rem;--ifm-spacing-vertical:var(--ifm-global-spacing);--ifm-spacing-horizontal:var(--ifm-global-spacing);--ifm-transition-fast:200ms;--ifm-transition-slow:400ms;--ifm-transition-timing-default:cubic-bezier(0.08,0.52,0.52,1);--ifm-global-shadow-lw:0 1px 2px 0 #0000001a;--ifm-global-shadow-md:0 5px 40px #0003;--ifm-global-shadow-tl:0 12px 28px 0 #0003,0 2px 4px 0 #0000001a;--ifm-z-index-dropdown:100;--ifm-z-index-fixed:200;--ifm-z-index-overlay:400;--ifm-container-width:1140px;--ifm-container-width-xl:1320px;--ifm-code-background:#f6f7f8;--ifm-code-border-radius:var(--ifm-global-radius);--ifm-code-font-size:90%;--ifm-code-padding-horizontal:0.1rem;--ifm-code-padding-vertical:0.1rem;--ifm-pre-background:var(--ifm-code-background);--ifm-pre-border-radius:var(--ifm-code-border-radius);--ifm-pre-color:inherit;--ifm-pre-line-height:1.45;--ifm-pre-padding:1rem;--ifm-heading-color:inherit;--ifm-heading-margin-top:0;--ifm-heading-margin-bottom:var(--ifm-spacing-vertical);--ifm-heading-font-family:var(--ifm-font-family-base);--ifm-heading-font-weight:var(--ifm-font-weight-bold);--ifm-heading-line-height:1.25;--ifm-h1-font-size:2rem;--ifm-h2-font-size:1.5rem;--ifm-h3-font-size:1.25rem;--ifm-h4-font-size:1rem;--ifm-h5-font-size:0.875rem;--ifm-h6-font-size:0.85rem;--ifm-image-alignment-padding:1.25rem;--ifm-leading-desktop:1.25;--ifm-leading:calc(var(--ifm-leading-desktop)*1rem);--ifm-list-left-padding:2rem;--ifm-list-margin:1rem;--ifm-list-item-margin:0.25rem;--ifm-list-paragraph-margin:1rem;--ifm-table-cell-padding:0.75rem;--ifm-table-background:#0000;--ifm-table-stripe-background:#00000008;--ifm-table-border-width:1px;--ifm-table-border-color:var(--ifm-color-emphasis-300);--ifm-table-head-background:inherit;--ifm-table-head-color:inherit;--ifm-table-head-font-weight:var(--ifm-font-weight-bold);--ifm-table-cell-color:inherit;--ifm-link-color:var(--ifm-color-primary);--ifm-link-decoration:none;--ifm-link-hover-color:var(--ifm-link-color);--ifm-link-hover-decoration:underline;--ifm-paragraph-margin-bottom:var(--ifm-leading);--ifm-blockquote-font-size:var(--ifm-font-size-base);--ifm-blockquote-border-left-width:2px;--ifm-blockquote-padding-horizontal:var(--ifm-spacing-horizontal);--ifm-blockquote-padding-vertical:0;--ifm-blockquote-shadow:none;--ifm-blockquote-color:var(--ifm-color-emphasis-800);--ifm-blockquote-border-color:var(--ifm-color-emphasis-300);--ifm-hr-background-color:var(--ifm-color-emphasis-500);--ifm-hr-height:1px;--ifm-hr-margin-vertical:1.5rem;--ifm-scrollbar-size:7px;--ifm-scrollbar-track-background-color:#f1f1f1;--ifm-scrollbar-thumb-background-color:silver;--ifm-scrollbar-thumb-hover-background-color:#a7a7a7;--ifm-alert-background-color:inherit;--ifm-alert-border-color:inherit;--ifm-alert-border-radius:var(--ifm-global-radius);--ifm-alert-border-width:0px;--ifm-alert-border-left-width:5px;--ifm-alert-color:var(--ifm-font-color-base);--ifm-alert-padding-horizontal:var(--ifm-spacing-horizontal);--ifm-alert-padding-vertical:var(--ifm-spacing-vertical);--ifm-alert-shadow:var(--ifm-global-shadow-lw);--ifm-avatar-intro-margin:1rem;--ifm-avatar-intro-alignment:inherit;--ifm-avatar-photo-size:3rem;--ifm-badge-background-color:inherit;--ifm-badge-border-color:inherit;--ifm-badge-border-radius:var(--ifm-global-radius);--ifm-badge-border-width:var(--ifm-global-border-width);--ifm-badge-color:var(--ifm-color-white);--ifm-badge-padding-horizontal:calc(var(--ifm-spacing-horizontal)*0.5);--ifm-badge-padding-vertical:calc(var(--ifm-spacing-vertical)*0.25);--ifm-breadcrumb-border-radius:1.5rem;--ifm-breadcrumb-spacing:0.5rem;--ifm-breadcrumb-color-active:var(--ifm-color-primary);--ifm-breadcrumb-item-background-active:var(--ifm-hover-overlay);--ifm-breadcrumb-padding-horizontal:0.8rem;--ifm-breadcrumb-padding-vertical:0.4rem;--ifm-breadcrumb-size-multiplier:1;--ifm-breadcrumb-separator:url('data:image/svg+xml;utf8,');--ifm-breadcrumb-separator-filter:none;--ifm-breadcrumb-separator-size:0.5rem;--ifm-breadcrumb-separator-size-multiplier:1.25;--ifm-button-background-color:inherit;--ifm-button-border-color:var(--ifm-button-background-color);--ifm-button-border-width:var(--ifm-global-border-width);--ifm-button-font-weight:var(--ifm-font-weight-bold);--ifm-button-padding-horizontal:1.5rem;--ifm-button-padding-vertical:0.375rem;--ifm-button-size-multiplier:1;--ifm-button-transition-duration:var(--ifm-transition-fast);--ifm-button-border-radius:calc(var(--ifm-global-radius)*var(--ifm-button-size-multiplier));--ifm-button-group-spacing:2px;--ifm-card-background-color:var(--ifm-background-surface-color);--ifm-card-border-radius:calc(var(--ifm-global-radius)*2);--ifm-card-horizontal-spacing:var(--ifm-global-spacing);--ifm-card-vertical-spacing:var(--ifm-global-spacing);--ifm-toc-border-color:var(--ifm-color-emphasis-300);--ifm-toc-link-color:var(--ifm-color-content-secondary);--ifm-toc-padding-vertical:0.5rem;--ifm-toc-padding-horizontal:0.5rem;--ifm-dropdown-background-color:var(--ifm-background-surface-color);--ifm-dropdown-font-weight:var(--ifm-font-weight-semibold);--ifm-dropdown-link-color:var(--ifm-font-color-base);--ifm-dropdown-hover-background-color:var(--ifm-hover-overlay);--ifm-footer-background-color:var(--ifm-color-emphasis-100);--ifm-footer-color:inherit;--ifm-footer-link-color:var(--ifm-color-emphasis-700);--ifm-footer-link-hover-color:var(--ifm-color-primary);--ifm-footer-link-horizontal-spacing:0.5rem;--ifm-footer-padding-horizontal:calc(var(--ifm-spacing-horizontal)*2);--ifm-footer-padding-vertical:calc(var(--ifm-spacing-vertical)*2);--ifm-footer-title-color:inherit;--ifm-footer-logo-max-width:min(30rem,90vw);--ifm-hero-background-color:var(--ifm-background-surface-color);--ifm-hero-text-color:var(--ifm-color-emphasis-800);--ifm-menu-color:var(--ifm-color-emphasis-700);--ifm-menu-color-active:var(--ifm-color-primary);--ifm-menu-color-background-active:var(--ifm-hover-overlay);--ifm-menu-color-background-hover:var(--ifm-hover-overlay);--ifm-menu-link-padding-horizontal:0.75rem;--ifm-menu-link-padding-vertical:0.375rem;--ifm-menu-link-sublist-icon:url('data:image/svg+xml;utf8,');--ifm-menu-link-sublist-icon-filter:none;--ifm-navbar-background-color:var(--ifm-background-surface-color);--ifm-navbar-height:3.75rem;--ifm-navbar-item-padding-horizontal:0.75rem;--ifm-navbar-item-padding-vertical:0.25rem;--ifm-navbar-link-color:var(--ifm-font-color-base);--ifm-navbar-link-active-color:var(--ifm-link-color);--ifm-navbar-padding-horizontal:var(--ifm-spacing-horizontal);--ifm-navbar-padding-vertical:calc(var(--ifm-spacing-vertical)*0.5);--ifm-navbar-shadow:var(--ifm-global-shadow-lw);--ifm-navbar-search-input-background-color:var(--ifm-color-emphasis-200);--ifm-navbar-search-input-color:var(--ifm-color-emphasis-800);--ifm-navbar-search-input-placeholder-color:var(--ifm-color-emphasis-500);--ifm-navbar-search-input-icon:url('data:image/svg+xml;utf8,');--ifm-navbar-sidebar-width:83vw;--ifm-pagination-border-radius:var(--ifm-global-radius);--ifm-pagination-color-active:var(--ifm-color-primary);--ifm-pagination-font-size:1rem;--ifm-pagination-item-active-background:var(--ifm-hover-overlay);--ifm-pagination-page-spacing:0.2em;--ifm-pagination-padding-horizontal:calc(var(--ifm-spacing-horizontal)*1);--ifm-pagination-padding-vertical:calc(var(--ifm-spacing-vertical)*0.25);--ifm-pagination-nav-border-radius:var(--ifm-global-radius);--ifm-pagination-nav-color-hover:var(--ifm-color-primary);--ifm-pills-color-active:var(--ifm-color-primary);--ifm-pills-color-background-active:var(--ifm-hover-overlay);--ifm-pills-spacing:0.125rem;--ifm-tabs-color:var(--ifm-font-color-secondary);--ifm-tabs-color-active:var(--ifm-color-primary);--ifm-tabs-color-active-border:var(--ifm-tabs-color-active);--ifm-tabs-padding-horizontal:1rem;--ifm-tabs-padding-vertical:1rem}.badge--danger,.badge--info,.badge--primary,.badge--secondary,.badge--success,.badge--warning{--ifm-badge-border-color:var(--ifm-badge-background-color)}.button--link,.button--outline{--ifm-button-background-color:#0000}*{box-sizing:border-box}html{background-color:var(--ifm-background-color);color:var(--ifm-font-color-base);color-scheme:var(--ifm-color-scheme);font:var(--ifm-font-size-base)/var(--ifm-line-height-base) var(--ifm-font-family-base);-webkit-font-smoothing:antialiased;-webkit-tap-highlight-color:transparent;text-rendering:optimizelegibility;-webkit-text-size-adjust:100%;text-size-adjust:100%}iframe{border:0;color-scheme:auto}.container{margin:0 auto;max-width:var(--ifm-container-width)}.container--fluid{max-width:inherit}.row{display:flex;flex-wrap:wrap;margin:0 calc(var(--ifm-spacing-horizontal)*-1)}.margin-bottom--none,.margin-vert--none,.markdown>:last-child{margin-bottom:0!important}.margin-top--none,.margin-vert--none{margin-top:0!important}.row--no-gutters{margin-left:0;margin-right:0}.margin-horiz--none,.margin-right--none{margin-right:0!important}.row--no-gutters>.col{padding-left:0;padding-right:0}.row--align-top{align-items:flex-start}.row--align-bottom{align-items:flex-end}.row--align-center{align-items:center}.row--align-stretch{align-items:stretch}.row--align-baseline{align-items:baseline}.col{--ifm-col-width:100%;flex:1 0;margin-left:0;max-width:var(--ifm-col-width)}.padding-bottom--none,.padding-vert--none{padding-bottom:0!important}.padding-top--none,.padding-vert--none{padding-top:0!important}.padding-horiz--none,.padding-left--none{padding-left:0!important}.padding-horiz--none,.padding-right--none{padding-right:0!important}.col[class*=col--]{flex:0 0 var(--ifm-col-width)}.col--1{--ifm-col-width:8.33333%}.col--offset-1{margin-left:8.33333%}.col--2{--ifm-col-width:16.66667%}.col--offset-2{margin-left:16.66667%}.col--3{--ifm-col-width:25%}.col--offset-3{margin-left:25%}.col--4{--ifm-col-width:33.33333%}.col--offset-4{margin-left:33.33333%}.col--5{--ifm-col-width:41.66667%}.col--offset-5{margin-left:41.66667%}.col--6{--ifm-col-width:50%}.col--offset-6{margin-left:50%}.col--7{--ifm-col-width:58.33333%}.col--offset-7{margin-left:58.33333%}.col--8{--ifm-col-width:66.66667%}.col--offset-8{margin-left:66.66667%}.col--9{--ifm-col-width:75%}.col--offset-9{margin-left:75%}.col--10{--ifm-col-width:83.33333%}.col--offset-10{margin-left:83.33333%}.col--11{--ifm-col-width:91.66667%}.col--offset-11{margin-left:91.66667%}.col--12{--ifm-col-width:100%}.col--offset-12{margin-left:100%}.margin-horiz--none,.margin-left--none{margin-left:0!important}.margin--none{margin:0!important}.margin-bottom--xs,.margin-vert--xs{margin-bottom:.25rem!important}.margin-top--xs,.margin-vert--xs{margin-top:.25rem!important}.margin-horiz--xs,.margin-left--xs{margin-left:.25rem!important}.margin-horiz--xs,.margin-right--xs{margin-right:.25rem!important}.margin--xs{margin:.25rem!important}.margin-bottom--sm,.margin-vert--sm{margin-bottom:.5rem!important}.margin-top--sm,.margin-vert--sm{margin-top:.5rem!important}.margin-horiz--sm,.margin-left--sm{margin-left:.5rem!important}.margin-horiz--sm,.margin-right--sm{margin-right:.5rem!important}.margin--sm{margin:.5rem!important}.margin-bottom--md,.margin-vert--md{margin-bottom:1rem!important}.margin-top--md,.margin-vert--md{margin-top:1rem!important}.margin-horiz--md,.margin-left--md{margin-left:1rem!important}.margin-horiz--md,.margin-right--md{margin-right:1rem!important}.margin--md{margin:1rem!important}.margin-bottom--lg,.margin-vert--lg{margin-bottom:2rem!important}.margin-top--lg,.margin-vert--lg{margin-top:2rem!important}.margin-horiz--lg,.margin-left--lg{margin-left:2rem!important}.margin-horiz--lg,.margin-right--lg{margin-right:2rem!important}.margin--lg{margin:2rem!important}.margin-bottom--xl,.margin-vert--xl{margin-bottom:5rem!important}.margin-top--xl,.margin-vert--xl{margin-top:5rem!important}.margin-horiz--xl,.margin-left--xl{margin-left:5rem!important}.margin-horiz--xl,.margin-right--xl{margin-right:5rem!important}.margin--xl{margin:5rem!important}.padding--none{padding:0!important}.padding-bottom--xs,.padding-vert--xs{padding-bottom:.25rem!important}.padding-top--xs,.padding-vert--xs{padding-top:.25rem!important}.padding-horiz--xs,.padding-left--xs{padding-left:.25rem!important}.padding-horiz--xs,.padding-right--xs{padding-right:.25rem!important}.padding--xs{padding:.25rem!important}.padding-bottom--sm,.padding-vert--sm{padding-bottom:.5rem!important}.padding-top--sm,.padding-vert--sm{padding-top:.5rem!important}.padding-horiz--sm,.padding-left--sm{padding-left:.5rem!important}.padding-horiz--sm,.padding-right--sm{padding-right:.5rem!important}.padding--sm{padding:.5rem!important}.padding-bottom--md,.padding-vert--md{padding-bottom:1rem!important}.padding-top--md,.padding-vert--md{padding-top:1rem!important}.padding-horiz--md,.padding-left--md{padding-left:1rem!important}.padding-horiz--md,.padding-right--md{padding-right:1rem!important}.padding--md{padding:1rem!important}.padding-bottom--lg,.padding-vert--lg{padding-bottom:2rem!important}.padding-top--lg,.padding-vert--lg{padding-top:2rem!important}.padding-horiz--lg,.padding-left--lg{padding-left:2rem!important}.padding-horiz--lg,.padding-right--lg{padding-right:2rem!important}.padding--lg{padding:2rem!important}.padding-bottom--xl,.padding-vert--xl{padding-bottom:5rem!important}.padding-top--xl,.padding-vert--xl{padding-top:5rem!important}.padding-horiz--xl,.padding-left--xl{padding-left:5rem!important}.padding-horiz--xl,.padding-right--xl{padding-right:5rem!important}.padding--xl{padding:5rem!important}code{background-color:var(--ifm-code-background);border:.1rem solid #0000001a;border-radius:var(--ifm-code-border-radius);font-family:var(--ifm-font-family-monospace);font-size:var(--ifm-code-font-size);padding:var(--ifm-code-padding-vertical) var(--ifm-code-padding-horizontal)}a code{color:inherit}pre{background-color:var(--ifm-pre-background);border-radius:var(--ifm-pre-border-radius);color:var(--ifm-pre-color);font:var(--ifm-code-font-size)/var(--ifm-pre-line-height) var(--ifm-font-family-monospace);padding:var(--ifm-pre-padding)}pre code{background-color:initial;border:none;font-size:100%;line-height:inherit;padding:0}kbd{background-color:var(--ifm-color-emphasis-0);border:1px solid var(--ifm-color-emphasis-400);border-radius:.2rem;box-shadow:inset 0 -1px 0 var(--ifm-color-emphasis-400);color:var(--ifm-color-emphasis-800);font:80% var(--ifm-font-family-monospace);padding:.15rem .3rem}h1,h2,h3,h4,h5,h6{color:var(--ifm-heading-color);font-family:var(--ifm-heading-font-family);font-weight:var(--ifm-heading-font-weight);line-height:var(--ifm-heading-line-height);margin:var(--ifm-heading-margin-top) 0 var(--ifm-heading-margin-bottom) 0}h1{font-size:var(--ifm-h1-font-size)}h2{font-size:var(--ifm-h2-font-size)}h3{font-size:var(--ifm-h3-font-size)}h4{font-size:var(--ifm-h4-font-size)}h5{font-size:var(--ifm-h5-font-size)}h6{font-size:var(--ifm-h6-font-size)}img{max-width:100%}img[align=right]{padding-left:var(--image-alignment-padding)}img[align=left]{padding-right:var(--image-alignment-padding)}.markdown{--ifm-h1-vertical-rhythm-top:3;--ifm-h2-vertical-rhythm-top:2;--ifm-h3-vertical-rhythm-top:1.5;--ifm-heading-vertical-rhythm-top:1.25;--ifm-h1-vertical-rhythm-bottom:1.25;--ifm-heading-vertical-rhythm-bottom:1}.markdown:after,.markdown:before{content:"";display:table}.markdown:after{clear:both}.markdown h1:first-child{--ifm-h1-font-size:3rem;margin-bottom:calc(var(--ifm-h1-vertical-rhythm-bottom)*var(--ifm-leading))}.markdown>h2{--ifm-h2-font-size:2rem;margin-top:calc(var(--ifm-h2-vertical-rhythm-top)*var(--ifm-leading))}.markdown>h3{--ifm-h3-font-size:1.5rem;margin-top:calc(var(--ifm-h3-vertical-rhythm-top)*var(--ifm-leading))}.markdown>h4,.markdown>h5,.markdown>h6{margin-top:calc(var(--ifm-heading-vertical-rhythm-top)*var(--ifm-leading))}.markdown>p,.markdown>pre,.markdown>ul{margin-bottom:var(--ifm-leading)}.markdown li>p{margin-top:var(--ifm-list-paragraph-margin)}.markdown li+li{margin-top:var(--ifm-list-item-margin)}ol,ul{margin:0 0 var(--ifm-list-margin);padding-left:var(--ifm-list-left-padding)}ol ol,ul ol{list-style-type:lower-roman}ol ol ol,ol ul ol,ul ol ol,ul ul ol{list-style-type:lower-alpha}table{border-collapse:collapse;display:block;margin-bottom:var(--ifm-spacing-vertical)}table thead tr{border-bottom:2px solid var(--ifm-table-border-color)}table thead,table tr:nth-child(2n){background-color:var(--ifm-table-stripe-background)}table tr{background-color:var(--ifm-table-background);border-top:var(--ifm-table-border-width) solid var(--ifm-table-border-color)}table td,table th{border:var(--ifm-table-border-width) solid var(--ifm-table-border-color);padding:var(--ifm-table-cell-padding)}table th{background-color:var(--ifm-table-head-background);color:var(--ifm-table-head-color);font-weight:var(--ifm-table-head-font-weight)}table td{color:var(--ifm-table-cell-color)}strong{font-weight:var(--ifm-font-weight-bold)}a{color:var(--ifm-link-color);text-decoration:var(--ifm-link-decoration)}a:hover{color:var(--ifm-link-hover-color);text-decoration:var(--ifm-link-hover-decoration)}.button:hover,.text--no-decoration,.text--no-decoration:hover,a:not([href]){-webkit-text-decoration:none;text-decoration:none}p{margin:0 0 var(--ifm-paragraph-margin-bottom)}blockquote{border-left:var(--ifm-blockquote-border-left-width) solid var(--ifm-blockquote-border-color);box-shadow:var(--ifm-blockquote-shadow);color:var(--ifm-blockquote-color);font-size:var(--ifm-blockquote-font-size);padding:var(--ifm-blockquote-padding-vertical) var(--ifm-blockquote-padding-horizontal)}blockquote>:first-child{margin-top:0}blockquote>:last-child{margin-bottom:0}hr{background-color:var(--ifm-hr-background-color);border:0;height:var(--ifm-hr-height);margin:var(--ifm-hr-margin-vertical) 0}.shadow--lw{box-shadow:var(--ifm-global-shadow-lw)!important}.shadow--md{box-shadow:var(--ifm-global-shadow-md)!important}.shadow--tl{box-shadow:var(--ifm-global-shadow-tl)!important}.text--primary{color:var(--ifm-color-primary)}.text--secondary{color:var(--ifm-color-secondary)}.text--success{color:var(--ifm-color-success)}.text--info{color:var(--ifm-color-info)}.text--warning{color:var(--ifm-color-warning)}.text--danger{color:var(--ifm-color-danger)}.text--center{text-align:center}.text--left{text-align:left}.text--justify{text-align:justify}.text--right{text-align:right}.text--capitalize{text-transform:capitalize}.text--lowercase{text-transform:lowercase}.alert__heading,.text--uppercase{text-transform:uppercase}.text--light{font-weight:var(--ifm-font-weight-light)}.text--normal{font-weight:var(--ifm-font-weight-normal)}.text--semibold{font-weight:var(--ifm-font-weight-semibold)}.text--bold{font-weight:var(--ifm-font-weight-bold)}.text--italic{font-style:italic}.text--truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.text--break{word-wrap:break-word!important;word-break:break-word!important}.clean-btn{background:none;border:none;color:inherit;cursor:pointer;font-family:inherit;padding:0}.alert,.alert .close{color:var(--ifm-alert-foreground-color)}.clean-list{list-style:none;padding-left:0}.alert--primary{--ifm-alert-background-color:var(--ifm-color-primary-contrast-background);--ifm-alert-background-color-highlight:#3578e526;--ifm-alert-foreground-color:var(--ifm-color-primary-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-primary-dark)}.alert--secondary{--ifm-alert-background-color:var(--ifm-color-secondary-contrast-background);--ifm-alert-background-color-highlight:#ebedf026;--ifm-alert-foreground-color:var(--ifm-color-secondary-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-secondary-dark)}.alert--success{--ifm-alert-background-color:var(--ifm-color-success-contrast-background);--ifm-alert-background-color-highlight:#00a40026;--ifm-alert-foreground-color:var(--ifm-color-success-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-success-dark)}.alert--info{--ifm-alert-background-color:var(--ifm-color-info-contrast-background);--ifm-alert-background-color-highlight:#54c7ec26;--ifm-alert-foreground-color:var(--ifm-color-info-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-info-dark)}.alert--warning{--ifm-alert-background-color:var(--ifm-color-warning-contrast-background);--ifm-alert-background-color-highlight:#ffba0026;--ifm-alert-foreground-color:var(--ifm-color-warning-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-warning-dark)}.alert--danger{--ifm-alert-background-color:var(--ifm-color-danger-contrast-background);--ifm-alert-background-color-highlight:#fa383e26;--ifm-alert-foreground-color:var(--ifm-color-danger-contrast-foreground);--ifm-alert-border-color:var(--ifm-color-danger-dark)}.alert{--ifm-code-background:var(--ifm-alert-background-color-highlight);--ifm-link-color:var(--ifm-alert-foreground-color);--ifm-link-hover-color:var(--ifm-alert-foreground-color);--ifm-link-decoration:underline;--ifm-tabs-color:var(--ifm-alert-foreground-color);--ifm-tabs-color-active:var(--ifm-alert-foreground-color);--ifm-tabs-color-active-border:var(--ifm-alert-border-color);background-color:var(--ifm-alert-background-color);border:var(--ifm-alert-border-width) solid var(--ifm-alert-border-color);border-left-width:var(--ifm-alert-border-left-width);border-radius:var(--ifm-alert-border-radius);box-shadow:var(--ifm-alert-shadow);padding:var(--ifm-alert-padding-vertical) var(--ifm-alert-padding-horizontal)}.alert__heading{align-items:center;display:flex;font:700 var(--ifm-h5-font-size)/var(--ifm-heading-line-height) var(--ifm-heading-font-family);margin-bottom:.5rem}.alert__icon{display:inline-flex;margin-right:.4em}.alert__icon svg{fill:var(--ifm-alert-foreground-color);stroke:var(--ifm-alert-foreground-color);stroke-width:0}.alert .close{margin:calc(var(--ifm-alert-padding-vertical)*-1) calc(var(--ifm-alert-padding-horizontal)*-1) 0 0;opacity:.75}.alert .close:focus,.alert .close:hover{opacity:1}.alert a{text-decoration-color:var(--ifm-alert-border-color)}.alert a:hover{text-decoration-thickness:2px}.avatar{column-gap:var(--ifm-avatar-intro-margin);display:flex}.avatar__photo{border-radius:50%;display:block;height:var(--ifm-avatar-photo-size);overflow:hidden;width:var(--ifm-avatar-photo-size)}.card--full-height,.navbar__logo img{height:100%}.avatar__photo--sm{--ifm-avatar-photo-size:2rem}.avatar__photo--lg{--ifm-avatar-photo-size:4rem}.avatar__photo--xl{--ifm-avatar-photo-size:6rem}.avatar__intro{display:flex;flex:1 1;flex-direction:column;justify-content:center;text-align:var(--ifm-avatar-intro-alignment)}.badge,.breadcrumbs__item,.breadcrumbs__link,.button,.dropdown>.navbar__link:after{display:inline-block}.avatar__name{font:700 var(--ifm-h4-font-size)/var(--ifm-heading-line-height) var(--ifm-font-family-base)}.avatar__subtitle{margin-top:.25rem}.avatar--vertical{--ifm-avatar-intro-alignment:center;--ifm-avatar-intro-margin:0.5rem;align-items:center;flex-direction:column}.badge{background-color:var(--ifm-badge-background-color);border:var(--ifm-badge-border-width) solid var(--ifm-badge-border-color);border-radius:var(--ifm-badge-border-radius);color:var(--ifm-badge-color);font-size:75%;font-weight:var(--ifm-font-weight-bold);line-height:1;padding:var(--ifm-badge-padding-vertical) var(--ifm-badge-padding-horizontal)}.badge--primary{--ifm-badge-background-color:var(--ifm-color-primary)}.badge--secondary{--ifm-badge-background-color:var(--ifm-color-secondary);color:var(--ifm-color-black)}.breadcrumbs__link,.button.button--secondary.button--outline:not(.button--active):not(:hover){color:var(--ifm-font-color-base)}.badge--success{--ifm-badge-background-color:var(--ifm-color-success)}.badge--info{--ifm-badge-background-color:var(--ifm-color-info)}.badge--warning{--ifm-badge-background-color:var(--ifm-color-warning)}.badge--danger{--ifm-badge-background-color:var(--ifm-color-danger)}.breadcrumbs{margin-bottom:0;padding-left:0}.breadcrumbs__item:not(:last-child):after{background:var(--ifm-breadcrumb-separator) center;content:" ";display:inline-block;filter:var(--ifm-breadcrumb-separator-filter);height:calc(var(--ifm-breadcrumb-separator-size)*var(--ifm-breadcrumb-size-multiplier)*var(--ifm-breadcrumb-separator-size-multiplier));margin:0 var(--ifm-breadcrumb-spacing);opacity:.5;width:calc(var(--ifm-breadcrumb-separator-size)*var(--ifm-breadcrumb-size-multiplier)*var(--ifm-breadcrumb-separator-size-multiplier))}.breadcrumbs__item--active .breadcrumbs__link{background:var(--ifm-breadcrumb-item-background-active);color:var(--ifm-breadcrumb-color-active)}.breadcrumbs__link{border-radius:var(--ifm-breadcrumb-border-radius);font-size:calc(1rem*var(--ifm-breadcrumb-size-multiplier));padding:calc(var(--ifm-breadcrumb-padding-vertical)*var(--ifm-breadcrumb-size-multiplier)) calc(var(--ifm-breadcrumb-padding-horizontal)*var(--ifm-breadcrumb-size-multiplier));transition-duration:var(--ifm-transition-fast);transition-property:background,color}.breadcrumbs__link:any-link:hover,.breadcrumbs__link:link:hover,.breadcrumbs__link:visited:hover,area[href].breadcrumbs__link:hover{background:var(--ifm-breadcrumb-item-background-active);-webkit-text-decoration:none;text-decoration:none}.breadcrumbs--sm{--ifm-breadcrumb-size-multiplier:0.8}.breadcrumbs--lg{--ifm-breadcrumb-size-multiplier:1.2}.button{background-color:var(--ifm-button-background-color);border:var(--ifm-button-border-width) solid var(--ifm-button-border-color);border-radius:var(--ifm-button-border-radius);cursor:pointer;font-size:calc(.875rem*var(--ifm-button-size-multiplier));font-weight:var(--ifm-button-font-weight);line-height:1.5;padding:calc(var(--ifm-button-padding-vertical)*var(--ifm-button-size-multiplier)) calc(var(--ifm-button-padding-horizontal)*var(--ifm-button-size-multiplier));text-align:center;transition-duration:var(--ifm-button-transition-duration);transition-property:color,background,border-color;-webkit-user-select:none;user-select:none;white-space:nowrap}.button,.button:hover{color:var(--ifm-button-color)}.button--outline{--ifm-button-color:var(--ifm-button-border-color)}.button--outline:hover{--ifm-button-background-color:var(--ifm-button-border-color)}.button--link{--ifm-button-border-color:#0000;color:var(--ifm-link-color);text-decoration:var(--ifm-link-decoration)}.button--link.button--active,.button--link:active,.button--link:hover{color:var(--ifm-link-hover-color);text-decoration:var(--ifm-link-hover-decoration)}.dropdown__link--active,.dropdown__link:hover,.menu__link:hover,.navbar__brand:hover,.navbar__link--active,.navbar__link:hover,.pagination-nav__link:hover,.pagination__link:hover{-webkit-text-decoration:none;text-decoration:none}.button.disabled,.button:disabled,.button[disabled]{opacity:.65;pointer-events:none}.button--sm{--ifm-button-size-multiplier:0.8}.button--lg{--ifm-button-size-multiplier:1.35}.button--block{display:block;width:100%}.button.button--secondary{color:var(--ifm-color-gray-900)}:where(.button--primary){--ifm-button-background-color:var(--ifm-color-primary);--ifm-button-border-color:var(--ifm-color-primary)}:where(.button--primary):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-primary-dark);--ifm-button-border-color:var(--ifm-color-primary-dark)}.button--primary.button--active,.button--primary:active{--ifm-button-background-color:var(--ifm-color-primary-darker);--ifm-button-border-color:var(--ifm-color-primary-darker)}:where(.button--secondary){--ifm-button-background-color:var(--ifm-color-secondary);--ifm-button-border-color:var(--ifm-color-secondary)}:where(.button--secondary):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-secondary-dark);--ifm-button-border-color:var(--ifm-color-secondary-dark)}.button--secondary.button--active,.button--secondary:active{--ifm-button-background-color:var(--ifm-color-secondary-darker);--ifm-button-border-color:var(--ifm-color-secondary-darker)}:where(.button--success){--ifm-button-background-color:var(--ifm-color-success);--ifm-button-border-color:var(--ifm-color-success)}:where(.button--success):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-success-dark);--ifm-button-border-color:var(--ifm-color-success-dark)}.button--success.button--active,.button--success:active{--ifm-button-background-color:var(--ifm-color-success-darker);--ifm-button-border-color:var(--ifm-color-success-darker)}:where(.button--info){--ifm-button-background-color:var(--ifm-color-info);--ifm-button-border-color:var(--ifm-color-info)}:where(.button--info):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-info-dark);--ifm-button-border-color:var(--ifm-color-info-dark)}.button--info.button--active,.button--info:active{--ifm-button-background-color:var(--ifm-color-info-darker);--ifm-button-border-color:var(--ifm-color-info-darker)}:where(.button--warning){--ifm-button-background-color:var(--ifm-color-warning);--ifm-button-border-color:var(--ifm-color-warning)}:where(.button--warning):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-warning-dark);--ifm-button-border-color:var(--ifm-color-warning-dark)}.button--warning.button--active,.button--warning:active{--ifm-button-background-color:var(--ifm-color-warning-darker);--ifm-button-border-color:var(--ifm-color-warning-darker)}:where(.button--danger){--ifm-button-background-color:var(--ifm-color-danger);--ifm-button-border-color:var(--ifm-color-danger)}:where(.button--danger):not(.button--outline):hover{--ifm-button-background-color:var(--ifm-color-danger-dark);--ifm-button-border-color:var(--ifm-color-danger-dark)}.button--danger.button--active,.button--danger:active{--ifm-button-background-color:var(--ifm-color-danger-darker);--ifm-button-border-color:var(--ifm-color-danger-darker)}.button-group{display:inline-flex;gap:var(--ifm-button-group-spacing)}.button-group>.button:not(:first-child){border-bottom-left-radius:0;border-top-left-radius:0}.button-group>.button:not(:last-child){border-bottom-right-radius:0;border-top-right-radius:0}.button-group--block{display:flex;justify-content:stretch}.button-group--block>.button{flex-grow:1}.card{background-color:var(--ifm-card-background-color);border-radius:var(--ifm-card-border-radius);box-shadow:var(--ifm-global-shadow-lw);display:flex;flex-direction:column;overflow:hidden}.card__image{padding-top:var(--ifm-card-vertical-spacing)}.card__image:first-child{padding-top:0}.card__body,.card__footer,.card__header{padding:var(--ifm-card-vertical-spacing) var(--ifm-card-horizontal-spacing)}.card__body:not(:last-child),.card__footer:not(:last-child),.card__header:not(:last-child){padding-bottom:0}.card__body>:last-child,.card__footer>:last-child,.card__header>:last-child{margin-bottom:0}.card__footer{margin-top:auto}.table-of-contents{font-size:.8rem;margin-bottom:0;padding:var(--ifm-toc-padding-vertical) 0}.table-of-contents,.table-of-contents ul{list-style:none;padding-left:var(--ifm-toc-padding-horizontal)}.table-of-contents li{margin:var(--ifm-toc-padding-vertical) var(--ifm-toc-padding-horizontal)}.table-of-contents__left-border{border-left:1px solid var(--ifm-toc-border-color)}.table-of-contents__link{color:var(--ifm-toc-link-color);display:block}.table-of-contents__link--active,.table-of-contents__link--active code,.table-of-contents__link:hover,.table-of-contents__link:hover code{color:var(--ifm-color-primary);-webkit-text-decoration:none;text-decoration:none}.close{color:var(--ifm-color-black);float:right;font-size:1.5rem;font-weight:var(--ifm-font-weight-bold);line-height:1;opacity:.5;padding:1rem;transition:opacity var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.close:hover{opacity:.7}.close:focus{opacity:.8}.dropdown{display:inline-flex;font-weight:var(--ifm-dropdown-font-weight);position:relative;vertical-align:top}.dropdown--hoverable:hover .dropdown__menu,.dropdown--show .dropdown__menu{opacity:1;pointer-events:all;transform:translateY(-1px);visibility:visible}.dropdown__menu,.navbar__item.dropdown .navbar__link:not([href]){pointer-events:none}.dropdown--right .dropdown__menu{left:inherit;right:0}.dropdown--nocaret .navbar__link:after{content:none!important}.dropdown__menu{background-color:var(--ifm-dropdown-background-color);border-radius:var(--ifm-global-radius);box-shadow:var(--ifm-global-shadow-md);left:0;list-style:none;max-height:80vh;min-width:10rem;opacity:0;overflow-y:auto;padding:.5rem;position:absolute;top:calc(100% - var(--ifm-navbar-item-padding-vertical) + .3rem);transform:translateY(-.625rem);transition-duration:var(--ifm-transition-fast);transition-property:opacity,transform,visibility;transition-timing-function:var(--ifm-transition-timing-default);visibility:hidden;z-index:var(--ifm-z-index-dropdown)}.menu__caret,.menu__link,.menu__list-item-collapsible{border-radius:.25rem;transition:background var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.dropdown__link{border-radius:.25rem;color:var(--ifm-dropdown-link-color);display:block;font-size:.875rem;margin-top:.2rem;padding:.25rem .5rem;white-space:nowrap}.dropdown__link--active,.dropdown__link:hover{background-color:var(--ifm-dropdown-hover-background-color);color:var(--ifm-dropdown-link-color)}.dropdown__link--active,.dropdown__link--active:hover{--ifm-dropdown-link-color:var(--ifm-link-color)}.dropdown>.navbar__link:after{border-color:currentcolor #0000;border-style:solid;border-width:.4em .4em 0;content:"";margin-left:.3em;position:relative;top:2px;transform:translateY(-50%)}.footer{background-color:var(--ifm-footer-background-color);color:var(--ifm-footer-color);padding:var(--ifm-footer-padding-vertical) var(--ifm-footer-padding-horizontal)}.footer--dark{--ifm-footer-background-color:#303846;--ifm-footer-color:var(--ifm-footer-link-color);--ifm-footer-link-color:var(--ifm-color-secondary);--ifm-footer-title-color:var(--ifm-color-white)}.footer__links{margin-bottom:1rem}.footer__link-item{color:var(--ifm-footer-link-color);line-height:2}.footer__link-item:hover{color:var(--ifm-footer-link-hover-color)}.footer__link-separator{margin:0 var(--ifm-footer-link-horizontal-spacing)}.footer__logo{margin-top:1rem;max-width:var(--ifm-footer-logo-max-width)}.footer__title{color:var(--ifm-footer-title-color);font:700 var(--ifm-h4-font-size)/var(--ifm-heading-line-height) var(--ifm-font-family-base);margin-bottom:var(--ifm-heading-margin-bottom)}.menu,.navbar__link{font-weight:var(--ifm-font-weight-semibold)}.footer__item{margin-top:0}.footer__items{margin-bottom:0}[type=checkbox]{padding:0}.hero{align-items:center;background-color:var(--ifm-hero-background-color);color:var(--ifm-hero-text-color);display:flex;padding:4rem 2rem}.hero--primary{--ifm-hero-background-color:var(--ifm-color-primary);--ifm-hero-text-color:var(--ifm-font-color-base-inverse)}.hero--dark{--ifm-hero-background-color:#303846;--ifm-hero-text-color:var(--ifm-color-white)}.hero__title{font-size:3rem}.hero__subtitle{font-size:1.5rem}.menu__list{list-style:none;margin:0;padding-left:0}.menu__caret,.menu__link{padding:var(--ifm-menu-link-padding-vertical) var(--ifm-menu-link-padding-horizontal)}.menu__list .menu__list{flex:0 0 100%;margin-top:.25rem;padding-left:var(--ifm-menu-link-padding-horizontal)}.menu__list-item:not(:first-child){margin-top:.25rem}.menu__list-item--collapsed .menu__list{height:0;overflow:hidden}.menu__list-item--collapsed .menu__caret:before,.menu__list-item--collapsed .menu__link--sublist:after{transform:rotate(90deg)}.menu__list-item-collapsible{display:flex;flex-wrap:wrap;position:relative}.menu__caret:hover,.menu__link:hover,.menu__list-item-collapsible--active,.menu__list-item-collapsible:hover{background:var(--ifm-menu-color-background-hover)}.menu__list-item-collapsible .menu__link--active,.menu__list-item-collapsible .menu__link:hover{background:none!important}.menu__caret,.menu__link{align-items:center;display:flex}.menu__link{color:var(--ifm-menu-color);flex:1;line-height:1.25}.menu__link:hover{color:var(--ifm-menu-color)}.menu__caret:before,.menu__link--sublist-caret:after{content:"";filter:var(--ifm-menu-link-sublist-icon-filter);height:1.25rem;transform:rotate(180deg);transition:transform var(--ifm-transition-fast) linear;width:1.25rem}.menu__link--sublist-caret:after{background:var(--ifm-menu-link-sublist-icon) 50%/2rem 2rem;margin-left:auto;min-width:1.25rem}.menu__link--active,.menu__link--active:hover{color:var(--ifm-menu-color-active)}.navbar__brand,.navbar__link{color:var(--ifm-navbar-link-color)}.menu__link--active:not(.menu__link--sublist){background-color:var(--ifm-menu-color-background-active)}.menu__caret:before{background:var(--ifm-menu-link-sublist-icon) 50%/2rem 2rem}.navbar--dark,html[data-theme=dark]{--ifm-menu-link-sublist-icon-filter:invert(100%) sepia(94%) saturate(17%) hue-rotate(223deg) brightness(104%) contrast(98%)}.navbar{background-color:var(--ifm-navbar-background-color);box-shadow:var(--ifm-navbar-shadow);height:var(--ifm-navbar-height);padding:var(--ifm-navbar-padding-vertical) var(--ifm-navbar-padding-horizontal)}.navbar,.navbar>.container,.navbar>.container-fluid{display:flex}.navbar--fixed-top{position:sticky;top:0;z-index:var(--ifm-z-index-fixed)}.navbar-sidebar,.navbar-sidebar__backdrop{bottom:0;left:0;opacity:0;position:fixed;top:0;transition-duration:var(--ifm-transition-fast);transition-timing-function:ease-in-out;visibility:hidden}.navbar__inner{display:flex;flex-wrap:wrap;justify-content:space-between;width:100%}.navbar__brand{align-items:center;display:flex;margin-right:1rem;min-width:0}.navbar__brand:hover{color:var(--ifm-navbar-link-hover-color)}.navbar__title{flex:1 1 auto}.navbar__toggle{display:none;margin-right:.5rem}.navbar__logo{flex:0 0 auto;height:2rem;margin-right:.5rem}.navbar__items{align-items:center;display:flex;flex:1;min-width:0}.navbar__items--center{flex:0 0 auto}.navbar__items--center .navbar__brand{margin:0}.navbar__items--center+.navbar__items--right{flex:1}.navbar__items--right{flex:0 0 auto;justify-content:flex-end}.navbar__items--right>:last-child{padding-right:0}.navbar__item{display:inline-block;padding:var(--ifm-navbar-item-padding-vertical) var(--ifm-navbar-item-padding-horizontal)}.navbar__link--active,.navbar__link:hover{color:var(--ifm-navbar-link-hover-color)}.navbar--dark,.navbar--primary{--ifm-menu-color:var(--ifm-color-gray-300);--ifm-navbar-link-color:var(--ifm-color-gray-100);--ifm-navbar-search-input-background-color:#ffffff1a;--ifm-navbar-search-input-placeholder-color:#ffffff80;color:var(--ifm-color-white)}.navbar--dark{--ifm-navbar-background-color:#242526;--ifm-menu-color-background-active:#ffffff0d;--ifm-navbar-search-input-color:var(--ifm-color-white)}.navbar--primary{--ifm-navbar-background-color:var(--ifm-color-primary);--ifm-navbar-link-hover-color:var(--ifm-color-white);--ifm-menu-color-active:var(--ifm-color-white);--ifm-navbar-search-input-color:var(--ifm-color-emphasis-500)}.navbar__search-input{appearance:none;background:var(--ifm-navbar-search-input-background-color) var(--ifm-navbar-search-input-icon) no-repeat .75rem center/1rem 1rem;border:none;border-radius:2rem;color:var(--ifm-navbar-search-input-color);cursor:text;display:inline-block;font-size:1rem;height:2rem;padding:0 .5rem 0 2.25rem;width:12.5rem}.navbar__search-input::placeholder{color:var(--ifm-navbar-search-input-placeholder-color)}.navbar-sidebar{background-color:var(--ifm-navbar-background-color);box-shadow:var(--ifm-global-shadow-md);transform:translate3d(-100%,0,0);transition-property:opacity,visibility,transform;width:var(--ifm-navbar-sidebar-width)}.navbar-sidebar--show .navbar-sidebar,.navbar-sidebar__items{transform:translateZ(0)}.navbar-sidebar--show .navbar-sidebar,.navbar-sidebar--show .navbar-sidebar__backdrop{opacity:1;visibility:visible}.navbar-sidebar__backdrop{background-color:#0009;right:0;transition-property:opacity,visibility}.navbar-sidebar__brand{align-items:center;box-shadow:var(--ifm-navbar-shadow);display:flex;flex:1;height:var(--ifm-navbar-height);padding:var(--ifm-navbar-padding-vertical) var(--ifm-navbar-padding-horizontal)}.navbar-sidebar__items{display:flex;height:calc(100% - var(--ifm-navbar-height));transition:transform var(--ifm-transition-fast) ease-in-out}.navbar-sidebar__items--show-secondary{transform:translate3d(calc((var(--ifm-navbar-sidebar-width))*-1),0,0)}.navbar-sidebar__item{flex-shrink:0;padding:.5rem;width:calc(var(--ifm-navbar-sidebar-width))}.navbar-sidebar__back{background:var(--ifm-menu-color-background-active);font-size:15px;font-weight:var(--ifm-button-font-weight);margin:0 0 .2rem -.5rem;padding:.6rem 1.5rem;position:relative;text-align:left;top:-.5rem;width:calc(100% + 1rem)}.navbar-sidebar__close{display:flex;margin-left:auto}.pagination{column-gap:var(--ifm-pagination-page-spacing);display:flex;font-size:var(--ifm-pagination-font-size);padding-left:0}.pagination--sm{--ifm-pagination-font-size:0.8rem;--ifm-pagination-padding-horizontal:0.8rem;--ifm-pagination-padding-vertical:0.2rem}.pagination--lg{--ifm-pagination-font-size:1.2rem;--ifm-pagination-padding-horizontal:1.2rem;--ifm-pagination-padding-vertical:0.3rem}.pagination__item{display:inline-flex}.pagination__item>span{padding:var(--ifm-pagination-padding-vertical)}.pagination__item--active .pagination__link{color:var(--ifm-pagination-color-active)}.pagination__item--active .pagination__link,.pagination__item:not(.pagination__item--active):hover .pagination__link{background:var(--ifm-pagination-item-active-background)}.pagination__item--disabled,.pagination__item[disabled]{opacity:.25;pointer-events:none}.pagination__link{border-radius:var(--ifm-pagination-border-radius);color:var(--ifm-font-color-base);display:inline-block;padding:var(--ifm-pagination-padding-vertical) var(--ifm-pagination-padding-horizontal);transition:background var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.pagination-nav{display:grid;grid-gap:var(--ifm-spacing-horizontal);gap:var(--ifm-spacing-horizontal);grid-template-columns:repeat(2,1fr)}.pagination-nav__link{border:1px solid var(--ifm-color-emphasis-300);border-radius:var(--ifm-pagination-nav-border-radius);display:block;height:100%;line-height:var(--ifm-heading-line-height);padding:var(--ifm-global-spacing);transition:border-color var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.pagination-nav__link:hover{border-color:var(--ifm-pagination-nav-color-hover)}.pagination-nav__link--next{grid-column:2/3;text-align:right}.pagination-nav__label{font-size:var(--ifm-h4-font-size);font-weight:var(--ifm-heading-font-weight);word-break:break-word}.pagination-nav__link--prev .pagination-nav__label:before{content:"« "}.pagination-nav__link--next .pagination-nav__label:after{content:" »"}.pagination-nav__sublabel{color:var(--ifm-color-content-secondary);font-size:var(--ifm-h5-font-size);font-weight:var(--ifm-font-weight-semibold);margin-bottom:.25rem}.pills__item,.tabs{font-weight:var(--ifm-font-weight-bold)}.pills{display:flex;gap:var(--ifm-pills-spacing);padding-left:0}.pills__item{border-radius:.5rem;cursor:pointer;display:inline-block;padding:.25rem 1rem;transition:background var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.pills__item--active{color:var(--ifm-pills-color-active)}.pills__item--active,.pills__item:not(.pills__item--active):hover{background:var(--ifm-pills-color-background-active)}.pills--block{justify-content:stretch}.pills--block .pills__item{flex-grow:1;text-align:center}.tabs{color:var(--ifm-tabs-color);display:flex;margin-bottom:0;overflow-x:auto;padding-left:0}.tabs__item{border-bottom:3px solid #0000;border-radius:var(--ifm-global-radius);cursor:pointer;display:inline-flex;padding:var(--ifm-tabs-padding-vertical) var(--ifm-tabs-padding-horizontal);transition:background-color var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.tabs__item--active{border-bottom-color:var(--ifm-tabs-color-active-border);border-bottom-left-radius:0;border-bottom-right-radius:0;color:var(--ifm-tabs-color-active)}.tabs__item:hover{background-color:var(--ifm-hover-overlay)}.tabs--block{justify-content:stretch}.tabs--block .tabs__item{flex-grow:1;justify-content:center}html[data-theme=dark]{--ifm-color-scheme:dark;--ifm-color-emphasis-0:var(--ifm-color-gray-1000);--ifm-color-emphasis-100:var(--ifm-color-gray-900);--ifm-color-emphasis-200:var(--ifm-color-gray-800);--ifm-color-emphasis-300:var(--ifm-color-gray-700);--ifm-color-emphasis-400:var(--ifm-color-gray-600);--ifm-color-emphasis-600:var(--ifm-color-gray-400);--ifm-color-emphasis-700:var(--ifm-color-gray-300);--ifm-color-emphasis-800:var(--ifm-color-gray-200);--ifm-color-emphasis-900:var(--ifm-color-gray-100);--ifm-color-emphasis-1000:var(--ifm-color-gray-0);--ifm-background-color:#1b1b1d;--ifm-background-surface-color:#242526;--ifm-hover-overlay:#ffffff0d;--ifm-color-content:#e3e3e3;--ifm-color-content-secondary:#fff;--ifm-breadcrumb-separator-filter:invert(64%) sepia(11%) saturate(0%) hue-rotate(149deg) brightness(99%) contrast(95%);--ifm-code-background:#ffffff1a;--ifm-scrollbar-track-background-color:#444;--ifm-scrollbar-thumb-background-color:#686868;--ifm-scrollbar-thumb-hover-background-color:#7a7a7a;--ifm-table-stripe-background:#ffffff12;--ifm-toc-border-color:var(--ifm-color-emphasis-200);--ifm-color-primary-contrast-background:#102445;--ifm-color-primary-contrast-foreground:#ebf2fc;--ifm-color-secondary-contrast-background:#474748;--ifm-color-secondary-contrast-foreground:#fdfdfe;--ifm-color-success-contrast-background:#003100;--ifm-color-success-contrast-foreground:#e6f6e6;--ifm-color-info-contrast-background:#193c47;--ifm-color-info-contrast-foreground:#eef9fd;--ifm-color-warning-contrast-background:#4d3800;--ifm-color-warning-contrast-foreground:#fff8e6;--ifm-color-danger-contrast-background:#4b1113;--ifm-color-danger-contrast-foreground:#ffebec}}:root{--ifm-color-primary:#6366f1;--ifm-color-primary-dark:#4f46e5;--ifm-color-primary-darker:#4338ca;--ifm-color-primary-darkest:#3730a3;--ifm-color-primary-light:#818cf8;--ifm-color-primary-lighter:#a5b4fc;--ifm-color-primary-lightest:#c7d2fe;--ifm-background-color:#f8fafc;--ifm-background-surface-color:#fff;--ifm-font-color-base:#1e293b;--ifm-font-color-secondary:#64748b;--ifm-heading-color:#0f172a;--ifm-link-color:#6366f1;--ifm-link-hover-color:#4f46e5;--ifm-code-font-size:95%;--ifm-code-background:#f1f5f9;--ifm-code-border-radius:6px;--ifm-code-padding-horizontal:0.4rem;--ifm-code-padding-vertical:0.15rem;--docusaurus-highlighted-code-line-bg:#6366f114;--ifm-card-background-color:#fff;--ifm-global-shadow-lw:0 2px 8px #0000000f;--ifm-global-shadow-md:0 4px 16px #00000014;--ifm-global-shadow-tl:0 8px 32px #0000001a;--ifm-global-radius:8px;--ifm-toc-border-color:#00000014;--ifm-navbar-height:3.75rem;--hp-primary:#6366f1;--hp-primary-dark:#4f46e5;--hp-secondary:#8b5cf6;--hp-accent:#06b6d4;--hp-success:#10b981;--hp-dark:#0a0e27;--hp-dark-light:#151932;--hp-text:#e2e8f0;--hp-text-muted:#94a3b8;--hp-bg-card:#ffffff08;--hp-bg-page:#0a0e27}[data-theme=dark]{--ifm-color-primary:#818cf8;--ifm-color-primary-dark:#6366f1;--ifm-color-primary-darker:#4f46e5;--ifm-color-primary-darkest:#4338ca;--ifm-color-primary-light:#a5b4fc;--ifm-color-primary-lighter:#c7d2fe;--ifm-color-primary-lightest:#e0e7ff;--ifm-background-color:#0a0e27;--ifm-background-surface-color:#151932;--ifm-font-color-base:#e2e8f0;--ifm-font-color-secondary:#94a3b8;--ifm-heading-color:#f1f5f9;--ifm-link-color:#818cf8;--ifm-link-hover-color:#a5b4fc;--ifm-code-background:#ffffff0f;--docusaurus-highlighted-code-line-bg:#6366f126;--ifm-card-background-color:#ffffff08;--ifm-global-shadow-lw:0 2px 8px #0000004d;--ifm-global-shadow-md:0 4px 16px #0006;--ifm-global-shadow-tl:0 8px 32px #00000080;--ifm-toc-border-color:#ffffff0f}.gradient-bg-global{height:100%;left:0;pointer-events:none;position:fixed;top:0;width:100%;z-index:0}.gradient-orb-global{animation:25s ease-in-out infinite a;border-radius:50%;filter:blur(100px);opacity:.25;position:absolute}[data-theme=light] .gradient-orb-global{opacity:.1}.orb-global-1{background:radial-gradient(circle,#6366f1,#0000);height:600px;left:-10%;top:-10%;width:600px}.orb-global-2{animation-delay:8s;background:radial-gradient(circle,#8b5cf6,#0000);height:500px;right:-10%;top:50%;width:500px}.orb-global-3{animation-delay:15s;background:radial-gradient(circle,#06b6d4,#0000);bottom:-20%;height:700px;left:30%;width:700px}.logo_Ukns,.navbar__title{animation:3s infinite b;-webkit-text-fill-color:#0000}@keyframes a{0%,to{transform:translate(0) scale(1)}33%{transform:translate(60px,-60px) scale(1.1)}66%{transform:translate(-40px,40px) scale(.9)}}.navbar{backdrop-filter:blur(20px);-webkit-backdrop-filter:blur(20px);background:#0a0e27cc!important;border-bottom:1px solid #ffffff0d;box-shadow:none;position:sticky;z-index:100}[data-theme=light] .navbar{background:#ffffffd9!important;border-bottom:1px solid #00000014}.navbar__title{background:linear-gradient(135deg,#6366f1,#8b5cf6,#06b6d4);-webkit-background-clip:text;background-size:200% 200%;font-weight:800;background-clip:text}.navbar__link{font-weight:500}[data-theme=dark] .navbar__link,[data-theme=dark] .pagination-nav__label{color:#e2e8f0}[data-theme=dark] .navbar__link--active,[data-theme=dark] .navbar__link:hover{color:#818cf8}.navbar__toggle{color:var(--ifm-font-color-base)}.navbar-sidebar{background:var(--ifm-background-color)}.footer{background:#151932!important;border-top:1px solid #ffffff0d}[data-theme=light] .footer{background:#f1f5f9!important;border-top:1px solid #00000014}.footer__title{color:#e2e8f0;font-weight:700}[data-theme=light] .footer__title{color:#1e293b}.footer__link-item{color:#94a3b8;transition:color .3s}.footer__link-item:hover{color:#818cf8;-webkit-text-decoration:none;text-decoration:none}.footer__copyright,[data-theme=light] .footer__link-item{color:#64748b}[data-theme=light] .footer__link-item:hover{color:#6366f1}[data-theme=dark] .theme-doc-sidebar-container{border-right:1px solid #ffffff0d!important}[data-theme=dark] .menu{background:#0000}[data-theme=dark] .menu__link{border-radius:8px;color:#cbd5e1;transition:.2s}[data-theme=dark] .menu__link:hover{background:#6366f11a;color:#e2e8f0}[data-theme=dark] .menu__link--active:not(.menu__link--sublist){background:#6366f126;color:#818cf8;font-weight:600}[data-theme=dark] .menu__list-item-collapsible:hover{background:#6366f114}[data-theme=dark] .theme-doc-sidebar-item-category>.menu__list-item-collapsible>.menu__link{color:#e2e8f0;font-weight:600}.main-wrapper,[class*=docMainContainer],[class*=mainWrapper]{position:relative;z-index:1}.markdown h1,.markdown h2,.markdown h3,.markdown h4,.markdown h5,.markdown h6{color:var(--ifm-heading-color)}[data-theme=dark] table{border-color:#ffffff14}[data-theme=dark] table thead tr{background:#ffffff0a;border-bottom:1px solid #ffffff14}[data-theme=dark] table tbody tr{border-bottom:1px solid #ffffff0a}[data-theme=dark] table tbody tr:nth-child(2n){background:#ffffff05}[data-theme=dark] hr,[data-theme=dark] td,[data-theme=dark] th{border-color:#ffffff0f}[data-theme=dark] blockquote{background:#6366f10d;border-left-color:#818cf8;color:#cbd5e1}[data-theme=dark] .prism-code{background:#ffffff0a!important;border:1px solid #ffffff0f}[data-theme=dark] code{background:#ffffff0f;border:1px solid #ffffff14;color:#e2e8f0}[data-theme=dark] a code{color:var(--ifm-link-color)}[data-theme=dark] .codeBlockTitle_node_modules-\@docusaurus-theme-classic-lib-theme-CodeBlock-Content-styles-module{background:#ffffff0f!important;border-bottom:1px solid #ffffff0f}[data-theme=dark] .alert{background:#ffffff08;border:1px solid #ffffff0f;color:#e2e8f0}[data-theme=dark] .alert--info{background:#06b6d40f;border-left:4px solid #06b6d4}[data-theme=dark] .alert--warning{background:#f59e0b0f;border-left:4px solid #f59e0b}[data-theme=dark] .alert--danger{background:#ef44440f;border-left:4px solid #ef4444}[data-theme=dark] .alert--success{background:#10b9810f;border-left:4px solid #10b981}[data-theme=dark] .alert--secondary{background:#6366f10f;border-left:4px solid #818cf8}[data-theme=dark] .admonitionHeading_node_modules-\@docusaurus-theme-classic-lib-theme-Admonition-Layout-styles-module{color:inherit}[data-theme=dark] .pagination-nav__sublabel,[data-theme=dark] .table-of-contents__link{color:#94a3b8}[data-theme=dark] .table-of-contents__link--active,[data-theme=dark] .table-of-contents__link:hover,[data-theme=dark] article .avatar__name a{color:#818cf8}[data-theme=dark] .table-of-contents{border-left:1px solid #ffffff0f}[data-theme=dark] .pagination-nav__link{background:#ffffff08;border:1px solid #ffffff14;border-radius:12px;transition:.3s}[data-theme=dark] .pagination-nav__link:hover{background:#6366f10f;border-color:#6366f14d}[data-theme=dark] .blog-post-page article header h1{color:#f1f5f9}[data-theme=dark] .blog-tags a{background:#6366f11a;border:1px solid #6366f133;color:#818cf8}[data-theme=dark] .blog-tags a:hover{background:#6366f133;border-color:#6366f166;-webkit-text-decoration:none;text-decoration:none}[data-theme=dark] .navbar__search-input{background:#ffffff0d;border:1px solid #ffffff1a;color:#e2e8f0}[data-theme=dark] .navbar__search-input::placeholder{color:#64748b}[data-theme=dark] .breadcrumbs__link{background:#ffffff0a;border-radius:6px;color:#94a3b8}[data-theme=dark] .breadcrumbs__link:hover{background:#6366f11a;color:#e2e8f0}[data-theme=dark] .breadcrumbs__item--active .breadcrumbs__link{background:#6366f11f;color:#818cf8}[data-theme=dark] .tabs__item{border-bottom-color:#0000;color:#94a3b8}[data-theme=dark] .tabs__item:hover{color:#e2e8f0}[data-theme=dark] .tabs__item--active{border-bottom-color:#818cf8;color:#818cf8}[data-theme=dark] ::-webkit-scrollbar{height:8px;width:8px}[data-theme=dark] ::-webkit-scrollbar-track{background:#0000}[data-theme=dark] ::-webkit-scrollbar-thumb{background:#ffffff1f;border-radius:4px}[data-theme=dark] ::-webkit-scrollbar-thumb:hover{background:#fff3}[data-theme=dark] .dropdown__menu{background:#151932;border:1px solid #ffffff14}[data-theme=dark] .dropdown__link{color:#cbd5e1}[data-theme=dark] .dropdown__link:hover{background:#6366f11a;color:#e2e8f0}[data-theme=dark] .dropdown__link--active{background:#6366f11f;color:#818cf8}html.homepage-active .footer,html.homepage-active .navbar{display:none!important}html.homepage-active main{margin-top:0}html.homepage-active [class*=docMainContainer],html.homepage-active [class*=mainWrapper]{padding-top:0}[data-theme=light] .theme-doc-sidebar-container{border-right:1px solid #0000000f}[data-theme=light] .menu__link--active:not(.menu__link--sublist){background:#6366f114;color:#6366f1;font-weight:600}[data-theme=light] .menu__link:hover{background:#6366f10d}[data-theme=light] .pagination-nav__link{border-radius:12px;transition:.3s}[data-theme=light] .pagination-nav__link:hover{border-color:#6366f14d;box-shadow:0 4px 16px #6366f114}[data-theme=light] blockquote{border-left-color:#6366f1}@layer docusaurus.core{#__docusaurus-base-url-issue-banner-container{display:none}}.btn_bvfa,.btn_bvfa:hover,.componentLink_RzJT,.componentLink_RzJT:hover,.footerLinks_lH9U a,.footerLinks_lH9U a:hover,.footerList_2l2h a,.footerList_2l2h a:hover,.logo_Ukns,.navLink_aQaq,.navLink_aQaq:hover{-webkit-text-decoration:none;text-decoration:none}.hero_aEcG,.navContainer_E5Tz{margin:0 auto;max-width:1400px}[data-theme=light]{--hp-dark:#f8fafc;--hp-dark-light:#f1f5f9;--hp-text:#1e293b;--hp-text-muted:#64748b;--hp-bg-card:#00000005;--hp-bg-page:#f8fafc}.homepageWrapper_H_rv{background:var(--hp-bg-page);color:var(--hp-text);font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,sans-serif;line-height:1.6;overflow-x:hidden}.customNav_xRNg{backdrop-filter:blur(20px);background:#0a0e27cc;border-bottom:1px solid #ffffff0d;padding:1.2rem 0;position:fixed;top:0;transition:transform .3s;width:100%;z-index:1000}.ctaButtons_vsp7,.ctaDescription_HswS,.ctaTitle_arch,.customFooter_Ymmc,.hero_aEcG,.section_Q9Zo{position:relative;z-index:1}[data-theme=light] .customNav_xRNg{background:#ffffffd9;border-bottom:1px solid #00000014}.navContainer_E5Tz{align-items:center;display:flex;justify-content:space-between;padding:0 2rem}.logo_Ukns{background:linear-gradient(135deg,#6366f1,#8b5cf6,#06b6d4);-webkit-background-clip:text;background-size:200% 200%;font-size:1.6rem;font-weight:800;background-clip:text}@keyframes b{0%,to{background-position:0 50%}50%{background-position:100% 50%}}.navLinks_FO3Z{align-items:center;display:flex;gap:2.5rem}.navLink_aQaq{color:var(--hp-text);font-weight:500;transition:color .3s}.footerLinks_lH9U a:hover,.footerList_2l2h a:hover,.navLink_aQaq:hover{color:var(--hp-primary)}.btn_bvfa{border:none;border-radius:50px;cursor:pointer;display:inline-block;font-size:1rem;font-weight:600;padding:.75rem 2rem;transition:.4s cubic-bezier(.175,.885,.32,1.275)}.btnPrimary_hBjO{background:linear-gradient(135deg,#6366f1,#8b5cf6);box-shadow:0 10px 30px #6366f14d;color:#fff}.btnPrimary_hBjO:hover{box-shadow:0 15px 40px #6366f180;color:#fff;transform:translateY(-3px)}.btnSecondary_mRVh{background:#ffffff0d;border:2px solid #6366f180;color:var(--hp-text)}[data-theme=light] .btnSecondary_mRVh{background:#6366f10d;border-color:#6366f166}.btnSecondary_mRVh:hover{background:#6366f133;border-color:var(--hp-primary);color:var(--hp-text);transform:translateY(-3px)}.btnWhite_DoE5{background:#fff;color:var(--hp-primary)}.btnWhite_DoE5:hover{background:#f8fafc;color:var(--hp-primary);transform:translateY(-3px) scale(1.05)}.btnOutlineWhite_Kzbe{background:#0000;border:2px solid #fff;color:#fff}.btnOutlineWhite_Kzbe:hover{background:#ffffff26;color:#fff;transform:translateY(-3px)}.hero_aEcG{align-items:center;display:grid;gap:4rem;grid-template-columns:1fr 1fr;min-height:100vh;padding:10rem 2rem 5rem}.heroContent_mKPX{animation:1s ease-out c}@keyframes c{0%{opacity:0;transform:translateY(40px)}to{opacity:1;transform:translateY(0)}}.heroBadge_Z6oq{backdrop-filter:blur(10px);background:#6366f11a;border:1px solid #6366f14d;border-radius:50px;color:var(--hp-primary);display:inline-block;font-size:.9rem;font-weight:600;margin-bottom:2rem;padding:.5rem 1.5rem}.heroTitle_qg2I{background:linear-gradient(135deg,#fff,#a5b4fc);-webkit-background-clip:text;font-size:4.5rem;font-weight:900;line-height:1.1;margin-bottom:1.5rem;-webkit-text-fill-color:#0000;background-clip:text}.barrierAnswer_ZtxW,.barrierCard_tMSq p{line-height:1.8;color:var(--hp-text-muted)}[data-theme=light] .heroTitle_qg2I,[data-theme=light] .sectionTitle_Ut5p{background:linear-gradient(135deg,#1e293b,#6366f1);-webkit-background-clip:text;-webkit-text-fill-color:#0000;background-clip:text}.heroSubtitle_jFu1{color:var(--hp-text-muted);font-size:1.25rem;line-height:1.8;margin-bottom:2.5rem}.heroButtons_r52D{display:flex;flex-wrap:wrap;gap:1.5rem}.heroImage_xZN7{animation:1s ease-out .3s both c;position:relative}.heroImage_xZN7 img{border-radius:20px;box-shadow:0 40px 80px #00000080;width:100%}[data-theme=light] .heroImage_xZN7 img{box-shadow:0 40px 80px #00000026}.adoptionBadge_hbYR{animation:1s ease-out .6s both c;margin-top:3rem;text-align:center}.adoptionBadge_hbYR p{color:var(--hp-text-muted);font-size:.95rem}.section_Q9Zo{padding:8rem 2rem}.container_bfhl{margin:0 auto;max-width:1400px}.sectionHeader_Gahl{margin-bottom:5rem;text-align:center}.barrierCard_tMSq h3,.componentContent_xz2v h3,.sectionSubtitle_AZuW{font-weight:700;margin-bottom:1rem}.sectionSubtitle_AZuW{color:var(--hp-primary);font-size:.95rem;letter-spacing:2px;text-transform:uppercase}.sectionTitle_Ut5p{background:linear-gradient(135deg,#fff,#a5b4fc);-webkit-background-clip:text;font-size:3.5rem;font-weight:900;margin-bottom:1.5rem;-webkit-text-fill-color:#0000;background-clip:text}.barrierCard_tMSq,.componentCard_LlUg{backdrop-filter:blur(20px);background:var(--hp-bg-card);border:1px solid #ffffff14}.sectionDescription_cpL1{color:var(--hp-text-muted);font-size:1.2rem;margin:0 auto;max-width:800px}.barriersGrid_u0Jf,.videosGrid_FXHY{display:grid;gap:2.5rem;grid-template-columns:repeat(3,1fr);margin-top:4rem}.barrierCard_tMSq{border-radius:24px;padding:2.5rem;transition:.4s}[data-theme=light] .barrierCard_tMSq,[data-theme=light] .blogCard_hyds,[data-theme=light] .componentCard_LlUg,[data-theme=light] .statCard_w2S8,[data-theme=light] .videoCard_jGks{background:#fff;border-color:#00000014;box-shadow:0 4px 20px #0000000d}.barrierCard_tMSq:hover,.videoCard_jGks:hover{border-color:#6366f14d;box-shadow:0 20px 50px #0006;transform:translateY(-8px)}.componentCardVisible_hAJc:hover,.componentCard_LlUg:hover{transform:translateY(-10px)}[data-theme=light] .barrierCard_tMSq:hover{border-color:#6366f14d;box-shadow:0 20px 50px #6366f11f}.barrierIcon_HTIA{font-size:2.5rem;margin-bottom:1.5rem}.barrierCard_tMSq h3{color:var(--hp-text);font-size:1.4rem}.barrierCard_tMSq p{font-size:.95rem}.barrierQuestions_jlWA{list-style:none;margin:1rem 0;padding:0}.barrierQuestions_jlWA li{color:var(--hp-text-muted);font-size:.92rem;line-height:1.6;padding:.4rem 0 .4rem 1.2rem;position:relative}.barrierQuestions_jlWA li:before{color:var(--hp-primary);content:"?";font-weight:700;left:0;position:absolute}.barrierAnswer_ZtxW{border-top:1px solid #ffffff0f;font-size:.92rem;margin-top:1rem;padding-top:1rem}.componentContent_xz2v,.statCard_w2S8{padding:2.5rem}[data-theme=light] .barrierAnswer_ZtxW{border-top-color:#0000000f}.componentsGrid_KtT5{display:grid;gap:3rem;grid-template-columns:repeat(3,1fr);margin-top:4rem}.componentCard_LlUg{border-radius:24px;opacity:0;overflow:hidden;transform:translateY(50px);transition:.5s cubic-bezier(.175,.885,.32,1.275)}.componentCardVisible_hAJc{opacity:1;transform:translateY(0)}.componentCard_LlUg:hover{border-color:#6366f14d;box-shadow:0 30px 60px #00000080}[data-theme=light] .componentCard_LlUg:hover{box-shadow:0 30px 60px #6366f11a}.blogCard_hyds:hover,.statCard_w2S8:hover{border-color:#6366f14d;transform:translateY(-5px)}.componentContent_xz2v h3{color:var(--hp-text);font-size:1.6rem}.componentContent_xz2v p{color:var(--hp-text-muted);line-height:1.7;margin-bottom:1.5rem}.componentLink_RzJT{align-items:center;display:inline-flex;font-weight:600;gap:.5rem;transition:gap .3s}.blogCard_hyds,.statCard_w2S8,.videoCard_jGks{backdrop-filter:blur(20px);transition:.4s}.componentLink_RzJT,.componentLink_RzJT:hover{color:var(--hp-primary)}.componentLink_RzJT:hover{gap:1rem}.componentIcon_JDYs{align-items:center;background:linear-gradient(135deg,#6366f11a,#8b5cf61a);display:flex;font-size:4rem;height:180px;justify-content:center;width:100%}[data-theme=light] .componentIcon_JDYs{background:linear-gradient(135deg,#6366f10f,#8b5cf60f)}.statsSection_GUBq{background:#0003}[data-theme=light] .statsSection_GUBq{background:#6366f108}.statsGrid_wBRk{display:grid;gap:2.5rem;grid-template-columns:repeat(auto-fit,minmax(250px,1fr));margin-top:4rem}.statCard_w2S8{background:var(--hp-bg-card);border:1px solid #ffffff14;border-radius:20px;text-align:center}.statLabel_I99V{color:var(--hp-text-muted);font-size:.9rem;letter-spacing:1.5px;margin-bottom:.5rem;text-transform:uppercase}.statValue_tB6D{background:linear-gradient(135deg,#6366f1,#8b5cf6);-webkit-background-clip:text;font-size:2.5rem;font-weight:900;-webkit-text-fill-color:#0000;background-clip:text}.statDescription_WIU_{color:var(--hp-text-muted);font-size:.95rem;margin-top:.5rem}.blogCard_hyds,.blogCard_hyds:hover{color:inherit;-webkit-text-decoration:none;text-decoration:none}.videoCard_jGks{background:var(--hp-bg-card);border:1px solid #ffffff14;border-radius:24px;overflow:hidden}[data-theme=light] .videoCard_jGks:hover{box-shadow:0 20px 50px #6366f11f}.videoWrapper_XWWU{aspect-ratio:16/9;background:#000;overflow:hidden;position:relative;width:100%}.videoPlayer_Nt7m{display:block;height:100%;object-fit:cover;width:100%}.videoContent_pd0B{padding:1.5rem 2rem 2rem}.videoContent_pd0B h3{color:var(--hp-text);font-size:1.3rem;font-weight:700;margin-bottom:.5rem}.videoContent_pd0B p{color:var(--hp-text-muted);font-size:.92rem;line-height:1.6;margin:0}.blogGrid_Qec3{display:grid;gap:2.5rem;grid-template-columns:repeat(auto-fill,minmax(350px,1fr));margin-top:4rem}.blogCard_hyds{background:var(--hp-bg-card);border:1px solid #ffffff14;border-radius:20px;display:block;overflow:hidden}.blogCardIcon_JPeR{align-items:center;background:linear-gradient(135deg,#6366f126,#06b6d426);display:flex;font-size:3rem;height:160px;justify-content:center;width:100%}[data-theme=light] .blogCardIcon_JPeR{background:linear-gradient(135deg,#6366f114,#06b6d414)}.blogContent_dJxs{padding:2rem}.blogCategory_UY54{background:#6366f133;border-radius:12px;color:var(--hp-primary);display:inline-block;font-size:.75rem;font-weight:700;margin-bottom:1rem;padding:.25rem .75rem;text-transform:uppercase}.blogCard_hyds h3{color:var(--hp-text);font-size:1.3rem;font-weight:700;margin-bottom:.75rem}.blogMeta_skDH{align-items:center;color:var(--hp-text-muted);display:flex;font-size:.85rem;gap:.5rem}.ctaSection_bmsv{background:linear-gradient(135deg,#6366f1,#8b5cf6);border-radius:40px;margin:2rem 0;overflow:hidden;padding:6rem 4rem;position:relative;text-align:center}.ctaSection_bmsv:before{animation:20s linear infinite d;background:radial-gradient(circle,#ffffff1a 0,#0000 70%);content:"";height:200%;left:-50%;position:absolute;top:-50%;width:200%}@keyframes d{0%{transform:rotate(0)}to{transform:rotate(1turn)}}.ctaTitle_arch{background:none;color:#fff;font-size:3.5rem;font-weight:900;margin-bottom:1.5rem;-webkit-text-fill-color:#fff}.ctaDescription_HswS{color:#ffffffe6;font-size:1.3rem;margin-bottom:3rem}.ctaButtons_vsp7{display:flex;flex-wrap:wrap;gap:1.5rem;justify-content:center}.customFooter_Ymmc{background:var(--hp-dark-light);border-top:1px solid #ffffff0d;padding:5rem 2rem 2rem}[data-theme=light] .customFooter_Ymmc{background:#f1f5f9;border-top-color:#00000014}.footerContent_obNo{display:grid;gap:4rem;grid-template-columns:2fr 1fr 1fr 1fr;margin:0 auto 3rem;max-width:1400px}.footerSection__c07 h4{color:var(--hp-text);font-size:1.2rem;font-weight:700;margin-bottom:1.5rem}.footerBottom_nS2f,.footerLinks_lH9U a,.footerList_2l2h a,.footerSection__c07 p{color:var(--hp-text-muted)}.footerSection__c07 p{line-height:1.8}.footerList_2l2h{list-style:none;margin:0;padding:0}.footerList_2l2h li{margin-bottom:.75rem}.footerList_2l2h a{transition:.3s}.footerBottom_nS2f{align-items:center;border-top:1px solid #ffffff0d;display:flex;flex-wrap:wrap;gap:1rem;justify-content:space-between;margin:0 auto;max-width:1400px;padding-top:2rem}[data-theme=light] .footerBottom_nS2f{border-top-color:#00000014}.footerLinks_lH9U{display:flex;gap:2rem}.footerLinks_lH9U a{transition:color .3s}@layer docusaurus.theme-common{body:not(.navigation-with-keyboard) :not(input):focus{outline:0}.themedComponent_mlkZ{display:none}[data-theme=dark] .themedComponent--dark_xIcU,[data-theme=light] .themedComponent--light_NVdE,html:not([data-theme]) .themedComponent--light_NVdE{display:initial}.errorBoundaryError_a6uf{color:red;white-space:pre-wrap}.errorBoundaryFallback_VBag{color:red;padding:.55rem}.details_lb9f{--docusaurus-details-summary-arrow-size:0.38rem;--docusaurus-details-transition:transform 200ms ease;--docusaurus-details-decoration-color:grey}.details_lb9f>summary{cursor:pointer;list-style:none;padding-left:1rem;position:relative}.details_lb9f>summary::-webkit-details-marker{display:none}.details_lb9f>summary:before{border-color:#0000 #0000 #0000 var(--docusaurus-details-decoration-color);border-style:solid;border-width:var(--docusaurus-details-summary-arrow-size);content:"";left:0;position:absolute;top:.45rem;transform:rotate(0);transform-origin:calc(var(--docusaurus-details-summary-arrow-size)/2) 50%;transition:var(--docusaurus-details-transition)}.details_lb9f[data-collapsed=false].isBrowser_bmU9>summary:before,.details_lb9f[open]:not(.isBrowser_bmU9)>summary:before{transform:rotate(90deg)}.collapsibleContent_i85q{border-top:1px solid var(--docusaurus-details-decoration-color);margin-top:1rem;padding-top:1rem}.collapsibleContent_i85q p:last-child,.details_lb9f>summary>p:last-child{margin-bottom:0}}@layer docusaurus.theme-classic{:root{--docusaurus-progress-bar-color:var(--ifm-color-primary);--docusaurus-announcement-bar-height:auto;--docusaurus-collapse-button-bg:#0000;--docusaurus-collapse-button-bg-hover:#0000001a;--doc-sidebar-width:300px;--doc-sidebar-hidden-width:30px;--docusaurus-blog-social-icon-size:1rem;--docusaurus-tag-list-border:var(--ifm-color-emphasis-300)}#nprogress{pointer-events:none}#nprogress .bar{background:var(--docusaurus-progress-bar-color);height:2px;left:0;position:fixed;top:0;width:100%;z-index:1031}#nprogress .peg{box-shadow:0 0 10px var(--docusaurus-progress-bar-color),0 0 5px var(--docusaurus-progress-bar-color);height:100%;opacity:1;position:absolute;right:0;transform:rotate(3deg) translateY(-4px);width:100px}.skipToContent_fXgn{background-color:var(--ifm-background-surface-color);color:var(--ifm-color-emphasis-900);left:100%;padding:calc(var(--ifm-global-spacing)/2) var(--ifm-global-spacing);position:fixed;top:1rem;z-index:calc(var(--ifm-z-index-fixed) + 1)}.skipToContent_fXgn:focus{box-shadow:var(--ifm-global-shadow-md);left:1rem}.closeButton_CVFx{line-height:0;padding:0}.content_knG7{font-size:85%;padding:5px 0;text-align:center}.content_knG7 a{color:inherit;-webkit-text-decoration:underline;text-decoration:underline}.announcementBar_mb4j{align-items:center;background-color:var(--ifm-color-white);border-bottom:1px solid var(--ifm-color-emphasis-100);color:var(--ifm-color-black);display:flex;height:var(--docusaurus-announcement-bar-height)}.docSidebarContainer_YfHR,.navbarSearchContainer_Bca1:empty,.sidebarLogo_isFc,.toggleIcon_g3eP,html[data-announcement-bar-initially-dismissed=true] .announcementBar_mb4j{display:none}.announcementBarPlaceholder_vyr4{flex:0 0 10px}.announcementBarClose_gvF7{align-self:stretch;flex:0 0 30px}.announcementBarContent_xLdY{flex:1 1 auto}.toggle_vylO{height:2rem;width:2rem}.toggleButton_gllP{-webkit-tap-highlight-color:transparent;align-items:center;border-radius:50%;display:flex;height:100%;justify-content:center;transition:background var(--ifm-transition-fast);width:100%}.toggleButton_gllP:hover{background:var(--ifm-color-emphasis-200)}[data-theme-choice=dark] .darkToggleIcon_wfgR,[data-theme-choice=light] .lightToggleIcon_pyhR,[data-theme-choice=system] .systemToggleIcon_QzmC{display:initial}.toggleButtonDisabled_aARS{cursor:not-allowed}.darkNavbarColorModeToggle_X3D1:hover{background:var(--ifm-color-gray-800)}.backToTopButton_sjWU{background-color:var(--ifm-color-emphasis-200);border-radius:50%;bottom:1.3rem;box-shadow:var(--ifm-global-shadow-lw);height:3rem;opacity:0;position:fixed;right:1.3rem;transform:scale(0);transition:all var(--ifm-transition-fast) var(--ifm-transition-timing-default);visibility:hidden;width:3rem;z-index:calc(var(--ifm-z-index-fixed) - 1)}.backToTopButton_sjWU:after{background-color:var(--ifm-color-emphasis-1000);content:" ";display:inline-block;height:100%;-webkit-mask:var(--ifm-menu-link-sublist-icon) 50%/2rem 2rem no-repeat;mask:var(--ifm-menu-link-sublist-icon) 50%/2rem 2rem no-repeat;width:100%}.backToTopButtonShow_xfvO{opacity:1;transform:scale(1);visibility:visible}[data-theme=dark]:root{--docusaurus-collapse-button-bg:#ffffff0d;--docusaurus-collapse-button-bg-hover:#ffffff1a}.collapseSidebarButton_PEFL{display:none;margin:0}.iconExternalLink_nPIU{margin-left:.3rem}.dropdownNavbarItemMobile_J0Sd{cursor:pointer}.iconLanguage_nlXk{margin-right:5px;vertical-align:text-bottom}.navbarHideable_m1mJ{transition:transform var(--ifm-transition-fast) ease}.navbarHidden_jGov{transform:translate3d(0,calc(-100% - 2px),0)}.navbar__items--right>:last-child{padding-right:0}.footerLogoLink_BH7S{opacity:.5;transition:opacity var(--ifm-transition-fast) var(--ifm-transition-timing-default)}.footerLogoLink_BH7S:hover,.hash-link:focus,:hover>.hash-link{opacity:1}.menuExternalLink_NmtK{align-items:center}.docMainContainer_TBSr,.docRoot_UBD9{display:flex;width:100%}.authorSocialIcon_XYv3,.authorSocialLink_owbf{width:var(--docusaurus-blog-social-icon-size)}.docsWrapper_hBAB{display:flex;flex:1 0 auto}.anchorWithStickyNavbar_LWe7{scroll-margin-top:calc(var(--ifm-navbar-height) + .5rem)}.anchorWithHideOnScrollNavbar_WYt5{scroll-margin-top:.5rem}.hash-link{opacity:0;padding-left:.5rem;transition:opacity var(--ifm-transition-fast);-webkit-user-select:none;user-select:none}.hash-link:before{content:"#"}.docCardListItem_W1sv>*,body,html{height:100%}.mainWrapper_z2l0{display:flex;flex:1 0 auto;flex-direction:column}.docusaurus-mt-lg{margin-top:3rem}#__docusaurus{display:flex;flex-direction:column;min-height:100%}.sidebar_re4s{max-height:calc(100vh - var(--ifm-navbar-height) - 2rem);overflow-y:auto;position:sticky;top:calc(var(--ifm-navbar-height) + 2rem)}.authorSocials_rSDt,.authorTitle_nd0D{overflow:hidden;-webkit-box-orient:vertical}.sidebarItemTitle_pO2u{font-size:var(--ifm-h3-font-size);font-weight:var(--ifm-font-weight-bold)}.container_mt6G,.sidebarItemList_Yudw{font-size:.9rem}.sidebarItem__DBe{margin-top:.7rem}.sidebarItemLink_mo7H{color:var(--ifm-font-color-base);display:block}.sidebarItemLink_mo7H:hover{-webkit-text-decoration:none;text-decoration:none}.sidebarItemLinkActive_I1ZP{color:var(--ifm-color-primary)!important}.yearGroupHeading_rMGB{margin-bottom:.4rem;margin-top:1.6rem}.yearGroupHeading_QT03{margin:1rem .75rem .5rem}.cardContainer_fWXF{--ifm-link-color:var(--ifm-color-emphasis-800);--ifm-link-hover-color:var(--ifm-color-emphasis-700);--ifm-link-hover-decoration:none;border:1px solid var(--ifm-color-emphasis-200);box-shadow:0 1.5px 3px 0 #00000026;transition:all var(--ifm-transition-fast) ease;transition-property:border,box-shadow}.cardContainer_fWXF:hover{border-color:var(--ifm-color-primary);box-shadow:0 3px 6px 0 #0003}.admonitionContent_BuS1>:last-child,.cardContainer_fWXF :last-child{margin-bottom:0}.cardTitle_rnsV{font-size:1.2rem}.cardDescription_PWke{font-size:.8rem}.docCardListItem_W1sv{margin-bottom:2rem}.title_f1Hy{font-size:3rem}[data-theme=dark] .githubSvg_Uu4N,[data-theme=dark] .instagramSvg_YC40,[data-theme=dark] .threadsSvg_PTXY,[data-theme=dark] .xSvg_y3PF{fill:var(--light)}[data-theme=light] .githubSvg_Uu4N,[data-theme=light] .instagramSvg_YC40,[data-theme=light] .threadsSvg_PTXY,[data-theme=light] .xSvg_y3PF{fill:var(--dark)}.authorSocials_rSDt{align-items:center;display:flex;flex-wrap:wrap;line-clamp:1;-webkit-line-clamp:1}.authorSocialLink_owbf,.authorSocials_rSDt{height:var(--docusaurus-blog-social-icon-size);line-height:0}.authorSocialLink_owbf{margin-right:.4rem}.authorSocialIcon_XYv3{height:var(--docusaurus-blog-social-icon-size)}.authorImage_XqGP{--ifm-avatar-photo-size:3.6rem}.author-as-h1_n9oJ .authorImage_XqGP{--ifm-avatar-photo-size:7rem}.author-as-h2_gXvM .authorImage_XqGP{--ifm-avatar-photo-size:5.4rem}.authorDetails_lV9A{align-items:flex-start;display:flex;flex-direction:column;justify-content:space-around}.authorName_yefp{display:flex;flex-direction:row;font-size:1.1rem;line-height:1.1rem}.author-as-h1_n9oJ .authorName_yefp{display:inline;font-size:2.4rem;line-height:2.4rem}.author-as-h2_gXvM .authorName_yefp{display:inline;font-size:1.4rem;line-height:1.4rem}.authorTitle_nd0D{display:-webkit-box;font-size:.8rem;line-height:1rem;line-clamp:1;-webkit-line-clamp:1}.author-as-h1_n9oJ .authorTitle_nd0D{font-size:1.2rem;line-height:1.6rem}.author-as-h2_gXvM .authorTitle_nd0D{font-size:1rem;line-height:1.3rem}.authorBlogPostCount_iiJ5{background:var(--ifm-color-secondary);border-radius:var(--ifm-global-radius);color:var(--ifm-color-black);font-size:.8rem;line-height:1.2;margin-left:.3rem;padding:.1rem .4rem}.authorListItem_n3yI{list-style-type:none;margin-bottom:2rem}.authorCol_Hf19{max-width:inherit!important}.imageOnlyAuthorRow_pa_O{display:flex;flex-flow:row wrap}.imageOnlyAuthorCol_G86a{margin-left:.3rem;margin-right:.3rem}.codeBlockContainer_Ckt0{background:var(--prism-background-color);border-radius:var(--ifm-code-border-radius);box-shadow:var(--ifm-global-shadow-lw);color:var(--prism-color);margin-bottom:var(--ifm-leading)}.codeBlock_bY9V{--ifm-pre-background:var(--prism-background-color);margin:0;padding:0}.codeBlockStandalone_MEMb{padding:0}.codeBlockLines_e6Vv{float:left;font:inherit;min-width:100%;padding:var(--ifm-pre-padding)}.codeBlockLinesWithNumbering_o6Pm{display:table;padding:var(--ifm-pre-padding) 0}:where(:root){--docusaurus-highlighted-code-line-bg:#484d5b}:where([data-theme=dark]){--docusaurus-highlighted-code-line-bg:#646464}.theme-code-block-highlighted-line{background-color:var(--docusaurus-highlighted-code-line-bg);display:block;margin:0 calc(var(--ifm-pre-padding)*-1);padding:0 var(--ifm-pre-padding)}.codeLine_lJS_{counter-increment:a;display:table-row}.codeLineNumber_Tfdd{background:var(--ifm-pre-background);display:table-cell;left:0;overflow-wrap:normal;padding:0 var(--ifm-pre-padding);position:sticky;text-align:right;width:1%}.codeLineNumber_Tfdd:before{content:counter(a);opacity:.4}.theme-code-block-highlighted-line .codeLineNumber_Tfdd:before{opacity:.8}.codeLineContent_feaV{padding-right:var(--ifm-pre-padding)}.theme-code-block:hover .copyButtonCopied_Vdqa{opacity:1!important}.copyButtonIcons_IEyt{height:1.125rem;position:relative;width:1.125rem}.copyButtonIcon_TrPX,.copyButtonSuccessIcon_cVMy{left:0;position:absolute;top:0;fill:currentColor;height:inherit;opacity:inherit;transition:all var(--ifm-transition-fast) ease;width:inherit}.copyButtonSuccessIcon_cVMy{color:#00d600;left:50%;opacity:0;top:50%;transform:translate(-50%,-50%) scale(.33)}.copyButtonCopied_Vdqa .copyButtonIcon_TrPX{opacity:0;transform:scale(.33)}.copyButtonCopied_Vdqa .copyButtonSuccessIcon_cVMy{opacity:1;transform:translate(-50%,-50%) scale(1);transition-delay:75ms}.wordWrapButtonIcon_b1P5{height:1.2rem;width:1.2rem}.wordWrapButtonEnabled_uzNF .wordWrapButtonIcon_b1P5{color:var(--ifm-color-primary)}.buttonGroup_M5ko{column-gap:.2rem;display:flex;position:absolute;right:calc(var(--ifm-pre-padding)/2);top:calc(var(--ifm-pre-padding)/2)}.buttonGroup_M5ko button{align-items:center;background:var(--prism-background-color);border:1px solid var(--ifm-color-emphasis-300);border-radius:var(--ifm-global-radius);color:var(--prism-color);display:flex;line-height:0;opacity:0;padding:.4rem;transition:opacity var(--ifm-transition-fast) ease-in-out}.buttonGroup_M5ko button:focus-visible,.buttonGroup_M5ko button:hover{opacity:1!important}.theme-code-block:hover .buttonGroup_M5ko button{opacity:.4}.tag_zVej{border:1px solid var(--docusaurus-tag-list-border);transition:border var(--ifm-transition-fast)}.tag_zVej:hover{--docusaurus-tag-list-border:var(--ifm-link-color);-webkit-text-decoration:none;text-decoration:none}.tagRegular_sFm0{border-radius:var(--ifm-global-radius);font-size:90%;padding:.2rem .5rem .3rem}.tagWithCount_h2kH{align-items:center;border-left:0;display:flex;padding:0 .5rem 0 1rem;position:relative}.tagWithCount_h2kH:after,.tagWithCount_h2kH:before{border:1px solid var(--docusaurus-tag-list-border);content:"";position:absolute;top:50%;transition:inherit}.tagWithCount_h2kH:before{border-bottom:0;border-right:0;height:1.18rem;right:100%;transform:translate(50%,-50%) rotate(-45deg);width:1.18rem}.tagWithCount_h2kH:after{border-radius:50%;height:.5rem;left:0;transform:translateY(-50%);width:.5rem}.tagWithCount_h2kH span{background:var(--ifm-color-secondary);border-radius:var(--ifm-global-radius);color:var(--ifm-color-black);font-size:.7rem;line-height:1.2;margin-left:.3rem;padding:.1rem .4rem}.tag_Nnez{display:inline-block;margin:.5rem .5rem 0 1rem}.codeBlockContent_QJqH{border-radius:inherit;direction:ltr;position:relative}.codeBlockTitle_OeMC{border-bottom:1px solid var(--ifm-color-emphasis-300);border-top-left-radius:inherit;border-top-right-radius:inherit;font-size:var(--ifm-code-font-size);font-weight:500;padding:.75rem var(--ifm-pre-padding)}.codeBlockTitle_OeMC+.codeBlockContent_QJqH .codeBlock_a8dz{border-top-left-radius:0;border-top-right-radius:0}.tags_jXut{display:inline}.tag_QGVx{display:inline-block;margin:0 .4rem .5rem 0}.iconEdit_Z9Sw{margin-right:.3em;vertical-align:sub}.lastUpdated_JAkA{font-size:smaller;font-style:italic;margin-top:.2rem}.tocCollapsibleButton_TO0P{align-items:center;display:flex;font-size:inherit;justify-content:space-between;padding:.4rem .8rem;width:100%}.tocCollapsibleButton_TO0P:after{background:var(--ifm-menu-link-sublist-icon) 50% 50%/2rem 2rem no-repeat;content:"";filter:var(--ifm-menu-link-sublist-icon-filter);height:1.25rem;transform:rotate(180deg);transition:transform var(--ifm-transition-fast);width:1.25rem}.tocCollapsibleButtonExpanded_MG3E:after,.tocCollapsibleExpanded_sAul{transform:none}.tocCollapsible_ETCw{background-color:var(--ifm-menu-color-background-active);border-radius:var(--ifm-global-radius);margin:1rem 0}.tocCollapsibleContent_vkbj>ul{border-left:none;border-top:1px solid var(--ifm-color-emphasis-300);font-size:15px;padding:.2rem 0}.tocCollapsibleContent_vkbj ul li{margin:.4rem .8rem}.tocCollapsibleContent_vkbj a{display:block}.details_b_Ee{--docusaurus-details-decoration-color:var(--ifm-alert-border-color);--docusaurus-details-transition:transform var(--ifm-transition-fast) ease;border:1px solid var(--ifm-alert-border-color);margin:0 0 var(--ifm-spacing-vertical)}.containsTaskList_mC6p{list-style:none}:not(.containsTaskList_mC6p>li)>.containsTaskList_mC6p{padding-left:0}.img_ev3q{height:auto}.tableOfContents_bqdL{max-height:calc(100vh - var(--ifm-navbar-height) - 2rem);overflow-y:auto;position:sticky;top:calc(var(--ifm-navbar-height) + 1rem)}.admonition_xJq3{margin-bottom:1em}.admonitionHeading_Gvgb{font:var(--ifm-heading-font-weight) var(--ifm-h5-font-size)/var(--ifm-heading-line-height) var(--ifm-heading-font-family);text-transform:uppercase}.admonitionHeading_Gvgb:not(:last-child){margin-bottom:.3rem}.admonitionHeading_Gvgb code{text-transform:none}.admonitionIcon_Rf37{display:inline-block;margin-right:.4em;vertical-align:middle}.admonitionIcon_Rf37 svg{display:inline-block;height:1.6em;width:1.6em;fill:var(--ifm-alert-foreground-color)}.breadcrumbHomeIcon_YNFT{height:1.1rem;position:relative;top:1px;vertical-align:top;width:1.1rem}.breadcrumbsContainer_Z_bl{--ifm-breadcrumb-size-multiplier:0.8;margin-bottom:.8rem}.title_kItE{--ifm-h1-font-size:3rem;margin-bottom:calc(var(--ifm-leading)*1.25)}.docItemContainer_Djhp article>:first-child,.docItemContainer_Djhp header+*{margin-top:0}.mdxPageWrapper_j9I6{justify-content:center}}@media (min-width:997px){.collapseSidebarButton_PEFL,.expandButton_TmdG{background-color:var(--docusaurus-collapse-button-bg)}:root{--docusaurus-announcement-bar-height:30px}.announcementBarClose_gvF7,.announcementBarPlaceholder_vyr4{flex-basis:50px}.collapseSidebarButton_PEFL{border:1px solid var(--ifm-toc-border-color);border-radius:0;bottom:0;display:block!important;height:40px;position:sticky}.collapseSidebarButtonIcon_kv0_{margin-top:4px;transform:rotate(180deg)}.expandButtonIcon_i1dp,[dir=rtl] .collapseSidebarButtonIcon_kv0_{transform:rotate(0)}.collapseSidebarButton_PEFL:focus,.collapseSidebarButton_PEFL:hover,.expandButton_TmdG:focus,.expandButton_TmdG:hover{background-color:var(--docusaurus-collapse-button-bg-hover)}.navbarSearchContainer_Bca1{padding:var(--ifm-navbar-item-padding-vertical) var(--ifm-navbar-item-padding-horizontal)}.menuHtmlItem_M9Kj{padding:var(--ifm-menu-link-padding-vertical) var(--ifm-menu-link-padding-horizontal)}.menu_SIkG{flex-grow:1;padding:.5rem}@supports (scrollbar-gutter:stable){.menu_SIkG{padding:.5rem 0 .5rem .5rem;scrollbar-gutter:stable}}.menuWithAnnouncementBar_GW3s{margin-bottom:var(--docusaurus-announcement-bar-height)}.sidebar_njMd{display:flex;flex-direction:column;height:100%;padding-top:var(--ifm-navbar-height);width:var(--doc-sidebar-width)}.sidebarWithHideableNavbar_wUlq{padding-top:0}.sidebarHidden_VK0M{opacity:0;visibility:hidden}.sidebarLogo_isFc{align-items:center;color:inherit!important;display:flex!important;margin:0 var(--ifm-navbar-padding-horizontal);max-height:var(--ifm-navbar-height);min-height:var(--ifm-navbar-height);-webkit-text-decoration:none!important;text-decoration:none!important}.sidebarLogo_isFc img{height:2rem;margin-right:.5rem}.expandButton_TmdG{align-items:center;display:flex;height:100%;justify-content:center;position:absolute;right:0;top:0;transition:background-color var(--ifm-transition-fast) ease;width:100%}[dir=rtl] .expandButtonIcon_i1dp{transform:rotate(180deg)}.docSidebarContainer_YfHR{border-right:1px solid var(--ifm-toc-border-color);clip-path:inset(0);display:block;margin-top:calc(var(--ifm-navbar-height)*-1);transition:width var(--ifm-transition-fast) ease;width:var(--doc-sidebar-width);will-change:width}.docSidebarContainerHidden_DPk8{cursor:pointer;width:var(--doc-sidebar-hidden-width)}.sidebarViewport_aRkj{height:100%;max-height:100vh;position:sticky;top:0}.docMainContainer_TBSr{flex-grow:1;max-width:calc(100% - var(--doc-sidebar-width))}.docMainContainerEnhanced_lQrH{max-width:calc(100% - var(--doc-sidebar-hidden-width))}.docItemWrapperEnhanced_JWYK{max-width:calc(var(--ifm-container-width) + var(--doc-sidebar-width))!important}.lastUpdated_JAkA{text-align:right}.tocMobile_ITEo{display:none}.docItemCol_VOVn,.generatedIndexPage_vN6x{max-width:75%!important}}@media (min-width:1440px){.container{max-width:var(--ifm-container-width-xl)}}@media (max-width:1024px){.hero_aEcG{grid-template-columns:1fr;padding-top:8rem;text-align:center}.heroButtons_r52D{justify-content:center}.componentsGrid_KtT5,.footerContent_obNo,.videosGrid_FXHY{grid-template-columns:1fr 1fr}.barriersGrid_u0Jf{grid-template-columns:1fr}}@media (max-width:996px){.col{--ifm-col-width:100%;flex-basis:var(--ifm-col-width);margin-left:0}.footer{--ifm-footer-padding-horizontal:0}.colorModeToggle_DEke,.footer__link-separator,.navbar__item,.sidebar_re4s,.tableOfContents_bqdL{display:none}.footer__col{margin-bottom:calc(var(--ifm-spacing-vertical)*3)}.footer__link-item{display:block;width:max-content}.hero{padding-left:0;padding-right:0}.navbar>.container,.navbar>.container-fluid{padding:0}.navbar__toggle{display:inherit}.navbar__search-input{width:9rem}.pills--block,.tabs--block{flex-direction:column}.navbarSearchContainer_Bca1{position:absolute;right:var(--ifm-navbar-padding-horizontal)}.docItemContainer_F8PC{padding:0 .3rem}}@media (max-width:768px){.heroTitle_qg2I{font-size:3rem}.ctaTitle_arch,.sectionTitle_Ut5p{font-size:2.5rem}.navLinks_FO3Z a:not(.btn_bvfa):not(.btnPrimary_hBjO){display:none}.blogGrid_Qec3,.componentsGrid_KtT5,.footerContent_obNo,.videosGrid_FXHY{grid-template-columns:1fr}.ctaSection_bmsv{border-radius:20px;padding:4rem 2rem}.section_Q9Zo{padding:4rem 1.5rem}.hero_aEcG{padding:7rem 1.5rem 3rem}}@media (max-width:576px){.markdown h1:first-child{--ifm-h1-font-size:2rem}.markdown>h2{--ifm-h2-font-size:1.5rem}.markdown>h3{--ifm-h3-font-size:1.25rem}.title_f1Hy{font-size:2rem}}@media (max-width:480px){.heroTitle_qg2I{font-size:2.2rem}.sectionTitle_Ut5p{font-size:2rem}.statsGrid_wBRk{grid-template-columns:1fr 1fr}.heroButtons_r52D{align-items:center;flex-direction:column}}@media (hover:hover){.backToTopButton_sjWU:hover{background-color:var(--ifm-color-emphasis-300)}}@media (pointer:fine){.thin-scrollbar{scrollbar-width:thin}.thin-scrollbar::-webkit-scrollbar{height:var(--ifm-scrollbar-size);width:var(--ifm-scrollbar-size)}.thin-scrollbar::-webkit-scrollbar-track{background:var(--ifm-scrollbar-track-background-color);border-radius:10px}.thin-scrollbar::-webkit-scrollbar-thumb{background:var(--ifm-scrollbar-thumb-background-color);border-radius:10px}.thin-scrollbar::-webkit-scrollbar-thumb:hover{background:var(--ifm-scrollbar-thumb-hover-background-color)}}@media (prefers-reduced-motion:reduce){:root{--ifm-transition-fast:0ms;--ifm-transition-slow:0ms}}@media print{.announcementBar_mb4j,.footer,.menu,.navbar,.pagination-nav,.table-of-contents,.tocMobile_ITEo{display:none}.tabs{page-break-inside:avoid}.codeBlockLines_e6Vv{white-space:pre-wrap}} \ No newline at end of file diff --git a/docs/assets/js/3aeb33c7.dbef3914.js b/docs/assets/js/3aeb33c7.dbef3914.js new file mode 100644 index 00000000..74500548 --- /dev/null +++ b/docs/assets/js/3aeb33c7.dbef3914.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[974],{5969:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-five","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-five/index.md","source":"@site/blog/bharatmlstack-history/post-five/index.md","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"BharatMLStack","date":"2025-06-02T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":4.93,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-five","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","authors":["jaya"],"date":"2025-6-2","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"nextItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-four"}}')},7309:(e,t,i)=>{i.r(t),i.d(t,{assets:()=>h,contentTitle:()=>d,default:()=>o,frontMatter:()=>r,metadata:()=>n,toc:()=>c});var n=i(5969),s=i(4848),l=i(8453);const r={slug:"post-five",title:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale",authors:["jaya"],date:"2025-6-2",tags:["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},d=void 0,h={authorsImageUrls:[void 0]},c=[{value:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale",id:"llm-inference-optimization-techniques-engineering-sub-second-latency-at-scale",level:2},{value:"1. Advanced Memory Management: Paged & Prefix KV Caching",id:"1-advanced-memory-management-paged--prefix-kv-caching",level:2},{value:"Paged KV caching",id:"paged-kv-caching",level:3},{value:"KV cache quantization",id:"kv-cache-quantization",level:3},{value:"Prefix caching (the "voice bot" optimizer)",id:"prefix-caching-the-voice-bot-optimizer",level:3},{value:"2. Aggressive Quantization (INT4 AWQ & FP8)",id:"2-aggressive-quantization-int4-awq--fp8",level:2},{value:"INT4 AWQ (Activation-aware Weight Quantization)",id:"int4-awq-activation-aware-weight-quantization",level:3},{value:"FP8 precision",id:"fp8-precision",level:3},{value:"3. Kernel Fusion & Custom Plugins",id:"3-kernel-fusion--custom-plugins",level:2},{value:"4. Inflight (Continuous) Batching",id:"4-inflight-continuous-batching",level:2},{value:"5. Parallelism Strategies: Scaling Beyond One GPU",id:"5-parallelism-strategies-scaling-beyond-one-gpu",level:2},{value:"6. Speculative Decoding",id:"6-speculative-decoding",level:2},{value:"Few Benchmarks",id:"few-benchmarks",level:2},{value:"Search query rewriting",id:"search-query-rewriting",level:3},{value:"Voice bot query",id:"voice-bot-query",level:3},{value:"Conclusion",id:"conclusion",level:2}];function a(e){const t={h2:"h2",h3:"h3",img:"img",li:"li",p:"p",strong:"strong",table:"table",tbody:"tbody",td:"td",th:"th",thead:"thead",tr:"tr",ul:"ul",...(0,l.R)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(t.p,{children:(0,s.jsx)(t.img,{alt:"BharatMLStack",src:i(9200).A+"",width:"1396",height:"460"})}),"\n",(0,s.jsx)(t.h2,{id:"llm-inference-optimization-techniques-engineering-sub-second-latency-at-scale",children:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale"}),"\n",(0,s.jsx)(t.p,{children:"Raw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack\u2014from memory management to kernel execution."}),"\n",(0,s.jsx)(t.h2,{id:"1-advanced-memory-management-paged--prefix-kv-caching",children:"1. Advanced Memory Management: Paged & Prefix KV Caching"}),"\n",(0,s.jsx)(t.p,{children:"The most significant bottleneck in LLM inference is not always compute, but memory bandwidth\u2014specifically managing the Key-Value (KV) cache."}),"\n",(0,s.jsx)(t.h3,{id:"paged-kv-caching",children:"Paged KV caching"}),"\n",(0,s.jsxs)(t.p,{children:["Standard caching suffers from fragmentation. We use ",(0,s.jsx)(t.strong,{children:"Paged KV caching"}),", which operates similarly to an operating system's virtual memory: the KV cache is divided into non-contiguous blocks. This lets us serve larger batch sizes without running out of memory."]}),"\n",(0,s.jsx)(t.h3,{id:"kv-cache-quantization",children:"KV cache quantization"}),"\n",(0,s.jsxs)(t.p,{children:["To further maximize available memory, we implement ",(0,s.jsx)(t.strong,{children:"KV cache quantization"})," (e.g., FP8). By compressing stored attention keys and values from 16-bit to 8-bit, we nearly double the effective context window capacity of the GPU, allowing longer conversations or larger batches without materially degrading quality."]}),"\n",(0,s.jsx)(t.h3,{id:"prefix-caching-the-voice-bot-optimizer",children:'Prefix caching (the "voice bot" optimizer)'}),"\n",(0,s.jsxs)(t.p,{children:['For use cases like GenAI voice bots where the system prompt (e.g., "You are a helpful assistant...") is static across thousands of requests, we enable ',(0,s.jsx)(t.strong,{children:"prefix caching"}),"."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Impact"}),": By reusing pre-computed KV states for common prefixes, we achieve a cache hit rate of ~90%. This reduces ",(0,s.jsx)(t.strong,{children:"Time To First Token (TTFT)"})," by skipping redundant computation of the system prompt."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"2-aggressive-quantization-int4-awq--fp8",children:"2. Aggressive Quantization (INT4 AWQ & FP8)"}),"\n",(0,s.jsx)(t.p,{children:"Running models in their native 16-bit precision (BF16) restricts maximum batch size and throughput. We use quantization to shrink model weights without sacrificing accuracy."}),"\n",(0,s.jsx)(t.h3,{id:"int4-awq-activation-aware-weight-quantization",children:"INT4 AWQ (Activation-aware Weight Quantization)"}),"\n",(0,s.jsxs)(t.p,{children:["For the Llama 3 family, we use ",(0,s.jsx)(t.strong,{children:"AWQ"})," to compress weights to 4 bits. This reduces model size by ~75%, allowing larger models to fit into L4 GPU memory and significantly improving token generation speed."]}),"\n",(0,s.jsx)(t.h3,{id:"fp8-precision",children:"FP8 precision"}),"\n",(0,s.jsxs)(t.p,{children:["For NVIDIA Hopper (H100) architectures, we are exploring ",(0,s.jsx)(t.strong,{children:"FP8 quantization"}),", leveraging native FP8 tensor cores to accelerate matrix multiplications while maintaining a higher dynamic range than integer quantization."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Verification"}),": We validate quantized models by comparing dot-product similarity of embeddings against the FP16 baseline, consistently achieving ",(0,s.jsx)(t.strong,{children:">99% similarity"}),"."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"3-kernel-fusion--custom-plugins",children:"3. Kernel Fusion & Custom Plugins"}),"\n",(0,s.jsx)(t.p,{children:"To minimize overhead from launching thousands of small GPU operations, we fuse them into monolithic kernels using NVIDIA TensorRT plugins."}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Flash attention & FMHA"}),": We enable ",(0,s.jsx)(t.strong,{children:"Fused Multi-Head Attention (FMHA)"})," combined with flash attention to reduce memory reads/writes."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"GEMM plugins"}),": We use specialized ",(0,s.jsx)(t.strong,{children:"GEMM"})," plugins to accelerate transformer linear layers."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Removing input padding"}),": Instead of padding short sequences to match the longest, we remove input padding so the GPU processes only valid tokens."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"4-inflight-continuous-batching",children:"4. Inflight (Continuous) Batching"}),"\n",(0,s.jsx)(t.p,{children:"Traditional static batching waits for all requests in a batch to finish before returning results\u2014so one long response delays everyone else."}),"\n",(0,s.jsxs)(t.p,{children:["We implement ",(0,s.jsx)(t.strong,{children:"inflight batching"}),": as soon as one request completes, its slot is freed and filled by a new request from the queue. This keeps GPUs saturated and decouples latency of short queries from long ones."]}),"\n",(0,s.jsx)(t.h2,{id:"5-parallelism-strategies-scaling-beyond-one-gpu",children:"5. Parallelism Strategies: Scaling Beyond One GPU"}),"\n",(0,s.jsx)(t.p,{children:"For large models (e.g., 70B+ parameters) that cannot fit into the VRAM of a single GPU, we use parallelism strategies."}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Tensor parallelism (TP)"}),": Split weight matrices across multiple GPUs (e.g., 4\xd7 L4 or 8\xd7 A100). Each GPU computes a shard and outputs are reduced at every layer."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Pipeline parallelism (PP)"}),": Split model layers across GPUs to pipeline compute (e.g., while one GPU computes later layers for Request A, another starts early layers for Request B)."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"6-speculative-decoding",children:"6. Speculative Decoding"}),"\n",(0,s.jsxs)(t.p,{children:["To reduce inter-token latency (ITL), we explore ",(0,s.jsx)(t.strong,{children:"speculative decoding"}),"."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Mechanism"}),': A smaller, faster "draft" model speculatively generates a short token sequence (e.g., 5 tokens).']}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Verification"}),": The larger target model verifies those tokens in one parallel forward pass. If correct, we effectively generate multiple tokens per large-model step; if not, we discard and regenerate. This is effective for predictable text, improving perceived generation speed."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"few-benchmarks",children:"Few Benchmarks"}),"\n",(0,s.jsx)(t.p,{children:"Below are a couple of representative use cases and performance numbers."}),"\n",(0,s.jsx)(t.h3,{id:"search-query-rewriting",children:"Search query rewriting"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"LLM"}),": Fine-tuned llama-3.2-1B"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Input & output token length"}),": ~10\u201320"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Response type"}),": Non-streaming"]}),"\n"]}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Inference runtime"}),(0,s.jsx)(t.th,{children:"Hardware"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Max requests/sec"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Max p99 latency"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{children:"4 \xd7 L4 GPUs (multi-GPU)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1000"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"95 ms"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{children:"1 \xd7 A100 40 GB GPU"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1000"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"69 ms"})]})]})]}),"\n",(0,s.jsx)(t.h3,{id:"voice-bot-query",children:"Voice bot query"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"LLM"}),": Llama-3.1-8B"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Input token length"}),": ~1900\u20132000"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Output token length"}),": ~200"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Response type"}),": Streaming"]}),"\n"]}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Inference runtime"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Concurrency"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"p99 TTFT (ms)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"p99 ITL (ms)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Token throughput (tokens/sec)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Request throughput (req/sec)"}),(0,s.jsx)(t.th,{children:"Hardware"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"36.27"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"22.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"45.66"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.23"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"49.81"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"23.21"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"89.37"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.45"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"55.33"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"36.62"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"153.39"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.78"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"66.5"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"39.11"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"279.88"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1.47"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"131.8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"30.39"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"547.8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2.77"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"277.22"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"48.02"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"925.7"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4.78"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"64"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"498.52"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"71.62"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,164.40"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"6.2"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"128"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"677.31"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"120.37"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,445.18"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"7.69"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"256"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,926.31"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"216.88"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,600.81"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8.52"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"21.17"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"9.24"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"130.05"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.68"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"25.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"9.21"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"264.5"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1.35"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"28.52"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"10.99"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"437.69"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2.27"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"34.4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"12.61"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"760.49"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"3.96"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"68.03"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"14.32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,343.80"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"7.01"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"185.96"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16.82"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2,287.30"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"11.92"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"64"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"136.87"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"21.17"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"3,625.22"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"18.89"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"128"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"463.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"34.15"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4,456.51"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"23.24"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"256"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"890.12"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"59.18"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"5,188.24"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"27.05"}),(0,s.jsx)(t.td,{children:"A100"})]})]})]}),"\n",(0,s.jsx)(t.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,s.jsx)(t.p,{children:"High-performance LLM inference is fundamentally a systems engineering problem: memory efficiency, kernel execution, batching strategy, and parallelism determine real-world latency and throughput. Techniques such as paged KV caching, aggressive quantization, kernel fusion, and inflight batching improve GPU utilization while reducing latency and memory pressure."}),"\n",(0,s.jsx)(t.p,{children:"These optimizations enable the platform to deliver sub-second responses, sustain high concurrency, and efficiently serve both lightweight and long-context workloads. By continuously optimizing across the full inference stack, we keep LLM serving scalable, cost-efficient, and production-ready for real-time AI applications."})]})}function o(e={}){const{wrapper:t}={...(0,l.R)(),...e.components};return t?(0,s.jsx)(t,{...e,children:(0,s.jsx)(a,{...e})}):a(e)}},8453:(e,t,i)=>{i.d(t,{R:()=>r,x:()=>d});var n=i(6540);const s={},l=n.createContext(s);function r(e){const t=n.useContext(l);return n.useMemo(function(){return"function"==typeof e?e(t):{...t,...e}},[t,e])}function d(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(s):e.components||s:r(e.components),n.createElement(l.Provider,{value:t},e.children)}},9200:(e,t,i)=>{i.d(t,{A:()=>n});const n=i.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"}}]); \ No newline at end of file diff --git a/docs/assets/js/428aafcc.1b6a0a9c.js b/docs/assets/js/428aafcc.1b6a0a9c.js new file mode 100644 index 00000000..7982c88b --- /dev/null +++ b/docs/assets/js/428aafcc.1b6a0a9c.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[5503],{702:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/vss-c482f6eac4c68b3219e4c562a6b717ec.png"},788:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-three","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-three/index.md","source":"@site/blog/bharatmlstack-history/post-three/index.md","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"BharatMLStack","date":"2024-05-21T00:00:00.000Z","tags":[{"inline":true,"label":"model-inference","permalink":"/BharatMLStack/blog/tags/model-inference"},{"inline":true,"label":"embedding-search","permalink":"/BharatMLStack/blog/tags/embedding-search"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":3.6,"hasTruncateMarker":false,"authors":[{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"slug":"post-three","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","authors":["aditya","jaya","adarsha"],"date":"2024-05-21T00:00:00.000Z","tags":["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-four"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two"}}')},6e3:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},7999:(e,n,i)=>{i.r(n),i.d(n,{assets:()=>o,contentTitle:()=>l,default:()=>h,frontMatter:()=>s,metadata:()=>t,toc:()=>d});var t=i(788),a=i(4848),r=i(8453);const s={slug:"post-three",title:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search",authors:["aditya","jaya","adarsha"],date:new Date("2024-05-21T00:00:00.000Z"),tags:["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},l=void 0,o={authorsImageUrls:[void 0,void 0,void 0]},d=[{value:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search",id:"cracking-the-code-scaling-model-inference--real-time-embedding-search",level:2},{value:"Breaking Free from the Scalability Ceiling",id:"breaking-free-from-the-scalability-ceiling",level:2},{value:"The Model Serving Bottleneck\u2014A Wake-Up Call",id:"the-model-serving-bottlenecka-wake-up-call",level:3},{value:"Scaling Triton on GKE",id:"scaling-triton-on-gke",level:3},{value:"Fixing the Cold Start Problem",id:"fixing-the-cold-start-problem",level:3},{value:"Embedding Search: The Last Piece of the Puzzle",id:"embedding-search-the-last-piece-of-the-puzzle",level:2},{value:"Choosing the Right Vector Database",id:"choosing-the-right-vector-database",level:3},{value:"Embedding Freshness & Real-Time Updates",id:"embedding-freshness--real-time-updates",level:3},{value:"Final Takeaways: Scaling Smartly for Real-Time ML",id:"final-takeaways-scaling-smartly-for-real-time-ml",level:2}];function c(e){const n={h2:"h2",h3:"h3",img:"img",li:"li",p:"p",ul:"ul",...(0,r.R)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(n.p,{children:(0,a.jsx)(n.img,{alt:"BharatMLStack",src:i(6e3).A+"",width:"1396",height:"460"})}),"\n",(0,a.jsx)(n.h2,{id:"cracking-the-code-scaling-model-inference--real-time-embedding-search",children:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search"}),"\n",(0,a.jsx)(n.p,{children:"By mid-2023, we had transformed our ML stack\u2014building a real-time feature store, optimizing model retrieval, and fine-tuning ranking. But two critical gaps remained:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\udd39 Scaling model inference without hitting infrastructure roadblocks"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\udd39 Moving embedding search from batch to real-time for candidate generation"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"Here\u2019s how we tackled these last-mile challenges, broke free from infrastructure constraints, and built a cost-efficient, high-performance system."}),"\n",(0,a.jsx)(n.h2,{id:"breaking-free-from-the-scalability-ceiling",children:"Breaking Free from the Scalability Ceiling"}),"\n",(0,a.jsx)(n.h3,{id:"the-model-serving-bottlenecka-wake-up-call",children:"The Model Serving Bottleneck\u2014A Wake-Up Call"}),"\n",(0,a.jsx)(n.p,{children:"July 2023. With just months left for the Mega Blockbuster Sale (MBS), we noticed a serious issue\u2014scaling our model-serving infrastructure was taking 10\u201315 minutes. In real-time ML, that\u2019s an eternity.\nIn one of our war rooms, we ran a quick experiment:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 We deployed an XGBoost model on a self-hosted Triton Inference Server running on a 16-core machine."}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Fired requests and compared the outputs with our existing cloud-hosted setup."}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 The results matched\u2014perfectly."}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:'That moment changed everything. We prepped a backup Triton setup on EKS, just in case our cloud provider couldn\'t allocate enough compute resources in time. Luckily, they did\u2014but the seed was planted.\nThen in October, just two weeks before MBS, we got an alarming response from our infrastructure team:\n"Node availability may be an issue."\nWith no time to waste, we moved 30% of real-time ML traffic to our self-hosted Triton cluster. The results?'}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\u2705 p99 latency dropped from 90\u2013100ms to 30\u201340ms"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Triton handled significantly higher throughput on fewer resources"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 No model changes were needed"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"MBS ran without a hitch, proving that self-hosted inference was the way forward."}),"\n",(0,a.jsx)(n.h3,{id:"scaling-triton-on-gke",children:"Scaling Triton on GKE"}),"\n",(0,a.jsx)(n.p,{children:"This left us with two choices:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"1\ufe0f\u20e3 Port models to a managed cloud inference service, investing time in learning a new deployment stack"}),"\n",(0,a.jsx)(n.li,{children:"2\ufe0f\u20e3 Scale our existing Triton setup on GKE, optimizing for cost and performance"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"We went with Option 2\u2014and it slashed inference costs to 35% of what we previously paid, while giving us full control over scaling and optimizations."}),"\n",(0,a.jsx)(n.h3,{id:"fixing-the-cold-start-problem",children:"Fixing the Cold Start Problem"}),"\n",(0,a.jsx)(n.p,{children:"As we onboarded more deep learning (DL) models, we hit a new bottleneck, new inference pods took 7\u20139 minutes to spin up."}),"\n",(0,a.jsx)(n.p,{children:"After profiling, we found the culprits:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"Triton\u2019s base image\u2014a massive 5GB"}),"\n",(0,a.jsx)(n.li,{children:"Model binaries\u2014often 1GB+"}),"\n",(0,a.jsx)(n.li,{children:"Startup delay\u2014mostly due to downloading and initializing these assets"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"To fix this, we built a lightweight Triton image, stripping unused components and shrinking the size to 900MB. This cut cold start times drastically, making auto-scaling faster and smoother."}),"\n",(0,a.jsx)(n.h2,{id:"embedding-search-the-last-piece-of-the-puzzle",children:"Embedding Search: The Last Piece of the Puzzle"}),"\n",(0,a.jsx)(n.p,{children:"By mid-2023, most of our ML stack had gone real-time\u2014except for Candidate Generation (CG), which still ran in batch mode. To truly power real-time recommendations, we needed an online embedding search system."}),"\n",(0,a.jsx)(n.h3,{id:"choosing-the-right-vector-database",children:"Choosing the Right Vector Database"}),"\n",(0,a.jsx)(n.p,{children:"We benchmarked three production-ready vector DBs across key parameters:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"Milvus"}),"\n",(0,a.jsx)(n.li,{children:"Qdrant"}),"\n",(0,a.jsx)(n.li,{children:"Weaviate"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"After extensive POCs, Qdrant stood out for its:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\u2705 Blazing-fast search latency on high-dimensional vectors"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Efficient memory usage, crucial for in-memory workloads"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Support for upserts and soft deletes, vital for Ads use cases"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 gRPC + REST APIs, making integration seamless"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Powerful filtering, allowing fine-tuned retrieval (e.g., filtering Ads by category, active status, etc.)"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"At its core, Qdrant uses HNSW indexing, delivering both high recall and low-latency nearest-neighbor search\u2014a perfect fit for our needs."}),"\n",(0,a.jsx)(n.h3,{id:"embedding-freshness--real-time-updates",children:"Embedding Freshness & Real-Time Updates"}),"\n",(0,a.jsx)(n.p,{children:"To ensure embeddings stayed up to date, we built a dual ingestion pipeline:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\udccc Daily Refresh: A bulk pipeline updated embeddings overnight"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\udccc Real-Time Updates: Ads events triggered immediate upserts/deletes"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:'This setup powered real-time "Similar Products" recommendations on the product page and became the foundation for Ads Candidate Generation, ensuring the right ads surfaced in milliseconds.'}),"\n",(0,a.jsx)(n.p,{children:(0,a.jsx)(n.img,{alt:"Skye",src:i(702).A+"",width:"1260",height:"644"})}),"\n",(0,a.jsx)(n.h2,{id:"final-takeaways-scaling-smartly-for-real-time-ml",children:"Final Takeaways: Scaling Smartly for Real-Time ML"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Self-hosted inference on Triton gave us lower cost, faster scaling, and better performance than managed services"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Building a custom Triton image reduced cold starts, improving responsiveness"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Qdrant-based embedding search enabled real-time personalization at scale"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Real-time updates for embeddings unlocked dynamic, up-to-date recommendations"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"By early 2024, Meesho\u2019s ML stack had evolved into a fully real-time, scalable, and cost-efficient system, setting the foundation for even bigger leaps ahead."})]})}function h(e={}){const{wrapper:n}={...(0,r.R)(),...e.components};return n?(0,a.jsx)(n,{...e,children:(0,a.jsx)(c,{...e})}):c(e)}},8453:(e,n,i)=>{i.d(n,{R:()=>s,x:()=>l});var t=i(6540);const a={},r=t.createContext(a);function s(e){const n=t.useContext(r);return t.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function l(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(a):e.components||a:s(e.components),t.createElement(r.Provider,{value:n},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/428aafcc.2c1db158.js b/docs/assets/js/428aafcc.2c1db158.js deleted file mode 100644 index 813dbcdf..00000000 --- a/docs/assets/js/428aafcc.2c1db158.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[5503],{702:(e,n,t)=>{t.d(n,{A:()=>i});const i=t.p+"assets/images/vss-c482f6eac4c68b3219e4c562a6b717ec.png"},788:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-three","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-three/index.md","source":"@site/blog/bharatmlstack-history/post-three/index.md","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"BharatMLStack","date":"2024-05-21T00:00:00.000Z","tags":[{"inline":true,"label":"model-inference","permalink":"/BharatMLStack/blog/tags/model-inference"},{"inline":true,"label":"embedding-search","permalink":"/BharatMLStack/blog/tags/embedding-search"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":3.6,"hasTruncateMarker":false,"authors":[{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"slug":"post-three","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","authors":["aditya","jaya","adarsha"],"date":"2024-05-21T00:00:00.000Z","tags":["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-three"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two"}}')},6e3:(e,n,t)=>{t.d(n,{A:()=>i});const i=t.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},7999:(e,n,t)=>{t.r(n),t.d(n,{assets:()=>o,contentTitle:()=>l,default:()=>h,frontMatter:()=>s,metadata:()=>i,toc:()=>d});var i=t(788),a=t(4848),r=t(8453);const s={slug:"post-three",title:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search",authors:["aditya","jaya","adarsha"],date:new Date("2024-05-21T00:00:00.000Z"),tags:["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},l=void 0,o={authorsImageUrls:[void 0,void 0,void 0]},d=[{value:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search",id:"cracking-the-code-scaling-model-inference--real-time-embedding-search",level:2},{value:"Breaking Free from the Scalability Ceiling",id:"breaking-free-from-the-scalability-ceiling",level:2},{value:"The Model Serving Bottleneck\u2014A Wake-Up Call",id:"the-model-serving-bottlenecka-wake-up-call",level:3},{value:"Scaling Triton on GKE",id:"scaling-triton-on-gke",level:3},{value:"Fixing the Cold Start Problem",id:"fixing-the-cold-start-problem",level:3},{value:"Embedding Search: The Last Piece of the Puzzle",id:"embedding-search-the-last-piece-of-the-puzzle",level:2},{value:"Choosing the Right Vector Database",id:"choosing-the-right-vector-database",level:3},{value:"Embedding Freshness & Real-Time Updates",id:"embedding-freshness--real-time-updates",level:3},{value:"Final Takeaways: Scaling Smartly for Real-Time ML",id:"final-takeaways-scaling-smartly-for-real-time-ml",level:2}];function c(e){const n={h2:"h2",h3:"h3",img:"img",li:"li",p:"p",ul:"ul",...(0,r.R)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(n.p,{children:(0,a.jsx)(n.img,{alt:"BharatMLStack",src:t(6e3).A+"",width:"1396",height:"460"})}),"\n",(0,a.jsx)(n.h2,{id:"cracking-the-code-scaling-model-inference--real-time-embedding-search",children:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search"}),"\n",(0,a.jsx)(n.p,{children:"By mid-2023, we had transformed our ML stack\u2014building a real-time feature store, optimizing model retrieval, and fine-tuning ranking. But two critical gaps remained:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\udd39 Scaling model inference without hitting infrastructure roadblocks"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\udd39 Moving embedding search from batch to real-time for candidate generation"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"Here\u2019s how we tackled these last-mile challenges, broke free from infrastructure constraints, and built a cost-efficient, high-performance system."}),"\n",(0,a.jsx)(n.h2,{id:"breaking-free-from-the-scalability-ceiling",children:"Breaking Free from the Scalability Ceiling"}),"\n",(0,a.jsx)(n.h3,{id:"the-model-serving-bottlenecka-wake-up-call",children:"The Model Serving Bottleneck\u2014A Wake-Up Call"}),"\n",(0,a.jsx)(n.p,{children:"July 2023. With just months left for the Mega Blockbuster Sale (MBS), we noticed a serious issue\u2014scaling our model-serving infrastructure was taking 10\u201315 minutes. In real-time ML, that\u2019s an eternity.\nIn one of our war rooms, we ran a quick experiment:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 We deployed an XGBoost model on a self-hosted Triton Inference Server running on a 16-core machine."}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Fired requests and compared the outputs with our existing cloud-hosted setup."}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 The results matched\u2014perfectly."}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:'That moment changed everything. We prepped a backup Triton setup on EKS, just in case our cloud provider couldn\'t allocate enough compute resources in time. Luckily, they did\u2014but the seed was planted.\nThen in October, just two weeks before MBS, we got an alarming response from our infrastructure team:\n"Node availability may be an issue."\nWith no time to waste, we moved 30% of real-time ML traffic to our self-hosted Triton cluster. The results?'}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\u2705 p99 latency dropped from 90\u2013100ms to 30\u201340ms"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Triton handled significantly higher throughput on fewer resources"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 No model changes were needed"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"MBS ran without a hitch, proving that self-hosted inference was the way forward."}),"\n",(0,a.jsx)(n.h3,{id:"scaling-triton-on-gke",children:"Scaling Triton on GKE"}),"\n",(0,a.jsx)(n.p,{children:"This left us with two choices:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"1\ufe0f\u20e3 Port models to a managed cloud inference service, investing time in learning a new deployment stack"}),"\n",(0,a.jsx)(n.li,{children:"2\ufe0f\u20e3 Scale our existing Triton setup on GKE, optimizing for cost and performance"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"We went with Option 2\u2014and it slashed inference costs to 35% of what we previously paid, while giving us full control over scaling and optimizations."}),"\n",(0,a.jsx)(n.h3,{id:"fixing-the-cold-start-problem",children:"Fixing the Cold Start Problem"}),"\n",(0,a.jsx)(n.p,{children:"As we onboarded more deep learning (DL) models, we hit a new bottleneck, new inference pods took 7\u20139 minutes to spin up."}),"\n",(0,a.jsx)(n.p,{children:"After profiling, we found the culprits:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"Triton\u2019s base image\u2014a massive 5GB"}),"\n",(0,a.jsx)(n.li,{children:"Model binaries\u2014often 1GB+"}),"\n",(0,a.jsx)(n.li,{children:"Startup delay\u2014mostly due to downloading and initializing these assets"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"To fix this, we built a lightweight Triton image, stripping unused components and shrinking the size to 900MB. This cut cold start times drastically, making auto-scaling faster and smoother."}),"\n",(0,a.jsx)(n.h2,{id:"embedding-search-the-last-piece-of-the-puzzle",children:"Embedding Search: The Last Piece of the Puzzle"}),"\n",(0,a.jsx)(n.p,{children:"By mid-2023, most of our ML stack had gone real-time\u2014except for Candidate Generation (CG), which still ran in batch mode. To truly power real-time recommendations, we needed an online embedding search system."}),"\n",(0,a.jsx)(n.h3,{id:"choosing-the-right-vector-database",children:"Choosing the Right Vector Database"}),"\n",(0,a.jsx)(n.p,{children:"We benchmarked three production-ready vector DBs across key parameters:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"Milvus"}),"\n",(0,a.jsx)(n.li,{children:"Qdrant"}),"\n",(0,a.jsx)(n.li,{children:"Weaviate"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"After extensive POCs, Qdrant stood out for its:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\u2705 Blazing-fast search latency on high-dimensional vectors"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Efficient memory usage, crucial for in-memory workloads"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Support for upserts and soft deletes, vital for Ads use cases"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 gRPC + REST APIs, making integration seamless"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Powerful filtering, allowing fine-tuned retrieval (e.g., filtering Ads by category, active status, etc.)"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"At its core, Qdrant uses HNSW indexing, delivering both high recall and low-latency nearest-neighbor search\u2014a perfect fit for our needs."}),"\n",(0,a.jsx)(n.h3,{id:"embedding-freshness--real-time-updates",children:"Embedding Freshness & Real-Time Updates"}),"\n",(0,a.jsx)(n.p,{children:"To ensure embeddings stayed up to date, we built a dual ingestion pipeline:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\udccc Daily Refresh: A bulk pipeline updated embeddings overnight"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\udccc Real-Time Updates: Ads events triggered immediate upserts/deletes"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:'This setup powered real-time "Similar Products" recommendations on the product page and became the foundation for Ads Candidate Generation, ensuring the right ads surfaced in milliseconds.'}),"\n",(0,a.jsx)(n.p,{children:(0,a.jsx)(n.img,{alt:"Skye",src:t(702).A+"",width:"1260",height:"644"})}),"\n",(0,a.jsx)(n.h2,{id:"final-takeaways-scaling-smartly-for-real-time-ml",children:"Final Takeaways: Scaling Smartly for Real-Time ML"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Self-hosted inference on Triton gave us lower cost, faster scaling, and better performance than managed services"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Building a custom Triton image reduced cold starts, improving responsiveness"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Qdrant-based embedding search enabled real-time personalization at scale"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Real-time updates for embeddings unlocked dynamic, up-to-date recommendations"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"By early 2024, Meesho\u2019s ML stack had evolved into a fully real-time, scalable, and cost-efficient system, setting the foundation for even bigger leaps ahead."})]})}function h(e={}){const{wrapper:n}={...(0,r.R)(),...e.components};return n?(0,a.jsx)(n,{...e,children:(0,a.jsx)(c,{...e})}):c(e)}},8453:(e,n,t)=>{t.d(n,{R:()=>s,x:()=>l});var i=t(6540);const a={},r=i.createContext(a);function s(e){const n=i.useContext(r);return i.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function l(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(a):e.components||a:s(e.components),i.createElement(r.Provider,{value:n},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/6479fb86.3f75012c.js b/docs/assets/js/6479fb86.3f75012c.js new file mode 100644 index 00000000..3d01bfe6 --- /dev/null +++ b/docs/assets/js/6479fb86.3f75012c.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[5579],{3751:e=>{e.exports=JSON.parse('{"archive":{"blogPosts":[{"id":"post-five","metadata":{"permalink":"/BharatMLStack/blog/post-five","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-five/index.md","source":"@site/blog/bharatmlstack-history/post-five/index.md","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"BharatMLStack","date":"2025-06-02T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":4.93,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-five","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","authors":["jaya"],"date":"2025-6-2","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"nextItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-four"}},"content":"![BharatMLStack](./bms.png)\\n## LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale\\n\\nRaw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack\u2014from memory management to kernel execution.\\n\\n## 1. Advanced Memory Management: Paged & Prefix KV Caching\\n\\nThe most significant bottleneck in LLM inference is not always compute, but memory bandwidth\u2014specifically managing the Key-Value (KV) cache.\\n\\n### Paged KV caching\\n\\nStandard caching suffers from fragmentation. We use **Paged KV caching**, which operates similarly to an operating system\'s virtual memory: the KV cache is divided into non-contiguous blocks. This lets us serve larger batch sizes without running out of memory.\\n\\n### KV cache quantization\\n\\nTo further maximize available memory, we implement **KV cache quantization** (e.g., FP8). By compressing stored attention keys and values from 16-bit to 8-bit, we nearly double the effective context window capacity of the GPU, allowing longer conversations or larger batches without materially degrading quality.\\n\\n### Prefix caching (the \\"voice bot\\" optimizer)\\n\\nFor use cases like GenAI voice bots where the system prompt (e.g., \\"You are a helpful assistant...\\") is static across thousands of requests, we enable **prefix caching**.\\n\\n- **Impact**: By reusing pre-computed KV states for common prefixes, we achieve a cache hit rate of ~90%. This reduces **Time To First Token (TTFT)** by skipping redundant computation of the system prompt.\\n\\n## 2. Aggressive Quantization (INT4 AWQ & FP8)\\n\\nRunning models in their native 16-bit precision (BF16) restricts maximum batch size and throughput. We use quantization to shrink model weights without sacrificing accuracy.\\n\\n### INT4 AWQ (Activation-aware Weight Quantization)\\n\\nFor the Llama 3 family, we use **AWQ** to compress weights to 4 bits. This reduces model size by ~75%, allowing larger models to fit into L4 GPU memory and significantly improving token generation speed.\\n\\n### FP8 precision\\n\\nFor NVIDIA Hopper (H100) architectures, we are exploring **FP8 quantization**, leveraging native FP8 tensor cores to accelerate matrix multiplications while maintaining a higher dynamic range than integer quantization.\\n\\n- **Verification**: We validate quantized models by comparing dot-product similarity of embeddings against the FP16 baseline, consistently achieving **>99% similarity**.\\n\\n## 3. Kernel Fusion & Custom Plugins\\n\\nTo minimize overhead from launching thousands of small GPU operations, we fuse them into monolithic kernels using NVIDIA TensorRT plugins.\\n\\n- **Flash attention & FMHA**: We enable **Fused Multi-Head Attention (FMHA)** combined with flash attention to reduce memory reads/writes.\\n- **GEMM plugins**: We use specialized **GEMM** plugins to accelerate transformer linear layers.\\n- **Removing input padding**: Instead of padding short sequences to match the longest, we remove input padding so the GPU processes only valid tokens.\\n\\n## 4. Inflight (Continuous) Batching\\n\\nTraditional static batching waits for all requests in a batch to finish before returning results\u2014so one long response delays everyone else.\\n\\nWe implement **inflight batching**: as soon as one request completes, its slot is freed and filled by a new request from the queue. This keeps GPUs saturated and decouples latency of short queries from long ones.\\n\\n## 5. Parallelism Strategies: Scaling Beyond One GPU\\n\\nFor large models (e.g., 70B+ parameters) that cannot fit into the VRAM of a single GPU, we use parallelism strategies.\\n\\n- **Tensor parallelism (TP)**: Split weight matrices across multiple GPUs (e.g., 4\xd7 L4 or 8\xd7 A100). Each GPU computes a shard and outputs are reduced at every layer.\\n- **Pipeline parallelism (PP)**: Split model layers across GPUs to pipeline compute (e.g., while one GPU computes later layers for Request A, another starts early layers for Request B).\\n\\n## 6. Speculative Decoding\\n\\nTo reduce inter-token latency (ITL), we explore **speculative decoding**.\\n\\n- **Mechanism**: A smaller, faster \\"draft\\" model speculatively generates a short token sequence (e.g., 5 tokens).\\n- **Verification**: The larger target model verifies those tokens in one parallel forward pass. If correct, we effectively generate multiple tokens per large-model step; if not, we discard and regenerate. This is effective for predictable text, improving perceived generation speed.\\n\\n## Few Benchmarks\\n\\nBelow are a couple of representative use cases and performance numbers.\\n\\n### Search query rewriting\\n\\n- **LLM**: Fine-tuned llama-3.2-1B\\n- **Input & output token length**: ~10\u201320\\n- **Response type**: Non-streaming\\n\\n| Inference runtime | Hardware | Max requests/sec | Max p99 latency |\\n| --- | --- | ---: | ---: |\\n| TensorRT-LLM | 4 \xd7 L4 GPUs (multi-GPU) | 1000 | 95 ms |\\n| TensorRT-LLM | 1 \xd7 A100 40 GB GPU | 1000 | 69 ms |\\n\\n### Voice bot query\\n\\n- **LLM**: Llama-3.1-8B\\n- **Input token length**: ~1900\u20132000\\n- **Output token length**: ~200\\n- **Response type**: Streaming\\n\\n| Inference runtime | Concurrency | p99 TTFT (ms) | p99 ITL (ms) | Token throughput (tokens/sec) | Request throughput (req/sec) | Hardware |\\n| --- | ---: | ---: | ---: | ---: | ---: | --- |\\n| TensorRT-LLM | 1 | 36.27 | 22.78 | 45.66 | 0.23 | L4 |\\n| TensorRT-LLM | 2 | 49.81 | 23.21 | 89.37 | 0.45 | L4 |\\n| TensorRT-LLM | 4 | 55.33 | 36.62 | 153.39 | 0.78 | L4 |\\n| TensorRT-LLM | 8 | 66.5 | 39.11 | 279.88 | 1.47 | L4 |\\n| TensorRT-LLM | 16 | 131.8 | 30.39 | 547.8 | 2.77 | L4 |\\n| TensorRT-LLM | 32 | 277.22 | 48.02 | 925.7 | 4.78 | L4 |\\n| TensorRT-LLM | 64 | 498.52 | 71.62 | 1,164.40 | 6.2 | L4 |\\n| TensorRT-LLM | 128 | 677.31 | 120.37 | 1,445.18 | 7.69 | L4 |\\n| TensorRT-LLM | 256 | 1,926.31 | 216.88 | 1,600.81 | 8.52 | L4 |\\n| TensorRT-LLM | 1 | 21.17 | 9.24 | 130.05 | 0.68 | A100 |\\n| TensorRT-LLM | 2 | 25.78 | 9.21 | 264.5 | 1.35 | A100 |\\n| TensorRT-LLM | 4 | 28.52 | 10.99 | 437.69 | 2.27 | A100 |\\n| TensorRT-LLM | 8 | 34.4 | 12.61 | 760.49 | 3.96 | A100 |\\n| TensorRT-LLM | 16 | 68.03 | 14.32 | 1,343.80 | 7.01 | A100 |\\n| TensorRT-LLM | 32 | 185.96 | 16.82 | 2,287.30 | 11.92 | A100 |\\n| TensorRT-LLM | 64 | 136.87 | 21.17 | 3,625.22 | 18.89 | A100 |\\n| TensorRT-LLM | 128 | 463.78 | 34.15 | 4,456.51 | 23.24 | A100 |\\n| TensorRT-LLM | 256 | 890.12 | 59.18 | 5,188.24 | 27.05 | A100 |\\n\\n## Conclusion\\n\\nHigh-performance LLM inference is fundamentally a systems engineering problem: memory efficiency, kernel execution, batching strategy, and parallelism determine real-world latency and throughput. Techniques such as paged KV caching, aggressive quantization, kernel fusion, and inflight batching improve GPU utilization while reducing latency and memory pressure.\\n\\nThese optimizations enable the platform to deliver sub-second responses, sustain high concurrency, and efficiently serve both lightweight and long-context workloads. By continuously optimizing across the full inference stack, we keep LLM serving scalable, cost-efficient, and production-ready for real-time AI applications."},{"id":"post-four","metadata":{"permalink":"/BharatMLStack/blog/post-four","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-four/index.md","source":"@site/blog/bharatmlstack-history/post-four/index.md","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","description":"BharatMLStack","date":"2025-03-29T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":13.38,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-four","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","authors":["jaya"],"date":"2025-3-29","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/post-five"},"nextItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/post-three"}},"content":"![BharatMLStack](./bms.png)\\n## Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving\\n\\n\\n\\nServing large language models in production introduces new challenges across infrastructure, performance optimization, and operational lifecycle management. The LLM Inference Platform addresses these challenges by providing a unified system for deploying and managing open-source and fine-tuned LLMs at scale.\\n\\nThe platform implements a complete LLMOps lifecycle \u2014 from model registration and automated compilation to deployment, runtime optimization, and monitoring. Designed as a self-service environment, users can onboard models directly from open repositories such as Hugging Face or upload custom fine-tuned models, and deploy them using a single-click workflow with no manual infrastructure or configuration steps required.\\n\\nIn addition to fully automated deployment, the platform allows users to select and apply custom inference optimization techniques \u2014 such as quantization strategies, batching configurations, and runtime-specific performance enhancements \u2014 enabling teams to balance latency, throughput, and cost based on their use case. The goal is to reduce operational friction while enabling high-performance, production-grade LLM inference.\\n\\n## Why LLM Inference Is not just bigger ML model serving\\n\\nLarge language model (LLM) inference introduces a fundamentally different set of challenges compared to traditional machine learning inference. While classical ML models typically perform a single forward pass to produce a fixed prediction, LLMs operate as autoregressive systems, generating outputs token by token based on previously generated context. This difference dramatically changes how inference systems must be designed, optimized, and scaled.\\n\\n### Autoregressive Generation and Sequential Computation:\\n\\nUnlike traditional models such as classifiers or recommenders \u2014 where inference cost is relatively constant \u2014 LLMs generate responses incrementally. Each new token depends on all previously generated tokens, making inference inherently sequential and dynamic. This means latency and compute requirements vary significantly depending on prompt length and output size, introducing complexity in scheduling and resource allocation.\\nBecause tokens cannot be generated fully in parallel during decoding, GPUs may become underutilized without specialized batching and scheduling strategies. This has led to the development of dedicated LLM inference engines optimized for token-level execution.\\n\\n### Prefill and Decode Phases:\\n\\nLLM inference typically consists of two distinct stages:\\n\\n- Prefill phase \u2014 the model processes the input prompt and builds internal representations. This stage is compute-heavy and highly parallelizable.\\n- Decode phase \u2014 the model generates tokens sequentially, predicting one token at a time using previously generated context.\\n\\nThe decode stage often becomes memory-bound rather than compute-bound, which creates new performance bottlenecks compared to traditional ML workloads.\\n\\n### Context Management and KV Caching:\\n\\nAnother fundamental difference lies in how LLMs maintain context. Transformer-based models rely on attention mechanisms that require access to past token representations. To avoid recomputing these representations repeatedly, inference engines use key-value (KV) caching, which stores intermediate activations from previous tokens.\\nKV caching significantly improves performance by eliminating redundant computation, but it introduces new challenges:\\n\\n- Memory consumption grows with sequence length and batch size\\n- GPU memory becomes a critical bottleneck\\n- Efficient memory management becomes essential for scaling concurrent requests\\n\\nThis tradeoff between compute efficiency and memory usage is unique to LLM inference workloads.\\n\\n### Dynamic and Irregular Workloads:\\n\\nTraditional ML inference typically operates on fixed-size inputs with predictable latency. In contrast, LLM requests vary widely in prompt length, output length, and runtime behavior. As a result:\\n\\n- Batch sizes must be dynamic rather than static\\n- Requests may enter and leave batches asynchronously\\n- Scheduling systems must continuously rebalance workloads to maximize GPU utilization\\n\\nThese characteristics require specialized serving architectures that differ significantly from standard ML serving pipelines.\\n\\n### Streaming and User Experience Constraints:\\n\\nAnother distinguishing factor is the expectation of real-time streaming responses. Instead of returning a single output, LLM systems often stream tokens to users as they are generated. \\nBecause of these differences \u2014 sequential generation, growing memory requirements, dynamic workloads, and streaming constraints \u2014 LLM inference cannot be treated as a simple extension of existing ML serving systems. Production platforms must incorporate specialized runtime engines, advanced optimization techniques, and observability tailored specifically to LLM workloads.\\n\\n## LLMOps: High-Level Architecture \\n\\n![LLM Architecture](./llm-plat.png)\\n\\nThe LLM Inference Framework is designed as a fully automated, end-to-end system for deploying and operating open-source and fine-tuned large language models at scale. The architecture abstracts the complexity of model optimization, hardware selection, deployment, and runtime management into a unified workflow that enables users to move from raw model weights to production-ready inference endpoints with minimal manual intervention.\\n\\nOur LLM Inference Framework is architected not just as a serving engine, but as a complete lifecycle management system. As illustrated in the high-level design below, the platform automates the journey of a model through seven distinct stages, ensuring reproducibility, performance, and scalability.\\n\\n1. Onboarding & Registration (The Source of Truth)\\n\\n The lifecycle begins with the Data Scientist or engineer.\\n\\n - Model Ingestion: Users onboard models\u2014whether open-source (Hugging Face, NeMo) or internally fine-tuned\u2014via the Truffle Box SDK/UI.\\n - LLM + Prompt Registry: Unlike traditional systems that only track model weights, our registry is a unified control plane. It stores both the Model Artifacts and the Prompt Templates. This allows Data Scientists to register and version-control prompts (e.g., \\"customer_support_v2\\") independently of the application code.\\n\\n2. The \\"Black Box\\" Build Engine\\n\\n Once a model is registered, the Automated LLM Compiler + Quantizer Module kicks off a background job on ephemeral GPU resources.\\n\\n - Transformation: The raw model is converted into a TRT-LLM Checkpoint.\\n - Quantization: The system automatically applies quantization algorithms (like INT4 AWQ or FP8) to reduce memory footprint.\\n - Engine Building: Finally, it compiles a highly optimized TRT Engine specifically tuned for the target hardware.\\n\\n3. Intelligent Profiling & Validation\\n\\n Before deployment, the new engine passes through the Hardware & Inference Runtime Profiler.\\n\\n - Benchmarking: This module empirically tests the engine against various hardware configurations (L4 vs. A100) and runtimes (TRT-LLM vs. vLLM).\\n - Optimization: It recommends the optimal configuration that meets latency SLAs (Time-To-First-Token) while minimizing cost.\\n\\n4. Smart Artifact Generation & Distribution\\n\\n To solve the Kubernetes \\"Cold Start\\" problem, the LLM Serving Artifacts Generation module packages the model using a bifurcated strategy:\\n\\n - Standard Models: Artifacts are uploaded to Cloud Storage (GCS) and downloaded by pods at startup.\\n - Very Large Models: For massive models (>8GB) where network downloads are too slow, the system pre-caches the model onto Secondary Boot Disks. These disks are attached directly to new GPU nodes during autoscaling, eliminating download wait times.\\n\\n5. Image Streaming & Deployment\\n\\n Simultaneously, the inference runtime container images are pulled from the Artifact Registry.\\n\\n - Image Streaming: We utilize container image streaming to allow pods to start initializing while the massive Triton/Dynamo container layers are still downloading, further shaving seconds off the startup time. link\\n\\n6. The Inference Runtime (Kubernetes)\\n\\n The workload lands on Kubernetes with Autoscaling.\\n\\n - Dynamic Backends: Depending on the profile generated in Stage 3, the pod initializes either TensorRT-LLM (for throughput) or vLLM (for flexibility), or spins up a Dynamo worker for distributed inference.\\n - Data Loading: The pod either downloads the model from Cloud Storage or mounts the pre-warmed Secondary Boot Disk (\\"Pull from Disk\\").\\n\\n7. Client Interaction & Observability\\n\\n Finally, the LLM Inference Client executes the request.\\n\\n - Prompt Injection: The client pulls the specific prompt template ID from the Registry, ensuring the exact versioned instructions are used.\\n - Streaming Response: The request is sent via gRPC, and tokens are streamed back to the user in real-time.\\n\\n8. Observability: Monitoring the Pulse of GenAI\\n\\n In traditional microservices, success is measured by CPU utilization and request latency (p99). For Large Language Models, these metrics are insufficient. A user doesn\'t care if the GPU is at 80% utilization; they care about how fast the first word appears and how smoothly the rest of the sentence follows.\\n\\n To capture the true user experience, our platform instrumentation focuses on three critical LLM-specific metrics:\\n\\n 1. Time to First Token (TTFT)\\n - Definition: TTFT measures the time elapsed from the moment a request is received until the very first token is generated and streamed back to the user.\\n - Why it matters: This represents the \\"Prefill Phase\\" latency\u2014the time the model takes to process the input prompt and load weights. A high TTFT makes the application feel unresponsive or \\"hung.\\"\\n - Optimization: We closely monitor TTFT to ensure our Prefix Caching is effective (aiming for high cache hitrates), which drastically lowers this metric by skipping redundant prompt processing.\\n\\n 2. Inter-Token Latency (ITL)\\n - Definition: ITL measures the average time interval between the generation of consecutive tokens during the \\"Decode Phase\\".\\n - Why it matters: This defines the \\"perceived speed\\" of reading. Even if the first token is fast (low TTFT), high ITL makes the text generation look \\"jerky\\" or slow to the user.\\n - Benchmarks: In our testing with Llama 3.1, we track p99 ITL to ensure it stays below human reading speeds to maintain a natural conversational flow.\\n\\n 3. Token Throughput vs. Request Throughput\\n - We distinguish between two types of throughput to balance system efficiency with user load:\\n - Token Throughput (tokens/sec): The total number of tokens generated across all concurrent requests. This measures the raw compute efficiency of the GPU and the effectiveness of batching.\\n - Request Throughput (req/sec): The number of distinct user queries served per second. We use this to determine autoscaling thresholds, ensuring we scale out before the queue depth impacts ITL.\\n\\n 4. The Monitoring Stack\\n - Real-time Dashboards: We utilize Grafana to visualize these streaming metrics in real-time, allowing on-call engineers to spot \\"slow generation\\" incidents that generic \\"500 error\\" alerts would miss.\\n - Request Tracing: Since Triton Inference Server does not log request payloads by default, we integrate a Helix Client to asynchronously publish request logs to Log Tables. This allows us to trace a specific \\"slow\\" request back to its prompt to understand if a complex input caused the latency spike.\\n\\n## Supported Inference backends (TensorRT LLM, Dynamo & vLLM)\\n\\nTailored for the Use Case: We do not believe in a \\"one-size-fits-all\\" approach to inference. Different use cases\u2014whether a real-time voice bot requiring ultra-lowsub-second latency or a massive reasoning task requiring huge context windows\u2014demand different runtime characteristics. Our platform is designed to be runtime-agnostic, allowing us to automatically select and tailor the best engine based on the specific requirements of the application:\\n\\n1. TensorRT-LLM: The High-Performance Standard\\n\\n Suitable for: High-throughput production workloads where latency is critical (e.g., customer support chat, real-time voice bots).\\n\\n TensorRT-LLM serves as our default backend for these scenarios. Our internal benchmarks on Llama 3.1 and 3.2 models demonstrated that a tuned TensorRT-LLM engine significantly outperforms standard runtimes, especially when utilizing INT4 AWQ and FP8 quantization .\\n\\n Key optimizations we tailor for these high-load cases include:\\n\\n - Optimized execution via TensorRT engine compilation\\n - Quantization-aware execution for reduced memory usage and improved throughput\\n - Inflight Batching: Allowing requests to be processed continuously without waiting for the entire batch to finish, drastically improving GPU utilization .\\n - Custom Plugins: Enabling specific NVIDIA plugins like the GEMM plugin and GPT Attention plugin to accelerate matrix multiplications and attention mechanisms .\\n\\n2. Dynamo: Distributed Inference for Reasoning Models\\n\\n Suitable for: Very large \\"reasoning\\" models (70B+) or scenarios requiring massive context windows where a single GPU\'s memory is insufficient.\\n\\n For these memory-bound tasks, we utilize Dynamo, a low-latency distributed inference framework . Unlike monolithic servers, Dynamo disaggregates the inference process to scale resources horizontally:\\n\\n - KV Aware Routing: A specialized router directs requests to workers that already hold the relevant Key-Value (KV) cache, minimizing redundant computation .\\n - Prefill vs. Decode Split: The workload is divided into Prefill Workers (processing the prompt) and Decode Workers (generating tokens), allowing us to scale the compute-heavy \\"reading\\" phase independently from the memory-heavy \\"writing\\" phase .\\n - Distributed execution across multiple GPU resources\\n\\n3. vLLM: The Flexible Baseline\\n\\n Suitable for: Rapid prototyping, testing new model architectures, or low-traffic internal tools where ease of deployment outweighs raw throughput.\\n\\n While TensorRT-LLM is optimized for maximum speed, vLLM provides a robust and flexible baseline .\\n\\n - High throughput through dynamic batching and efficient memory utilization\\n - Paged KV cache management for handling long contexts and concurrent requests\\n - Strong support for open-source model ecosystems\\n - Rapid Adoption: It allows us to onboard new model architectures immediately without waiting for a custom TensorRT build.\\n - Benchmarking Insight: In our internal tests, vLLM provided a strong baseline but often lacked the specific max-token optimizations present in our custom TRT engines . We use it strategically for initial testing before committing to a full TensorRT optimization pipeline.\\n\\n## Conclusion\\n\\nLarge language model inference introduces a fundamentally new class of infrastructure challenges\u2014where performance is governed not just by raw compute, but by memory efficiency, intelligent scheduling, runtime specialization, and lifecycle automation. Unlike traditional ML serving, LLM inference requires systems that understand token-level execution, manage rapidly growing context state, and continuously balance latency, throughput, and cost under highly dynamic workloads.\\n\\nThe LLM Inference Framework addresses these challenges by transforming inference into a fully automated, reproducible lifecycle\u2014from model onboarding and compilation to deployment, optimization, and observability. By integrating automated quantization and engine compilation, intelligent runtime selection, cold-start mitigation strategies, and LLM-specific observability metrics such as Time-to-First-Token and Inter-Token Latency, the platform ensures both high performance and operational simplicity.\\n\\nEqually important, the framework is designed with flexibility and future evolution in mind. Its runtime-agnostic architecture enables seamless adoption of emerging inference engines, hardware accelerators, and optimization techniques without requiring platform redesign. This ensures that teams can continuously leverage advancements in the rapidly evolving LLM ecosystem while maintaining consistent operational workflows.\\n\\nUltimately, the goal of the platform is to make production-scale LLM deployment as seamless and reliable as traditional software deployment\u2014allowing teams to focus on building intelligent applications rather than managing infrastructure complexity. By combining lifecycle automation, runtime optimization, and deep observability, the LLM Inference Framework provides a scalable foundation for delivering fast, cost-efficient, and production-ready LLM experiences.\\n\\n## Future Explorations\\n\\nWhile we have achieved significant milestones in latency and throughput, the landscape of GenAI is evolving rapidly. Our roadmap focuses on increasing flexibility, reducing costs, and enhancing reliability for enterprise-grade workloads. Here is what we are building next:\\n\\n- TPU Support: To diversify our hardware supply chain and further optimize cost-per-token, we are evaluating Google Cloud TPUs to bake it into our platform. By leveraging the JAX and PyTorch/XLA ecosystems, we aim to unlock the massive throughput potential of TPU v5e chips, particularly for our open-source Llama models. This will allow the hardware profiler to dynamically choose between NVIDIA GPUs and Google TPUs based on real-time availability and price-performance metrics.\\n- Multi-LoRA Serving (Serverless Experience): Currently, deploying a fine-tuned model requires a dedicated GPU. We are building support for Multi-LoRA serving, which will allow us to serve hundreds of unique, fine-tuned adapters on top of a single frozen base model. This will drastically reduce costs for multi-tenant applications, enabling a \\"serverless\\" experience where specific fine-tunes are hot-swapped instantly per request.\\n- Spot Instance Orchestration: To further optimize cloud costs, we are developing fault-tolerant mechanisms to run inference workloads on Spot Instances. By implementing aggressive checkpointing and seamless request draining, we aim to leverage cheaper, preemptible compute capacity without interrupting the user\'s streaming experience.\\n- Semantic Caching Layer: We plan to move beyond standard Prefix Caching to implement Semantic Caching. By using a vector database to fetch responses for semantically similar queries (e.g., \\"How do I reset my password?\\" vs. \\"Password reset steps\\"), we can bypass the GPU entirely for repetitive queries, reducing latency to near-zero.\\n- Context-Aware Autoscaling: Standard CPU/GPU utilization metrics are often insufficient signals for scaling LLMs. We are working on KV-cache pressure metrics for autoscaling. This ensures that we scale out before the memory fills up, preventing eviction-based slowdowns during traffic spikes.\\n- Online Evaluation & Guardrails: We are integrating a lightweight \\"Trust Layer\\" into the proxy. This will allow for low-latency input/output filtering (Guardrails) and asynchronous \\"LLM-as-a-Judge\\" evaluation pipelines to monitor response quality in production, not just system health."},{"id":"post-three","metadata":{"permalink":"/BharatMLStack/blog/post-three","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-three/index.md","source":"@site/blog/bharatmlstack-history/post-three/index.md","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"BharatMLStack","date":"2024-05-21T00:00:00.000Z","tags":[{"inline":true,"label":"model-inference","permalink":"/BharatMLStack/blog/tags/model-inference"},{"inline":true,"label":"embedding-search","permalink":"/BharatMLStack/blog/tags/embedding-search"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":3.6,"hasTruncateMarker":false,"authors":[{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"slug":"post-three","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","authors":["aditya","jaya","adarsha"],"date":"2024-05-21T00:00:00.000Z","tags":["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-four"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two"}},"content":"![BharatMLStack](./bms.png)\\n\\n## Cracking the Code: Scaling Model Inference & Real-Time Embedding Search\\n\\nBy mid-2023, we had transformed our ML stack\u2014building a real-time feature store, optimizing model retrieval, and fine-tuning ranking. But two critical gaps remained:\\n\\n- \ud83d\udd39 Scaling model inference without hitting infrastructure roadblocks\\n- \ud83d\udd39 Moving embedding search from batch to real-time for candidate generation\\n\\nHere\u2019s how we tackled these last-mile challenges, broke free from infrastructure constraints, and built a cost-efficient, high-performance system.\\n\\n## Breaking Free from the Scalability Ceiling\\n\\n### The Model Serving Bottleneck\u2014A Wake-Up Call\\n\\nJuly 2023. With just months left for the Mega Blockbuster Sale (MBS), we noticed a serious issue\u2014scaling our model-serving infrastructure was taking 10\u201315 minutes. In real-time ML, that\u2019s an eternity.\\nIn one of our war rooms, we ran a quick experiment:\\n\\n- \ud83d\ude80 We deployed an XGBoost model on a self-hosted Triton Inference Server running on a 16-core machine.\\n- \ud83d\ude80 Fired requests and compared the outputs with our existing cloud-hosted setup.\\n- \ud83d\ude80 The results matched\u2014perfectly.\\n\\nThat moment changed everything. We prepped a backup Triton setup on EKS, just in case our cloud provider couldn\'t allocate enough compute resources in time. Luckily, they did\u2014but the seed was planted.\\nThen in October, just two weeks before MBS, we got an alarming response from our infrastructure team:\\n \\"Node availability may be an issue.\\"\\nWith no time to waste, we moved 30% of real-time ML traffic to our self-hosted Triton cluster. The results?\\n\\n- \u2705 p99 latency dropped from 90\u2013100ms to 30\u201340ms\\n- \u2705 Triton handled significantly higher throughput on fewer resources\\n- \u2705 No model changes were needed\\n\\nMBS ran without a hitch, proving that self-hosted inference was the way forward.\\n\\n### Scaling Triton on GKE\\n\\nThis left us with two choices:\\n\\n- 1\ufe0f\u20e3 Port models to a managed cloud inference service, investing time in learning a new deployment stack\\n- 2\ufe0f\u20e3 Scale our existing Triton setup on GKE, optimizing for cost and performance\\n\\nWe went with Option 2\u2014and it slashed inference costs to 35% of what we previously paid, while giving us full control over scaling and optimizations.\\n\\n### Fixing the Cold Start Problem\\n\\nAs we onboarded more deep learning (DL) models, we hit a new bottleneck, new inference pods took 7\u20139 minutes to spin up.\\n\\nAfter profiling, we found the culprits:\\n\\n- Triton\u2019s base image\u2014a massive 5GB\\n- Model binaries\u2014often 1GB+\\n- Startup delay\u2014mostly due to downloading and initializing these assets\\n\\nTo fix this, we built a lightweight Triton image, stripping unused components and shrinking the size to 900MB. This cut cold start times drastically, making auto-scaling faster and smoother.\\n\\n## Embedding Search: The Last Piece of the Puzzle\\n\\nBy mid-2023, most of our ML stack had gone real-time\u2014except for Candidate Generation (CG), which still ran in batch mode. To truly power real-time recommendations, we needed an online embedding search system.\\n\\n### Choosing the Right Vector Database\\n\\nWe benchmarked three production-ready vector DBs across key parameters:\\n\\n- Milvus\\n- Qdrant\\n- Weaviate\\n\\nAfter extensive POCs, Qdrant stood out for its:\\n\\n- \u2705 Blazing-fast search latency on high-dimensional vectors\\n- \u2705 Efficient memory usage, crucial for in-memory workloads\\n- \u2705 Support for upserts and soft deletes, vital for Ads use cases\\n- \u2705 gRPC + REST APIs, making integration seamless\\n- \u2705 Powerful filtering, allowing fine-tuned retrieval (e.g., filtering Ads by category, active status, etc.)\\n\\nAt its core, Qdrant uses HNSW indexing, delivering both high recall and low-latency nearest-neighbor search\u2014a perfect fit for our needs.\\n\\n### Embedding Freshness & Real-Time Updates\\n\\nTo ensure embeddings stayed up to date, we built a dual ingestion pipeline:\\n\\n- \ud83d\udccc Daily Refresh: A bulk pipeline updated embeddings overnight\\n- \ud83d\udccc Real-Time Updates: Ads events triggered immediate upserts/deletes\\n\\nThis setup powered real-time \\"Similar Products\\" recommendations on the product page and became the foundation for Ads Candidate Generation, ensuring the right ads surfaced in milliseconds.\\n\\n![Skye](./vss.png)\\n\\n## Final Takeaways: Scaling Smartly for Real-Time ML\\n\\n- \ud83d\ude80 Self-hosted inference on Triton gave us lower cost, faster scaling, and better performance than managed services\\n- \ud83d\ude80 Building a custom Triton image reduced cold starts, improving responsiveness\\n- \ud83d\ude80 Qdrant-based embedding search enabled real-time personalization at scale\\n- \ud83d\ude80 Real-time updates for embeddings unlocked dynamic, up-to-date recommendations\\n\\nBy early 2024, Meesho\u2019s ML stack had evolved into a fully real-time, scalable, and cost-efficient system, setting the foundation for even bigger leaps ahead."},{"id":"post-two","metadata":{"permalink":"/BharatMLStack/blog/post-two","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-two/index.md","source":"@site/blog/bharatmlstack-history/post-two/index.md","title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","description":"BharatMLStack","date":"2023-04-10T00:00:00.000Z","tags":[{"inline":true,"label":"inferflow","permalink":"/BharatMLStack/blog/tags/inferflow"},{"inline":true,"label":"interaction-store","permalink":"/BharatMLStack/blog/tags/interaction-store"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":6.31,"hasTruncateMarker":false,"authors":[{"name":"Bhawani Singh","title":"Architect @ Meesho","url":"https://github.com/singh-bhawani","imageURL":"https://github.com/singh-bhawani.png","key":"bhawani","page":null},{"name":"Jigar Dave","title":"Lead Software Engineer @ Meesho","url":"https://github.com/jigarpatel26","imageURL":"https://github.com/jigarpatel26.png","key":"jigar","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"slug":"post-two","title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","authors":["bhawani","jigar","adarsha"],"date":"2023-4-10","tags":["inferflow","interaction-store","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/post-three"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","permalink":"/BharatMLStack/blog/post-one"}},"content":"![BharatMLStack](./bms.png)\\n## Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)\\n\\nBy late 2022, we had built something we were truly proud of\u2014a real-time ML serving system with a DAG-based executor, a feature store, and an interaction store powering key ranking and personalization models. It was a major milestone, the culmination of months of effort from data scientists, ML engineers, and backend teams. Our system was live, and we were ready to push the boundaries of experimentation.\\nAnd it worked. Mostly.\\nBut soon, cracks appeared. Every new model needed custom feature retrieval logic, DAGs became dense and unmanageable, and scaling turned into a constant firefight. Costs surged, and infra bottlenecks slowed experimentation. Our system worked, but it wasn\u2019t built for scale.\\nThis is the story of how we tackled these challenges\u2014building Inferflow for seamless feature retrieval, optimizing real-time infra, and cutting costs while scaling to millions of QPS.\\n\\n### The Cost of Success\\nEvery new Ranker model required its own feature set, often pulling from different entities. Each addition meant:\\n\\n- Adding new DAG nodes in IOP\\n- Writing custom logic to fetch features from multiple sources (e.g., user, product, user \xd7 category)\\n- Inferring intermediate features (e.g., extracting category from a product to fetch user \xd7 category data)\\n- Optimizing I/O and dealing with the inevitable bugs\\n\\nWhat began as clean DAGs soon turned into a tangled web of cross-dependent graphs. Every experimentation cycle meant new nodes, new dependencies, and slower iterations.\\n\\n### Scaling Pains (and Cassandra\u2019s Limits)\\nAt some point, we were hitting:\\n\\n- 250\u2013300K reads/sec\\n- 1M writes/sec (during lean hours)\\n\\nAll of this ran on Cassandra. While its distributed architecture had been proven in production, operating large-scale clusters came with considerable infrastructure overhead. Our proof-of-concept (POC) demonstrated throughput of around 100K ops/sec, but as we scaled further, the challenges grew. Ensuring node health, optimizing compaction, and maintaining storage balance became increasingly demanding. We also observed latency spikes under heavy load, alongside a sharp increase in total cost of ownership.\\n\\n### Interaction Store Woes\\nOur interaction store was another ticking time bomb:\\n\\n- \ud83d\udea8 Clusters kept growing in size and cost\\n- \ud83d\udea8 Latency spikes became increasingly frequent\\n- \ud83d\udea8 The DMC proxy occasionally lost locality of nodes against shards, causing cross-node communication and degraded performance\\n\\nEach time this happened, we had to manually rebalance shards just to restore stable latency, making operations unsustainable at scale.\\n\\n### Silver Linings\\nDespite the chaos, the system was live and delivering value:\\n\\n- Real-time infrastructure was in production\\n- Costs dropped by 60\u201370% compared to offline personalization\\n- New experiments rolled out faster and more successfully\\n- User engagement metrics improved\\n\\nIt wasn\u2019t perfect. It was far from easy. But it worked\u2014and that counted for a lot.\\n\\n### Round Two: Solving the Top 2 Bottlenecks\\nWith the first-gen system stretched to its limits, we stepped back. Conversations with data scientists and backend engineers revealed three recurring pain points:\\n\\n1. Coding feature retrieval logic for every new model was becoming unsustainable\\n2. ML scale was exploding\u2014bringing rising infra costs with it\\n3. Real-time embedding search was the next big unlock\\n\\nWe tackled them one by one\u2014starting with the biggest pain point.\\n\\n#### Problem 1: No-Code Feature Retrieval for Model Inference\\nWe noticed a pattern: for personalized ranking, models needed features from:\\n\\n- \u2705 Product\\n- \u2705 User\\n- \u2705 User \xd7 Category\\n- \u2705 Region, cohort, sub-category, etc.\\n\\nA key insight emerged: Entities that contribute features for a model always map back to the context entities.\\n\\n![MP Dag](./mp-dag.png)\\n\\nWith this, we designed Inferflow, a graph-driven feature retrieval and model orchestration system:\\n\\n- 1\ufe0f\u20e3 Inferflow takes a modelId and context IDs (e.g., userId, productIds)\\n- 2\ufe0f\u20e3 Loads a pre-defined feature retrieval graph from ZooKeeper\\n- 3\ufe0f\u20e3 Executes the graph to resolve entity relationships dynamically\\n- 4\ufe0f\u20e3 Outputs a 2D matrix of feature vectors\\n\\n\ud83d\udca1 The impact?\\n\\n- \ud83d\ude80 No more custom feature retrieval code\u2014just graph updates in config\\n- \ud83d\ude80 Feature consistency across experiments\\n- \ud83d\ude80 Faster iteration cycles for ranking, fraud detection, and beyond\\n\\nHere\u2019s a visual example that shows how this graph plays out during execution. We further extended the graph to call multiple models as needed:\\n![MP matrix](./mp-matrix.png)\\nWe built Inferflow in GoLang, using gRPC and Proto3 serialization for efficiency.\\n\\n#### Problem 2: Scaling Without Breaking the Bank\\nWith more ML use cases coming online, we needed to cut costs without compromising performance. We focused on:\\n\\n- \ud83d\udd39 Online Feature Store\\n- \ud83d\udd39 Interaction Store\\n\\n#### Optimizing the Online Feature Store\\nOur costs were concentrated in:\\n\\n- \ud83d\udccc Database (Cassandra)\\n- \ud83d\udccc Cache (Redis)\\n- \ud83d\udccc Running Pods (Java services)\\n\\n1\ufe0f\u20e3 Replacing Cassandra with ScyllaDB\\nAs we hit the operational limits of large Cassandra clusters, we transitioned to ScyllaDB, which offered a seamless drop-in replacement without major code changes. The switch brought significant benefits:\\n\\n- Throughput: Matched or exceeded Cassandra\'s performance under identical workloads, even under high concurrency.\\n- Latency: Achieved consistently lower P99 latencies due to ScyllaDB\'s shard-per-core architecture and better I/O utilization.\\n- Cost Efficiency: Reduced infra footprint by ~70% through better CPU and memory efficiency, eliminating the need for over-provisioned nodes.\\n\\n2\ufe0f\u20e3 Finding the Right Cache\\nTo reduce backend load and improve response times, we benchmarked multiple caching solutions\u2014Memcached, KeyDB, and Dragonfly\u2014under real production traffic patterns. Dragonfly stood out due to its robust architecture and operational simplicity:\\n\\n- Data Skew Handling: Efficiently managed extreme key hotness and uneven access patterns without performance degradation.\\n- Throughput: Delivered consistently high throughput, even with large object sizes and concurrent access.\\n- Ease of Adoption: Acted as a drop-in Redis replacement with full protocol compatibility\u2014no changes needed in application code or client libraries.\\n\\n3\ufe0f\u20e3 Moving to GoLang for Cost-Efficient Serving\\nJava services were memory-heavy\u2014so we rewrote core services in GoLang. The results?\\n\\n\u2705 Memory usage dropped by ~80%\\n\u2705 CPU utilization was significantly lower\\n\u2705 Faster, more efficient deployments\\n\\n#### Optimizing the Interaction Store\\nWe realized that we only need a user\u2019s interaction data in Redis when they open the app. So, we implemented a tiered storage approach:\\n\\n- \ud83d\udccc Cold Tier (ScyllaDB)\u2014Stores click, order, wishlist events\\n- \ud83d\udccc Hot Tier (Redis)\u2014Loads a user\u2019s past interactions only when they open the app\\n\\nSmart Offloading: We introduced an inactivity tracker to detect when a user session ends. At that point, Redis data was flushed back to Scylla, reducing unnecessary writes.\\n\\n![InteractionStore](./interaction-str.png)\\n#### Results\\n\\n- Online Feature Store hit 1M QPS for the first time during the 2023 Mega Blockbuster Sale\u2014without breaking a sweat\\n- Infra costs for Online Feature Store and Interaction Store dropped by ~60%\\n\\n#### The Catch: Our ML Hosting Hit a Hard Limit\\nWhile planning for 2023 MBS, we ran into a critical scalability bottleneck:\\n\\n- \u274c Insufficient compute availability in our region for ML instances\\n- \u274c Couldn\u2019t provision enough nodes to handle real-time inference at scale\\n\\nThis forced us to rethink where and how we hosted our models. The existing setup was great for prototyping\u2014but it wasn\u2019t built to handle the bursty, high-QPS demands of real-world production workloads.\\n\\n### Conclusion: From Firefighting to Future-Proofing\\nWhat started as an ambitious experiment turned into a real-time ML infrastructure that powered millions of requests per second. We battled scaling pains, rethought feature retrieval with Inferflow, and rebuilt our infra stack for efficiency\u2014driving down costs while improving experimentation velocity.\\nBut new challenges emerged. Our infrastructure could now handle scale, but our ML model hosting setup hit a hard limit. With compute availability bottlenecks threatening real-time inference, we faced a critical decision: how do we make model serving as scalable and cost-efficient as the rest of our stack? That\u2019s the next piece of the puzzle\u2014and the story of Part 3."},{"id":"post-one","metadata":{"permalink":"/BharatMLStack/blog/post-one","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-one/index.md","source":"@site/blog/bharatmlstack-history/post-one/index.md","title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","description":"BharatMLStack","date":"2022-11-15T00:00:00.000Z","tags":[{"inline":true,"label":"online-feature-store","permalink":"/BharatMLStack/blog/tags/online-feature-store"},{"inline":true,"label":"interaction-store","permalink":"/BharatMLStack/blog/tags/interaction-store"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"}],"readingTime":10.25,"hasTruncateMarker":false,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null},{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Bhawani Singh","title":"Architect @ Meesho","url":"https://github.com/singh-bhawani","imageURL":"https://github.com/singh-bhawani.png","key":"bhawani","page":null},{"name":"Jigar Dave","title":"Lead Software Engineer @ Meesho","url":"https://github.com/jigarpatel26","imageURL":"https://github.com/jigarpatel26.png","key":"jigar","page":null}],"frontMatter":{"slug":"post-one","title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","authors":["adarsha","aditya","bhawani","jigar"],"date":"2022-11-15T00:00:00.000Z","tags":["online-feature-store","interaction-store","mlplatform","meesho"]},"unlisted":false,"prevItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two"}},"content":"![BharatMLStack](./bms.png)\\n## The Genesis: How a Friday Night Roast Sparked Meesho\u2019s ML Platform\\n\\nIt all started in early 2022, over a casual Friday evening catch-up. Like many great origin stories, this one began with friendly banter between a group of backend engineers and data scientists. As the conversations unfolded, so did the roasting\u2014until one remark hit a little too close to home:\\n\\n*\\"Why are we still crunching data for Monthly Active Users (MAU) when the next day it\u2019s all about Daily Active Users (DAU)?\\"*\\n\\nThe laughter died down, and the question lingered. When we regrouped on Monday\u2014clear-headed and slightly reflective\u2014we decided to dig into the numbers. What they discovered was quite revealing: a large portion of compute resources wasn\u2019t being put to good use.\\nMuch of the system\u2019s effort was spent supporting users who weren\u2019t actively engaging, and even for new users, the experience wasn\u2019t optimized to make a meaningful impact.\\n\\nAt the same time, Meesho had just launched a company-wide initiative to reduce costs\u2014and every team had to contribute. This realization sparked the journey that would eventually lead to the **Meesho ML Platform**, known today as **BharatMLStack**.\\n\\n![Alt Text](./old-batch-arch.png)\\n\\nBefore the ML Platform, our recommendation and ranking pipelines followed a batch processing approach:\\n- **Data Ingestion**: The Data Platform team executed ETL jobs to ingest raw user data\u2014including user profiles, interaction logs, and product impressions\u2014into designated S3 buckets.\\n- **Layer 1**: Embedding Generation: On the Data Science side, Spark jobs pulled data from multiple S3 sources, cleaned and preprocessed it, and applied matrix factorization to generate user and item embeddings. The processed data and embeddings were then stored back in S3 in a structured format.\\n- **Layer 2**: Candidate Generation (CG): In this stage, Spark jobs leveraged embeddings and historical interaction data to generate candidate recommendations for users. These candidate lists were subsequently written to S3.\\n- **Layer 3**: Ranking and Merging \u2013 A final round of processing ranked the generated candidates using ML models, combined different candidate lists, and stored the final ranked recommendations in a caching system.\\n- **Serving**: A microservice retrieved ranked recommendations from an in-memory data store via exposed APIs, delivering personalized listings across key surfaces such as \\"For You\\" and Category Landing Pages (CLP).\\n\\nThis approach held up well\u2014until Meesho started seeing a significant surge in traffic.\\n\\n## The Turning Point: From Batch to Real-Time\\n\\nAt this time, the team was iterating on new **Ranker models**, and real-time inference seemed like the next logical step. But Rankers needed **real-time feature retrieval**, which meant an **online feature store** had to be built first.\\n\\nExploring open-source options led to **cost vs. performance trade-offs**, but Meesho\u2019s surging traffic meant that **latency and stability were non-negotiable**. After multiple debates and stakeholder discussions, a bold decision was made:\\n\\n*We would build our own feature store.*\\n\\nMeanwhile, efforts began to bring **Candidate Generators (CGs)** to real-time. The challenge? **Storing and retrieving user interactions quickly enough** to power real-time recommendations.\\n\\nAs the team dove deeper, a new roadblock emerged: \\nOur ML jobs were orchestrated using **Airflow DAGs**, giving data scientists flexibility in experimentation. But transitioning to real-time execution threatened this agility. Every change would now require backend engineering support, **slowing down iteration cycles**.\\n\\nThat\u2019s when the idea struck: \\nWe needed a **framework for real-time DAG execution**\u2014one that preserved the same flexibility as Airflow but worked for **streaming data**.\\n\\nThis moment shaped the **next phase of our journey**.\\n\\n## First Generation Design\\n\\n![Alt Text](./first-gen-arch.png)\\n\\n# Laying the Groundwork: The First-Gen ML Platform\\n\\nTo solve these challenges, the team built three foundational components:\\n\\n\\n### 1. IOP Framework: A Real-Time DAG Executor\\n\\n- **Reusable Nodes**: Each DAG node (e.g., an invocation to a CG service, a ranker, or a filter) had to be implemented only once. After that, it could be reused across any workflow by referencing it in config.\\n- **Config-driven Dynamic Graphs**: Execution graphs were defined as adjacency lists stored in **ZooKeeper**, allowing teams to modify the sequence or structure of operations without touching application code.\\n- **Plug-and-play CGs**: The Candidate Generator interface was preserved, so a single CG node could call any CG service by passing `cg_name` in the request. This drastically reduced the code surface area and improved maintainability.\\n- **Production-Grade DAGs**: DAGs were designed to execute in **low-latency real-time environments**, with support for **parallel execution, retries, and branching**.\\n\\n[More about IOP DAG](https://www.meesho.io/blog/rebuilding-meeshos-ranking-platform)\\n\\n\\n### 2. Online Feature Store - 0th Version\\n\\n- Used **Cassandra** and **Redis** for low-latency feature serving.\\n- Maintained feature consistency using **Feature Groups** with TTL-based expiry.\\n- A hybrid schema was used: feature keys stored in **ZooKeeper**, data stored in **compact arrays**.\\n\\n\\n### 3. Interaction Store - 0th Version\\n\\n- Captured real-time user interactions like clicks, orders, and add-to-cart events.\\n- Stored event data in **Redis ZSETs (sorted sets)** to enable fast lookups for recommendation engines.\\n- Provided an API to fetch a user\'s **last _k_ interactions** or **interactions within a time window**.\\n\\n\\nWith these components in place, **real-time ML at Meesho became a reality**.\\n\\nThis was just the beginning.\\n\\n## Building the Online Feature Store - 0th Version\\n\\n![Alt text](./online-feature-store-v0.png)\\n\\n### Choosing the Right Tech Stack\\n\\nWe spent considerable time evaluating various databases, caches, and communication protocols for our **online feature store**. After carefully weighing **cost, latency, throughput**, and **operational stability**, we settled on a combination of:\\n\\n- **Cassandra** and **Redis** for storage\\n- **gRPC + Proto3** as our communication layer\\n\\n\\n### Streamlining the Data Flow\\n\\nTo keep things simple in the initial version:\\n\\n- **Feature engineering jobs** wrote raw outputs to an **S3 bucket**\\n- A **daily feature push job**:\\n - Read from S3\\n - Grouped related features into **Feature Groups** (ensuring consistency)\\n - Pushed them to **Kafka**\\n\\nFor features requiring frequent updates:\\n\\n- **Ad-hoc jobs** computed features in higher frequency\\n- These jobs pushed to both **Kafka** and **S3** (S3 preserved historical data for future model training)\\n\\n\\n## The Challenges: Data Format and Storage\\n\\nOne of the most critical design challenges was how to store feature data **efficiently and consistently**, especially in databases like **Cassandra** and **Redis**, which come with unique storage constraints.\\n\\nWe had to solve for three key requirements:\\n\\n- ### Feature Consistency\\n When a feature group contains features like `order_count_1h` and `click_count_1h`, both must reflect the **same time window**. Inconsistent updates would lead to **unreliable model predictions**.\\n\\n- ### TTL Granularity\\n Each feature group required an **expiry timestamp**, so that **all features within it expired together**\u2014preserving consistency during reads.\\n\\n- ### Extensibility Across Databases\\n We anticipated that infra needs would evolve. To future-proof our system, the data format was designed to be **decoupled from DB-specific layouts**, enabling portability to systems like **ScyllaDB**, **DynamoDB**, **HBase**, or **BigTable**.\\n\\n\\n---\\n\\n## Overcoming Technical Constraints\\nAt the time, we were using Cassandra, which not only imposed a soft limit of 75 columns per row, but also exhibited significant performance degradation as the number of columns increased further, particularly in memory constrained machines. Wide rows caused high memory usage during reads, unpredictable latencies due to heavy deserialization overhead, and inefficiencies during compactions and repairs. This ruled out the naive \\"one column per feature\\" approach. We needed a format that was compact, minimized the number of columns, and remained efficient and portable across different storage systems.\\n\\n## The Solution: Schema Separation\\n\\nWe introduced the concept of Feature Groups\u2014logical groupings of features that must remain consistent with one another.\\nTo represent these groups efficiently, we adopted a layered storage approach:\\n\\n- **Feature Labels (Keys)** were stored in ZooKeeper, serving as the schema.\\n- **Feature Values** were stored as a comma-separated string array in Cassandra or Redis.\\n- **Expiry Timestamp and Schema Version** were appended using a semi-colon delimiter at the end of the string.\\n\\nExample:\\n\\n```bash\\nfeature_1_value,feature_2_value,feature_3_value;expiry_ts\\n```\\n\\nThis format allowed:\\n- Consistent writes and reads at the group level\\n- Easy parsing of feature values using the schema lookup from ZooKeeper\\n- Efficient storage with minimal DB column usage\\n- Support for per-group TTLs and schema evolution\\n\\n## Tracking Changes in Feature Groups\\nFeature groups don\u2019t stay static. As models evolve, features get added, renamed, or removed. But schema changes often go live before the data is ready\u2014and stopping ingestion just to wait for everything to align isn\'t feasible.\\n\\n### Common Real-World Scenarios:\\n- A new feature is added to the schema, but ingestion jobs still use the older schema version.\\n- Ongoing writes don\u2019t include the newly added feature, and stopping ingestion would break freshness for existing features.\\n- During serving, models request a mix of old and new features, depending on rollout stages.\\n\\n## The Solution: Schema Versioning\\nWe solved this with versioned feature group schemas, which unlocked several capabilities:\\n- ### Backward Compatibility\\n Older ingestion jobs can continue writing using older schema versions. During reads, the system uses the schema version embedded in the value to interpret the data correctly.\\n- ### Partial Availability Handling \\n During inference, if some features in the request aren\u2019t available (due to rollout delays or missing data), the system serves default values, ensuring the inference call doesn\u2019t fail.\\n- ### Safe Writes Without Pipeline Pauses\\n With schema versioning, we no longer had to stop ingestion pipelines for schema updates. Writes using previous versions can continue safely, and downstream consumers evolve independently.\\nThis design gave us the flexibility to move fast without breaking things\u2014preserving data quality, enabling experimentation, and ensuring reliability at scale.\\n\\n![Alt Text](./schema.png)\\n\\n## Interaction Store - 0th Version\\n\\n![Alt Text](./interaction-store-v0.png)\\n\\nTo power real-time Candidate Generators (CGs), we needed fast access to user behavior signals\u2014like what a user recently clicked, ordered, or added to their cart. These interactions form the basis for many real-time recommendations, such as **Similar Products**, **People Also Viewed**, or **Recently Ordered Again**.\\nFor the **0th version** of the Interaction Store, we focused on a design that was **simple, fast, and reliable** \u2014 optimized for high-throughput ingestion and low-latency lookups.\\n\\n## Event Ingestion\\nWe instrumented our backend services to emit key user interaction events to Kafka in real time. These included:\\n- Click\\n- Order\\n- Add to Cart\\n- Wishlist\\n- Share\\n\\nEach event carried essential metadata:\\n- userId \u2014 uniquely identifies the user\\n- productId \u2014 the item being interacted with\\n- timestamp \u2014 the moment the interaction occurred\\n\\nThis decoupled the interaction logging from storage, allowing ingestion and consumption to scale independently.\\n\\n## Storage Design\\nTo store these events, we built Kafka consumers that processed the incoming streams and wrote the data into Redis, using sorted sets (ZSETs) as the primary data structure.\\n\\n### Why Redis?\\nRedis gave us:\\n- **Low-latency** reads and writes\\n- **Time-ordered data** using ZSETs (via score = timestamp)\\n- **Native TTL support**, if needed in later versions\\n- **In-memory performance** \u2014ideal for real-time CGs\\n\\n### Storage Structure\\nEach user\u2019s interactions were stored using a composite key format, uniquely identifying the user and interaction type. This structure allowed efficient organization and quick retrieval of recent activity for recommendation generation:\\n\\n```bash\\nuserId_eventType \u2192 ZSET[...(pid, ts)...]\\n```\\n\\nWithin each ZSET:\\n\\n- The **timestamp** served as the score, maintaining temporal order\\n- The **productId** (optionally with metadata) was the **value**\\n\\nThis allowed us to efficiently retrieve the interactions with HTTP-based API server with two query modes:\\n- Fetch the **last k interactions** of a specific type for a given user with `ZREVRANGE(userId_eventType, count)`\\n- Retrieve **all interactions within a time range** (e.g., last 24 hours) with `ZREVRANGEBYSCORE(userId_eventType, timeRange)`\\n\\n### Built-in Guardrails\\nSince Redis was the sole store, we implemented High Availability (HA) to prevent data loss. To optimize memory usage, we also enforced size limits per event type\u2014only storing the last k interactions per user, with older entries getting truncated.\\n\\n## Conclusion: Laying the Foundation for Real-Time ML\\n\\nIn this first phase, we tackled the **fundamentals**\u2014shifting from batch-based recommendations to a **real-time Recommendation** using ML platform that could keep up with Meesho\u2019s growth.\\n\\nWith the **IOP Framework**, **Online Feature Store**, and **Interaction Store**, we built the core infrastructure to support real-time personalization at scale. These wins have already unlocked: \\n- \u2705 Faster, more dynamic recommendations for millions of users. \\n- \u2705 Better infrastructure efficiency, reducing wasted compute power. \\n- \u2705 A flexible, modular system that allows for further experimentation.\\n\\nBut this is just the beginning. While we\'ve solved key challenges, **certain roadblocks remain** \u2014from optimizing **cost-performance trade-offs** to **seamlessly evolving schemas**.\\n\\n\\nThis foundational work laid the path for a reliable and scalable **real-time feature serving layer**."}]}}')}}]); \ No newline at end of file diff --git a/docs/assets/js/6479fb86.96631f8d.js b/docs/assets/js/6479fb86.96631f8d.js deleted file mode 100644 index 6f77cfc2..00000000 --- a/docs/assets/js/6479fb86.96631f8d.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[5579],{3751:e=>{e.exports=JSON.parse('{"archive":{"blogPosts":[{"id":"post-five","metadata":{"permalink":"/BharatMLStack/blog/post-five","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-five/index.md","source":"@site/blog/bharatmlstack-history/post-five/index.md","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"BharatMLStack","date":"2025-06-02T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":4.93,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-five","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","authors":["jaya"],"date":"2025-6-2","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"nextItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-three"}},"content":"![BharatMLStack](./bms.png)\\n## LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale\\n\\nRaw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack\u2014from memory management to kernel execution.\\n\\n## 1. Advanced Memory Management: Paged & Prefix KV Caching\\n\\nThe most significant bottleneck in LLM inference is not always compute, but memory bandwidth\u2014specifically managing the Key-Value (KV) cache.\\n\\n### Paged KV caching\\n\\nStandard caching suffers from fragmentation. We use **Paged KV caching**, which operates similarly to an operating system\'s virtual memory: the KV cache is divided into non-contiguous blocks. This lets us serve larger batch sizes without running out of memory.\\n\\n### KV cache quantization\\n\\nTo further maximize available memory, we implement **KV cache quantization** (e.g., FP8). By compressing stored attention keys and values from 16-bit to 8-bit, we nearly double the effective context window capacity of the GPU, allowing longer conversations or larger batches without materially degrading quality.\\n\\n### Prefix caching (the \\"voice bot\\" optimizer)\\n\\nFor use cases like GenAI voice bots where the system prompt (e.g., \\"You are a helpful assistant...\\") is static across thousands of requests, we enable **prefix caching**.\\n\\n- **Impact**: By reusing pre-computed KV states for common prefixes, we achieve a cache hit rate of ~90%. This reduces **Time To First Token (TTFT)** by skipping redundant computation of the system prompt.\\n\\n## 2. Aggressive Quantization (INT4 AWQ & FP8)\\n\\nRunning models in their native 16-bit precision (BF16) restricts maximum batch size and throughput. We use quantization to shrink model weights without sacrificing accuracy.\\n\\n### INT4 AWQ (Activation-aware Weight Quantization)\\n\\nFor the Llama 3 family, we use **AWQ** to compress weights to 4 bits. This reduces model size by ~75%, allowing larger models to fit into L4 GPU memory and significantly improving token generation speed.\\n\\n### FP8 precision\\n\\nFor NVIDIA Hopper (H100) architectures, we are exploring **FP8 quantization**, leveraging native FP8 tensor cores to accelerate matrix multiplications while maintaining a higher dynamic range than integer quantization.\\n\\n- **Verification**: We validate quantized models by comparing dot-product similarity of embeddings against the FP16 baseline, consistently achieving **>99% similarity**.\\n\\n## 3. Kernel Fusion & Custom Plugins\\n\\nTo minimize overhead from launching thousands of small GPU operations, we fuse them into monolithic kernels using NVIDIA TensorRT plugins.\\n\\n- **Flash attention & FMHA**: We enable **Fused Multi-Head Attention (FMHA)** combined with flash attention to reduce memory reads/writes.\\n- **GEMM plugins**: We use specialized **GEMM** plugins to accelerate transformer linear layers.\\n- **Removing input padding**: Instead of padding short sequences to match the longest, we remove input padding so the GPU processes only valid tokens.\\n\\n## 4. Inflight (Continuous) Batching\\n\\nTraditional static batching waits for all requests in a batch to finish before returning results\u2014so one long response delays everyone else.\\n\\nWe implement **inflight batching**: as soon as one request completes, its slot is freed and filled by a new request from the queue. This keeps GPUs saturated and decouples latency of short queries from long ones.\\n\\n## 5. Parallelism Strategies: Scaling Beyond One GPU\\n\\nFor large models (e.g., 70B+ parameters) that cannot fit into the VRAM of a single GPU, we use parallelism strategies.\\n\\n- **Tensor parallelism (TP)**: Split weight matrices across multiple GPUs (e.g., 4\xd7 L4 or 8\xd7 A100). Each GPU computes a shard and outputs are reduced at every layer.\\n- **Pipeline parallelism (PP)**: Split model layers across GPUs to pipeline compute (e.g., while one GPU computes later layers for Request A, another starts early layers for Request B).\\n\\n## 6. Speculative Decoding\\n\\nTo reduce inter-token latency (ITL), we explore **speculative decoding**.\\n\\n- **Mechanism**: A smaller, faster \\"draft\\" model speculatively generates a short token sequence (e.g., 5 tokens).\\n- **Verification**: The larger target model verifies those tokens in one parallel forward pass. If correct, we effectively generate multiple tokens per large-model step; if not, we discard and regenerate. This is effective for predictable text, improving perceived generation speed.\\n\\n## Few Benchmarks\\n\\nBelow are a couple of representative use cases and performance numbers.\\n\\n### Search query rewriting\\n\\n- **LLM**: Fine-tuned llama-3.2-1B\\n- **Input & output token length**: ~10\u201320\\n- **Response type**: Non-streaming\\n\\n| Inference runtime | Hardware | Max requests/sec | Max p99 latency |\\n| --- | --- | ---: | ---: |\\n| TensorRT-LLM | 4 \xd7 L4 GPUs (multi-GPU) | 1000 | 95 ms |\\n| TensorRT-LLM | 1 \xd7 A100 40 GB GPU | 1000 | 69 ms |\\n\\n### Voice bot query\\n\\n- **LLM**: Llama-3.1-8B\\n- **Input token length**: ~1900\u20132000\\n- **Output token length**: ~200\\n- **Response type**: Streaming\\n\\n| Inference runtime | Concurrency | p99 TTFT (ms) | p99 ITL (ms) | Token throughput (tokens/sec) | Request throughput (req/sec) | Hardware |\\n| --- | ---: | ---: | ---: | ---: | ---: | --- |\\n| TensorRT-LLM | 1 | 36.27 | 22.78 | 45.66 | 0.23 | L4 |\\n| TensorRT-LLM | 2 | 49.81 | 23.21 | 89.37 | 0.45 | L4 |\\n| TensorRT-LLM | 4 | 55.33 | 36.62 | 153.39 | 0.78 | L4 |\\n| TensorRT-LLM | 8 | 66.5 | 39.11 | 279.88 | 1.47 | L4 |\\n| TensorRT-LLM | 16 | 131.8 | 30.39 | 547.8 | 2.77 | L4 |\\n| TensorRT-LLM | 32 | 277.22 | 48.02 | 925.7 | 4.78 | L4 |\\n| TensorRT-LLM | 64 | 498.52 | 71.62 | 1,164.40 | 6.2 | L4 |\\n| TensorRT-LLM | 128 | 677.31 | 120.37 | 1,445.18 | 7.69 | L4 |\\n| TensorRT-LLM | 256 | 1,926.31 | 216.88 | 1,600.81 | 8.52 | L4 |\\n| TensorRT-LLM | 1 | 21.17 | 9.24 | 130.05 | 0.68 | A100 |\\n| TensorRT-LLM | 2 | 25.78 | 9.21 | 264.5 | 1.35 | A100 |\\n| TensorRT-LLM | 4 | 28.52 | 10.99 | 437.69 | 2.27 | A100 |\\n| TensorRT-LLM | 8 | 34.4 | 12.61 | 760.49 | 3.96 | A100 |\\n| TensorRT-LLM | 16 | 68.03 | 14.32 | 1,343.80 | 7.01 | A100 |\\n| TensorRT-LLM | 32 | 185.96 | 16.82 | 2,287.30 | 11.92 | A100 |\\n| TensorRT-LLM | 64 | 136.87 | 21.17 | 3,625.22 | 18.89 | A100 |\\n| TensorRT-LLM | 128 | 463.78 | 34.15 | 4,456.51 | 23.24 | A100 |\\n| TensorRT-LLM | 256 | 890.12 | 59.18 | 5,188.24 | 27.05 | A100 |\\n\\n## Conclusion\\n\\nHigh-performance LLM inference is fundamentally a systems engineering problem: memory efficiency, kernel execution, batching strategy, and parallelism determine real-world latency and throughput. Techniques such as paged KV caching, aggressive quantization, kernel fusion, and inflight batching improve GPU utilization while reducing latency and memory pressure.\\n\\nThese optimizations enable the platform to deliver sub-second responses, sustain high concurrency, and efficiently serve both lightweight and long-context workloads. By continuously optimizing across the full inference stack, we keep LLM serving scalable, cost-efficient, and production-ready for real-time AI applications."},{"id":"post-three","metadata":{"permalink":"/BharatMLStack/blog/post-three","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-four/index.md","source":"@site/blog/bharatmlstack-history/post-four/index.md","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","description":"BharatMLStack","date":"2025-03-29T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":13.38,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-three","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","authors":["jaya"],"date":"2025-3-29","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/post-five"},"nextItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/post-three"}},"content":"![BharatMLStack](./bms.png)\\n## Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving\\n\\n\\n\\nServing large language models in production introduces new challenges across infrastructure, performance optimization, and operational lifecycle management. The LLM Inference Platform addresses these challenges by providing a unified system for deploying and managing open-source and fine-tuned LLMs at scale.\\n\\nThe platform implements a complete LLMOps lifecycle \u2014 from model registration and automated compilation to deployment, runtime optimization, and monitoring. Designed as a self-service environment, users can onboard models directly from open repositories such as Hugging Face or upload custom fine-tuned models, and deploy them using a single-click workflow with no manual infrastructure or configuration steps required.\\n\\nIn addition to fully automated deployment, the platform allows users to select and apply custom inference optimization techniques \u2014 such as quantization strategies, batching configurations, and runtime-specific performance enhancements \u2014 enabling teams to balance latency, throughput, and cost based on their use case. The goal is to reduce operational friction while enabling high-performance, production-grade LLM inference.\\n\\n## Why LLM Inference Is not just bigger ML model serving\\n\\nLarge language model (LLM) inference introduces a fundamentally different set of challenges compared to traditional machine learning inference. While classical ML models typically perform a single forward pass to produce a fixed prediction, LLMs operate as autoregressive systems, generating outputs token by token based on previously generated context. This difference dramatically changes how inference systems must be designed, optimized, and scaled.\\n\\n### Autoregressive Generation and Sequential Computation:\\n\\nUnlike traditional models such as classifiers or recommenders \u2014 where inference cost is relatively constant \u2014 LLMs generate responses incrementally. Each new token depends on all previously generated tokens, making inference inherently sequential and dynamic. This means latency and compute requirements vary significantly depending on prompt length and output size, introducing complexity in scheduling and resource allocation.\\nBecause tokens cannot be generated fully in parallel during decoding, GPUs may become underutilized without specialized batching and scheduling strategies. This has led to the development of dedicated LLM inference engines optimized for token-level execution.\\n\\n### Prefill and Decode Phases:\\n\\nLLM inference typically consists of two distinct stages:\\n\\n- Prefill phase \u2014 the model processes the input prompt and builds internal representations. This stage is compute-heavy and highly parallelizable.\\n- Decode phase \u2014 the model generates tokens sequentially, predicting one token at a time using previously generated context.\\n\\nThe decode stage often becomes memory-bound rather than compute-bound, which creates new performance bottlenecks compared to traditional ML workloads.\\n\\n### Context Management and KV Caching:\\n\\nAnother fundamental difference lies in how LLMs maintain context. Transformer-based models rely on attention mechanisms that require access to past token representations. To avoid recomputing these representations repeatedly, inference engines use key-value (KV) caching, which stores intermediate activations from previous tokens.\\nKV caching significantly improves performance by eliminating redundant computation, but it introduces new challenges:\\n\\n- Memory consumption grows with sequence length and batch size\\n- GPU memory becomes a critical bottleneck\\n- Efficient memory management becomes essential for scaling concurrent requests\\n\\nThis tradeoff between compute efficiency and memory usage is unique to LLM inference workloads.\\n\\n### Dynamic and Irregular Workloads:\\n\\nTraditional ML inference typically operates on fixed-size inputs with predictable latency. In contrast, LLM requests vary widely in prompt length, output length, and runtime behavior. As a result:\\n\\n- Batch sizes must be dynamic rather than static\\n- Requests may enter and leave batches asynchronously\\n- Scheduling systems must continuously rebalance workloads to maximize GPU utilization\\n\\nThese characteristics require specialized serving architectures that differ significantly from standard ML serving pipelines.\\n\\n### Streaming and User Experience Constraints:\\n\\nAnother distinguishing factor is the expectation of real-time streaming responses. Instead of returning a single output, LLM systems often stream tokens to users as they are generated. \\nBecause of these differences \u2014 sequential generation, growing memory requirements, dynamic workloads, and streaming constraints \u2014 LLM inference cannot be treated as a simple extension of existing ML serving systems. Production platforms must incorporate specialized runtime engines, advanced optimization techniques, and observability tailored specifically to LLM workloads.\\n\\n## LLMOps: High-Level Architecture \\n\\n![LLM Architecture](./llm-plat.png)\\n\\nThe LLM Inference Framework is designed as a fully automated, end-to-end system for deploying and operating open-source and fine-tuned large language models at scale. The architecture abstracts the complexity of model optimization, hardware selection, deployment, and runtime management into a unified workflow that enables users to move from raw model weights to production-ready inference endpoints with minimal manual intervention.\\n\\nOur LLM Inference Framework is architected not just as a serving engine, but as a complete lifecycle management system. As illustrated in the high-level design below, the platform automates the journey of a model through seven distinct stages, ensuring reproducibility, performance, and scalability.\\n\\n1. Onboarding & Registration (The Source of Truth)\\n\\n The lifecycle begins with the Data Scientist or engineer.\\n\\n - Model Ingestion: Users onboard models\u2014whether open-source (Hugging Face, NeMo) or internally fine-tuned\u2014via the Truffle Box SDK/UI.\\n - LLM + Prompt Registry: Unlike traditional systems that only track model weights, our registry is a unified control plane. It stores both the Model Artifacts and the Prompt Templates. This allows Data Scientists to register and version-control prompts (e.g., \\"customer_support_v2\\") independently of the application code.\\n\\n2. The \\"Black Box\\" Build Engine\\n\\n Once a model is registered, the Automated LLM Compiler + Quantizer Module kicks off a background job on ephemeral GPU resources.\\n\\n - Transformation: The raw model is converted into a TRT-LLM Checkpoint.\\n - Quantization: The system automatically applies quantization algorithms (like INT4 AWQ or FP8) to reduce memory footprint.\\n - Engine Building: Finally, it compiles a highly optimized TRT Engine specifically tuned for the target hardware.\\n\\n3. Intelligent Profiling & Validation\\n\\n Before deployment, the new engine passes through the Hardware & Inference Runtime Profiler.\\n\\n - Benchmarking: This module empirically tests the engine against various hardware configurations (L4 vs. A100) and runtimes (TRT-LLM vs. vLLM).\\n - Optimization: It recommends the optimal configuration that meets latency SLAs (Time-To-First-Token) while minimizing cost.\\n\\n4. Smart Artifact Generation & Distribution\\n\\n To solve the Kubernetes \\"Cold Start\\" problem, the LLM Serving Artifacts Generation module packages the model using a bifurcated strategy:\\n\\n - Standard Models: Artifacts are uploaded to Cloud Storage (GCS) and downloaded by pods at startup.\\n - Very Large Models: For massive models (>8GB) where network downloads are too slow, the system pre-caches the model onto Secondary Boot Disks. These disks are attached directly to new GPU nodes during autoscaling, eliminating download wait times.\\n\\n5. Image Streaming & Deployment\\n\\n Simultaneously, the inference runtime container images are pulled from the Artifact Registry.\\n\\n - Image Streaming: We utilize container image streaming to allow pods to start initializing while the massive Triton/Dynamo container layers are still downloading, further shaving seconds off the startup time. link\\n\\n6. The Inference Runtime (Kubernetes)\\n\\n The workload lands on Kubernetes with Autoscaling.\\n\\n - Dynamic Backends: Depending on the profile generated in Stage 3, the pod initializes either TensorRT-LLM (for throughput) or vLLM (for flexibility), or spins up a Dynamo worker for distributed inference.\\n - Data Loading: The pod either downloads the model from Cloud Storage or mounts the pre-warmed Secondary Boot Disk (\\"Pull from Disk\\").\\n\\n7. Client Interaction & Observability\\n\\n Finally, the LLM Inference Client executes the request.\\n\\n - Prompt Injection: The client pulls the specific prompt template ID from the Registry, ensuring the exact versioned instructions are used.\\n - Streaming Response: The request is sent via gRPC, and tokens are streamed back to the user in real-time.\\n\\n8. Observability: Monitoring the Pulse of GenAI\\n\\n In traditional microservices, success is measured by CPU utilization and request latency (p99). For Large Language Models, these metrics are insufficient. A user doesn\'t care if the GPU is at 80% utilization; they care about how fast the first word appears and how smoothly the rest of the sentence follows.\\n\\n To capture the true user experience, our platform instrumentation focuses on three critical LLM-specific metrics:\\n\\n 1. Time to First Token (TTFT)\\n - Definition: TTFT measures the time elapsed from the moment a request is received until the very first token is generated and streamed back to the user.\\n - Why it matters: This represents the \\"Prefill Phase\\" latency\u2014the time the model takes to process the input prompt and load weights. A high TTFT makes the application feel unresponsive or \\"hung.\\"\\n - Optimization: We closely monitor TTFT to ensure our Prefix Caching is effective (aiming for high cache hitrates), which drastically lowers this metric by skipping redundant prompt processing.\\n\\n 2. Inter-Token Latency (ITL)\\n - Definition: ITL measures the average time interval between the generation of consecutive tokens during the \\"Decode Phase\\".\\n - Why it matters: This defines the \\"perceived speed\\" of reading. Even if the first token is fast (low TTFT), high ITL makes the text generation look \\"jerky\\" or slow to the user.\\n - Benchmarks: In our testing with Llama 3.1, we track p99 ITL to ensure it stays below human reading speeds to maintain a natural conversational flow.\\n\\n 3. Token Throughput vs. Request Throughput\\n - We distinguish between two types of throughput to balance system efficiency with user load:\\n - Token Throughput (tokens/sec): The total number of tokens generated across all concurrent requests. This measures the raw compute efficiency of the GPU and the effectiveness of batching.\\n - Request Throughput (req/sec): The number of distinct user queries served per second. We use this to determine autoscaling thresholds, ensuring we scale out before the queue depth impacts ITL.\\n\\n 4. The Monitoring Stack\\n - Real-time Dashboards: We utilize Grafana to visualize these streaming metrics in real-time, allowing on-call engineers to spot \\"slow generation\\" incidents that generic \\"500 error\\" alerts would miss.\\n - Request Tracing: Since Triton Inference Server does not log request payloads by default, we integrate a Helix Client to asynchronously publish request logs to Log Tables. This allows us to trace a specific \\"slow\\" request back to its prompt to understand if a complex input caused the latency spike.\\n\\n## Supported Inference backends (TensorRT LLM, Dynamo & vLLM)\\n\\nTailored for the Use Case: We do not believe in a \\"one-size-fits-all\\" approach to inference. Different use cases\u2014whether a real-time voice bot requiring ultra-lowsub-second latency or a massive reasoning task requiring huge context windows\u2014demand different runtime characteristics. Our platform is designed to be runtime-agnostic, allowing us to automatically select and tailor the best engine based on the specific requirements of the application:\\n\\n1. TensorRT-LLM: The High-Performance Standard\\n\\n Suitable for: High-throughput production workloads where latency is critical (e.g., customer support chat, real-time voice bots).\\n\\n TensorRT-LLM serves as our default backend for these scenarios. Our internal benchmarks on Llama 3.1 and 3.2 models demonstrated that a tuned TensorRT-LLM engine significantly outperforms standard runtimes, especially when utilizing INT4 AWQ and FP8 quantization .\\n\\n Key optimizations we tailor for these high-load cases include:\\n\\n - Optimized execution via TensorRT engine compilation\\n - Quantization-aware execution for reduced memory usage and improved throughput\\n - Inflight Batching: Allowing requests to be processed continuously without waiting for the entire batch to finish, drastically improving GPU utilization .\\n - Custom Plugins: Enabling specific NVIDIA plugins like the GEMM plugin and GPT Attention plugin to accelerate matrix multiplications and attention mechanisms .\\n\\n2. Dynamo: Distributed Inference for Reasoning Models\\n\\n Suitable for: Very large \\"reasoning\\" models (70B+) or scenarios requiring massive context windows where a single GPU\'s memory is insufficient.\\n\\n For these memory-bound tasks, we utilize Dynamo, a low-latency distributed inference framework . Unlike monolithic servers, Dynamo disaggregates the inference process to scale resources horizontally:\\n\\n - KV Aware Routing: A specialized router directs requests to workers that already hold the relevant Key-Value (KV) cache, minimizing redundant computation .\\n - Prefill vs. Decode Split: The workload is divided into Prefill Workers (processing the prompt) and Decode Workers (generating tokens), allowing us to scale the compute-heavy \\"reading\\" phase independently from the memory-heavy \\"writing\\" phase .\\n - Distributed execution across multiple GPU resources\\n\\n3. vLLM: The Flexible Baseline\\n\\n Suitable for: Rapid prototyping, testing new model architectures, or low-traffic internal tools where ease of deployment outweighs raw throughput.\\n\\n While TensorRT-LLM is optimized for maximum speed, vLLM provides a robust and flexible baseline .\\n\\n - High throughput through dynamic batching and efficient memory utilization\\n - Paged KV cache management for handling long contexts and concurrent requests\\n - Strong support for open-source model ecosystems\\n - Rapid Adoption: It allows us to onboard new model architectures immediately without waiting for a custom TensorRT build.\\n - Benchmarking Insight: In our internal tests, vLLM provided a strong baseline but often lacked the specific max-token optimizations present in our custom TRT engines . We use it strategically for initial testing before committing to a full TensorRT optimization pipeline.\\n\\n## Conclusion\\n\\nLarge language model inference introduces a fundamentally new class of infrastructure challenges\u2014where performance is governed not just by raw compute, but by memory efficiency, intelligent scheduling, runtime specialization, and lifecycle automation. Unlike traditional ML serving, LLM inference requires systems that understand token-level execution, manage rapidly growing context state, and continuously balance latency, throughput, and cost under highly dynamic workloads.\\n\\nThe LLM Inference Framework addresses these challenges by transforming inference into a fully automated, reproducible lifecycle\u2014from model onboarding and compilation to deployment, optimization, and observability. By integrating automated quantization and engine compilation, intelligent runtime selection, cold-start mitigation strategies, and LLM-specific observability metrics such as Time-to-First-Token and Inter-Token Latency, the platform ensures both high performance and operational simplicity.\\n\\nEqually important, the framework is designed with flexibility and future evolution in mind. Its runtime-agnostic architecture enables seamless adoption of emerging inference engines, hardware accelerators, and optimization techniques without requiring platform redesign. This ensures that teams can continuously leverage advancements in the rapidly evolving LLM ecosystem while maintaining consistent operational workflows.\\n\\nUltimately, the goal of the platform is to make production-scale LLM deployment as seamless and reliable as traditional software deployment\u2014allowing teams to focus on building intelligent applications rather than managing infrastructure complexity. By combining lifecycle automation, runtime optimization, and deep observability, the LLM Inference Framework provides a scalable foundation for delivering fast, cost-efficient, and production-ready LLM experiences.\\n\\n## Future Explorations\\n\\nWhile we have achieved significant milestones in latency and throughput, the landscape of GenAI is evolving rapidly. Our roadmap focuses on increasing flexibility, reducing costs, and enhancing reliability for enterprise-grade workloads. Here is what we are building next:\\n\\n- TPU Support: To diversify our hardware supply chain and further optimize cost-per-token, we are evaluating Google Cloud TPUs to bake it into our platform. By leveraging the JAX and PyTorch/XLA ecosystems, we aim to unlock the massive throughput potential of TPU v5e chips, particularly for our open-source Llama models. This will allow the hardware profiler to dynamically choose between NVIDIA GPUs and Google TPUs based on real-time availability and price-performance metrics.\\n- Multi-LoRA Serving (Serverless Experience): Currently, deploying a fine-tuned model requires a dedicated GPU. We are building support for Multi-LoRA serving, which will allow us to serve hundreds of unique, fine-tuned adapters on top of a single frozen base model. This will drastically reduce costs for multi-tenant applications, enabling a \\"serverless\\" experience where specific fine-tunes are hot-swapped instantly per request.\\n- Spot Instance Orchestration: To further optimize cloud costs, we are developing fault-tolerant mechanisms to run inference workloads on Spot Instances. By implementing aggressive checkpointing and seamless request draining, we aim to leverage cheaper, preemptible compute capacity without interrupting the user\'s streaming experience.\\n- Semantic Caching Layer: We plan to move beyond standard Prefix Caching to implement Semantic Caching. By using a vector database to fetch responses for semantically similar queries (e.g., \\"How do I reset my password?\\" vs. \\"Password reset steps\\"), we can bypass the GPU entirely for repetitive queries, reducing latency to near-zero.\\n- Context-Aware Autoscaling: Standard CPU/GPU utilization metrics are often insufficient signals for scaling LLMs. We are working on KV-cache pressure metrics for autoscaling. This ensures that we scale out before the memory fills up, preventing eviction-based slowdowns during traffic spikes.\\n- Online Evaluation & Guardrails: We are integrating a lightweight \\"Trust Layer\\" into the proxy. This will allow for low-latency input/output filtering (Guardrails) and asynchronous \\"LLM-as-a-Judge\\" evaluation pipelines to monitor response quality in production, not just system health."},{"id":"post-three","metadata":{"permalink":"/BharatMLStack/blog/post-three","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-three/index.md","source":"@site/blog/bharatmlstack-history/post-three/index.md","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"BharatMLStack","date":"2024-05-21T00:00:00.000Z","tags":[{"inline":true,"label":"model-inference","permalink":"/BharatMLStack/blog/tags/model-inference"},{"inline":true,"label":"embedding-search","permalink":"/BharatMLStack/blog/tags/embedding-search"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":3.6,"hasTruncateMarker":false,"authors":[{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"slug":"post-three","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","authors":["aditya","jaya","adarsha"],"date":"2024-05-21T00:00:00.000Z","tags":["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-three"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two"}},"content":"![BharatMLStack](./bms.png)\\n\\n## Cracking the Code: Scaling Model Inference & Real-Time Embedding Search\\n\\nBy mid-2023, we had transformed our ML stack\u2014building a real-time feature store, optimizing model retrieval, and fine-tuning ranking. But two critical gaps remained:\\n\\n- \ud83d\udd39 Scaling model inference without hitting infrastructure roadblocks\\n- \ud83d\udd39 Moving embedding search from batch to real-time for candidate generation\\n\\nHere\u2019s how we tackled these last-mile challenges, broke free from infrastructure constraints, and built a cost-efficient, high-performance system.\\n\\n## Breaking Free from the Scalability Ceiling\\n\\n### The Model Serving Bottleneck\u2014A Wake-Up Call\\n\\nJuly 2023. With just months left for the Mega Blockbuster Sale (MBS), we noticed a serious issue\u2014scaling our model-serving infrastructure was taking 10\u201315 minutes. In real-time ML, that\u2019s an eternity.\\nIn one of our war rooms, we ran a quick experiment:\\n\\n- \ud83d\ude80 We deployed an XGBoost model on a self-hosted Triton Inference Server running on a 16-core machine.\\n- \ud83d\ude80 Fired requests and compared the outputs with our existing cloud-hosted setup.\\n- \ud83d\ude80 The results matched\u2014perfectly.\\n\\nThat moment changed everything. We prepped a backup Triton setup on EKS, just in case our cloud provider couldn\'t allocate enough compute resources in time. Luckily, they did\u2014but the seed was planted.\\nThen in October, just two weeks before MBS, we got an alarming response from our infrastructure team:\\n \\"Node availability may be an issue.\\"\\nWith no time to waste, we moved 30% of real-time ML traffic to our self-hosted Triton cluster. The results?\\n\\n- \u2705 p99 latency dropped from 90\u2013100ms to 30\u201340ms\\n- \u2705 Triton handled significantly higher throughput on fewer resources\\n- \u2705 No model changes were needed\\n\\nMBS ran without a hitch, proving that self-hosted inference was the way forward.\\n\\n### Scaling Triton on GKE\\n\\nThis left us with two choices:\\n\\n- 1\ufe0f\u20e3 Port models to a managed cloud inference service, investing time in learning a new deployment stack\\n- 2\ufe0f\u20e3 Scale our existing Triton setup on GKE, optimizing for cost and performance\\n\\nWe went with Option 2\u2014and it slashed inference costs to 35% of what we previously paid, while giving us full control over scaling and optimizations.\\n\\n### Fixing the Cold Start Problem\\n\\nAs we onboarded more deep learning (DL) models, we hit a new bottleneck, new inference pods took 7\u20139 minutes to spin up.\\n\\nAfter profiling, we found the culprits:\\n\\n- Triton\u2019s base image\u2014a massive 5GB\\n- Model binaries\u2014often 1GB+\\n- Startup delay\u2014mostly due to downloading and initializing these assets\\n\\nTo fix this, we built a lightweight Triton image, stripping unused components and shrinking the size to 900MB. This cut cold start times drastically, making auto-scaling faster and smoother.\\n\\n## Embedding Search: The Last Piece of the Puzzle\\n\\nBy mid-2023, most of our ML stack had gone real-time\u2014except for Candidate Generation (CG), which still ran in batch mode. To truly power real-time recommendations, we needed an online embedding search system.\\n\\n### Choosing the Right Vector Database\\n\\nWe benchmarked three production-ready vector DBs across key parameters:\\n\\n- Milvus\\n- Qdrant\\n- Weaviate\\n\\nAfter extensive POCs, Qdrant stood out for its:\\n\\n- \u2705 Blazing-fast search latency on high-dimensional vectors\\n- \u2705 Efficient memory usage, crucial for in-memory workloads\\n- \u2705 Support for upserts and soft deletes, vital for Ads use cases\\n- \u2705 gRPC + REST APIs, making integration seamless\\n- \u2705 Powerful filtering, allowing fine-tuned retrieval (e.g., filtering Ads by category, active status, etc.)\\n\\nAt its core, Qdrant uses HNSW indexing, delivering both high recall and low-latency nearest-neighbor search\u2014a perfect fit for our needs.\\n\\n### Embedding Freshness & Real-Time Updates\\n\\nTo ensure embeddings stayed up to date, we built a dual ingestion pipeline:\\n\\n- \ud83d\udccc Daily Refresh: A bulk pipeline updated embeddings overnight\\n- \ud83d\udccc Real-Time Updates: Ads events triggered immediate upserts/deletes\\n\\nThis setup powered real-time \\"Similar Products\\" recommendations on the product page and became the foundation for Ads Candidate Generation, ensuring the right ads surfaced in milliseconds.\\n\\n![Skye](./vss.png)\\n\\n## Final Takeaways: Scaling Smartly for Real-Time ML\\n\\n- \ud83d\ude80 Self-hosted inference on Triton gave us lower cost, faster scaling, and better performance than managed services\\n- \ud83d\ude80 Building a custom Triton image reduced cold starts, improving responsiveness\\n- \ud83d\ude80 Qdrant-based embedding search enabled real-time personalization at scale\\n- \ud83d\ude80 Real-time updates for embeddings unlocked dynamic, up-to-date recommendations\\n\\nBy early 2024, Meesho\u2019s ML stack had evolved into a fully real-time, scalable, and cost-efficient system, setting the foundation for even bigger leaps ahead."},{"id":"post-two","metadata":{"permalink":"/BharatMLStack/blog/post-two","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-two/index.md","source":"@site/blog/bharatmlstack-history/post-two/index.md","title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","description":"BharatMLStack","date":"2023-04-10T00:00:00.000Z","tags":[{"inline":true,"label":"inferflow","permalink":"/BharatMLStack/blog/tags/inferflow"},{"inline":true,"label":"interaction-store","permalink":"/BharatMLStack/blog/tags/interaction-store"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":6.31,"hasTruncateMarker":false,"authors":[{"name":"Bhawani Singh","title":"Architect @ Meesho","url":"https://github.com/singh-bhawani","imageURL":"https://github.com/singh-bhawani.png","key":"bhawani","page":null},{"name":"Jigar Dave","title":"Lead Software Engineer @ Meesho","url":"https://github.com/jigarpatel26","imageURL":"https://github.com/jigarpatel26.png","key":"jigar","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"slug":"post-two","title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","authors":["bhawani","jigar","adarsha"],"date":"2023-4-10","tags":["inferflow","interaction-store","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/post-three"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","permalink":"/BharatMLStack/blog/post-one"}},"content":"![BharatMLStack](./bms.png)\\n## Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)\\n\\nBy late 2022, we had built something we were truly proud of\u2014a real-time ML serving system with a DAG-based executor, a feature store, and an interaction store powering key ranking and personalization models. It was a major milestone, the culmination of months of effort from data scientists, ML engineers, and backend teams. Our system was live, and we were ready to push the boundaries of experimentation.\\nAnd it worked. Mostly.\\nBut soon, cracks appeared. Every new model needed custom feature retrieval logic, DAGs became dense and unmanageable, and scaling turned into a constant firefight. Costs surged, and infra bottlenecks slowed experimentation. Our system worked, but it wasn\u2019t built for scale.\\nThis is the story of how we tackled these challenges\u2014building Inferflow for seamless feature retrieval, optimizing real-time infra, and cutting costs while scaling to millions of QPS.\\n\\n### The Cost of Success\\nEvery new Ranker model required its own feature set, often pulling from different entities. Each addition meant:\\n\\n- Adding new DAG nodes in IOP\\n- Writing custom logic to fetch features from multiple sources (e.g., user, product, user \xd7 category)\\n- Inferring intermediate features (e.g., extracting category from a product to fetch user \xd7 category data)\\n- Optimizing I/O and dealing with the inevitable bugs\\n\\nWhat began as clean DAGs soon turned into a tangled web of cross-dependent graphs. Every experimentation cycle meant new nodes, new dependencies, and slower iterations.\\n\\n### Scaling Pains (and Cassandra\u2019s Limits)\\nAt some point, we were hitting:\\n\\n- 250\u2013300K reads/sec\\n- 1M writes/sec (during lean hours)\\n\\nAll of this ran on Cassandra. While its distributed architecture had been proven in production, operating large-scale clusters came with considerable infrastructure overhead. Our proof-of-concept (POC) demonstrated throughput of around 100K ops/sec, but as we scaled further, the challenges grew. Ensuring node health, optimizing compaction, and maintaining storage balance became increasingly demanding. We also observed latency spikes under heavy load, alongside a sharp increase in total cost of ownership.\\n\\n### Interaction Store Woes\\nOur interaction store was another ticking time bomb:\\n\\n- \ud83d\udea8 Clusters kept growing in size and cost\\n- \ud83d\udea8 Latency spikes became increasingly frequent\\n- \ud83d\udea8 The DMC proxy occasionally lost locality of nodes against shards, causing cross-node communication and degraded performance\\n\\nEach time this happened, we had to manually rebalance shards just to restore stable latency, making operations unsustainable at scale.\\n\\n### Silver Linings\\nDespite the chaos, the system was live and delivering value:\\n\\n- Real-time infrastructure was in production\\n- Costs dropped by 60\u201370% compared to offline personalization\\n- New experiments rolled out faster and more successfully\\n- User engagement metrics improved\\n\\nIt wasn\u2019t perfect. It was far from easy. But it worked\u2014and that counted for a lot.\\n\\n### Round Two: Solving the Top 2 Bottlenecks\\nWith the first-gen system stretched to its limits, we stepped back. Conversations with data scientists and backend engineers revealed three recurring pain points:\\n\\n1. Coding feature retrieval logic for every new model was becoming unsustainable\\n2. ML scale was exploding\u2014bringing rising infra costs with it\\n3. Real-time embedding search was the next big unlock\\n\\nWe tackled them one by one\u2014starting with the biggest pain point.\\n\\n#### Problem 1: No-Code Feature Retrieval for Model Inference\\nWe noticed a pattern: for personalized ranking, models needed features from:\\n\\n- \u2705 Product\\n- \u2705 User\\n- \u2705 User \xd7 Category\\n- \u2705 Region, cohort, sub-category, etc.\\n\\nA key insight emerged: Entities that contribute features for a model always map back to the context entities.\\n\\n![MP Dag](./mp-dag.png)\\n\\nWith this, we designed Inferflow, a graph-driven feature retrieval and model orchestration system:\\n\\n- 1\ufe0f\u20e3 Inferflow takes a modelId and context IDs (e.g., userId, productIds)\\n- 2\ufe0f\u20e3 Loads a pre-defined feature retrieval graph from ZooKeeper\\n- 3\ufe0f\u20e3 Executes the graph to resolve entity relationships dynamically\\n- 4\ufe0f\u20e3 Outputs a 2D matrix of feature vectors\\n\\n\ud83d\udca1 The impact?\\n\\n- \ud83d\ude80 No more custom feature retrieval code\u2014just graph updates in config\\n- \ud83d\ude80 Feature consistency across experiments\\n- \ud83d\ude80 Faster iteration cycles for ranking, fraud detection, and beyond\\n\\nHere\u2019s a visual example that shows how this graph plays out during execution. We further extended the graph to call multiple models as needed:\\n![MP matrix](./mp-matrix.png)\\nWe built Inferflow in GoLang, using gRPC and Proto3 serialization for efficiency.\\n\\n#### Problem 2: Scaling Without Breaking the Bank\\nWith more ML use cases coming online, we needed to cut costs without compromising performance. We focused on:\\n\\n- \ud83d\udd39 Online Feature Store\\n- \ud83d\udd39 Interaction Store\\n\\n#### Optimizing the Online Feature Store\\nOur costs were concentrated in:\\n\\n- \ud83d\udccc Database (Cassandra)\\n- \ud83d\udccc Cache (Redis)\\n- \ud83d\udccc Running Pods (Java services)\\n\\n1\ufe0f\u20e3 Replacing Cassandra with ScyllaDB\\nAs we hit the operational limits of large Cassandra clusters, we transitioned to ScyllaDB, which offered a seamless drop-in replacement without major code changes. The switch brought significant benefits:\\n\\n- Throughput: Matched or exceeded Cassandra\'s performance under identical workloads, even under high concurrency.\\n- Latency: Achieved consistently lower P99 latencies due to ScyllaDB\'s shard-per-core architecture and better I/O utilization.\\n- Cost Efficiency: Reduced infra footprint by ~70% through better CPU and memory efficiency, eliminating the need for over-provisioned nodes.\\n\\n2\ufe0f\u20e3 Finding the Right Cache\\nTo reduce backend load and improve response times, we benchmarked multiple caching solutions\u2014Memcached, KeyDB, and Dragonfly\u2014under real production traffic patterns. Dragonfly stood out due to its robust architecture and operational simplicity:\\n\\n- Data Skew Handling: Efficiently managed extreme key hotness and uneven access patterns without performance degradation.\\n- Throughput: Delivered consistently high throughput, even with large object sizes and concurrent access.\\n- Ease of Adoption: Acted as a drop-in Redis replacement with full protocol compatibility\u2014no changes needed in application code or client libraries.\\n\\n3\ufe0f\u20e3 Moving to GoLang for Cost-Efficient Serving\\nJava services were memory-heavy\u2014so we rewrote core services in GoLang. The results?\\n\\n\u2705 Memory usage dropped by ~80%\\n\u2705 CPU utilization was significantly lower\\n\u2705 Faster, more efficient deployments\\n\\n#### Optimizing the Interaction Store\\nWe realized that we only need a user\u2019s interaction data in Redis when they open the app. So, we implemented a tiered storage approach:\\n\\n- \ud83d\udccc Cold Tier (ScyllaDB)\u2014Stores click, order, wishlist events\\n- \ud83d\udccc Hot Tier (Redis)\u2014Loads a user\u2019s past interactions only when they open the app\\n\\nSmart Offloading: We introduced an inactivity tracker to detect when a user session ends. At that point, Redis data was flushed back to Scylla, reducing unnecessary writes.\\n\\n![InteractionStore](./interaction-str.png)\\n#### Results\\n\\n- Online Feature Store hit 1M QPS for the first time during the 2023 Mega Blockbuster Sale\u2014without breaking a sweat\\n- Infra costs for Online Feature Store and Interaction Store dropped by ~60%\\n\\n#### The Catch: Our ML Hosting Hit a Hard Limit\\nWhile planning for 2023 MBS, we ran into a critical scalability bottleneck:\\n\\n- \u274c Insufficient compute availability in our region for ML instances\\n- \u274c Couldn\u2019t provision enough nodes to handle real-time inference at scale\\n\\nThis forced us to rethink where and how we hosted our models. The existing setup was great for prototyping\u2014but it wasn\u2019t built to handle the bursty, high-QPS demands of real-world production workloads.\\n\\n### Conclusion: From Firefighting to Future-Proofing\\nWhat started as an ambitious experiment turned into a real-time ML infrastructure that powered millions of requests per second. We battled scaling pains, rethought feature retrieval with Inferflow, and rebuilt our infra stack for efficiency\u2014driving down costs while improving experimentation velocity.\\nBut new challenges emerged. Our infrastructure could now handle scale, but our ML model hosting setup hit a hard limit. With compute availability bottlenecks threatening real-time inference, we faced a critical decision: how do we make model serving as scalable and cost-efficient as the rest of our stack? That\u2019s the next piece of the puzzle\u2014and the story of Part 3."},{"id":"post-one","metadata":{"permalink":"/BharatMLStack/blog/post-one","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-one/index.md","source":"@site/blog/bharatmlstack-history/post-one/index.md","title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","description":"BharatMLStack","date":"2022-11-15T00:00:00.000Z","tags":[{"inline":true,"label":"online-feature-store","permalink":"/BharatMLStack/blog/tags/online-feature-store"},{"inline":true,"label":"interaction-store","permalink":"/BharatMLStack/blog/tags/interaction-store"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"}],"readingTime":10.25,"hasTruncateMarker":false,"authors":[{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null},{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Bhawani Singh","title":"Architect @ Meesho","url":"https://github.com/singh-bhawani","imageURL":"https://github.com/singh-bhawani.png","key":"bhawani","page":null},{"name":"Jigar Dave","title":"Lead Software Engineer @ Meesho","url":"https://github.com/jigarpatel26","imageURL":"https://github.com/jigarpatel26.png","key":"jigar","page":null}],"frontMatter":{"slug":"post-one","title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","authors":["adarsha","aditya","bhawani","jigar"],"date":"2022-11-15T00:00:00.000Z","tags":["online-feature-store","interaction-store","mlplatform","meesho"]},"unlisted":false,"prevItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two"}},"content":"![BharatMLStack](./bms.png)\\n## The Genesis: How a Friday Night Roast Sparked Meesho\u2019s ML Platform\\n\\nIt all started in early 2022, over a casual Friday evening catch-up. Like many great origin stories, this one began with friendly banter between a group of backend engineers and data scientists. As the conversations unfolded, so did the roasting\u2014until one remark hit a little too close to home:\\n\\n*\\"Why are we still crunching data for Monthly Active Users (MAU) when the next day it\u2019s all about Daily Active Users (DAU)?\\"*\\n\\nThe laughter died down, and the question lingered. When we regrouped on Monday\u2014clear-headed and slightly reflective\u2014we decided to dig into the numbers. What they discovered was quite revealing: a large portion of compute resources wasn\u2019t being put to good use.\\nMuch of the system\u2019s effort was spent supporting users who weren\u2019t actively engaging, and even for new users, the experience wasn\u2019t optimized to make a meaningful impact.\\n\\nAt the same time, Meesho had just launched a company-wide initiative to reduce costs\u2014and every team had to contribute. This realization sparked the journey that would eventually lead to the **Meesho ML Platform**, known today as **BharatMLStack**.\\n\\n![Alt Text](./old-batch-arch.png)\\n\\nBefore the ML Platform, our recommendation and ranking pipelines followed a batch processing approach:\\n- **Data Ingestion**: The Data Platform team executed ETL jobs to ingest raw user data\u2014including user profiles, interaction logs, and product impressions\u2014into designated S3 buckets.\\n- **Layer 1**: Embedding Generation: On the Data Science side, Spark jobs pulled data from multiple S3 sources, cleaned and preprocessed it, and applied matrix factorization to generate user and item embeddings. The processed data and embeddings were then stored back in S3 in a structured format.\\n- **Layer 2**: Candidate Generation (CG): In this stage, Spark jobs leveraged embeddings and historical interaction data to generate candidate recommendations for users. These candidate lists were subsequently written to S3.\\n- **Layer 3**: Ranking and Merging \u2013 A final round of processing ranked the generated candidates using ML models, combined different candidate lists, and stored the final ranked recommendations in a caching system.\\n- **Serving**: A microservice retrieved ranked recommendations from an in-memory data store via exposed APIs, delivering personalized listings across key surfaces such as \\"For You\\" and Category Landing Pages (CLP).\\n\\nThis approach held up well\u2014until Meesho started seeing a significant surge in traffic.\\n\\n## The Turning Point: From Batch to Real-Time\\n\\nAt this time, the team was iterating on new **Ranker models**, and real-time inference seemed like the next logical step. But Rankers needed **real-time feature retrieval**, which meant an **online feature store** had to be built first.\\n\\nExploring open-source options led to **cost vs. performance trade-offs**, but Meesho\u2019s surging traffic meant that **latency and stability were non-negotiable**. After multiple debates and stakeholder discussions, a bold decision was made:\\n\\n*We would build our own feature store.*\\n\\nMeanwhile, efforts began to bring **Candidate Generators (CGs)** to real-time. The challenge? **Storing and retrieving user interactions quickly enough** to power real-time recommendations.\\n\\nAs the team dove deeper, a new roadblock emerged: \\nOur ML jobs were orchestrated using **Airflow DAGs**, giving data scientists flexibility in experimentation. But transitioning to real-time execution threatened this agility. Every change would now require backend engineering support, **slowing down iteration cycles**.\\n\\nThat\u2019s when the idea struck: \\nWe needed a **framework for real-time DAG execution**\u2014one that preserved the same flexibility as Airflow but worked for **streaming data**.\\n\\nThis moment shaped the **next phase of our journey**.\\n\\n## First Generation Design\\n\\n![Alt Text](./first-gen-arch.png)\\n\\n# Laying the Groundwork: The First-Gen ML Platform\\n\\nTo solve these challenges, the team built three foundational components:\\n\\n\\n### 1. IOP Framework: A Real-Time DAG Executor\\n\\n- **Reusable Nodes**: Each DAG node (e.g., an invocation to a CG service, a ranker, or a filter) had to be implemented only once. After that, it could be reused across any workflow by referencing it in config.\\n- **Config-driven Dynamic Graphs**: Execution graphs were defined as adjacency lists stored in **ZooKeeper**, allowing teams to modify the sequence or structure of operations without touching application code.\\n- **Plug-and-play CGs**: The Candidate Generator interface was preserved, so a single CG node could call any CG service by passing `cg_name` in the request. This drastically reduced the code surface area and improved maintainability.\\n- **Production-Grade DAGs**: DAGs were designed to execute in **low-latency real-time environments**, with support for **parallel execution, retries, and branching**.\\n\\n[More about IOP DAG](https://www.meesho.io/blog/rebuilding-meeshos-ranking-platform)\\n\\n\\n### 2. Online Feature Store - 0th Version\\n\\n- Used **Cassandra** and **Redis** for low-latency feature serving.\\n- Maintained feature consistency using **Feature Groups** with TTL-based expiry.\\n- A hybrid schema was used: feature keys stored in **ZooKeeper**, data stored in **compact arrays**.\\n\\n\\n### 3. Interaction Store - 0th Version\\n\\n- Captured real-time user interactions like clicks, orders, and add-to-cart events.\\n- Stored event data in **Redis ZSETs (sorted sets)** to enable fast lookups for recommendation engines.\\n- Provided an API to fetch a user\'s **last _k_ interactions** or **interactions within a time window**.\\n\\n\\nWith these components in place, **real-time ML at Meesho became a reality**.\\n\\nThis was just the beginning.\\n\\n## Building the Online Feature Store - 0th Version\\n\\n![Alt text](./online-feature-store-v0.png)\\n\\n### Choosing the Right Tech Stack\\n\\nWe spent considerable time evaluating various databases, caches, and communication protocols for our **online feature store**. After carefully weighing **cost, latency, throughput**, and **operational stability**, we settled on a combination of:\\n\\n- **Cassandra** and **Redis** for storage\\n- **gRPC + Proto3** as our communication layer\\n\\n\\n### Streamlining the Data Flow\\n\\nTo keep things simple in the initial version:\\n\\n- **Feature engineering jobs** wrote raw outputs to an **S3 bucket**\\n- A **daily feature push job**:\\n - Read from S3\\n - Grouped related features into **Feature Groups** (ensuring consistency)\\n - Pushed them to **Kafka**\\n\\nFor features requiring frequent updates:\\n\\n- **Ad-hoc jobs** computed features in higher frequency\\n- These jobs pushed to both **Kafka** and **S3** (S3 preserved historical data for future model training)\\n\\n\\n## The Challenges: Data Format and Storage\\n\\nOne of the most critical design challenges was how to store feature data **efficiently and consistently**, especially in databases like **Cassandra** and **Redis**, which come with unique storage constraints.\\n\\nWe had to solve for three key requirements:\\n\\n- ### Feature Consistency\\n When a feature group contains features like `order_count_1h` and `click_count_1h`, both must reflect the **same time window**. Inconsistent updates would lead to **unreliable model predictions**.\\n\\n- ### TTL Granularity\\n Each feature group required an **expiry timestamp**, so that **all features within it expired together**\u2014preserving consistency during reads.\\n\\n- ### Extensibility Across Databases\\n We anticipated that infra needs would evolve. To future-proof our system, the data format was designed to be **decoupled from DB-specific layouts**, enabling portability to systems like **ScyllaDB**, **DynamoDB**, **HBase**, or **BigTable**.\\n\\n\\n---\\n\\n## Overcoming Technical Constraints\\nAt the time, we were using Cassandra, which not only imposed a soft limit of 75 columns per row, but also exhibited significant performance degradation as the number of columns increased further, particularly in memory constrained machines. Wide rows caused high memory usage during reads, unpredictable latencies due to heavy deserialization overhead, and inefficiencies during compactions and repairs. This ruled out the naive \\"one column per feature\\" approach. We needed a format that was compact, minimized the number of columns, and remained efficient and portable across different storage systems.\\n\\n## The Solution: Schema Separation\\n\\nWe introduced the concept of Feature Groups\u2014logical groupings of features that must remain consistent with one another.\\nTo represent these groups efficiently, we adopted a layered storage approach:\\n\\n- **Feature Labels (Keys)** were stored in ZooKeeper, serving as the schema.\\n- **Feature Values** were stored as a comma-separated string array in Cassandra or Redis.\\n- **Expiry Timestamp and Schema Version** were appended using a semi-colon delimiter at the end of the string.\\n\\nExample:\\n\\n```bash\\nfeature_1_value,feature_2_value,feature_3_value;expiry_ts\\n```\\n\\nThis format allowed:\\n- Consistent writes and reads at the group level\\n- Easy parsing of feature values using the schema lookup from ZooKeeper\\n- Efficient storage with minimal DB column usage\\n- Support for per-group TTLs and schema evolution\\n\\n## Tracking Changes in Feature Groups\\nFeature groups don\u2019t stay static. As models evolve, features get added, renamed, or removed. But schema changes often go live before the data is ready\u2014and stopping ingestion just to wait for everything to align isn\'t feasible.\\n\\n### Common Real-World Scenarios:\\n- A new feature is added to the schema, but ingestion jobs still use the older schema version.\\n- Ongoing writes don\u2019t include the newly added feature, and stopping ingestion would break freshness for existing features.\\n- During serving, models request a mix of old and new features, depending on rollout stages.\\n\\n## The Solution: Schema Versioning\\nWe solved this with versioned feature group schemas, which unlocked several capabilities:\\n- ### Backward Compatibility\\n Older ingestion jobs can continue writing using older schema versions. During reads, the system uses the schema version embedded in the value to interpret the data correctly.\\n- ### Partial Availability Handling \\n During inference, if some features in the request aren\u2019t available (due to rollout delays or missing data), the system serves default values, ensuring the inference call doesn\u2019t fail.\\n- ### Safe Writes Without Pipeline Pauses\\n With schema versioning, we no longer had to stop ingestion pipelines for schema updates. Writes using previous versions can continue safely, and downstream consumers evolve independently.\\nThis design gave us the flexibility to move fast without breaking things\u2014preserving data quality, enabling experimentation, and ensuring reliability at scale.\\n\\n![Alt Text](./schema.png)\\n\\n## Interaction Store - 0th Version\\n\\n![Alt Text](./interaction-store-v0.png)\\n\\nTo power real-time Candidate Generators (CGs), we needed fast access to user behavior signals\u2014like what a user recently clicked, ordered, or added to their cart. These interactions form the basis for many real-time recommendations, such as **Similar Products**, **People Also Viewed**, or **Recently Ordered Again**.\\nFor the **0th version** of the Interaction Store, we focused on a design that was **simple, fast, and reliable** \u2014 optimized for high-throughput ingestion and low-latency lookups.\\n\\n## Event Ingestion\\nWe instrumented our backend services to emit key user interaction events to Kafka in real time. These included:\\n- Click\\n- Order\\n- Add to Cart\\n- Wishlist\\n- Share\\n\\nEach event carried essential metadata:\\n- userId \u2014 uniquely identifies the user\\n- productId \u2014 the item being interacted with\\n- timestamp \u2014 the moment the interaction occurred\\n\\nThis decoupled the interaction logging from storage, allowing ingestion and consumption to scale independently.\\n\\n## Storage Design\\nTo store these events, we built Kafka consumers that processed the incoming streams and wrote the data into Redis, using sorted sets (ZSETs) as the primary data structure.\\n\\n### Why Redis?\\nRedis gave us:\\n- **Low-latency** reads and writes\\n- **Time-ordered data** using ZSETs (via score = timestamp)\\n- **Native TTL support**, if needed in later versions\\n- **In-memory performance** \u2014ideal for real-time CGs\\n\\n### Storage Structure\\nEach user\u2019s interactions were stored using a composite key format, uniquely identifying the user and interaction type. This structure allowed efficient organization and quick retrieval of recent activity for recommendation generation:\\n\\n```bash\\nuserId_eventType \u2192 ZSET[...(pid, ts)...]\\n```\\n\\nWithin each ZSET:\\n\\n- The **timestamp** served as the score, maintaining temporal order\\n- The **productId** (optionally with metadata) was the **value**\\n\\nThis allowed us to efficiently retrieve the interactions with HTTP-based API server with two query modes:\\n- Fetch the **last k interactions** of a specific type for a given user with `ZREVRANGE(userId_eventType, count)`\\n- Retrieve **all interactions within a time range** (e.g., last 24 hours) with `ZREVRANGEBYSCORE(userId_eventType, timeRange)`\\n\\n### Built-in Guardrails\\nSince Redis was the sole store, we implemented High Availability (HA) to prevent data loss. To optimize memory usage, we also enforced size limits per event type\u2014only storing the last k interactions per user, with older entries getting truncated.\\n\\n## Conclusion: Laying the Foundation for Real-Time ML\\n\\nIn this first phase, we tackled the **fundamentals**\u2014shifting from batch-based recommendations to a **real-time Recommendation** using ML platform that could keep up with Meesho\u2019s growth.\\n\\nWith the **IOP Framework**, **Online Feature Store**, and **Interaction Store**, we built the core infrastructure to support real-time personalization at scale. These wins have already unlocked: \\n- \u2705 Faster, more dynamic recommendations for millions of users. \\n- \u2705 Better infrastructure efficiency, reducing wasted compute power. \\n- \u2705 A flexible, modular system that allows for further experimentation.\\n\\nBut this is just the beginning. While we\'ve solved key challenges, **certain roadblocks remain** \u2014from optimizing **cost-performance trade-offs** to **seamlessly evolving schemas**.\\n\\n\\nThis foundational work laid the path for a reliable and scalable **real-time feature serving layer**."}]}}')}}]); \ No newline at end of file diff --git a/docs/assets/js/79ae4ea7.1416ba4f.js b/docs/assets/js/79ae4ea7.1416ba4f.js deleted file mode 100644 index 8a4eb857..00000000 --- a/docs/assets/js/79ae4ea7.1416ba4f.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[4340],{2173:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/llm-plat-9ac69c0ffd8c387d177e582611b8c775.png"},4311:(e,n,i)=>{i.r(n),i.d(n,{assets:()=>l,contentTitle:()=>o,default:()=>h,frontMatter:()=>a,metadata:()=>t,toc:()=>c});const t=JSON.parse('{"permalink":"/BharatMLStack/blog/post-three","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-four/index.md","source":"@site/blog/bharatmlstack-history/post-four/index.md","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","description":"BharatMLStack","date":"2025-03-29T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":13.38,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-three","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","authors":["jaya"],"date":"2025-3-29","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/post-five"},"nextItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/post-three"}}');var r=i(4848),s=i(8453);const a={slug:"post-three",title:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving",authors:["jaya"],date:"2025-3-29",tags:["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},o=void 0,l={authorsImageUrls:[void 0]},c=[{value:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving",id:"designing-a-production-grade-llm-inference-platform-from-model-weights-to-scalable-gpu-serving",level:2},{value:"Why LLM Inference Is not just bigger ML model serving",id:"why-llm-inference-is-not-just-bigger-ml-model-serving",level:2},{value:"Autoregressive Generation and Sequential Computation:",id:"autoregressive-generation-and-sequential-computation",level:3},{value:"Prefill and Decode Phases:",id:"prefill-and-decode-phases",level:3},{value:"Context Management and KV Caching:",id:"context-management-and-kv-caching",level:3},{value:"Dynamic and Irregular Workloads:",id:"dynamic-and-irregular-workloads",level:3},{value:"Streaming and User Experience Constraints:",id:"streaming-and-user-experience-constraints",level:3},{value:"LLMOps: High-Level Architecture",id:"llmops-high-level-architecture",level:2},{value:"Supported Inference backends (TensorRT LLM, Dynamo & vLLM)",id:"supported-inference-backends-tensorrt-llm--dynamo--vllm",level:2},{value:"Conclusion",id:"conclusion",level:2},{value:"Future Explorations",id:"future-explorations",level:2}];function d(e){const n={h2:"h2",h3:"h3",img:"img",li:"li",ol:"ol",p:"p",ul:"ul",...(0,s.R)(),...e.components};return(0,r.jsxs)(r.Fragment,{children:[(0,r.jsx)(n.p,{children:(0,r.jsx)(n.img,{alt:"BharatMLStack",src:i(7996).A+"",width:"1396",height:"460"})}),"\n",(0,r.jsx)(n.h2,{id:"designing-a-production-grade-llm-inference-platform-from-model-weights-to-scalable-gpu-serving",children:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving"}),"\n",(0,r.jsx)(n.p,{children:"Serving large language models in production introduces new challenges across infrastructure, performance optimization, and operational lifecycle management. The LLM Inference Platform addresses these challenges by providing a unified system for deploying and managing open-source and fine-tuned LLMs at scale."}),"\n",(0,r.jsx)(n.p,{children:"The platform implements a complete LLMOps lifecycle \u2014 from model registration and automated compilation to deployment, runtime optimization, and monitoring. Designed as a self-service environment, users can onboard models directly from open repositories such as Hugging Face or upload custom fine-tuned models, and deploy them using a single-click workflow with no manual infrastructure or configuration steps required."}),"\n",(0,r.jsx)(n.p,{children:"In addition to fully automated deployment, the platform allows users to select and apply custom inference optimization techniques \u2014 such as quantization strategies, batching configurations, and runtime-specific performance enhancements \u2014 enabling teams to balance latency, throughput, and cost based on their use case. The goal is to reduce operational friction while enabling high-performance, production-grade LLM inference."}),"\n",(0,r.jsx)(n.h2,{id:"why-llm-inference-is-not-just-bigger-ml-model-serving",children:"Why LLM Inference Is not just bigger ML model serving"}),"\n",(0,r.jsx)(n.p,{children:"Large language model (LLM) inference introduces a fundamentally different set of challenges compared to traditional machine learning inference. While classical ML models typically perform a single forward pass to produce a fixed prediction, LLMs operate as autoregressive systems, generating outputs token by token based on previously generated context. This difference dramatically changes how inference systems must be designed, optimized, and scaled."}),"\n",(0,r.jsx)(n.h3,{id:"autoregressive-generation-and-sequential-computation",children:"Autoregressive Generation and Sequential Computation:"}),"\n",(0,r.jsx)(n.p,{children:"Unlike traditional models such as classifiers or recommenders \u2014 where inference cost is relatively constant \u2014 LLMs generate responses incrementally. Each new token depends on all previously generated tokens, making inference inherently sequential and dynamic. This means latency and compute requirements vary significantly depending on prompt length and output size, introducing complexity in scheduling and resource allocation.\nBecause tokens cannot be generated fully in parallel during decoding, GPUs may become underutilized without specialized batching and scheduling strategies. This has led to the development of dedicated LLM inference engines optimized for token-level execution."}),"\n",(0,r.jsx)(n.h3,{id:"prefill-and-decode-phases",children:"Prefill and Decode Phases:"}),"\n",(0,r.jsx)(n.p,{children:"LLM inference typically consists of two distinct stages:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Prefill phase \u2014 the model processes the input prompt and builds internal representations. This stage is compute-heavy and highly parallelizable."}),"\n",(0,r.jsx)(n.li,{children:"Decode phase \u2014 the model generates tokens sequentially, predicting one token at a time using previously generated context."}),"\n"]}),"\n",(0,r.jsx)(n.p,{children:"The decode stage often becomes memory-bound rather than compute-bound, which creates new performance bottlenecks compared to traditional ML workloads."}),"\n",(0,r.jsx)(n.h3,{id:"context-management-and-kv-caching",children:"Context Management and KV Caching:"}),"\n",(0,r.jsx)(n.p,{children:"Another fundamental difference lies in how LLMs maintain context. Transformer-based models rely on attention mechanisms that require access to past token representations. To avoid recomputing these representations repeatedly, inference engines use key-value (KV) caching, which stores intermediate activations from previous tokens.\nKV caching significantly improves performance by eliminating redundant computation, but it introduces new challenges:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Memory consumption grows with sequence length and batch size"}),"\n",(0,r.jsx)(n.li,{children:"GPU memory becomes a critical bottleneck"}),"\n",(0,r.jsx)(n.li,{children:"Efficient memory management becomes essential for scaling concurrent requests"}),"\n"]}),"\n",(0,r.jsx)(n.p,{children:"This tradeoff between compute efficiency and memory usage is unique to LLM inference workloads."}),"\n",(0,r.jsx)(n.h3,{id:"dynamic-and-irregular-workloads",children:"Dynamic and Irregular Workloads:"}),"\n",(0,r.jsx)(n.p,{children:"Traditional ML inference typically operates on fixed-size inputs with predictable latency. In contrast, LLM requests vary widely in prompt length, output length, and runtime behavior. As a result:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Batch sizes must be dynamic rather than static"}),"\n",(0,r.jsx)(n.li,{children:"Requests may enter and leave batches asynchronously"}),"\n",(0,r.jsx)(n.li,{children:"Scheduling systems must continuously rebalance workloads to maximize GPU utilization"}),"\n"]}),"\n",(0,r.jsx)(n.p,{children:"These characteristics require specialized serving architectures that differ significantly from standard ML serving pipelines."}),"\n",(0,r.jsx)(n.h3,{id:"streaming-and-user-experience-constraints",children:"Streaming and User Experience Constraints:"}),"\n",(0,r.jsx)(n.p,{children:"Another distinguishing factor is the expectation of real-time streaming responses. Instead of returning a single output, LLM systems often stream tokens to users as they are generated.\nBecause of these differences \u2014 sequential generation, growing memory requirements, dynamic workloads, and streaming constraints \u2014 LLM inference cannot be treated as a simple extension of existing ML serving systems. Production platforms must incorporate specialized runtime engines, advanced optimization techniques, and observability tailored specifically to LLM workloads."}),"\n",(0,r.jsx)(n.h2,{id:"llmops-high-level-architecture",children:"LLMOps: High-Level Architecture"}),"\n",(0,r.jsx)(n.p,{children:(0,r.jsx)(n.img,{alt:"LLM Architecture",src:i(2173).A+"",width:"1302",height:"830"})}),"\n",(0,r.jsx)(n.p,{children:"The LLM Inference Framework is designed as a fully automated, end-to-end system for deploying and operating open-source and fine-tuned large language models at scale. The architecture abstracts the complexity of model optimization, hardware selection, deployment, and runtime management into a unified workflow that enables users to move from raw model weights to production-ready inference endpoints with minimal manual intervention."}),"\n",(0,r.jsx)(n.p,{children:"Our LLM Inference Framework is architected not just as a serving engine, but as a complete lifecycle management system. As illustrated in the high-level design below, the platform automates the journey of a model through seven distinct stages, ensuring reproducibility, performance, and scalability."}),"\n",(0,r.jsxs)(n.ol,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Onboarding & Registration (The Source of Truth)"}),"\n",(0,r.jsx)(n.p,{children:"The lifecycle begins with the Data Scientist or engineer."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Model Ingestion: Users onboard models\u2014whether open-source (Hugging Face, NeMo) or internally fine-tuned\u2014via the Truffle Box SDK/UI."}),"\n",(0,r.jsx)(n.li,{children:'LLM + Prompt Registry: Unlike traditional systems that only track model weights, our registry is a unified control plane. It stores both the Model Artifacts and the Prompt Templates. This allows Data Scientists to register and version-control prompts (e.g., "customer_support_v2") independently of the application code.'}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:'The "Black Box" Build Engine'}),"\n",(0,r.jsx)(n.p,{children:"Once a model is registered, the Automated LLM Compiler + Quantizer Module kicks off a background job on ephemeral GPU resources."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Transformation: The raw model is converted into a TRT-LLM Checkpoint."}),"\n",(0,r.jsx)(n.li,{children:"Quantization: The system automatically applies quantization algorithms (like INT4 AWQ or FP8) to reduce memory footprint."}),"\n",(0,r.jsx)(n.li,{children:"Engine Building: Finally, it compiles a highly optimized TRT Engine specifically tuned for the target hardware."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Intelligent Profiling & Validation"}),"\n",(0,r.jsx)(n.p,{children:"Before deployment, the new engine passes through the Hardware & Inference Runtime Profiler."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Benchmarking: This module empirically tests the engine against various hardware configurations (L4 vs. A100) and runtimes (TRT-LLM vs. vLLM)."}),"\n",(0,r.jsx)(n.li,{children:"Optimization: It recommends the optimal configuration that meets latency SLAs (Time-To-First-Token) while minimizing cost."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Smart Artifact Generation & Distribution"}),"\n",(0,r.jsx)(n.p,{children:'To solve the Kubernetes "Cold Start" problem, the LLM Serving Artifacts Generation module packages the model using a bifurcated strategy:'}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Standard Models: Artifacts are uploaded to Cloud Storage (GCS) and downloaded by pods at startup."}),"\n",(0,r.jsx)(n.li,{children:"Very Large Models: For massive models (>8GB) where network downloads are too slow, the system pre-caches the model onto Secondary Boot Disks. These disks are attached directly to new GPU nodes during autoscaling, eliminating download wait times."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Image Streaming & Deployment"}),"\n",(0,r.jsx)(n.p,{children:"Simultaneously, the inference runtime container images are pulled from the Artifact Registry."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Image Streaming: We utilize container image streaming to allow pods to start initializing while the massive Triton/Dynamo container layers are still downloading, further shaving seconds off the startup time. link"}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"The Inference Runtime (Kubernetes)"}),"\n",(0,r.jsx)(n.p,{children:"The workload lands on Kubernetes with Autoscaling."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Dynamic Backends: Depending on the profile generated in Stage 3, the pod initializes either TensorRT-LLM (for throughput) or vLLM (for flexibility), or spins up a Dynamo worker for distributed inference."}),"\n",(0,r.jsx)(n.li,{children:'Data Loading: The pod either downloads the model from Cloud Storage or mounts the pre-warmed Secondary Boot Disk ("Pull from Disk").'}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Client Interaction & Observability"}),"\n",(0,r.jsx)(n.p,{children:"Finally, the LLM Inference Client executes the request."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Prompt Injection: The client pulls the specific prompt template ID from the Registry, ensuring the exact versioned instructions are used."}),"\n",(0,r.jsx)(n.li,{children:"Streaming Response: The request is sent via gRPC, and tokens are streamed back to the user in real-time."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Observability: Monitoring the Pulse of GenAI"}),"\n",(0,r.jsx)(n.p,{children:"In traditional microservices, success is measured by CPU utilization and request latency (p99). For Large Language Models, these metrics are insufficient. A user doesn't care if the GPU is at 80% utilization; they care about how fast the first word appears and how smoothly the rest of the sentence follows."}),"\n",(0,r.jsx)(n.p,{children:"To capture the true user experience, our platform instrumentation focuses on three critical LLM-specific metrics:"}),"\n",(0,r.jsxs)(n.ol,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Time to First Token (TTFT)"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Definition: TTFT measures the time elapsed from the moment a request is received until the very first token is generated and streamed back to the user."}),"\n",(0,r.jsx)(n.li,{children:'Why it matters: This represents the "Prefill Phase" latency\u2014the time the model takes to process the input prompt and load weights. A high TTFT makes the application feel unresponsive or "hung."'}),"\n",(0,r.jsx)(n.li,{children:"Optimization: We closely monitor TTFT to ensure our Prefix Caching is effective (aiming for high cache hitrates), which drastically lowers this metric by skipping redundant prompt processing."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Inter-Token Latency (ITL)"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:'Definition: ITL measures the average time interval between the generation of consecutive tokens during the "Decode Phase".'}),"\n",(0,r.jsx)(n.li,{children:'Why it matters: This defines the "perceived speed" of reading. Even if the first token is fast (low TTFT), high ITL makes the text generation look "jerky" or slow to the user.'}),"\n",(0,r.jsx)(n.li,{children:"Benchmarks: In our testing with Llama 3.1, we track p99 ITL to ensure it stays below human reading speeds to maintain a natural conversational flow."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Token Throughput vs. Request Throughput"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"We distinguish between two types of throughput to balance system efficiency with user load:"}),"\n",(0,r.jsx)(n.li,{children:"Token Throughput (tokens/sec): The total number of tokens generated across all concurrent requests. This measures the raw compute efficiency of the GPU and the effectiveness of batching."}),"\n",(0,r.jsx)(n.li,{children:"Request Throughput (req/sec): The number of distinct user queries served per second. We use this to determine autoscaling thresholds, ensuring we scale out before the queue depth impacts ITL."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"The Monitoring Stack"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:'Real-time Dashboards: We utilize Grafana to visualize these streaming metrics in real-time, allowing on-call engineers to spot "slow generation" incidents that generic "500 error" alerts would miss.'}),"\n",(0,r.jsx)(n.li,{children:'Request Tracing: Since Triton Inference Server does not log request payloads by default, we integrate a Helix Client to asynchronously publish request logs to Log Tables. This allows us to trace a specific "slow" request back to its prompt to understand if a complex input caused the latency spike.'}),"\n"]}),"\n"]}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,r.jsx)(n.h2,{id:"supported-inference-backends-tensorrt-llm--dynamo--vllm",children:"Supported Inference backends (TensorRT LLM, Dynamo & vLLM)"}),"\n",(0,r.jsx)(n.p,{children:'Tailored for the Use Case: We do not believe in a "one-size-fits-all" approach to inference. Different use cases\u2014whether a real-time voice bot requiring ultra-lowsub-second latency or a massive reasoning task requiring huge context windows\u2014demand different runtime characteristics. Our platform is designed to be runtime-agnostic, allowing us to automatically select and tailor the best engine based on the specific requirements of the application:'}),"\n",(0,r.jsxs)(n.ol,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"TensorRT-LLM: The High-Performance Standard"}),"\n",(0,r.jsx)(n.p,{children:"Suitable for: High-throughput production workloads where latency is critical (e.g., customer support chat, real-time voice bots)."}),"\n",(0,r.jsx)(n.p,{children:"TensorRT-LLM serves as our default backend for these scenarios. Our internal benchmarks on Llama 3.1 and 3.2 models demonstrated that a tuned TensorRT-LLM engine significantly outperforms standard runtimes, especially when utilizing INT4 AWQ and FP8 quantization ."}),"\n",(0,r.jsx)(n.p,{children:"Key optimizations we tailor for these high-load cases include:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Optimized execution via TensorRT engine compilation"}),"\n",(0,r.jsx)(n.li,{children:"Quantization-aware execution for reduced memory usage and improved throughput"}),"\n",(0,r.jsx)(n.li,{children:"Inflight Batching: Allowing requests to be processed continuously without waiting for the entire batch to finish, drastically improving GPU utilization ."}),"\n",(0,r.jsx)(n.li,{children:"Custom Plugins: Enabling specific NVIDIA plugins like the GEMM plugin and GPT Attention plugin to accelerate matrix multiplications and attention mechanisms ."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Dynamo: Distributed Inference for Reasoning Models"}),"\n",(0,r.jsx)(n.p,{children:'Suitable for: Very large "reasoning" models (70B+) or scenarios requiring massive context windows where a single GPU\'s memory is insufficient.'}),"\n",(0,r.jsx)(n.p,{children:"For these memory-bound tasks, we utilize Dynamo, a low-latency distributed inference framework . Unlike monolithic servers, Dynamo disaggregates the inference process to scale resources horizontally:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"KV Aware Routing: A specialized router directs requests to workers that already hold the relevant Key-Value (KV) cache, minimizing redundant computation ."}),"\n",(0,r.jsx)(n.li,{children:'Prefill vs. Decode Split: The workload is divided into Prefill Workers (processing the prompt) and Decode Workers (generating tokens), allowing us to scale the compute-heavy "reading" phase independently from the memory-heavy "writing" phase .'}),"\n",(0,r.jsx)(n.li,{children:"Distributed execution across multiple GPU resources"}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"vLLM: The Flexible Baseline"}),"\n",(0,r.jsx)(n.p,{children:"Suitable for: Rapid prototyping, testing new model architectures, or low-traffic internal tools where ease of deployment outweighs raw throughput."}),"\n",(0,r.jsx)(n.p,{children:"While TensorRT-LLM is optimized for maximum speed, vLLM provides a robust and flexible baseline ."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"High throughput through dynamic batching and efficient memory utilization"}),"\n",(0,r.jsx)(n.li,{children:"Paged KV cache management for handling long contexts and concurrent requests"}),"\n",(0,r.jsx)(n.li,{children:"Strong support for open-source model ecosystems"}),"\n",(0,r.jsx)(n.li,{children:"Rapid Adoption: It allows us to onboard new model architectures immediately without waiting for a custom TensorRT build."}),"\n",(0,r.jsx)(n.li,{children:"Benchmarking Insight: In our internal tests, vLLM provided a strong baseline but often lacked the specific max-token optimizations present in our custom TRT engines . We use it strategically for initial testing before committing to a full TensorRT optimization pipeline."}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,r.jsx)(n.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,r.jsx)(n.p,{children:"Large language model inference introduces a fundamentally new class of infrastructure challenges\u2014where performance is governed not just by raw compute, but by memory efficiency, intelligent scheduling, runtime specialization, and lifecycle automation. Unlike traditional ML serving, LLM inference requires systems that understand token-level execution, manage rapidly growing context state, and continuously balance latency, throughput, and cost under highly dynamic workloads."}),"\n",(0,r.jsx)(n.p,{children:"The LLM Inference Framework addresses these challenges by transforming inference into a fully automated, reproducible lifecycle\u2014from model onboarding and compilation to deployment, optimization, and observability. By integrating automated quantization and engine compilation, intelligent runtime selection, cold-start mitigation strategies, and LLM-specific observability metrics such as Time-to-First-Token and Inter-Token Latency, the platform ensures both high performance and operational simplicity."}),"\n",(0,r.jsx)(n.p,{children:"Equally important, the framework is designed with flexibility and future evolution in mind. Its runtime-agnostic architecture enables seamless adoption of emerging inference engines, hardware accelerators, and optimization techniques without requiring platform redesign. This ensures that teams can continuously leverage advancements in the rapidly evolving LLM ecosystem while maintaining consistent operational workflows."}),"\n",(0,r.jsx)(n.p,{children:"Ultimately, the goal of the platform is to make production-scale LLM deployment as seamless and reliable as traditional software deployment\u2014allowing teams to focus on building intelligent applications rather than managing infrastructure complexity. By combining lifecycle automation, runtime optimization, and deep observability, the LLM Inference Framework provides a scalable foundation for delivering fast, cost-efficient, and production-ready LLM experiences."}),"\n",(0,r.jsx)(n.h2,{id:"future-explorations",children:"Future Explorations"}),"\n",(0,r.jsx)(n.p,{children:"While we have achieved significant milestones in latency and throughput, the landscape of GenAI is evolving rapidly. Our roadmap focuses on increasing flexibility, reducing costs, and enhancing reliability for enterprise-grade workloads. Here is what we are building next:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"TPU Support: To diversify our hardware supply chain and further optimize cost-per-token, we are evaluating Google Cloud TPUs to bake it into our platform. By leveraging the JAX and PyTorch/XLA ecosystems, we aim to unlock the massive throughput potential of TPU v5e chips, particularly for our open-source Llama models. This will allow the hardware profiler to dynamically choose between NVIDIA GPUs and Google TPUs based on real-time availability and price-performance metrics."}),"\n",(0,r.jsx)(n.li,{children:'Multi-LoRA Serving (Serverless Experience): Currently, deploying a fine-tuned model requires a dedicated GPU. We are building support for Multi-LoRA serving, which will allow us to serve hundreds of unique, fine-tuned adapters on top of a single frozen base model. This will drastically reduce costs for multi-tenant applications, enabling a "serverless" experience where specific fine-tunes are hot-swapped instantly per request.'}),"\n",(0,r.jsx)(n.li,{children:"Spot Instance Orchestration: To further optimize cloud costs, we are developing fault-tolerant mechanisms to run inference workloads on Spot Instances. By implementing aggressive checkpointing and seamless request draining, we aim to leverage cheaper, preemptible compute capacity without interrupting the user's streaming experience."}),"\n",(0,r.jsx)(n.li,{children:'Semantic Caching Layer: We plan to move beyond standard Prefix Caching to implement Semantic Caching. By using a vector database to fetch responses for semantically similar queries (e.g., "How do I reset my password?" vs. "Password reset steps"), we can bypass the GPU entirely for repetitive queries, reducing latency to near-zero.'}),"\n",(0,r.jsx)(n.li,{children:"Context-Aware Autoscaling: Standard CPU/GPU utilization metrics are often insufficient signals for scaling LLMs. We are working on KV-cache pressure metrics for autoscaling. This ensures that we scale out before the memory fills up, preventing eviction-based slowdowns during traffic spikes."}),"\n",(0,r.jsx)(n.li,{children:'Online Evaluation & Guardrails: We are integrating a lightweight "Trust Layer" into the proxy. This will allow for low-latency input/output filtering (Guardrails) and asynchronous "LLM-as-a-Judge" evaluation pipelines to monitor response quality in production, not just system health.'}),"\n"]})]})}function h(e={}){const{wrapper:n}={...(0,s.R)(),...e.components};return n?(0,r.jsx)(n,{...e,children:(0,r.jsx)(d,{...e})}):d(e)}},7996:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},8453:(e,n,i)=>{i.d(n,{R:()=>a,x:()=>o});var t=i(6540);const r={},s=t.createContext(r);function a(e){const n=t.useContext(s);return t.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function o(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(r):e.components||r:a(e.components),t.createElement(s.Provider,{value:n},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/79ae4ea7.1af179c5.js b/docs/assets/js/79ae4ea7.1af179c5.js new file mode 100644 index 00000000..35991085 --- /dev/null +++ b/docs/assets/js/79ae4ea7.1af179c5.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[4340],{2173:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/llm-plat-9ac69c0ffd8c387d177e582611b8c775.png"},2233:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-four","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-four/index.md","source":"@site/blog/bharatmlstack-history/post-four/index.md","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","description":"BharatMLStack","date":"2025-03-29T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":13.38,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-four","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","authors":["jaya"],"date":"2025-3-29","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/post-five"},"nextItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/post-three"}}')},2305:(e,n,i)=>{i.r(n),i.d(n,{assets:()=>l,contentTitle:()=>o,default:()=>h,frontMatter:()=>a,metadata:()=>t,toc:()=>c});var t=i(2233),r=i(4848),s=i(8453);const a={slug:"post-four",title:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving",authors:["jaya"],date:"2025-3-29",tags:["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},o=void 0,l={authorsImageUrls:[void 0]},c=[{value:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving",id:"designing-a-production-grade-llm-inference-platform-from-model-weights-to-scalable-gpu-serving",level:2},{value:"Why LLM Inference Is not just bigger ML model serving",id:"why-llm-inference-is-not-just-bigger-ml-model-serving",level:2},{value:"Autoregressive Generation and Sequential Computation:",id:"autoregressive-generation-and-sequential-computation",level:3},{value:"Prefill and Decode Phases:",id:"prefill-and-decode-phases",level:3},{value:"Context Management and KV Caching:",id:"context-management-and-kv-caching",level:3},{value:"Dynamic and Irregular Workloads:",id:"dynamic-and-irregular-workloads",level:3},{value:"Streaming and User Experience Constraints:",id:"streaming-and-user-experience-constraints",level:3},{value:"LLMOps: High-Level Architecture",id:"llmops-high-level-architecture",level:2},{value:"Supported Inference backends (TensorRT LLM, Dynamo & vLLM)",id:"supported-inference-backends-tensorrt-llm--dynamo--vllm",level:2},{value:"Conclusion",id:"conclusion",level:2},{value:"Future Explorations",id:"future-explorations",level:2}];function d(e){const n={h2:"h2",h3:"h3",img:"img",li:"li",ol:"ol",p:"p",ul:"ul",...(0,s.R)(),...e.components};return(0,r.jsxs)(r.Fragment,{children:[(0,r.jsx)(n.p,{children:(0,r.jsx)(n.img,{alt:"BharatMLStack",src:i(7996).A+"",width:"1396",height:"460"})}),"\n",(0,r.jsx)(n.h2,{id:"designing-a-production-grade-llm-inference-platform-from-model-weights-to-scalable-gpu-serving",children:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving"}),"\n",(0,r.jsx)(n.p,{children:"Serving large language models in production introduces new challenges across infrastructure, performance optimization, and operational lifecycle management. The LLM Inference Platform addresses these challenges by providing a unified system for deploying and managing open-source and fine-tuned LLMs at scale."}),"\n",(0,r.jsx)(n.p,{children:"The platform implements a complete LLMOps lifecycle \u2014 from model registration and automated compilation to deployment, runtime optimization, and monitoring. Designed as a self-service environment, users can onboard models directly from open repositories such as Hugging Face or upload custom fine-tuned models, and deploy them using a single-click workflow with no manual infrastructure or configuration steps required."}),"\n",(0,r.jsx)(n.p,{children:"In addition to fully automated deployment, the platform allows users to select and apply custom inference optimization techniques \u2014 such as quantization strategies, batching configurations, and runtime-specific performance enhancements \u2014 enabling teams to balance latency, throughput, and cost based on their use case. The goal is to reduce operational friction while enabling high-performance, production-grade LLM inference."}),"\n",(0,r.jsx)(n.h2,{id:"why-llm-inference-is-not-just-bigger-ml-model-serving",children:"Why LLM Inference Is not just bigger ML model serving"}),"\n",(0,r.jsx)(n.p,{children:"Large language model (LLM) inference introduces a fundamentally different set of challenges compared to traditional machine learning inference. While classical ML models typically perform a single forward pass to produce a fixed prediction, LLMs operate as autoregressive systems, generating outputs token by token based on previously generated context. This difference dramatically changes how inference systems must be designed, optimized, and scaled."}),"\n",(0,r.jsx)(n.h3,{id:"autoregressive-generation-and-sequential-computation",children:"Autoregressive Generation and Sequential Computation:"}),"\n",(0,r.jsx)(n.p,{children:"Unlike traditional models such as classifiers or recommenders \u2014 where inference cost is relatively constant \u2014 LLMs generate responses incrementally. Each new token depends on all previously generated tokens, making inference inherently sequential and dynamic. This means latency and compute requirements vary significantly depending on prompt length and output size, introducing complexity in scheduling and resource allocation.\nBecause tokens cannot be generated fully in parallel during decoding, GPUs may become underutilized without specialized batching and scheduling strategies. This has led to the development of dedicated LLM inference engines optimized for token-level execution."}),"\n",(0,r.jsx)(n.h3,{id:"prefill-and-decode-phases",children:"Prefill and Decode Phases:"}),"\n",(0,r.jsx)(n.p,{children:"LLM inference typically consists of two distinct stages:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Prefill phase \u2014 the model processes the input prompt and builds internal representations. This stage is compute-heavy and highly parallelizable."}),"\n",(0,r.jsx)(n.li,{children:"Decode phase \u2014 the model generates tokens sequentially, predicting one token at a time using previously generated context."}),"\n"]}),"\n",(0,r.jsx)(n.p,{children:"The decode stage often becomes memory-bound rather than compute-bound, which creates new performance bottlenecks compared to traditional ML workloads."}),"\n",(0,r.jsx)(n.h3,{id:"context-management-and-kv-caching",children:"Context Management and KV Caching:"}),"\n",(0,r.jsx)(n.p,{children:"Another fundamental difference lies in how LLMs maintain context. Transformer-based models rely on attention mechanisms that require access to past token representations. To avoid recomputing these representations repeatedly, inference engines use key-value (KV) caching, which stores intermediate activations from previous tokens.\nKV caching significantly improves performance by eliminating redundant computation, but it introduces new challenges:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Memory consumption grows with sequence length and batch size"}),"\n",(0,r.jsx)(n.li,{children:"GPU memory becomes a critical bottleneck"}),"\n",(0,r.jsx)(n.li,{children:"Efficient memory management becomes essential for scaling concurrent requests"}),"\n"]}),"\n",(0,r.jsx)(n.p,{children:"This tradeoff between compute efficiency and memory usage is unique to LLM inference workloads."}),"\n",(0,r.jsx)(n.h3,{id:"dynamic-and-irregular-workloads",children:"Dynamic and Irregular Workloads:"}),"\n",(0,r.jsx)(n.p,{children:"Traditional ML inference typically operates on fixed-size inputs with predictable latency. In contrast, LLM requests vary widely in prompt length, output length, and runtime behavior. As a result:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Batch sizes must be dynamic rather than static"}),"\n",(0,r.jsx)(n.li,{children:"Requests may enter and leave batches asynchronously"}),"\n",(0,r.jsx)(n.li,{children:"Scheduling systems must continuously rebalance workloads to maximize GPU utilization"}),"\n"]}),"\n",(0,r.jsx)(n.p,{children:"These characteristics require specialized serving architectures that differ significantly from standard ML serving pipelines."}),"\n",(0,r.jsx)(n.h3,{id:"streaming-and-user-experience-constraints",children:"Streaming and User Experience Constraints:"}),"\n",(0,r.jsx)(n.p,{children:"Another distinguishing factor is the expectation of real-time streaming responses. Instead of returning a single output, LLM systems often stream tokens to users as they are generated.\nBecause of these differences \u2014 sequential generation, growing memory requirements, dynamic workloads, and streaming constraints \u2014 LLM inference cannot be treated as a simple extension of existing ML serving systems. Production platforms must incorporate specialized runtime engines, advanced optimization techniques, and observability tailored specifically to LLM workloads."}),"\n",(0,r.jsx)(n.h2,{id:"llmops-high-level-architecture",children:"LLMOps: High-Level Architecture"}),"\n",(0,r.jsx)(n.p,{children:(0,r.jsx)(n.img,{alt:"LLM Architecture",src:i(2173).A+"",width:"1302",height:"830"})}),"\n",(0,r.jsx)(n.p,{children:"The LLM Inference Framework is designed as a fully automated, end-to-end system for deploying and operating open-source and fine-tuned large language models at scale. The architecture abstracts the complexity of model optimization, hardware selection, deployment, and runtime management into a unified workflow that enables users to move from raw model weights to production-ready inference endpoints with minimal manual intervention."}),"\n",(0,r.jsx)(n.p,{children:"Our LLM Inference Framework is architected not just as a serving engine, but as a complete lifecycle management system. As illustrated in the high-level design below, the platform automates the journey of a model through seven distinct stages, ensuring reproducibility, performance, and scalability."}),"\n",(0,r.jsxs)(n.ol,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Onboarding & Registration (The Source of Truth)"}),"\n",(0,r.jsx)(n.p,{children:"The lifecycle begins with the Data Scientist or engineer."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Model Ingestion: Users onboard models\u2014whether open-source (Hugging Face, NeMo) or internally fine-tuned\u2014via the Truffle Box SDK/UI."}),"\n",(0,r.jsx)(n.li,{children:'LLM + Prompt Registry: Unlike traditional systems that only track model weights, our registry is a unified control plane. It stores both the Model Artifacts and the Prompt Templates. This allows Data Scientists to register and version-control prompts (e.g., "customer_support_v2") independently of the application code.'}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:'The "Black Box" Build Engine'}),"\n",(0,r.jsx)(n.p,{children:"Once a model is registered, the Automated LLM Compiler + Quantizer Module kicks off a background job on ephemeral GPU resources."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Transformation: The raw model is converted into a TRT-LLM Checkpoint."}),"\n",(0,r.jsx)(n.li,{children:"Quantization: The system automatically applies quantization algorithms (like INT4 AWQ or FP8) to reduce memory footprint."}),"\n",(0,r.jsx)(n.li,{children:"Engine Building: Finally, it compiles a highly optimized TRT Engine specifically tuned for the target hardware."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Intelligent Profiling & Validation"}),"\n",(0,r.jsx)(n.p,{children:"Before deployment, the new engine passes through the Hardware & Inference Runtime Profiler."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Benchmarking: This module empirically tests the engine against various hardware configurations (L4 vs. A100) and runtimes (TRT-LLM vs. vLLM)."}),"\n",(0,r.jsx)(n.li,{children:"Optimization: It recommends the optimal configuration that meets latency SLAs (Time-To-First-Token) while minimizing cost."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Smart Artifact Generation & Distribution"}),"\n",(0,r.jsx)(n.p,{children:'To solve the Kubernetes "Cold Start" problem, the LLM Serving Artifacts Generation module packages the model using a bifurcated strategy:'}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Standard Models: Artifacts are uploaded to Cloud Storage (GCS) and downloaded by pods at startup."}),"\n",(0,r.jsx)(n.li,{children:"Very Large Models: For massive models (>8GB) where network downloads are too slow, the system pre-caches the model onto Secondary Boot Disks. These disks are attached directly to new GPU nodes during autoscaling, eliminating download wait times."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Image Streaming & Deployment"}),"\n",(0,r.jsx)(n.p,{children:"Simultaneously, the inference runtime container images are pulled from the Artifact Registry."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Image Streaming: We utilize container image streaming to allow pods to start initializing while the massive Triton/Dynamo container layers are still downloading, further shaving seconds off the startup time. link"}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"The Inference Runtime (Kubernetes)"}),"\n",(0,r.jsx)(n.p,{children:"The workload lands on Kubernetes with Autoscaling."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Dynamic Backends: Depending on the profile generated in Stage 3, the pod initializes either TensorRT-LLM (for throughput) or vLLM (for flexibility), or spins up a Dynamo worker for distributed inference."}),"\n",(0,r.jsx)(n.li,{children:'Data Loading: The pod either downloads the model from Cloud Storage or mounts the pre-warmed Secondary Boot Disk ("Pull from Disk").'}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Client Interaction & Observability"}),"\n",(0,r.jsx)(n.p,{children:"Finally, the LLM Inference Client executes the request."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Prompt Injection: The client pulls the specific prompt template ID from the Registry, ensuring the exact versioned instructions are used."}),"\n",(0,r.jsx)(n.li,{children:"Streaming Response: The request is sent via gRPC, and tokens are streamed back to the user in real-time."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Observability: Monitoring the Pulse of GenAI"}),"\n",(0,r.jsx)(n.p,{children:"In traditional microservices, success is measured by CPU utilization and request latency (p99). For Large Language Models, these metrics are insufficient. A user doesn't care if the GPU is at 80% utilization; they care about how fast the first word appears and how smoothly the rest of the sentence follows."}),"\n",(0,r.jsx)(n.p,{children:"To capture the true user experience, our platform instrumentation focuses on three critical LLM-specific metrics:"}),"\n",(0,r.jsxs)(n.ol,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Time to First Token (TTFT)"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Definition: TTFT measures the time elapsed from the moment a request is received until the very first token is generated and streamed back to the user."}),"\n",(0,r.jsx)(n.li,{children:'Why it matters: This represents the "Prefill Phase" latency\u2014the time the model takes to process the input prompt and load weights. A high TTFT makes the application feel unresponsive or "hung."'}),"\n",(0,r.jsx)(n.li,{children:"Optimization: We closely monitor TTFT to ensure our Prefix Caching is effective (aiming for high cache hitrates), which drastically lowers this metric by skipping redundant prompt processing."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Inter-Token Latency (ITL)"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:'Definition: ITL measures the average time interval between the generation of consecutive tokens during the "Decode Phase".'}),"\n",(0,r.jsx)(n.li,{children:'Why it matters: This defines the "perceived speed" of reading. Even if the first token is fast (low TTFT), high ITL makes the text generation look "jerky" or slow to the user.'}),"\n",(0,r.jsx)(n.li,{children:"Benchmarks: In our testing with Llama 3.1, we track p99 ITL to ensure it stays below human reading speeds to maintain a natural conversational flow."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Token Throughput vs. Request Throughput"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"We distinguish between two types of throughput to balance system efficiency with user load:"}),"\n",(0,r.jsx)(n.li,{children:"Token Throughput (tokens/sec): The total number of tokens generated across all concurrent requests. This measures the raw compute efficiency of the GPU and the effectiveness of batching."}),"\n",(0,r.jsx)(n.li,{children:"Request Throughput (req/sec): The number of distinct user queries served per second. We use this to determine autoscaling thresholds, ensuring we scale out before the queue depth impacts ITL."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"The Monitoring Stack"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:'Real-time Dashboards: We utilize Grafana to visualize these streaming metrics in real-time, allowing on-call engineers to spot "slow generation" incidents that generic "500 error" alerts would miss.'}),"\n",(0,r.jsx)(n.li,{children:'Request Tracing: Since Triton Inference Server does not log request payloads by default, we integrate a Helix Client to asynchronously publish request logs to Log Tables. This allows us to trace a specific "slow" request back to its prompt to understand if a complex input caused the latency spike.'}),"\n"]}),"\n"]}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,r.jsx)(n.h2,{id:"supported-inference-backends-tensorrt-llm--dynamo--vllm",children:"Supported Inference backends (TensorRT LLM, Dynamo & vLLM)"}),"\n",(0,r.jsx)(n.p,{children:'Tailored for the Use Case: We do not believe in a "one-size-fits-all" approach to inference. Different use cases\u2014whether a real-time voice bot requiring ultra-lowsub-second latency or a massive reasoning task requiring huge context windows\u2014demand different runtime characteristics. Our platform is designed to be runtime-agnostic, allowing us to automatically select and tailor the best engine based on the specific requirements of the application:'}),"\n",(0,r.jsxs)(n.ol,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"TensorRT-LLM: The High-Performance Standard"}),"\n",(0,r.jsx)(n.p,{children:"Suitable for: High-throughput production workloads where latency is critical (e.g., customer support chat, real-time voice bots)."}),"\n",(0,r.jsx)(n.p,{children:"TensorRT-LLM serves as our default backend for these scenarios. Our internal benchmarks on Llama 3.1 and 3.2 models demonstrated that a tuned TensorRT-LLM engine significantly outperforms standard runtimes, especially when utilizing INT4 AWQ and FP8 quantization ."}),"\n",(0,r.jsx)(n.p,{children:"Key optimizations we tailor for these high-load cases include:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Optimized execution via TensorRT engine compilation"}),"\n",(0,r.jsx)(n.li,{children:"Quantization-aware execution for reduced memory usage and improved throughput"}),"\n",(0,r.jsx)(n.li,{children:"Inflight Batching: Allowing requests to be processed continuously without waiting for the entire batch to finish, drastically improving GPU utilization ."}),"\n",(0,r.jsx)(n.li,{children:"Custom Plugins: Enabling specific NVIDIA plugins like the GEMM plugin and GPT Attention plugin to accelerate matrix multiplications and attention mechanisms ."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Dynamo: Distributed Inference for Reasoning Models"}),"\n",(0,r.jsx)(n.p,{children:'Suitable for: Very large "reasoning" models (70B+) or scenarios requiring massive context windows where a single GPU\'s memory is insufficient.'}),"\n",(0,r.jsx)(n.p,{children:"For these memory-bound tasks, we utilize Dynamo, a low-latency distributed inference framework . Unlike monolithic servers, Dynamo disaggregates the inference process to scale resources horizontally:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"KV Aware Routing: A specialized router directs requests to workers that already hold the relevant Key-Value (KV) cache, minimizing redundant computation ."}),"\n",(0,r.jsx)(n.li,{children:'Prefill vs. Decode Split: The workload is divided into Prefill Workers (processing the prompt) and Decode Workers (generating tokens), allowing us to scale the compute-heavy "reading" phase independently from the memory-heavy "writing" phase .'}),"\n",(0,r.jsx)(n.li,{children:"Distributed execution across multiple GPU resources"}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"vLLM: The Flexible Baseline"}),"\n",(0,r.jsx)(n.p,{children:"Suitable for: Rapid prototyping, testing new model architectures, or low-traffic internal tools where ease of deployment outweighs raw throughput."}),"\n",(0,r.jsx)(n.p,{children:"While TensorRT-LLM is optimized for maximum speed, vLLM provides a robust and flexible baseline ."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"High throughput through dynamic batching and efficient memory utilization"}),"\n",(0,r.jsx)(n.li,{children:"Paged KV cache management for handling long contexts and concurrent requests"}),"\n",(0,r.jsx)(n.li,{children:"Strong support for open-source model ecosystems"}),"\n",(0,r.jsx)(n.li,{children:"Rapid Adoption: It allows us to onboard new model architectures immediately without waiting for a custom TensorRT build."}),"\n",(0,r.jsx)(n.li,{children:"Benchmarking Insight: In our internal tests, vLLM provided a strong baseline but often lacked the specific max-token optimizations present in our custom TRT engines . We use it strategically for initial testing before committing to a full TensorRT optimization pipeline."}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,r.jsx)(n.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,r.jsx)(n.p,{children:"Large language model inference introduces a fundamentally new class of infrastructure challenges\u2014where performance is governed not just by raw compute, but by memory efficiency, intelligent scheduling, runtime specialization, and lifecycle automation. Unlike traditional ML serving, LLM inference requires systems that understand token-level execution, manage rapidly growing context state, and continuously balance latency, throughput, and cost under highly dynamic workloads."}),"\n",(0,r.jsx)(n.p,{children:"The LLM Inference Framework addresses these challenges by transforming inference into a fully automated, reproducible lifecycle\u2014from model onboarding and compilation to deployment, optimization, and observability. By integrating automated quantization and engine compilation, intelligent runtime selection, cold-start mitigation strategies, and LLM-specific observability metrics such as Time-to-First-Token and Inter-Token Latency, the platform ensures both high performance and operational simplicity."}),"\n",(0,r.jsx)(n.p,{children:"Equally important, the framework is designed with flexibility and future evolution in mind. Its runtime-agnostic architecture enables seamless adoption of emerging inference engines, hardware accelerators, and optimization techniques without requiring platform redesign. This ensures that teams can continuously leverage advancements in the rapidly evolving LLM ecosystem while maintaining consistent operational workflows."}),"\n",(0,r.jsx)(n.p,{children:"Ultimately, the goal of the platform is to make production-scale LLM deployment as seamless and reliable as traditional software deployment\u2014allowing teams to focus on building intelligent applications rather than managing infrastructure complexity. By combining lifecycle automation, runtime optimization, and deep observability, the LLM Inference Framework provides a scalable foundation for delivering fast, cost-efficient, and production-ready LLM experiences."}),"\n",(0,r.jsx)(n.h2,{id:"future-explorations",children:"Future Explorations"}),"\n",(0,r.jsx)(n.p,{children:"While we have achieved significant milestones in latency and throughput, the landscape of GenAI is evolving rapidly. Our roadmap focuses on increasing flexibility, reducing costs, and enhancing reliability for enterprise-grade workloads. Here is what we are building next:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"TPU Support: To diversify our hardware supply chain and further optimize cost-per-token, we are evaluating Google Cloud TPUs to bake it into our platform. By leveraging the JAX and PyTorch/XLA ecosystems, we aim to unlock the massive throughput potential of TPU v5e chips, particularly for our open-source Llama models. This will allow the hardware profiler to dynamically choose between NVIDIA GPUs and Google TPUs based on real-time availability and price-performance metrics."}),"\n",(0,r.jsx)(n.li,{children:'Multi-LoRA Serving (Serverless Experience): Currently, deploying a fine-tuned model requires a dedicated GPU. We are building support for Multi-LoRA serving, which will allow us to serve hundreds of unique, fine-tuned adapters on top of a single frozen base model. This will drastically reduce costs for multi-tenant applications, enabling a "serverless" experience where specific fine-tunes are hot-swapped instantly per request.'}),"\n",(0,r.jsx)(n.li,{children:"Spot Instance Orchestration: To further optimize cloud costs, we are developing fault-tolerant mechanisms to run inference workloads on Spot Instances. By implementing aggressive checkpointing and seamless request draining, we aim to leverage cheaper, preemptible compute capacity without interrupting the user's streaming experience."}),"\n",(0,r.jsx)(n.li,{children:'Semantic Caching Layer: We plan to move beyond standard Prefix Caching to implement Semantic Caching. By using a vector database to fetch responses for semantically similar queries (e.g., "How do I reset my password?" vs. "Password reset steps"), we can bypass the GPU entirely for repetitive queries, reducing latency to near-zero.'}),"\n",(0,r.jsx)(n.li,{children:"Context-Aware Autoscaling: Standard CPU/GPU utilization metrics are often insufficient signals for scaling LLMs. We are working on KV-cache pressure metrics for autoscaling. This ensures that we scale out before the memory fills up, preventing eviction-based slowdowns during traffic spikes."}),"\n",(0,r.jsx)(n.li,{children:'Online Evaluation & Guardrails: We are integrating a lightweight "Trust Layer" into the proxy. This will allow for low-latency input/output filtering (Guardrails) and asynchronous "LLM-as-a-Judge" evaluation pipelines to monitor response quality in production, not just system health.'}),"\n"]})]})}function h(e={}){const{wrapper:n}={...(0,s.R)(),...e.components};return n?(0,r.jsx)(n,{...e,children:(0,r.jsx)(d,{...e})}):d(e)}},7996:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},8453:(e,n,i)=>{i.d(n,{R:()=>a,x:()=>o});var t=i(6540);const r={},s=t.createContext(r);function a(e){const n=t.useContext(s);return t.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function o(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(r):e.components||r:a(e.components),t.createElement(s.Provider,{value:n},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/814f3328.189ef834.js b/docs/assets/js/814f3328.189ef834.js new file mode 100644 index 00000000..f366747d --- /dev/null +++ b/docs/assets/js/814f3328.189ef834.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[7472],{5513:e=>{e.exports=JSON.parse('{"title":"Recent posts","items":[{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/post-five","unlisted":false,"date":"2025-06-02T00:00:00.000Z"},{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-four","unlisted":false,"date":"2025-03-29T00:00:00.000Z"},{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/post-three","unlisted":false,"date":"2024-05-21T00:00:00.000Z"},{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two","unlisted":false,"date":"2023-04-10T00:00:00.000Z"},{"title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","permalink":"/BharatMLStack/blog/post-one","unlisted":false,"date":"2022-11-15T00:00:00.000Z"}]}')}}]); \ No newline at end of file diff --git a/docs/assets/js/814f3328.bfb123e8.js b/docs/assets/js/814f3328.bfb123e8.js deleted file mode 100644 index f1e59d9a..00000000 --- a/docs/assets/js/814f3328.bfb123e8.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[7472],{5513:e=>{e.exports=JSON.parse('{"title":"Recent posts","items":[{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/post-five","unlisted":false,"date":"2025-06-02T00:00:00.000Z"},{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-three","unlisted":false,"date":"2025-03-29T00:00:00.000Z"},{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/post-three","unlisted":false,"date":"2024-05-21T00:00:00.000Z"},{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two","unlisted":false,"date":"2023-04-10T00:00:00.000Z"},{"title":"Building Meesho\u2019s ML Platform: From Chaos to Cutting-Edge (Part 1)","permalink":"/BharatMLStack/blog/post-one","unlisted":false,"date":"2022-11-15T00:00:00.000Z"}]}')}}]); \ No newline at end of file diff --git a/docs/assets/js/8cdb4121.2549a6bf.js b/docs/assets/js/8cdb4121.2549a6bf.js new file mode 100644 index 00000000..0dbaaca2 --- /dev/null +++ b/docs/assets/js/8cdb4121.2549a6bf.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[252],{2173:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/llm-plat-9ac69c0ffd8c387d177e582611b8c775.png"},2233:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-four","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-four/index.md","source":"@site/blog/bharatmlstack-history/post-four/index.md","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","description":"BharatMLStack","date":"2025-03-29T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":13.38,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-four","title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","authors":["jaya"],"date":"2025-3-29","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","permalink":"/BharatMLStack/blog/post-five"},"nextItem":{"title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","permalink":"/BharatMLStack/blog/post-three"}}')},2531:(e,n,i)=>{i.r(n),i.d(n,{assets:()=>l,contentTitle:()=>o,default:()=>h,frontMatter:()=>a,metadata:()=>t,toc:()=>c});var t=i(2233),r=i(4848),s=i(8453);const a={slug:"post-four",title:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving",authors:["jaya"],date:"2025-3-29",tags:["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},o=void 0,l={authorsImageUrls:[void 0]},c=[{value:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving",id:"designing-a-production-grade-llm-inference-platform-from-model-weights-to-scalable-gpu-serving",level:2},{value:"Why LLM Inference Is not just bigger ML model serving",id:"why-llm-inference-is-not-just-bigger-ml-model-serving",level:2},{value:"Autoregressive Generation and Sequential Computation:",id:"autoregressive-generation-and-sequential-computation",level:3},{value:"Prefill and Decode Phases:",id:"prefill-and-decode-phases",level:3},{value:"Context Management and KV Caching:",id:"context-management-and-kv-caching",level:3},{value:"Dynamic and Irregular Workloads:",id:"dynamic-and-irregular-workloads",level:3},{value:"Streaming and User Experience Constraints:",id:"streaming-and-user-experience-constraints",level:3},{value:"LLMOps: High-Level Architecture",id:"llmops-high-level-architecture",level:2},{value:"Supported Inference backends (TensorRT LLM, Dynamo & vLLM)",id:"supported-inference-backends-tensorrt-llm--dynamo--vllm",level:2},{value:"Conclusion",id:"conclusion",level:2},{value:"Future Explorations",id:"future-explorations",level:2}];function d(e){const n={h2:"h2",h3:"h3",img:"img",li:"li",ol:"ol",p:"p",ul:"ul",...(0,s.R)(),...e.components};return(0,r.jsxs)(r.Fragment,{children:[(0,r.jsx)(n.p,{children:(0,r.jsx)(n.img,{alt:"BharatMLStack",src:i(7996).A+"",width:"1396",height:"460"})}),"\n",(0,r.jsx)(n.h2,{id:"designing-a-production-grade-llm-inference-platform-from-model-weights-to-scalable-gpu-serving",children:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving"}),"\n",(0,r.jsx)(n.p,{children:"Serving large language models in production introduces new challenges across infrastructure, performance optimization, and operational lifecycle management. The LLM Inference Platform addresses these challenges by providing a unified system for deploying and managing open-source and fine-tuned LLMs at scale."}),"\n",(0,r.jsx)(n.p,{children:"The platform implements a complete LLMOps lifecycle \u2014 from model registration and automated compilation to deployment, runtime optimization, and monitoring. Designed as a self-service environment, users can onboard models directly from open repositories such as Hugging Face or upload custom fine-tuned models, and deploy them using a single-click workflow with no manual infrastructure or configuration steps required."}),"\n",(0,r.jsx)(n.p,{children:"In addition to fully automated deployment, the platform allows users to select and apply custom inference optimization techniques \u2014 such as quantization strategies, batching configurations, and runtime-specific performance enhancements \u2014 enabling teams to balance latency, throughput, and cost based on their use case. The goal is to reduce operational friction while enabling high-performance, production-grade LLM inference."}),"\n",(0,r.jsx)(n.h2,{id:"why-llm-inference-is-not-just-bigger-ml-model-serving",children:"Why LLM Inference Is not just bigger ML model serving"}),"\n",(0,r.jsx)(n.p,{children:"Large language model (LLM) inference introduces a fundamentally different set of challenges compared to traditional machine learning inference. While classical ML models typically perform a single forward pass to produce a fixed prediction, LLMs operate as autoregressive systems, generating outputs token by token based on previously generated context. This difference dramatically changes how inference systems must be designed, optimized, and scaled."}),"\n",(0,r.jsx)(n.h3,{id:"autoregressive-generation-and-sequential-computation",children:"Autoregressive Generation and Sequential Computation:"}),"\n",(0,r.jsx)(n.p,{children:"Unlike traditional models such as classifiers or recommenders \u2014 where inference cost is relatively constant \u2014 LLMs generate responses incrementally. Each new token depends on all previously generated tokens, making inference inherently sequential and dynamic. This means latency and compute requirements vary significantly depending on prompt length and output size, introducing complexity in scheduling and resource allocation.\nBecause tokens cannot be generated fully in parallel during decoding, GPUs may become underutilized without specialized batching and scheduling strategies. This has led to the development of dedicated LLM inference engines optimized for token-level execution."}),"\n",(0,r.jsx)(n.h3,{id:"prefill-and-decode-phases",children:"Prefill and Decode Phases:"}),"\n",(0,r.jsx)(n.p,{children:"LLM inference typically consists of two distinct stages:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Prefill phase \u2014 the model processes the input prompt and builds internal representations. This stage is compute-heavy and highly parallelizable."}),"\n",(0,r.jsx)(n.li,{children:"Decode phase \u2014 the model generates tokens sequentially, predicting one token at a time using previously generated context."}),"\n"]}),"\n",(0,r.jsx)(n.p,{children:"The decode stage often becomes memory-bound rather than compute-bound, which creates new performance bottlenecks compared to traditional ML workloads."}),"\n",(0,r.jsx)(n.h3,{id:"context-management-and-kv-caching",children:"Context Management and KV Caching:"}),"\n",(0,r.jsx)(n.p,{children:"Another fundamental difference lies in how LLMs maintain context. Transformer-based models rely on attention mechanisms that require access to past token representations. To avoid recomputing these representations repeatedly, inference engines use key-value (KV) caching, which stores intermediate activations from previous tokens.\nKV caching significantly improves performance by eliminating redundant computation, but it introduces new challenges:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Memory consumption grows with sequence length and batch size"}),"\n",(0,r.jsx)(n.li,{children:"GPU memory becomes a critical bottleneck"}),"\n",(0,r.jsx)(n.li,{children:"Efficient memory management becomes essential for scaling concurrent requests"}),"\n"]}),"\n",(0,r.jsx)(n.p,{children:"This tradeoff between compute efficiency and memory usage is unique to LLM inference workloads."}),"\n",(0,r.jsx)(n.h3,{id:"dynamic-and-irregular-workloads",children:"Dynamic and Irregular Workloads:"}),"\n",(0,r.jsx)(n.p,{children:"Traditional ML inference typically operates on fixed-size inputs with predictable latency. In contrast, LLM requests vary widely in prompt length, output length, and runtime behavior. As a result:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Batch sizes must be dynamic rather than static"}),"\n",(0,r.jsx)(n.li,{children:"Requests may enter and leave batches asynchronously"}),"\n",(0,r.jsx)(n.li,{children:"Scheduling systems must continuously rebalance workloads to maximize GPU utilization"}),"\n"]}),"\n",(0,r.jsx)(n.p,{children:"These characteristics require specialized serving architectures that differ significantly from standard ML serving pipelines."}),"\n",(0,r.jsx)(n.h3,{id:"streaming-and-user-experience-constraints",children:"Streaming and User Experience Constraints:"}),"\n",(0,r.jsx)(n.p,{children:"Another distinguishing factor is the expectation of real-time streaming responses. Instead of returning a single output, LLM systems often stream tokens to users as they are generated.\nBecause of these differences \u2014 sequential generation, growing memory requirements, dynamic workloads, and streaming constraints \u2014 LLM inference cannot be treated as a simple extension of existing ML serving systems. Production platforms must incorporate specialized runtime engines, advanced optimization techniques, and observability tailored specifically to LLM workloads."}),"\n",(0,r.jsx)(n.h2,{id:"llmops-high-level-architecture",children:"LLMOps: High-Level Architecture"}),"\n",(0,r.jsx)(n.p,{children:(0,r.jsx)(n.img,{alt:"LLM Architecture",src:i(2173).A+"",width:"1302",height:"830"})}),"\n",(0,r.jsx)(n.p,{children:"The LLM Inference Framework is designed as a fully automated, end-to-end system for deploying and operating open-source and fine-tuned large language models at scale. The architecture abstracts the complexity of model optimization, hardware selection, deployment, and runtime management into a unified workflow that enables users to move from raw model weights to production-ready inference endpoints with minimal manual intervention."}),"\n",(0,r.jsx)(n.p,{children:"Our LLM Inference Framework is architected not just as a serving engine, but as a complete lifecycle management system. As illustrated in the high-level design below, the platform automates the journey of a model through seven distinct stages, ensuring reproducibility, performance, and scalability."}),"\n",(0,r.jsxs)(n.ol,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Onboarding & Registration (The Source of Truth)"}),"\n",(0,r.jsx)(n.p,{children:"The lifecycle begins with the Data Scientist or engineer."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Model Ingestion: Users onboard models\u2014whether open-source (Hugging Face, NeMo) or internally fine-tuned\u2014via the Truffle Box SDK/UI."}),"\n",(0,r.jsx)(n.li,{children:'LLM + Prompt Registry: Unlike traditional systems that only track model weights, our registry is a unified control plane. It stores both the Model Artifacts and the Prompt Templates. This allows Data Scientists to register and version-control prompts (e.g., "customer_support_v2") independently of the application code.'}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:'The "Black Box" Build Engine'}),"\n",(0,r.jsx)(n.p,{children:"Once a model is registered, the Automated LLM Compiler + Quantizer Module kicks off a background job on ephemeral GPU resources."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Transformation: The raw model is converted into a TRT-LLM Checkpoint."}),"\n",(0,r.jsx)(n.li,{children:"Quantization: The system automatically applies quantization algorithms (like INT4 AWQ or FP8) to reduce memory footprint."}),"\n",(0,r.jsx)(n.li,{children:"Engine Building: Finally, it compiles a highly optimized TRT Engine specifically tuned for the target hardware."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Intelligent Profiling & Validation"}),"\n",(0,r.jsx)(n.p,{children:"Before deployment, the new engine passes through the Hardware & Inference Runtime Profiler."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Benchmarking: This module empirically tests the engine against various hardware configurations (L4 vs. A100) and runtimes (TRT-LLM vs. vLLM)."}),"\n",(0,r.jsx)(n.li,{children:"Optimization: It recommends the optimal configuration that meets latency SLAs (Time-To-First-Token) while minimizing cost."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Smart Artifact Generation & Distribution"}),"\n",(0,r.jsx)(n.p,{children:'To solve the Kubernetes "Cold Start" problem, the LLM Serving Artifacts Generation module packages the model using a bifurcated strategy:'}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Standard Models: Artifacts are uploaded to Cloud Storage (GCS) and downloaded by pods at startup."}),"\n",(0,r.jsx)(n.li,{children:"Very Large Models: For massive models (>8GB) where network downloads are too slow, the system pre-caches the model onto Secondary Boot Disks. These disks are attached directly to new GPU nodes during autoscaling, eliminating download wait times."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Image Streaming & Deployment"}),"\n",(0,r.jsx)(n.p,{children:"Simultaneously, the inference runtime container images are pulled from the Artifact Registry."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Image Streaming: We utilize container image streaming to allow pods to start initializing while the massive Triton/Dynamo container layers are still downloading, further shaving seconds off the startup time. link"}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"The Inference Runtime (Kubernetes)"}),"\n",(0,r.jsx)(n.p,{children:"The workload lands on Kubernetes with Autoscaling."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Dynamic Backends: Depending on the profile generated in Stage 3, the pod initializes either TensorRT-LLM (for throughput) or vLLM (for flexibility), or spins up a Dynamo worker for distributed inference."}),"\n",(0,r.jsx)(n.li,{children:'Data Loading: The pod either downloads the model from Cloud Storage or mounts the pre-warmed Secondary Boot Disk ("Pull from Disk").'}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Client Interaction & Observability"}),"\n",(0,r.jsx)(n.p,{children:"Finally, the LLM Inference Client executes the request."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Prompt Injection: The client pulls the specific prompt template ID from the Registry, ensuring the exact versioned instructions are used."}),"\n",(0,r.jsx)(n.li,{children:"Streaming Response: The request is sent via gRPC, and tokens are streamed back to the user in real-time."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Observability: Monitoring the Pulse of GenAI"}),"\n",(0,r.jsx)(n.p,{children:"In traditional microservices, success is measured by CPU utilization and request latency (p99). For Large Language Models, these metrics are insufficient. A user doesn't care if the GPU is at 80% utilization; they care about how fast the first word appears and how smoothly the rest of the sentence follows."}),"\n",(0,r.jsx)(n.p,{children:"To capture the true user experience, our platform instrumentation focuses on three critical LLM-specific metrics:"}),"\n",(0,r.jsxs)(n.ol,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Time to First Token (TTFT)"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Definition: TTFT measures the time elapsed from the moment a request is received until the very first token is generated and streamed back to the user."}),"\n",(0,r.jsx)(n.li,{children:'Why it matters: This represents the "Prefill Phase" latency\u2014the time the model takes to process the input prompt and load weights. A high TTFT makes the application feel unresponsive or "hung."'}),"\n",(0,r.jsx)(n.li,{children:"Optimization: We closely monitor TTFT to ensure our Prefix Caching is effective (aiming for high cache hitrates), which drastically lowers this metric by skipping redundant prompt processing."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Inter-Token Latency (ITL)"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:'Definition: ITL measures the average time interval between the generation of consecutive tokens during the "Decode Phase".'}),"\n",(0,r.jsx)(n.li,{children:'Why it matters: This defines the "perceived speed" of reading. Even if the first token is fast (low TTFT), high ITL makes the text generation look "jerky" or slow to the user.'}),"\n",(0,r.jsx)(n.li,{children:"Benchmarks: In our testing with Llama 3.1, we track p99 ITL to ensure it stays below human reading speeds to maintain a natural conversational flow."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Token Throughput vs. Request Throughput"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"We distinguish between two types of throughput to balance system efficiency with user load:"}),"\n",(0,r.jsx)(n.li,{children:"Token Throughput (tokens/sec): The total number of tokens generated across all concurrent requests. This measures the raw compute efficiency of the GPU and the effectiveness of batching."}),"\n",(0,r.jsx)(n.li,{children:"Request Throughput (req/sec): The number of distinct user queries served per second. We use this to determine autoscaling thresholds, ensuring we scale out before the queue depth impacts ITL."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"The Monitoring Stack"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:'Real-time Dashboards: We utilize Grafana to visualize these streaming metrics in real-time, allowing on-call engineers to spot "slow generation" incidents that generic "500 error" alerts would miss.'}),"\n",(0,r.jsx)(n.li,{children:'Request Tracing: Since Triton Inference Server does not log request payloads by default, we integrate a Helix Client to asynchronously publish request logs to Log Tables. This allows us to trace a specific "slow" request back to its prompt to understand if a complex input caused the latency spike.'}),"\n"]}),"\n"]}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,r.jsx)(n.h2,{id:"supported-inference-backends-tensorrt-llm--dynamo--vllm",children:"Supported Inference backends (TensorRT LLM, Dynamo & vLLM)"}),"\n",(0,r.jsx)(n.p,{children:'Tailored for the Use Case: We do not believe in a "one-size-fits-all" approach to inference. Different use cases\u2014whether a real-time voice bot requiring ultra-lowsub-second latency or a massive reasoning task requiring huge context windows\u2014demand different runtime characteristics. Our platform is designed to be runtime-agnostic, allowing us to automatically select and tailor the best engine based on the specific requirements of the application:'}),"\n",(0,r.jsxs)(n.ol,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"TensorRT-LLM: The High-Performance Standard"}),"\n",(0,r.jsx)(n.p,{children:"Suitable for: High-throughput production workloads where latency is critical (e.g., customer support chat, real-time voice bots)."}),"\n",(0,r.jsx)(n.p,{children:"TensorRT-LLM serves as our default backend for these scenarios. Our internal benchmarks on Llama 3.1 and 3.2 models demonstrated that a tuned TensorRT-LLM engine significantly outperforms standard runtimes, especially when utilizing INT4 AWQ and FP8 quantization ."}),"\n",(0,r.jsx)(n.p,{children:"Key optimizations we tailor for these high-load cases include:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"Optimized execution via TensorRT engine compilation"}),"\n",(0,r.jsx)(n.li,{children:"Quantization-aware execution for reduced memory usage and improved throughput"}),"\n",(0,r.jsx)(n.li,{children:"Inflight Batching: Allowing requests to be processed continuously without waiting for the entire batch to finish, drastically improving GPU utilization ."}),"\n",(0,r.jsx)(n.li,{children:"Custom Plugins: Enabling specific NVIDIA plugins like the GEMM plugin and GPT Attention plugin to accelerate matrix multiplications and attention mechanisms ."}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"Dynamo: Distributed Inference for Reasoning Models"}),"\n",(0,r.jsx)(n.p,{children:'Suitable for: Very large "reasoning" models (70B+) or scenarios requiring massive context windows where a single GPU\'s memory is insufficient.'}),"\n",(0,r.jsx)(n.p,{children:"For these memory-bound tasks, we utilize Dynamo, a low-latency distributed inference framework . Unlike monolithic servers, Dynamo disaggregates the inference process to scale resources horizontally:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"KV Aware Routing: A specialized router directs requests to workers that already hold the relevant Key-Value (KV) cache, minimizing redundant computation ."}),"\n",(0,r.jsx)(n.li,{children:'Prefill vs. Decode Split: The workload is divided into Prefill Workers (processing the prompt) and Decode Workers (generating tokens), allowing us to scale the compute-heavy "reading" phase independently from the memory-heavy "writing" phase .'}),"\n",(0,r.jsx)(n.li,{children:"Distributed execution across multiple GPU resources"}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsx)(n.p,{children:"vLLM: The Flexible Baseline"}),"\n",(0,r.jsx)(n.p,{children:"Suitable for: Rapid prototyping, testing new model architectures, or low-traffic internal tools where ease of deployment outweighs raw throughput."}),"\n",(0,r.jsx)(n.p,{children:"While TensorRT-LLM is optimized for maximum speed, vLLM provides a robust and flexible baseline ."}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"High throughput through dynamic batching and efficient memory utilization"}),"\n",(0,r.jsx)(n.li,{children:"Paged KV cache management for handling long contexts and concurrent requests"}),"\n",(0,r.jsx)(n.li,{children:"Strong support for open-source model ecosystems"}),"\n",(0,r.jsx)(n.li,{children:"Rapid Adoption: It allows us to onboard new model architectures immediately without waiting for a custom TensorRT build."}),"\n",(0,r.jsx)(n.li,{children:"Benchmarking Insight: In our internal tests, vLLM provided a strong baseline but often lacked the specific max-token optimizations present in our custom TRT engines . We use it strategically for initial testing before committing to a full TensorRT optimization pipeline."}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,r.jsx)(n.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,r.jsx)(n.p,{children:"Large language model inference introduces a fundamentally new class of infrastructure challenges\u2014where performance is governed not just by raw compute, but by memory efficiency, intelligent scheduling, runtime specialization, and lifecycle automation. Unlike traditional ML serving, LLM inference requires systems that understand token-level execution, manage rapidly growing context state, and continuously balance latency, throughput, and cost under highly dynamic workloads."}),"\n",(0,r.jsx)(n.p,{children:"The LLM Inference Framework addresses these challenges by transforming inference into a fully automated, reproducible lifecycle\u2014from model onboarding and compilation to deployment, optimization, and observability. By integrating automated quantization and engine compilation, intelligent runtime selection, cold-start mitigation strategies, and LLM-specific observability metrics such as Time-to-First-Token and Inter-Token Latency, the platform ensures both high performance and operational simplicity."}),"\n",(0,r.jsx)(n.p,{children:"Equally important, the framework is designed with flexibility and future evolution in mind. Its runtime-agnostic architecture enables seamless adoption of emerging inference engines, hardware accelerators, and optimization techniques without requiring platform redesign. This ensures that teams can continuously leverage advancements in the rapidly evolving LLM ecosystem while maintaining consistent operational workflows."}),"\n",(0,r.jsx)(n.p,{children:"Ultimately, the goal of the platform is to make production-scale LLM deployment as seamless and reliable as traditional software deployment\u2014allowing teams to focus on building intelligent applications rather than managing infrastructure complexity. By combining lifecycle automation, runtime optimization, and deep observability, the LLM Inference Framework provides a scalable foundation for delivering fast, cost-efficient, and production-ready LLM experiences."}),"\n",(0,r.jsx)(n.h2,{id:"future-explorations",children:"Future Explorations"}),"\n",(0,r.jsx)(n.p,{children:"While we have achieved significant milestones in latency and throughput, the landscape of GenAI is evolving rapidly. Our roadmap focuses on increasing flexibility, reducing costs, and enhancing reliability for enterprise-grade workloads. Here is what we are building next:"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsx)(n.li,{children:"TPU Support: To diversify our hardware supply chain and further optimize cost-per-token, we are evaluating Google Cloud TPUs to bake it into our platform. By leveraging the JAX and PyTorch/XLA ecosystems, we aim to unlock the massive throughput potential of TPU v5e chips, particularly for our open-source Llama models. This will allow the hardware profiler to dynamically choose between NVIDIA GPUs and Google TPUs based on real-time availability and price-performance metrics."}),"\n",(0,r.jsx)(n.li,{children:'Multi-LoRA Serving (Serverless Experience): Currently, deploying a fine-tuned model requires a dedicated GPU. We are building support for Multi-LoRA serving, which will allow us to serve hundreds of unique, fine-tuned adapters on top of a single frozen base model. This will drastically reduce costs for multi-tenant applications, enabling a "serverless" experience where specific fine-tunes are hot-swapped instantly per request.'}),"\n",(0,r.jsx)(n.li,{children:"Spot Instance Orchestration: To further optimize cloud costs, we are developing fault-tolerant mechanisms to run inference workloads on Spot Instances. By implementing aggressive checkpointing and seamless request draining, we aim to leverage cheaper, preemptible compute capacity without interrupting the user's streaming experience."}),"\n",(0,r.jsx)(n.li,{children:'Semantic Caching Layer: We plan to move beyond standard Prefix Caching to implement Semantic Caching. By using a vector database to fetch responses for semantically similar queries (e.g., "How do I reset my password?" vs. "Password reset steps"), we can bypass the GPU entirely for repetitive queries, reducing latency to near-zero.'}),"\n",(0,r.jsx)(n.li,{children:"Context-Aware Autoscaling: Standard CPU/GPU utilization metrics are often insufficient signals for scaling LLMs. We are working on KV-cache pressure metrics for autoscaling. This ensures that we scale out before the memory fills up, preventing eviction-based slowdowns during traffic spikes."}),"\n",(0,r.jsx)(n.li,{children:'Online Evaluation & Guardrails: We are integrating a lightweight "Trust Layer" into the proxy. This will allow for low-latency input/output filtering (Guardrails) and asynchronous "LLM-as-a-Judge" evaluation pipelines to monitor response quality in production, not just system health.'}),"\n"]})]})}function h(e={}){const{wrapper:n}={...(0,s.R)(),...e.components};return n?(0,r.jsx)(n,{...e,children:(0,r.jsx)(d,{...e})}):d(e)}},7996:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},8453:(e,n,i)=>{i.d(n,{R:()=>a,x:()=>o});var t=i(6540);const r={},s=t.createContext(r);function a(e){const n=t.useContext(s);return t.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function o(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(r):e.components||r:a(e.components),t.createElement(s.Provider,{value:n},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/c4f5d8e4.41d5b3c8.js b/docs/assets/js/c4f5d8e4.41d5b3c8.js deleted file mode 100644 index 69b0d45b..00000000 --- a/docs/assets/js/c4f5d8e4.41d5b3c8.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[2634],{6467:(e,i,t)=>{t.r(i),t.d(i,{default:()=>M});var s=t(4164),r=t(8774),n=t(4586),a=t(6025),o=t(1656),c=t(1107);const l={features:"features_t9lD",featureSvg:"featureSvg_GfXr",featuresHeader:"featuresHeader_qR2i",featuresSubtitle:"featuresSubtitle_VdGe","bharatml-card":"bharatml-card_xZ6l","bharatml-icon":"bharatml-icon_XBoJ",featureDescription:"featureDescription_sP1D"};var d=t(4848);const h=[{title:"High-Performance Feature Store",icon:"\ud83d\ude80",description:(0,d.jsx)(d.Fragment,{children:"Sub-10ms P99 latency and 1M+ RPS capacity. Built for real-time ML inference with custom PSDB serialization format that outperforms Protocol Buffers and Apache Arrow."})},{title:"Production-Ready ML Infrastructure",icon:"\u26a1",description:(0,d.jsx)(d.Fragment,{children:"Multi-database backends (Scylla, Dragonfly, Redis), comprehensive monitoring, and enterprise-grade features. Deploy with confidence using battle-tested components."})},{title:"Developer-First Experience",icon:"\ud83d\udee0\ufe0f",description:(0,d.jsx)(d.Fragment,{children:"Multi-language SDKs (Go, Python), gRPC APIs, and extensive documentation. From data scientists, ML engineers to backend engineers, everyone gets tools they love."})}],u=[{title:"Feature Catalog & Management",icon:"\ud83d\udccb",description:(0,d.jsx)(d.Fragment,{children:"Comprehensive feature catalog with metadata management, versioning, and governance. Organize and discover features across your ML platform with ease."})},{title:"User Management & Admin Ops",icon:"\ud83d\udc65",description:(0,d.jsx)(d.Fragment,{children:"Role-based access control, user authentication, and administrative operations. Secure your ML platform with enterprise-grade user management capabilities."})},{title:"Modern UI Framework",icon:"\ud83c\udfa8",description:(0,d.jsx)(d.Fragment,{children:"Intuitive, responsive web interface built with modern web technologies. Streamline MLOps workflows with beautiful and functional user experiences."})}],m=[{title:"Multi-Language Support",icon:"\ud83c\udf10",description:(0,d.jsx)(d.Fragment,{children:"Native SDKs for Go and Python with idiomatic APIs. Choose the language that fits your team's expertise and existing infrastructure."})},{title:"gRPC & REST APIs",icon:"\ud83d\udd17",description:(0,d.jsx)(d.Fragment,{children:"High-performance gRPC clients and REST APIs for seamless integration. Built-in support for streaming, batching, and async operations."})},{title:"Spark Integration",icon:"\u26a1",description:(0,d.jsx)(d.Fragment,{children:"Native Apache Spark integration for batch feature processing and ingestion. Scale your feature engineering workflows with distributed computing power."})}];function x({icon:e,title:i,description:t}){return(0,d.jsxs)("div",{className:(0,s.A)("col col--4"),children:[(0,d.jsx)("div",{className:"text--center",children:(0,d.jsx)("div",{className:"bharatml-icon",children:e})}),(0,d.jsxs)("div",{className:"text--center padding-horiz--md bharatml-card",children:[(0,d.jsx)(c.A,{as:"h3",children:i}),(0,d.jsx)("p",{className:l.featureDescription,children:t})]})]})}function p({title:e,subtitle:i,features:t}){return(0,d.jsx)("section",{className:l.features,children:(0,d.jsxs)("div",{className:"container",children:[(0,d.jsxs)("div",{className:"text--center margin-bottom--xl",children:[(0,d.jsx)(c.A,{as:"h2",className:l.featuresHeader,children:e}),(0,d.jsx)("p",{className:l.featuresSubtitle,children:i})]}),(0,d.jsx)("div",{className:"row",children:t.map((e,i)=>(0,d.jsx)(x,{...e},i))})]})})}function g(){return(0,d.jsx)(p,{title:"Online Feature Store",subtitle:"High-performance, production-ready feature serving for real-time ML inference",features:h})}function f(){return(0,d.jsx)(p,{title:"Trufflebox UI",subtitle:"Modern, feature-rich UI framework for comprehensive MLOps management",features:u})}function j(){return(0,d.jsx)(p,{title:"SDKs",subtitle:"Developer-friendly client libraries and APIs for seamless platform integration",features:m})}const b={heroBanner:"heroBanner_qdFl",logoContainer:"logoContainer_xdaK",heroLogo:"heroLogo_U6bI",buttons:"buttons_AeoN",statsContainer:"statsContainer_KpvY",statItem:"statItem_bwiZ",aboutSection:"aboutSection_udvw",highlightBox:"highlightBox_Uhe8"};function v(){const{siteConfig:e}=(0,n.A)();return(0,d.jsx)("header",{className:(0,s.A)("hero bharatml-hero",b.heroBanner),children:(0,d.jsxs)("div",{className:"container",children:[(0,d.jsx)("div",{className:b.logoContainer,children:(0,d.jsx)("img",{src:(0,a.Ay)("/img/logo.svg"),alt:"BharatMLStack Logo",className:b.heroLogo})}),(0,d.jsxs)(c.A,{as:"h1",className:"hero__title",children:["Welcome to ",e.title]}),(0,d.jsx)("p",{className:"hero__subtitle",children:"Open source, end-to-end ML infrastructure stack built for scale, speed, and simplicity."}),(0,d.jsxs)("div",{className:b.buttons,children:[(0,d.jsx)(r.A,{className:"button button--secondary button--lg margin-right--md bharatml-button",to:"/category/online-feature-store",children:"\ud83d\udcda Get Started"}),(0,d.jsx)(r.A,{className:"button button--outline button--secondary button--lg",href:"https://github.com/Meesho/BharatMLStack",target:"_blank",children:"\u2b50 Star on GitHub"})]}),(0,d.jsxs)("div",{className:b.statsContainer,children:[(0,d.jsxs)("div",{className:b.statItem,children:[(0,d.jsx)("strong",{children:"Sub-10ms"}),(0,d.jsx)("span",{children:"P99 Latency"})]}),(0,d.jsxs)("div",{className:b.statItem,children:[(0,d.jsx)("strong",{children:"1M+ RPS"}),(0,d.jsx)("span",{children:"Tested Capacity"})]}),(0,d.jsxs)("div",{className:b.statItem,children:[(0,d.jsx)("strong",{children:"Multi-DB"}),(0,d.jsx)("span",{children:"Support"})]})]})]})})}function N(){return(0,d.jsx)("section",{className:b.aboutSection,children:(0,d.jsx)("div",{className:"container",children:(0,d.jsxs)("div",{className:"row",children:[(0,d.jsxs)("div",{className:"col col--6",children:[(0,d.jsx)(c.A,{as:"h2",children:"Built for India's Scale"}),(0,d.jsx)("p",{children:"BharatMLStack is a comprehensive, production-ready machine learning infrastructure platform designed to democratize ML capabilities across India and beyond. Our mission is to provide a robust, scalable, and accessible ML stack that empowers organizations to build, deploy, and manage machine learning solutions at massive scale."}),(0,d.jsx)(r.A,{className:"button button--primary",to:"/category/online-feature-store",children:"Explore Online Feature Store \u2192"})]}),(0,d.jsx)("div",{className:"col col--6",children:(0,d.jsxs)("div",{className:b.highlightBox,children:[(0,d.jsx)("h3",{children:"\ud83c\udfc6 Key Achievements"}),(0,d.jsxs)("ul",{children:[(0,d.jsx)("li",{children:"\u2705 Sub-10ms P99 latency for real-time inference"}),(0,d.jsx)("li",{children:"\u2705 1M+ RPS tested with 100 IDs per request"}),(0,d.jsx)("li",{children:"\u2705 PSDB format outperforms Proto3 & Arrow"}),(0,d.jsx)("li",{children:"\u2705 Multi-database: Scylla, Dragonfly, Redis"}),(0,d.jsx)("li",{children:"\u2705 Production-ready with comprehensive monitoring"})]})]})})]})})})}function y(){return(0,d.jsx)("section",{className:b.aboutSection,children:(0,d.jsx)("div",{className:"container",children:(0,d.jsxs)("div",{className:"row",children:[(0,d.jsxs)("div",{className:"col col--6",children:[(0,d.jsx)(c.A,{as:"h2",children:"Modern MLOps Management"}),(0,d.jsx)("p",{children:"Trufflebox UI provides a comprehensive, modern web interface for managing your entire ML infrastructure. Built with cutting-edge web technologies, it delivers an intuitive experience for feature management, user administration, and operational oversight. Streamline your MLOps workflows with enterprise-grade UI components."}),(0,d.jsx)(r.A,{className:"button button--primary",to:"/category/trufflebox-ui",children:"Explore Trufflebox UI \u2192"})]}),(0,d.jsx)("div",{className:"col col--6",children:(0,d.jsxs)("div",{className:b.highlightBox,children:[(0,d.jsx)("h3",{children:"\ud83c\udfa8 UI Features"}),(0,d.jsxs)("ul",{children:[(0,d.jsx)("li",{children:"\u2705 Comprehensive feature catalog & discovery"}),(0,d.jsx)("li",{children:"\u2705 Role-based access control & user management"}),(0,d.jsx)("li",{children:"\u2705 Job, Store, Admin Ops management"}),(0,d.jsx)("li",{children:"\u2705 Approval flow for everything"}),(0,d.jsx)("li",{children:"\u2705 Responsive design for desktop & mobile"})]})]})})]})})})}function S(){return(0,d.jsx)("section",{className:b.aboutSection,children:(0,d.jsx)("div",{className:"container",children:(0,d.jsxs)("div",{className:"row",children:[(0,d.jsxs)("div",{className:"col col--6",children:[(0,d.jsx)(c.A,{as:"h2",children:"Developer-First Integration"}),(0,d.jsx)("p",{children:"Our SDKs are designed with developers in mind, providing idiomatic APIs for Go and Python that feel natural in your existing codebase. Whether you're building microservices, data pipelines, or ML applications, our SDKs provide the tools you need for seamless integration with BharatMLStack's powerful infrastructure."}),(0,d.jsx)(r.A,{className:"button button--primary",to:"/category/sdks",children:"Explore SDKs \u2192"})]}),(0,d.jsx)("div",{className:"col col--6",children:(0,d.jsxs)("div",{className:b.highlightBox,children:[(0,d.jsx)("h3",{children:"\ud83d\udee0\ufe0f Developer Tools"}),(0,d.jsxs)("ul",{children:[(0,d.jsx)("li",{children:"\u2705 Native Go & Python SDKs with type safety"}),(0,d.jsx)("li",{children:"\u2705 High-performance gRPC"}),(0,d.jsx)("li",{children:"\u2705 Apache Spark integration for publishing features"})]})]})})]})})})}function w(){return(0,d.jsx)("section",{className:b.aboutSection,children:(0,d.jsx)("div",{className:"container",children:(0,d.jsxs)("div",{className:"row",children:[(0,d.jsxs)("div",{className:"col col--6",children:[(0,d.jsx)(c.A,{as:"h2",children:"Numerix"}),(0,d.jsx)("p",{children:"Numerix is a mathematical compute engine for BharatML Stack. It is used to perform mathematical operations on matrices and vectors."}),(0,d.jsx)(r.A,{className:"button button--primary",to:"/category/numerix",children:"Explore Numerix \u2192"})]}),(0,d.jsx)("div",{className:"col col--6",children:(0,d.jsxs)("div",{className:b.highlightBox,children:[(0,d.jsx)("h3",{children:"\ud83d\udee0\ufe0f Numerix Features"}),(0,d.jsxs)("ul",{children:[(0,d.jsx)("li",{children:"\u2705 Postfix expression evaluation"}),(0,d.jsx)("li",{children:"\u2705 Vectorized math operations"}),(0,d.jsx)("li",{children:"\u2705 Typed evaluation"}),(0,d.jsx)("li",{children:"\u2705 Compiler-assisted SIMD"}),(0,d.jsx)("li",{children:"\u2705 ARM & AMD support"}),(0,d.jsx)("li",{children:"\u2705 Multi-arch builds"}),(0,d.jsx)("li",{children:"\u2705 Deterministic runtime"})]})]})})]})})})}function M(){const{siteConfig:e}=(0,n.A)();return(0,d.jsxs)(o.A,{title:`${e.title} - Open Source ML Infrastructure`,description:"Open source, end-to-end ML infrastructure stack built for scale, speed, and simplicity. Features high-performance Online Feature Store with sub-10ms latency.",children:[(0,d.jsx)(v,{}),(0,d.jsxs)("main",{children:[(0,d.jsx)(g,{}),(0,d.jsx)(N,{}),(0,d.jsx)(f,{}),(0,d.jsx)(y,{}),(0,d.jsx)(j,{}),(0,d.jsx)(S,{}),(0,d.jsx)(w,{})]})]})}}}]); \ No newline at end of file diff --git a/docs/assets/js/c4f5d8e4.e88e308f.js b/docs/assets/js/c4f5d8e4.e88e308f.js new file mode 100644 index 00000000..f222d2f5 --- /dev/null +++ b/docs/assets/js/c4f5d8e4.e88e308f.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[2634],{1459:(e,t,s)=>{s.r(t),s.d(t,{default:()=>S});var i=s(6540),n=s(1656),r=s(4586),a=s(6025);const o={homepageWrapper:"homepageWrapper_H_rv",customNav:"customNav_xRNg",navContainer:"navContainer_E5Tz",logo:"logo_Ukns",hpGradientShift:"hpGradientShift_w9XB",navLinks:"navLinks_FO3Z",navLink:"navLink_aQaq",btn:"btn_bvfa",btnPrimary:"btnPrimary_hBjO",btnSecondary:"btnSecondary_mRVh",btnWhite:"btnWhite_DoE5",btnOutlineWhite:"btnOutlineWhite_Kzbe",hero:"hero_aEcG",heroContent:"heroContent_mKPX",hpFadeInUp:"hpFadeInUp_NspS",heroBadge:"heroBadge_Z6oq",heroTitle:"heroTitle_qg2I",heroSubtitle:"heroSubtitle_jFu1",heroButtons:"heroButtons_r52D",heroImage:"heroImage_xZN7",adoptionBadge:"adoptionBadge_hbYR",section:"section_Q9Zo",container:"container_bfhl",sectionHeader:"sectionHeader_Gahl",sectionSubtitle:"sectionSubtitle_AZuW",sectionTitle:"sectionTitle_Ut5p",sectionDescription:"sectionDescription_cpL1",barriersGrid:"barriersGrid_u0Jf",barrierCard:"barrierCard_tMSq",barrierIcon:"barrierIcon_HTIA",barrierQuestions:"barrierQuestions_jlWA",barrierAnswer:"barrierAnswer_ZtxW",componentsGrid:"componentsGrid_KtT5",componentCard:"componentCard_LlUg",componentCardVisible:"componentCardVisible_hAJc",componentContent:"componentContent_xz2v",componentLink:"componentLink_RzJT",componentIcon:"componentIcon_JDYs",statsSection:"statsSection_GUBq",statsGrid:"statsGrid_wBRk",statCard:"statCard_w2S8",statLabel:"statLabel_I99V",statValue:"statValue_tB6D",statDescription:"statDescription_WIU_",videosGrid:"videosGrid_FXHY",videoCard:"videoCard_jGks",videoWrapper:"videoWrapper_XWWU",videoPlayer:"videoPlayer_Nt7m",videoContent:"videoContent_pd0B",blogGrid:"blogGrid_Qec3",blogCard:"blogCard_hyds",blogCardIcon:"blogCardIcon_JPeR",blogContent:"blogContent_dJxs",blogCategory:"blogCategory_UY54",blogMeta:"blogMeta_skDH",ctaSection:"ctaSection_bmsv",hpRotate:"hpRotate_a55V",ctaTitle:"ctaTitle_arch",ctaDescription:"ctaDescription_HswS",ctaButtons:"ctaButtons_vsp7",customFooter:"customFooter_Ymmc",footerContent:"footerContent_obNo",footerSection:"footerSection__c07",footerList:"footerList_2l2h",footerBottom:"footerBottom_nS2f",footerLinks:"footerLinks_lH9U"};var c=s(4848);const l=[{icon:"\ud83e\udde0",title:"Focus on building intelligence, not infrastructure",questions:["Does every model deployment require a full-stack integration effort?","Do engineers have to rebuild feature retrieval, endpoint integrations, and logging for each new model?","Does changing a simple expression like 0.2\xd7s\u2081 + 0.8\xd7s\u2082 to 0.3\xd7s\u2081 + 0.7\xd7s\u2082 really need code reviews and redeployments?","Why does deploying intelligence require the devops team to provision infra?"],answer:"Machine learning teams should be iterating on models, not systems. Yet today, infrastructure complexity turns simple improvements into weeks of engineering effort, slowing experimentation and innovation."},{icon:"\ud83d\udcb0",title:"Built for scale without exponential cost growth",questions:["Do your infrastructure costs scale faster than your ML impact?","Are you recomputing the same features, reloading the same data, and moving the same bytes across systems repeatedly?","Are expensive GPUs and compute sitting underutilized while workloads wait on data or inefficient pipelines?","Why does scaling ML often mean scaling cost linearly\u2014or worse?"],answer:"A modern ML platform should eliminate redundant computation, reuse features intelligently, and optimize data access across memory, NVMe, and object storage. Compute should be pooled, scheduled efficiently, and fully utilized\u2014ensuring that scale drives impact, not runaway infrastructure costs."},{icon:"\ud83c\udf0d",title:"Freedom to deploy anywhere, without lock-in",questions:["Are your models tied to a single cloud, making migration costly and complex?","Does adopting managed services today limit your ability to optimize cost or move infrastructure tomorrow?","Can you deploy the same ML stack across public cloud, private cloud, or sovereign environments without redesigning everything?","Why should infrastructure choices dictate the future of your ML systems?"],answer:"A modern ML platform should be built on open standards and cloud-neutral abstractions, allowing you to deploy anywhere\u2014public cloud, private infrastructure, or sovereign environments. This ensures complete control over your data, freedom from vendor lock-in, and the ability to optimize for cost, performance, and compliance without architectural constraints."}],d=[{icon:"\u26a1",title:"Online Feature Store",description:"BharatMLStack Online Feature Store delivers sub-10ms, high-throughput access to machine learning features for real-time inference. It seamlessly ingests batch and streaming data, validates schemas, and persists compact, versioned feature groups optimized for low latency and efficiency. With scalable storage backends, gRPC APIs, and binary-optimized formats, it ensures consistent, reliable feature serving across ML pipelines.",cta:"/online-feature-store/v1.0.0"},{icon:"\ud83d\udd00",title:"Inferflow",description:"Inferflow is BharatMLStack's intelligent inference gateway that dynamically retrieves and assembles features required by ML models using a graph-based configuration called Inferpipes. It automatically resolves entity relationships, fetches features from the Online Feature Store, and constructs feature vectors without custom code.",cta:"/inferflow/v1.0.0"},{icon:"\ud83d\udd0d",title:"Skye",description:"Skye enables fast similarity retrieval by representing data as vectors and querying nearest matches in high-dimensional space. It supports pluggable vector databases, ensuring flexibility across infrastructure. The system provides tenant-level index isolation while allowing single embedding ingestion even when shared across tenants, reducing redundancy.",cta:"/skye/v1.0.0"},{icon:"\ud83e\uddee",title:"Numerix",description:"Numerix is a high-performance compute engine designed for ultra-fast element-wise matrix operations. Built in Rust and accelerated using SIMD, it delivers exceptional efficiency and predictable performance. Optimized for real-time inference workloads, it achieves strict sub-5ms p99 latency on matrices up to 1000\xd710.",cta:"/numerix/v1.0.0"},{icon:"\ud83d\ude80",title:"Predator",description:"Predator streamlines infrastructure and model lifecycle management. It enables the creation of deployables with specific Triton Server versions and supports seamless model rollouts. Leveraging Helm charts and Argo CD, Predator automates Kubernetes-based deployments while integrating with KEDA for auto-scaling and performance tuning.",cta:"/predator/v1.0.0"}],h=[{value:"4.5M+",label:"Daily Orders",description:"Daily orders processed via ML pipelines"},{value:"2.4M",label:"QPS on FS",description:"QPS on Feature Store with batch size of 100 id lookups"},{value:"1M+",label:"QPS Inference",description:"QPS on Model Inference"},{value:"500K",label:"QPS Embedding",description:"QPS Embedding Search"}],m=[{title:"Embedding Platform",description:"See how Skye powers real-time similarity search and embedding retrieval at scale.",url:"https://videos.meesho.com/reels/embedding_platform.mp4"},{title:"Feature Store",description:"Watch the Online Feature Store deliver sub-10ms feature serving for real-time inference.",url:"https://videos.meesho.com/reels/feature_store.mp4"},{title:"Numerix",description:"Explore ultra-fast matrix operations powered by Rust and SIMD acceleration.",url:"https://videos.meesho.com/reels/numerix.mp4"},{title:"Predator",description:"Automated Kubernetes-based model deployment with Helm, Argo CD, and KEDA.",url:"https://videos.meesho.com/reels/predator.mp4"},{title:"Inferflow",description:"Graph-based feature assembly and intelligent inference gateway in action.",url:"https://videos.meesho.com/reels/inferflow.mp4"}],u=[{title:"Building Meesho's ML Platform: From Chaos to Cutting-Edge (Part 1)",category:"ML Platform",icon:"\ud83d\ude80",link:"/blog/post-one"},{title:"Building Meesho's ML Platform: Lessons from the First-Gen System (Part 2)",category:"ML Platform",icon:"\ud83e\udde9",link:"/blog/post-two"},{title:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search",category:"Inference",icon:"\u26a1",link:"/blog/post-three"},{title:"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving",category:"LLM",icon:"\ud83e\udde0",link:"/blog/post-four"},{title:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale",category:"Optimization",icon:"\ud83d\udd2c",link:"/blog/post-five"}];function p(){const e=(0,a.Ay)("/"),t=(0,a.Ay)("/blog");return(0,c.jsx)("nav",{className:o.customNav,children:(0,c.jsxs)("div",{className:o.navContainer,children:[(0,c.jsx)("a",{href:e,className:o.logo,children:"BharatMLStack"}),(0,c.jsxs)("div",{className:o.navLinks,children:[(0,c.jsx)("a",{href:"#components",className:o.navLink,children:"Components"}),(0,c.jsx)("a",{href:"#stats",className:o.navLink,children:"Scale"}),(0,c.jsx)("a",{href:"#demos",className:o.navLink,children:"Demos"}),(0,c.jsx)("a",{href:t,className:o.navLink,children:"Blog"}),(0,c.jsx)("a",{href:"https://github.com/Meesho/BharatMLStack",className:`${o.btn} ${o.btnPrimary}`,target:"_blank",rel:"noopener noreferrer",children:"GitHub"})]})]})})}function f(){const e=(0,a.Ay)("/category/online-feature-store");return(0,c.jsxs)("section",{className:o.hero,children:[(0,c.jsxs)("div",{className:o.heroContent,children:[(0,c.jsx)("div",{className:o.heroBadge,children:"Open-source, scalable stack for enterprise ML"}),(0,c.jsx)("h1",{className:o.heroTitle,children:"Build production ML pipelines faster"}),(0,c.jsx)("p",{className:o.heroSubtitle,children:"Open source, end-to-end ML infrastructure stack built for scale, speed, and simplicity. Integrate, deploy, and manage robust ML workflows with full reliability and control."}),(0,c.jsxs)("div",{className:o.heroButtons,children:[(0,c.jsx)("a",{href:e,className:`${o.btn} ${o.btnPrimary}`,children:"Get Started"}),(0,c.jsx)("a",{href:"https://github.com/Meesho/BharatMLStack",className:`${o.btn} ${o.btnSecondary}`,target:"_blank",rel:"noopener noreferrer",children:"View on GitHub"})]}),(0,c.jsx)("div",{className:o.adoptionBadge,children:(0,c.jsx)("p",{children:"Adopted by data teams building at scale"})})]}),(0,c.jsx)("div",{className:o.heroImage,children:(0,c.jsx)("img",{src:"https://cdn.prod.website-files.com/698b10a6a7625eb340f3ae09/698b5bbdf7c6b6cda9addc72_ade02631-3927-487e-a128-c38ab39588a4.avif",alt:"ML Infrastructure",loading:"eager"})})]})}function g(){return(0,c.jsx)("section",{className:o.section,children:(0,c.jsxs)("div",{className:o.container,children:[(0,c.jsxs)("div",{className:o.sectionHeader,children:[(0,c.jsx)("p",{className:o.sectionSubtitle,children:"Why BharatMLStack"}),(0,c.jsx)("h2",{className:o.sectionTitle,children:"The Real Barriers to Scaling Machine Learning"}),(0,c.jsx)("p",{className:o.sectionDescription,children:"ML teams spend more time fighting infrastructure than building intelligence. BharatMLStack removes those barriers."})]}),(0,c.jsx)("div",{className:o.barriersGrid,children:l.map((e,t)=>(0,c.jsxs)("div",{className:o.barrierCard,children:[(0,c.jsx)("div",{className:o.barrierIcon,children:e.icon}),(0,c.jsx)("h3",{children:e.title}),(0,c.jsx)("ul",{className:o.barrierQuestions,children:e.questions.map((e,t)=>(0,c.jsx)("li",{children:e},t))}),(0,c.jsx)("p",{className:o.barrierAnswer,children:e.answer})]},t))})]})})}function b(){const e=(0,i.useRef)([]),t=(0,a.Ay)("/");return(0,i.useEffect)(()=>{const t=new IntersectionObserver(e=>{e.forEach(e=>{e.isIntersecting&&e.target.classList.add(o.componentCardVisible)})},{threshold:.1,rootMargin:"0px 0px -80px 0px"});return e.current.forEach(e=>{e&&t.observe(e)}),()=>t.disconnect()},[]),(0,c.jsx)("section",{className:o.section,id:"components",children:(0,c.jsxs)("div",{className:o.container,children:[(0,c.jsxs)("div",{className:o.sectionHeader,children:[(0,c.jsx)("p",{className:o.sectionSubtitle,children:"Platform Components"}),(0,c.jsx)("h2",{className:o.sectionTitle,children:"BharatMLStack Components"}),(0,c.jsx)("p",{className:o.sectionDescription,children:"Purpose-built components for every stage of the ML lifecycle, from feature serving to model deployment."})]}),(0,c.jsx)("div",{className:o.componentsGrid,children:d.map((s,i)=>(0,c.jsxs)("div",{className:o.componentCard,ref:t=>e.current[i]=t,children:[(0,c.jsx)("div",{className:o.componentIcon,children:s.icon}),(0,c.jsxs)("div",{className:o.componentContent,children:[(0,c.jsx)("h3",{children:s.title}),(0,c.jsx)("p",{children:s.description}),(0,c.jsx)("a",{href:`${t}${s.cta.replace(/^\//,"")}`,className:o.componentLink,children:"Learn more \u2192"})]})]},i))})]})})}function x(){return(0,c.jsx)("section",{className:`${o.section} ${o.statsSection}`,id:"stats",children:(0,c.jsxs)("div",{className:o.container,children:[(0,c.jsxs)("div",{className:o.sectionHeader,children:[(0,c.jsx)("p",{className:o.sectionSubtitle,children:"Proven at scale"}),(0,c.jsx)("h2",{className:o.sectionTitle,children:"Scaling Numbers"})]}),(0,c.jsx)("div",{className:o.statsGrid,children:h.map((e,t)=>(0,c.jsxs)("div",{className:o.statCard,children:[(0,c.jsx)("p",{className:o.statLabel,children:e.label}),(0,c.jsx)("div",{className:o.statValue,children:e.value}),(0,c.jsx)("p",{className:o.statDescription,children:e.description})]},t))})]})})}function v(){return(0,c.jsx)("section",{className:o.section,id:"demos",children:(0,c.jsxs)("div",{className:o.container,children:[(0,c.jsxs)("div",{className:o.sectionHeader,children:[(0,c.jsx)("p",{className:o.sectionSubtitle,children:"See it in action"}),(0,c.jsx)("h2",{className:o.sectionTitle,children:"Demo Videos"}),(0,c.jsx)("p",{className:o.sectionDescription,children:"Watch short demos of each BharatMLStack component in action."})]}),(0,c.jsx)("div",{className:o.videosGrid,children:m.map((e,t)=>(0,c.jsxs)("div",{className:o.videoCard,children:[(0,c.jsx)("div",{className:o.videoWrapper,children:(0,c.jsxs)("video",{className:o.videoPlayer,controls:!0,preload:"metadata",playsInline:!0,children:[(0,c.jsx)("source",{src:e.url,type:"video/mp4"}),"Your browser does not support the video tag."]})}),(0,c.jsxs)("div",{className:o.videoContent,children:[(0,c.jsx)("h3",{children:e.title}),(0,c.jsx)("p",{children:e.description})]})]},t))})]})})}function j(){const e=(0,a.Ay)("/");return(0,c.jsx)("section",{className:o.section,id:"blog",children:(0,c.jsxs)("div",{className:o.container,children:[(0,c.jsxs)("div",{className:o.sectionHeader,children:[(0,c.jsx)("p",{className:o.sectionSubtitle,children:"From our blog"}),(0,c.jsx)("h2",{className:o.sectionTitle,children:"View Our Blogs"}),(0,c.jsx)("p",{className:o.sectionDescription,children:"Technical articles, architecture deep-dives, and the story behind BharatMLStack."})]}),(0,c.jsx)("div",{className:o.blogGrid,children:u.map((t,s)=>(0,c.jsxs)("a",{href:`${e}${t.link.replace(/^\//,"")}`,className:o.blogCard,children:[(0,c.jsx)("div",{className:o.blogCardIcon,children:t.icon}),(0,c.jsxs)("div",{className:o.blogContent,children:[(0,c.jsx)("span",{className:o.blogCategory,children:t.category}),(0,c.jsx)("h3",{children:t.title}),(0,c.jsx)("div",{className:o.blogMeta,children:(0,c.jsx)("span",{children:"BharatMLStack Team"})})]})]},s))})]})})}function y(){const e=(0,a.Ay)("/category/online-feature-store");return(0,c.jsx)("section",{className:o.section,children:(0,c.jsx)("div",{className:o.container,children:(0,c.jsxs)("div",{className:o.ctaSection,children:[(0,c.jsx)("h2",{className:o.ctaTitle,children:"Deploy ML models with confidence"}),(0,c.jsx)("p",{className:o.ctaDescription,children:"Comprehensive stack for business-ready ML. Integrates seamlessly with enterprise systems. Robust security and regulatory compliance."}),(0,c.jsxs)("div",{className:o.ctaButtons,children:[(0,c.jsx)("a",{href:e,className:`${o.btn} ${o.btnWhite}`,children:"Start Now"}),(0,c.jsx)("a",{href:"https://github.com/Meesho/BharatMLStack",className:`${o.btn} ${o.btnOutlineWhite}`,target:"_blank",rel:"noopener noreferrer",children:"View on GitHub"})]})]})})})}function N(){const e=(0,a.Ay)("/"),t=(0,a.Ay)("/blog");return(0,c.jsxs)("footer",{className:o.customFooter,children:[(0,c.jsxs)("div",{className:o.footerContent,children:[(0,c.jsxs)("div",{className:o.footerSection,children:[(0,c.jsx)("h4",{children:"BharatMLStack"}),(0,c.jsx)("p",{children:"Enterprise-ready open-source ML infrastructure built for scale, speed, and simplicity."})]}),(0,c.jsxs)("div",{className:o.footerSection,children:[(0,c.jsx)("h4",{children:"Platform"}),(0,c.jsxs)("ul",{className:o.footerList,children:[(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:(0,a.Ay)("/online-feature-store/v1.0.0"),children:"Online Feature Store"})}),(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:(0,a.Ay)("/inferflow/v1.0.0"),children:"Inferflow"})}),(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:(0,a.Ay)("/skye/v1.0.0"),children:"Skye"})}),(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:(0,a.Ay)("/numerix/v1.0.0"),children:"Numerix"})}),(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:(0,a.Ay)("/predator/v1.0.0"),children:"Predator"})})]})]}),(0,c.jsxs)("div",{className:o.footerSection,children:[(0,c.jsx)("h4",{children:"Resources"}),(0,c.jsxs)("ul",{className:o.footerList,children:[(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:t,children:"Blog"})}),(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:e,children:"Documentation"})}),(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:"https://github.com/Meesho/BharatMLStack/discussions",children:"Forum"})})]})]}),(0,c.jsxs)("div",{className:o.footerSection,children:[(0,c.jsx)("h4",{children:"Community"}),(0,c.jsxs)("ul",{className:o.footerList,children:[(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:"https://github.com/Meesho/BharatMLStack",children:"GitHub"})}),(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:"https://discord.gg/XkT7XsV2AU",children:"Discord"})}),(0,c.jsx)("li",{children:(0,c.jsx)("a",{href:"https://github.com/Meesho/BharatMLStack/blob/main/CONTRIBUTING.md",children:"Contributing"})})]})]})]}),(0,c.jsxs)("div",{className:o.footerBottom,children:[(0,c.jsxs)("p",{children:["\xa9 ",(new Date).getFullYear()," Meesho Ltd. All rights reserved. Open Source under Apache 2.0 License."]}),(0,c.jsx)("div",{className:o.footerLinks,children:(0,c.jsx)("a",{href:"https://github.com/Meesho/BharatMLStack",children:"GitHub"})})]})]})}function S(){const{siteConfig:e}=(0,r.A)();return(0,i.useLayoutEffect)(()=>(document.documentElement.classList.add("homepage-active"),()=>{document.documentElement.classList.remove("homepage-active")}),[]),(0,c.jsxs)(n.A,{title:`${e.title} - Open Source ML Infrastructure`,description:"Open source, end-to-end ML infrastructure stack built for scale, speed, and simplicity.",children:[(0,c.jsx)("style",{children:"\n .navbar { display: none !important; }\n .footer { display: none !important; }\n [class*='docMainContainer'], [class*='mainWrapper'] { padding-top: 0 !important; }\n main { margin-top: 0 !important; }\n "}),(0,c.jsxs)("div",{className:o.homepageWrapper,children:[(0,c.jsx)(p,{}),(0,c.jsx)(f,{}),(0,c.jsx)(g,{}),(0,c.jsx)(b,{}),(0,c.jsx)(x,{}),(0,c.jsx)(v,{}),(0,c.jsx)(j,{}),(0,c.jsx)(y,{}),(0,c.jsx)(N,{})]})]})}}}]); \ No newline at end of file diff --git a/docs/assets/js/f9755c6e.8811662b.js b/docs/assets/js/f9755c6e.8811662b.js deleted file mode 100644 index d17be1e7..00000000 --- a/docs/assets/js/f9755c6e.8811662b.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[8315],{5969:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-five","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-five/index.md","source":"@site/blog/bharatmlstack-history/post-five/index.md","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"BharatMLStack","date":"2025-06-02T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":4.93,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-five","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","authors":["jaya"],"date":"2025-6-2","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"nextItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-three"}}')},8319:(e,t,i)=>{i.r(t),i.d(t,{assets:()=>h,contentTitle:()=>d,default:()=>o,frontMatter:()=>r,metadata:()=>n,toc:()=>c});var n=i(5969),s=i(4848),l=i(8453);const r={slug:"post-five",title:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale",authors:["jaya"],date:"2025-6-2",tags:["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},d=void 0,h={authorsImageUrls:[void 0]},c=[{value:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale",id:"llm-inference-optimization-techniques-engineering-sub-second-latency-at-scale",level:2},{value:"1. Advanced Memory Management: Paged & Prefix KV Caching",id:"1-advanced-memory-management-paged--prefix-kv-caching",level:2},{value:"Paged KV caching",id:"paged-kv-caching",level:3},{value:"KV cache quantization",id:"kv-cache-quantization",level:3},{value:"Prefix caching (the "voice bot" optimizer)",id:"prefix-caching-the-voice-bot-optimizer",level:3},{value:"2. Aggressive Quantization (INT4 AWQ & FP8)",id:"2-aggressive-quantization-int4-awq--fp8",level:2},{value:"INT4 AWQ (Activation-aware Weight Quantization)",id:"int4-awq-activation-aware-weight-quantization",level:3},{value:"FP8 precision",id:"fp8-precision",level:3},{value:"3. Kernel Fusion & Custom Plugins",id:"3-kernel-fusion--custom-plugins",level:2},{value:"4. Inflight (Continuous) Batching",id:"4-inflight-continuous-batching",level:2},{value:"5. Parallelism Strategies: Scaling Beyond One GPU",id:"5-parallelism-strategies-scaling-beyond-one-gpu",level:2},{value:"6. Speculative Decoding",id:"6-speculative-decoding",level:2},{value:"Few Benchmarks",id:"few-benchmarks",level:2},{value:"Search query rewriting",id:"search-query-rewriting",level:3},{value:"Voice bot query",id:"voice-bot-query",level:3},{value:"Conclusion",id:"conclusion",level:2}];function a(e){const t={h2:"h2",h3:"h3",img:"img",li:"li",p:"p",strong:"strong",table:"table",tbody:"tbody",td:"td",th:"th",thead:"thead",tr:"tr",ul:"ul",...(0,l.R)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(t.p,{children:(0,s.jsx)(t.img,{alt:"BharatMLStack",src:i(9200).A+"",width:"1396",height:"460"})}),"\n",(0,s.jsx)(t.h2,{id:"llm-inference-optimization-techniques-engineering-sub-second-latency-at-scale",children:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale"}),"\n",(0,s.jsx)(t.p,{children:"Raw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack\u2014from memory management to kernel execution."}),"\n",(0,s.jsx)(t.h2,{id:"1-advanced-memory-management-paged--prefix-kv-caching",children:"1. Advanced Memory Management: Paged & Prefix KV Caching"}),"\n",(0,s.jsx)(t.p,{children:"The most significant bottleneck in LLM inference is not always compute, but memory bandwidth\u2014specifically managing the Key-Value (KV) cache."}),"\n",(0,s.jsx)(t.h3,{id:"paged-kv-caching",children:"Paged KV caching"}),"\n",(0,s.jsxs)(t.p,{children:["Standard caching suffers from fragmentation. We use ",(0,s.jsx)(t.strong,{children:"Paged KV caching"}),", which operates similarly to an operating system's virtual memory: the KV cache is divided into non-contiguous blocks. This lets us serve larger batch sizes without running out of memory."]}),"\n",(0,s.jsx)(t.h3,{id:"kv-cache-quantization",children:"KV cache quantization"}),"\n",(0,s.jsxs)(t.p,{children:["To further maximize available memory, we implement ",(0,s.jsx)(t.strong,{children:"KV cache quantization"})," (e.g., FP8). By compressing stored attention keys and values from 16-bit to 8-bit, we nearly double the effective context window capacity of the GPU, allowing longer conversations or larger batches without materially degrading quality."]}),"\n",(0,s.jsx)(t.h3,{id:"prefix-caching-the-voice-bot-optimizer",children:'Prefix caching (the "voice bot" optimizer)'}),"\n",(0,s.jsxs)(t.p,{children:['For use cases like GenAI voice bots where the system prompt (e.g., "You are a helpful assistant...") is static across thousands of requests, we enable ',(0,s.jsx)(t.strong,{children:"prefix caching"}),"."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Impact"}),": By reusing pre-computed KV states for common prefixes, we achieve a cache hit rate of ~90%. This reduces ",(0,s.jsx)(t.strong,{children:"Time To First Token (TTFT)"})," by skipping redundant computation of the system prompt."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"2-aggressive-quantization-int4-awq--fp8",children:"2. Aggressive Quantization (INT4 AWQ & FP8)"}),"\n",(0,s.jsx)(t.p,{children:"Running models in their native 16-bit precision (BF16) restricts maximum batch size and throughput. We use quantization to shrink model weights without sacrificing accuracy."}),"\n",(0,s.jsx)(t.h3,{id:"int4-awq-activation-aware-weight-quantization",children:"INT4 AWQ (Activation-aware Weight Quantization)"}),"\n",(0,s.jsxs)(t.p,{children:["For the Llama 3 family, we use ",(0,s.jsx)(t.strong,{children:"AWQ"})," to compress weights to 4 bits. This reduces model size by ~75%, allowing larger models to fit into L4 GPU memory and significantly improving token generation speed."]}),"\n",(0,s.jsx)(t.h3,{id:"fp8-precision",children:"FP8 precision"}),"\n",(0,s.jsxs)(t.p,{children:["For NVIDIA Hopper (H100) architectures, we are exploring ",(0,s.jsx)(t.strong,{children:"FP8 quantization"}),", leveraging native FP8 tensor cores to accelerate matrix multiplications while maintaining a higher dynamic range than integer quantization."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Verification"}),": We validate quantized models by comparing dot-product similarity of embeddings against the FP16 baseline, consistently achieving ",(0,s.jsx)(t.strong,{children:">99% similarity"}),"."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"3-kernel-fusion--custom-plugins",children:"3. Kernel Fusion & Custom Plugins"}),"\n",(0,s.jsx)(t.p,{children:"To minimize overhead from launching thousands of small GPU operations, we fuse them into monolithic kernels using NVIDIA TensorRT plugins."}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Flash attention & FMHA"}),": We enable ",(0,s.jsx)(t.strong,{children:"Fused Multi-Head Attention (FMHA)"})," combined with flash attention to reduce memory reads/writes."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"GEMM plugins"}),": We use specialized ",(0,s.jsx)(t.strong,{children:"GEMM"})," plugins to accelerate transformer linear layers."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Removing input padding"}),": Instead of padding short sequences to match the longest, we remove input padding so the GPU processes only valid tokens."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"4-inflight-continuous-batching",children:"4. Inflight (Continuous) Batching"}),"\n",(0,s.jsx)(t.p,{children:"Traditional static batching waits for all requests in a batch to finish before returning results\u2014so one long response delays everyone else."}),"\n",(0,s.jsxs)(t.p,{children:["We implement ",(0,s.jsx)(t.strong,{children:"inflight batching"}),": as soon as one request completes, its slot is freed and filled by a new request from the queue. This keeps GPUs saturated and decouples latency of short queries from long ones."]}),"\n",(0,s.jsx)(t.h2,{id:"5-parallelism-strategies-scaling-beyond-one-gpu",children:"5. Parallelism Strategies: Scaling Beyond One GPU"}),"\n",(0,s.jsx)(t.p,{children:"For large models (e.g., 70B+ parameters) that cannot fit into the VRAM of a single GPU, we use parallelism strategies."}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Tensor parallelism (TP)"}),": Split weight matrices across multiple GPUs (e.g., 4\xd7 L4 or 8\xd7 A100). Each GPU computes a shard and outputs are reduced at every layer."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Pipeline parallelism (PP)"}),": Split model layers across GPUs to pipeline compute (e.g., while one GPU computes later layers for Request A, another starts early layers for Request B)."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"6-speculative-decoding",children:"6. Speculative Decoding"}),"\n",(0,s.jsxs)(t.p,{children:["To reduce inter-token latency (ITL), we explore ",(0,s.jsx)(t.strong,{children:"speculative decoding"}),"."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Mechanism"}),': A smaller, faster "draft" model speculatively generates a short token sequence (e.g., 5 tokens).']}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Verification"}),": The larger target model verifies those tokens in one parallel forward pass. If correct, we effectively generate multiple tokens per large-model step; if not, we discard and regenerate. This is effective for predictable text, improving perceived generation speed."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"few-benchmarks",children:"Few Benchmarks"}),"\n",(0,s.jsx)(t.p,{children:"Below are a couple of representative use cases and performance numbers."}),"\n",(0,s.jsx)(t.h3,{id:"search-query-rewriting",children:"Search query rewriting"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"LLM"}),": Fine-tuned llama-3.2-1B"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Input & output token length"}),": ~10\u201320"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Response type"}),": Non-streaming"]}),"\n"]}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Inference runtime"}),(0,s.jsx)(t.th,{children:"Hardware"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Max requests/sec"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Max p99 latency"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{children:"4 \xd7 L4 GPUs (multi-GPU)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1000"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"95 ms"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{children:"1 \xd7 A100 40 GB GPU"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1000"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"69 ms"})]})]})]}),"\n",(0,s.jsx)(t.h3,{id:"voice-bot-query",children:"Voice bot query"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"LLM"}),": Llama-3.1-8B"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Input token length"}),": ~1900\u20132000"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Output token length"}),": ~200"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Response type"}),": Streaming"]}),"\n"]}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Inference runtime"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Concurrency"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"p99 TTFT (ms)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"p99 ITL (ms)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Token throughput (tokens/sec)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Request throughput (req/sec)"}),(0,s.jsx)(t.th,{children:"Hardware"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"36.27"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"22.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"45.66"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.23"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"49.81"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"23.21"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"89.37"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.45"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"55.33"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"36.62"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"153.39"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.78"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"66.5"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"39.11"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"279.88"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1.47"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"131.8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"30.39"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"547.8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2.77"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"277.22"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"48.02"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"925.7"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4.78"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"64"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"498.52"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"71.62"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,164.40"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"6.2"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"128"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"677.31"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"120.37"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,445.18"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"7.69"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"256"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,926.31"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"216.88"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,600.81"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8.52"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"21.17"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"9.24"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"130.05"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.68"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"25.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"9.21"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"264.5"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1.35"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"28.52"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"10.99"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"437.69"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2.27"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"34.4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"12.61"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"760.49"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"3.96"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"68.03"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"14.32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,343.80"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"7.01"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"185.96"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16.82"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2,287.30"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"11.92"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"64"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"136.87"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"21.17"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"3,625.22"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"18.89"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"128"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"463.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"34.15"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4,456.51"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"23.24"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"256"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"890.12"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"59.18"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"5,188.24"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"27.05"}),(0,s.jsx)(t.td,{children:"A100"})]})]})]}),"\n",(0,s.jsx)(t.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,s.jsx)(t.p,{children:"High-performance LLM inference is fundamentally a systems engineering problem: memory efficiency, kernel execution, batching strategy, and parallelism determine real-world latency and throughput. Techniques such as paged KV caching, aggressive quantization, kernel fusion, and inflight batching improve GPU utilization while reducing latency and memory pressure."}),"\n",(0,s.jsx)(t.p,{children:"These optimizations enable the platform to deliver sub-second responses, sustain high concurrency, and efficiently serve both lightweight and long-context workloads. By continuously optimizing across the full inference stack, we keep LLM serving scalable, cost-efficient, and production-ready for real-time AI applications."})]})}function o(e={}){const{wrapper:t}={...(0,l.R)(),...e.components};return t?(0,s.jsx)(t,{...e,children:(0,s.jsx)(a,{...e})}):a(e)}},8453:(e,t,i)=>{i.d(t,{R:()=>r,x:()=>d});var n=i(6540);const s={},l=n.createContext(s);function r(e){const t=n.useContext(l);return n.useMemo(function(){return"function"==typeof e?e(t):{...t,...e}},[t,e])}function d(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(s):e.components||s:r(e.components),n.createElement(l.Provider,{value:t},e.children)}},9200:(e,t,i)=>{i.d(t,{A:()=>n});const n=i.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"}}]); \ No newline at end of file diff --git a/docs/assets/js/3aeb33c7.b4a8c40f.js b/docs/assets/js/f9755c6e.ac48cb60.js similarity index 92% rename from docs/assets/js/3aeb33c7.b4a8c40f.js rename to docs/assets/js/f9755c6e.ac48cb60.js index 854f0ff5..2f9d6cff 100644 --- a/docs/assets/js/3aeb33c7.b4a8c40f.js +++ b/docs/assets/js/f9755c6e.ac48cb60.js @@ -1 +1 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[974],{5969:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-five","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-five/index.md","source":"@site/blog/bharatmlstack-history/post-five/index.md","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"BharatMLStack","date":"2025-06-02T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":4.93,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-five","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","authors":["jaya"],"date":"2025-6-2","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"nextItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-three"}}')},7309:(e,t,i)=>{i.r(t),i.d(t,{assets:()=>h,contentTitle:()=>d,default:()=>o,frontMatter:()=>r,metadata:()=>n,toc:()=>c});var n=i(5969),s=i(4848),l=i(8453);const r={slug:"post-five",title:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale",authors:["jaya"],date:"2025-6-2",tags:["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},d=void 0,h={authorsImageUrls:[void 0]},c=[{value:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale",id:"llm-inference-optimization-techniques-engineering-sub-second-latency-at-scale",level:2},{value:"1. Advanced Memory Management: Paged & Prefix KV Caching",id:"1-advanced-memory-management-paged--prefix-kv-caching",level:2},{value:"Paged KV caching",id:"paged-kv-caching",level:3},{value:"KV cache quantization",id:"kv-cache-quantization",level:3},{value:"Prefix caching (the "voice bot" optimizer)",id:"prefix-caching-the-voice-bot-optimizer",level:3},{value:"2. Aggressive Quantization (INT4 AWQ & FP8)",id:"2-aggressive-quantization-int4-awq--fp8",level:2},{value:"INT4 AWQ (Activation-aware Weight Quantization)",id:"int4-awq-activation-aware-weight-quantization",level:3},{value:"FP8 precision",id:"fp8-precision",level:3},{value:"3. Kernel Fusion & Custom Plugins",id:"3-kernel-fusion--custom-plugins",level:2},{value:"4. Inflight (Continuous) Batching",id:"4-inflight-continuous-batching",level:2},{value:"5. Parallelism Strategies: Scaling Beyond One GPU",id:"5-parallelism-strategies-scaling-beyond-one-gpu",level:2},{value:"6. Speculative Decoding",id:"6-speculative-decoding",level:2},{value:"Few Benchmarks",id:"few-benchmarks",level:2},{value:"Search query rewriting",id:"search-query-rewriting",level:3},{value:"Voice bot query",id:"voice-bot-query",level:3},{value:"Conclusion",id:"conclusion",level:2}];function a(e){const t={h2:"h2",h3:"h3",img:"img",li:"li",p:"p",strong:"strong",table:"table",tbody:"tbody",td:"td",th:"th",thead:"thead",tr:"tr",ul:"ul",...(0,l.R)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(t.p,{children:(0,s.jsx)(t.img,{alt:"BharatMLStack",src:i(9200).A+"",width:"1396",height:"460"})}),"\n",(0,s.jsx)(t.h2,{id:"llm-inference-optimization-techniques-engineering-sub-second-latency-at-scale",children:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale"}),"\n",(0,s.jsx)(t.p,{children:"Raw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack\u2014from memory management to kernel execution."}),"\n",(0,s.jsx)(t.h2,{id:"1-advanced-memory-management-paged--prefix-kv-caching",children:"1. Advanced Memory Management: Paged & Prefix KV Caching"}),"\n",(0,s.jsx)(t.p,{children:"The most significant bottleneck in LLM inference is not always compute, but memory bandwidth\u2014specifically managing the Key-Value (KV) cache."}),"\n",(0,s.jsx)(t.h3,{id:"paged-kv-caching",children:"Paged KV caching"}),"\n",(0,s.jsxs)(t.p,{children:["Standard caching suffers from fragmentation. We use ",(0,s.jsx)(t.strong,{children:"Paged KV caching"}),", which operates similarly to an operating system's virtual memory: the KV cache is divided into non-contiguous blocks. This lets us serve larger batch sizes without running out of memory."]}),"\n",(0,s.jsx)(t.h3,{id:"kv-cache-quantization",children:"KV cache quantization"}),"\n",(0,s.jsxs)(t.p,{children:["To further maximize available memory, we implement ",(0,s.jsx)(t.strong,{children:"KV cache quantization"})," (e.g., FP8). By compressing stored attention keys and values from 16-bit to 8-bit, we nearly double the effective context window capacity of the GPU, allowing longer conversations or larger batches without materially degrading quality."]}),"\n",(0,s.jsx)(t.h3,{id:"prefix-caching-the-voice-bot-optimizer",children:'Prefix caching (the "voice bot" optimizer)'}),"\n",(0,s.jsxs)(t.p,{children:['For use cases like GenAI voice bots where the system prompt (e.g., "You are a helpful assistant...") is static across thousands of requests, we enable ',(0,s.jsx)(t.strong,{children:"prefix caching"}),"."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Impact"}),": By reusing pre-computed KV states for common prefixes, we achieve a cache hit rate of ~90%. This reduces ",(0,s.jsx)(t.strong,{children:"Time To First Token (TTFT)"})," by skipping redundant computation of the system prompt."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"2-aggressive-quantization-int4-awq--fp8",children:"2. Aggressive Quantization (INT4 AWQ & FP8)"}),"\n",(0,s.jsx)(t.p,{children:"Running models in their native 16-bit precision (BF16) restricts maximum batch size and throughput. We use quantization to shrink model weights without sacrificing accuracy."}),"\n",(0,s.jsx)(t.h3,{id:"int4-awq-activation-aware-weight-quantization",children:"INT4 AWQ (Activation-aware Weight Quantization)"}),"\n",(0,s.jsxs)(t.p,{children:["For the Llama 3 family, we use ",(0,s.jsx)(t.strong,{children:"AWQ"})," to compress weights to 4 bits. This reduces model size by ~75%, allowing larger models to fit into L4 GPU memory and significantly improving token generation speed."]}),"\n",(0,s.jsx)(t.h3,{id:"fp8-precision",children:"FP8 precision"}),"\n",(0,s.jsxs)(t.p,{children:["For NVIDIA Hopper (H100) architectures, we are exploring ",(0,s.jsx)(t.strong,{children:"FP8 quantization"}),", leveraging native FP8 tensor cores to accelerate matrix multiplications while maintaining a higher dynamic range than integer quantization."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Verification"}),": We validate quantized models by comparing dot-product similarity of embeddings against the FP16 baseline, consistently achieving ",(0,s.jsx)(t.strong,{children:">99% similarity"}),"."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"3-kernel-fusion--custom-plugins",children:"3. Kernel Fusion & Custom Plugins"}),"\n",(0,s.jsx)(t.p,{children:"To minimize overhead from launching thousands of small GPU operations, we fuse them into monolithic kernels using NVIDIA TensorRT plugins."}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Flash attention & FMHA"}),": We enable ",(0,s.jsx)(t.strong,{children:"Fused Multi-Head Attention (FMHA)"})," combined with flash attention to reduce memory reads/writes."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"GEMM plugins"}),": We use specialized ",(0,s.jsx)(t.strong,{children:"GEMM"})," plugins to accelerate transformer linear layers."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Removing input padding"}),": Instead of padding short sequences to match the longest, we remove input padding so the GPU processes only valid tokens."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"4-inflight-continuous-batching",children:"4. Inflight (Continuous) Batching"}),"\n",(0,s.jsx)(t.p,{children:"Traditional static batching waits for all requests in a batch to finish before returning results\u2014so one long response delays everyone else."}),"\n",(0,s.jsxs)(t.p,{children:["We implement ",(0,s.jsx)(t.strong,{children:"inflight batching"}),": as soon as one request completes, its slot is freed and filled by a new request from the queue. This keeps GPUs saturated and decouples latency of short queries from long ones."]}),"\n",(0,s.jsx)(t.h2,{id:"5-parallelism-strategies-scaling-beyond-one-gpu",children:"5. Parallelism Strategies: Scaling Beyond One GPU"}),"\n",(0,s.jsx)(t.p,{children:"For large models (e.g., 70B+ parameters) that cannot fit into the VRAM of a single GPU, we use parallelism strategies."}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Tensor parallelism (TP)"}),": Split weight matrices across multiple GPUs (e.g., 4\xd7 L4 or 8\xd7 A100). Each GPU computes a shard and outputs are reduced at every layer."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Pipeline parallelism (PP)"}),": Split model layers across GPUs to pipeline compute (e.g., while one GPU computes later layers for Request A, another starts early layers for Request B)."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"6-speculative-decoding",children:"6. Speculative Decoding"}),"\n",(0,s.jsxs)(t.p,{children:["To reduce inter-token latency (ITL), we explore ",(0,s.jsx)(t.strong,{children:"speculative decoding"}),"."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Mechanism"}),': A smaller, faster "draft" model speculatively generates a short token sequence (e.g., 5 tokens).']}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Verification"}),": The larger target model verifies those tokens in one parallel forward pass. If correct, we effectively generate multiple tokens per large-model step; if not, we discard and regenerate. This is effective for predictable text, improving perceived generation speed."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"few-benchmarks",children:"Few Benchmarks"}),"\n",(0,s.jsx)(t.p,{children:"Below are a couple of representative use cases and performance numbers."}),"\n",(0,s.jsx)(t.h3,{id:"search-query-rewriting",children:"Search query rewriting"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"LLM"}),": Fine-tuned llama-3.2-1B"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Input & output token length"}),": ~10\u201320"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Response type"}),": Non-streaming"]}),"\n"]}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Inference runtime"}),(0,s.jsx)(t.th,{children:"Hardware"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Max requests/sec"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Max p99 latency"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{children:"4 \xd7 L4 GPUs (multi-GPU)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1000"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"95 ms"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{children:"1 \xd7 A100 40 GB GPU"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1000"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"69 ms"})]})]})]}),"\n",(0,s.jsx)(t.h3,{id:"voice-bot-query",children:"Voice bot query"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"LLM"}),": Llama-3.1-8B"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Input token length"}),": ~1900\u20132000"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Output token length"}),": ~200"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Response type"}),": Streaming"]}),"\n"]}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Inference runtime"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Concurrency"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"p99 TTFT (ms)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"p99 ITL (ms)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Token throughput (tokens/sec)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Request throughput (req/sec)"}),(0,s.jsx)(t.th,{children:"Hardware"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"36.27"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"22.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"45.66"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.23"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"49.81"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"23.21"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"89.37"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.45"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"55.33"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"36.62"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"153.39"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.78"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"66.5"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"39.11"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"279.88"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1.47"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"131.8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"30.39"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"547.8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2.77"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"277.22"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"48.02"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"925.7"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4.78"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"64"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"498.52"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"71.62"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,164.40"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"6.2"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"128"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"677.31"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"120.37"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,445.18"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"7.69"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"256"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,926.31"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"216.88"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,600.81"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8.52"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"21.17"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"9.24"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"130.05"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.68"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"25.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"9.21"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"264.5"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1.35"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"28.52"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"10.99"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"437.69"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2.27"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"34.4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"12.61"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"760.49"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"3.96"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"68.03"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"14.32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,343.80"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"7.01"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"185.96"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16.82"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2,287.30"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"11.92"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"64"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"136.87"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"21.17"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"3,625.22"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"18.89"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"128"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"463.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"34.15"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4,456.51"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"23.24"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"256"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"890.12"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"59.18"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"5,188.24"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"27.05"}),(0,s.jsx)(t.td,{children:"A100"})]})]})]}),"\n",(0,s.jsx)(t.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,s.jsx)(t.p,{children:"High-performance LLM inference is fundamentally a systems engineering problem: memory efficiency, kernel execution, batching strategy, and parallelism determine real-world latency and throughput. Techniques such as paged KV caching, aggressive quantization, kernel fusion, and inflight batching improve GPU utilization while reducing latency and memory pressure."}),"\n",(0,s.jsx)(t.p,{children:"These optimizations enable the platform to deliver sub-second responses, sustain high concurrency, and efficiently serve both lightweight and long-context workloads. By continuously optimizing across the full inference stack, we keep LLM serving scalable, cost-efficient, and production-ready for real-time AI applications."})]})}function o(e={}){const{wrapper:t}={...(0,l.R)(),...e.components};return t?(0,s.jsx)(t,{...e,children:(0,s.jsx)(a,{...e})}):a(e)}},8453:(e,t,i)=>{i.d(t,{R:()=>r,x:()=>d});var n=i(6540);const s={},l=n.createContext(s);function r(e){const t=n.useContext(l);return n.useMemo(function(){return"function"==typeof e?e(t):{...t,...e}},[t,e])}function d(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(s):e.components||s:r(e.components),n.createElement(l.Provider,{value:t},e.children)}},9200:(e,t,i)=>{i.d(t,{A:()=>n});const n=i.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"}}]); \ No newline at end of file +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[8315],{5969:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-five","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-five/index.md","source":"@site/blog/bharatmlstack-history/post-five/index.md","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","description":"BharatMLStack","date":"2025-06-02T00:00:00.000Z","tags":[{"inline":true,"label":"llm","permalink":"/BharatMLStack/blog/tags/llm"},{"inline":true,"label":"vllm","permalink":"/BharatMLStack/blog/tags/vllm"},{"inline":true,"label":"tensorrt-llm","permalink":"/BharatMLStack/blog/tags/tensorrt-llm"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":4.93,"hasTruncateMarker":false,"authors":[{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null}],"frontMatter":{"slug":"post-five","title":"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale","authors":["jaya"],"date":"2025-6-2","tags":["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"nextItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-four"}}')},8319:(e,t,i)=>{i.r(t),i.d(t,{assets:()=>h,contentTitle:()=>d,default:()=>o,frontMatter:()=>r,metadata:()=>n,toc:()=>c});var n=i(5969),s=i(4848),l=i(8453);const r={slug:"post-five",title:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale",authors:["jaya"],date:"2025-6-2",tags:["llm","vllm","tensorrt-llm","mlplatform","meesho","bharatmlstack"]},d=void 0,h={authorsImageUrls:[void 0]},c=[{value:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale",id:"llm-inference-optimization-techniques-engineering-sub-second-latency-at-scale",level:2},{value:"1. Advanced Memory Management: Paged & Prefix KV Caching",id:"1-advanced-memory-management-paged--prefix-kv-caching",level:2},{value:"Paged KV caching",id:"paged-kv-caching",level:3},{value:"KV cache quantization",id:"kv-cache-quantization",level:3},{value:"Prefix caching (the "voice bot" optimizer)",id:"prefix-caching-the-voice-bot-optimizer",level:3},{value:"2. Aggressive Quantization (INT4 AWQ & FP8)",id:"2-aggressive-quantization-int4-awq--fp8",level:2},{value:"INT4 AWQ (Activation-aware Weight Quantization)",id:"int4-awq-activation-aware-weight-quantization",level:3},{value:"FP8 precision",id:"fp8-precision",level:3},{value:"3. Kernel Fusion & Custom Plugins",id:"3-kernel-fusion--custom-plugins",level:2},{value:"4. Inflight (Continuous) Batching",id:"4-inflight-continuous-batching",level:2},{value:"5. Parallelism Strategies: Scaling Beyond One GPU",id:"5-parallelism-strategies-scaling-beyond-one-gpu",level:2},{value:"6. Speculative Decoding",id:"6-speculative-decoding",level:2},{value:"Few Benchmarks",id:"few-benchmarks",level:2},{value:"Search query rewriting",id:"search-query-rewriting",level:3},{value:"Voice bot query",id:"voice-bot-query",level:3},{value:"Conclusion",id:"conclusion",level:2}];function a(e){const t={h2:"h2",h3:"h3",img:"img",li:"li",p:"p",strong:"strong",table:"table",tbody:"tbody",td:"td",th:"th",thead:"thead",tr:"tr",ul:"ul",...(0,l.R)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(t.p,{children:(0,s.jsx)(t.img,{alt:"BharatMLStack",src:i(9200).A+"",width:"1396",height:"460"})}),"\n",(0,s.jsx)(t.h2,{id:"llm-inference-optimization-techniques-engineering-sub-second-latency-at-scale",children:"LLM Inference Optimization Techniques: Engineering Sub-Second Latency at Scale"}),"\n",(0,s.jsx)(t.p,{children:"Raw execution of Large Language Models is inherently expensive and memory-intensive. To achieve sub-second latency and high throughput, we implement a multi-layered optimization strategy that targets the entire inference stack\u2014from memory management to kernel execution."}),"\n",(0,s.jsx)(t.h2,{id:"1-advanced-memory-management-paged--prefix-kv-caching",children:"1. Advanced Memory Management: Paged & Prefix KV Caching"}),"\n",(0,s.jsx)(t.p,{children:"The most significant bottleneck in LLM inference is not always compute, but memory bandwidth\u2014specifically managing the Key-Value (KV) cache."}),"\n",(0,s.jsx)(t.h3,{id:"paged-kv-caching",children:"Paged KV caching"}),"\n",(0,s.jsxs)(t.p,{children:["Standard caching suffers from fragmentation. We use ",(0,s.jsx)(t.strong,{children:"Paged KV caching"}),", which operates similarly to an operating system's virtual memory: the KV cache is divided into non-contiguous blocks. This lets us serve larger batch sizes without running out of memory."]}),"\n",(0,s.jsx)(t.h3,{id:"kv-cache-quantization",children:"KV cache quantization"}),"\n",(0,s.jsxs)(t.p,{children:["To further maximize available memory, we implement ",(0,s.jsx)(t.strong,{children:"KV cache quantization"})," (e.g., FP8). By compressing stored attention keys and values from 16-bit to 8-bit, we nearly double the effective context window capacity of the GPU, allowing longer conversations or larger batches without materially degrading quality."]}),"\n",(0,s.jsx)(t.h3,{id:"prefix-caching-the-voice-bot-optimizer",children:'Prefix caching (the "voice bot" optimizer)'}),"\n",(0,s.jsxs)(t.p,{children:['For use cases like GenAI voice bots where the system prompt (e.g., "You are a helpful assistant...") is static across thousands of requests, we enable ',(0,s.jsx)(t.strong,{children:"prefix caching"}),"."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Impact"}),": By reusing pre-computed KV states for common prefixes, we achieve a cache hit rate of ~90%. This reduces ",(0,s.jsx)(t.strong,{children:"Time To First Token (TTFT)"})," by skipping redundant computation of the system prompt."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"2-aggressive-quantization-int4-awq--fp8",children:"2. Aggressive Quantization (INT4 AWQ & FP8)"}),"\n",(0,s.jsx)(t.p,{children:"Running models in their native 16-bit precision (BF16) restricts maximum batch size and throughput. We use quantization to shrink model weights without sacrificing accuracy."}),"\n",(0,s.jsx)(t.h3,{id:"int4-awq-activation-aware-weight-quantization",children:"INT4 AWQ (Activation-aware Weight Quantization)"}),"\n",(0,s.jsxs)(t.p,{children:["For the Llama 3 family, we use ",(0,s.jsx)(t.strong,{children:"AWQ"})," to compress weights to 4 bits. This reduces model size by ~75%, allowing larger models to fit into L4 GPU memory and significantly improving token generation speed."]}),"\n",(0,s.jsx)(t.h3,{id:"fp8-precision",children:"FP8 precision"}),"\n",(0,s.jsxs)(t.p,{children:["For NVIDIA Hopper (H100) architectures, we are exploring ",(0,s.jsx)(t.strong,{children:"FP8 quantization"}),", leveraging native FP8 tensor cores to accelerate matrix multiplications while maintaining a higher dynamic range than integer quantization."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Verification"}),": We validate quantized models by comparing dot-product similarity of embeddings against the FP16 baseline, consistently achieving ",(0,s.jsx)(t.strong,{children:">99% similarity"}),"."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"3-kernel-fusion--custom-plugins",children:"3. Kernel Fusion & Custom Plugins"}),"\n",(0,s.jsx)(t.p,{children:"To minimize overhead from launching thousands of small GPU operations, we fuse them into monolithic kernels using NVIDIA TensorRT plugins."}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Flash attention & FMHA"}),": We enable ",(0,s.jsx)(t.strong,{children:"Fused Multi-Head Attention (FMHA)"})," combined with flash attention to reduce memory reads/writes."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"GEMM plugins"}),": We use specialized ",(0,s.jsx)(t.strong,{children:"GEMM"})," plugins to accelerate transformer linear layers."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Removing input padding"}),": Instead of padding short sequences to match the longest, we remove input padding so the GPU processes only valid tokens."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"4-inflight-continuous-batching",children:"4. Inflight (Continuous) Batching"}),"\n",(0,s.jsx)(t.p,{children:"Traditional static batching waits for all requests in a batch to finish before returning results\u2014so one long response delays everyone else."}),"\n",(0,s.jsxs)(t.p,{children:["We implement ",(0,s.jsx)(t.strong,{children:"inflight batching"}),": as soon as one request completes, its slot is freed and filled by a new request from the queue. This keeps GPUs saturated and decouples latency of short queries from long ones."]}),"\n",(0,s.jsx)(t.h2,{id:"5-parallelism-strategies-scaling-beyond-one-gpu",children:"5. Parallelism Strategies: Scaling Beyond One GPU"}),"\n",(0,s.jsx)(t.p,{children:"For large models (e.g., 70B+ parameters) that cannot fit into the VRAM of a single GPU, we use parallelism strategies."}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Tensor parallelism (TP)"}),": Split weight matrices across multiple GPUs (e.g., 4\xd7 L4 or 8\xd7 A100). Each GPU computes a shard and outputs are reduced at every layer."]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Pipeline parallelism (PP)"}),": Split model layers across GPUs to pipeline compute (e.g., while one GPU computes later layers for Request A, another starts early layers for Request B)."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"6-speculative-decoding",children:"6. Speculative Decoding"}),"\n",(0,s.jsxs)(t.p,{children:["To reduce inter-token latency (ITL), we explore ",(0,s.jsx)(t.strong,{children:"speculative decoding"}),"."]}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Mechanism"}),': A smaller, faster "draft" model speculatively generates a short token sequence (e.g., 5 tokens).']}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Verification"}),": The larger target model verifies those tokens in one parallel forward pass. If correct, we effectively generate multiple tokens per large-model step; if not, we discard and regenerate. This is effective for predictable text, improving perceived generation speed."]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"few-benchmarks",children:"Few Benchmarks"}),"\n",(0,s.jsx)(t.p,{children:"Below are a couple of representative use cases and performance numbers."}),"\n",(0,s.jsx)(t.h3,{id:"search-query-rewriting",children:"Search query rewriting"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"LLM"}),": Fine-tuned llama-3.2-1B"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Input & output token length"}),": ~10\u201320"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Response type"}),": Non-streaming"]}),"\n"]}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Inference runtime"}),(0,s.jsx)(t.th,{children:"Hardware"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Max requests/sec"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Max p99 latency"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{children:"4 \xd7 L4 GPUs (multi-GPU)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1000"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"95 ms"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{children:"1 \xd7 A100 40 GB GPU"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1000"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"69 ms"})]})]})]}),"\n",(0,s.jsx)(t.h3,{id:"voice-bot-query",children:"Voice bot query"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"LLM"}),": Llama-3.1-8B"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Input token length"}),": ~1900\u20132000"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Output token length"}),": ~200"]}),"\n",(0,s.jsxs)(t.li,{children:[(0,s.jsx)(t.strong,{children:"Response type"}),": Streaming"]}),"\n"]}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{children:"Inference runtime"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Concurrency"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"p99 TTFT (ms)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"p99 ITL (ms)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Token throughput (tokens/sec)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Request throughput (req/sec)"}),(0,s.jsx)(t.th,{children:"Hardware"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"36.27"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"22.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"45.66"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.23"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"49.81"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"23.21"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"89.37"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.45"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"55.33"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"36.62"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"153.39"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.78"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"66.5"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"39.11"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"279.88"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1.47"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"131.8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"30.39"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"547.8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2.77"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"277.22"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"48.02"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"925.7"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4.78"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"64"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"498.52"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"71.62"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,164.40"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"6.2"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"128"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"677.31"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"120.37"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,445.18"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"7.69"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"256"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,926.31"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"216.88"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,600.81"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8.52"}),(0,s.jsx)(t.td,{children:"L4"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"21.17"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"9.24"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"130.05"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"0.68"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"25.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"9.21"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"264.5"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1.35"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"28.52"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"10.99"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"437.69"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2.27"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"8"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"34.4"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"12.61"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"760.49"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"3.96"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"68.03"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"14.32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"1,343.80"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"7.01"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"32"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"185.96"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"16.82"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"2,287.30"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"11.92"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"64"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"136.87"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"21.17"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"3,625.22"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"18.89"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"128"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"463.78"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"34.15"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"4,456.51"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"23.24"}),(0,s.jsx)(t.td,{children:"A100"})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{children:"TensorRT-LLM"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"256"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"890.12"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"59.18"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"5,188.24"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:"27.05"}),(0,s.jsx)(t.td,{children:"A100"})]})]})]}),"\n",(0,s.jsx)(t.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,s.jsx)(t.p,{children:"High-performance LLM inference is fundamentally a systems engineering problem: memory efficiency, kernel execution, batching strategy, and parallelism determine real-world latency and throughput. Techniques such as paged KV caching, aggressive quantization, kernel fusion, and inflight batching improve GPU utilization while reducing latency and memory pressure."}),"\n",(0,s.jsx)(t.p,{children:"These optimizations enable the platform to deliver sub-second responses, sustain high concurrency, and efficiently serve both lightweight and long-context workloads. By continuously optimizing across the full inference stack, we keep LLM serving scalable, cost-efficient, and production-ready for real-time AI applications."})]})}function o(e={}){const{wrapper:t}={...(0,l.R)(),...e.components};return t?(0,s.jsx)(t,{...e,children:(0,s.jsx)(a,{...e})}):a(e)}},8453:(e,t,i)=>{i.d(t,{R:()=>r,x:()=>d});var n=i(6540);const s={},l=n.createContext(s);function r(e){const t=n.useContext(l);return n.useMemo(function(){return"function"==typeof e?e(t):{...t,...e}},[t,e])}function d(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(s):e.components||s:r(e.components),n.createElement(l.Provider,{value:t},e.children)}},9200:(e,t,i)=>{i.d(t,{A:()=>n});const n=i.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"}}]); \ No newline at end of file diff --git a/docs/assets/js/fccc4c42.4690f84a.js b/docs/assets/js/fccc4c42.4690f84a.js deleted file mode 100644 index 539fe26f..00000000 --- a/docs/assets/js/fccc4c42.4690f84a.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[2117],{702:(e,n,t)=>{t.d(n,{A:()=>i});const i=t.p+"assets/images/vss-c482f6eac4c68b3219e4c562a6b717ec.png"},788:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-three","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-three/index.md","source":"@site/blog/bharatmlstack-history/post-three/index.md","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"BharatMLStack","date":"2024-05-21T00:00:00.000Z","tags":[{"inline":true,"label":"model-inference","permalink":"/BharatMLStack/blog/tags/model-inference"},{"inline":true,"label":"embedding-search","permalink":"/BharatMLStack/blog/tags/embedding-search"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":3.6,"hasTruncateMarker":false,"authors":[{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"slug":"post-three","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","authors":["aditya","jaya","adarsha"],"date":"2024-05-21T00:00:00.000Z","tags":["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-three"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two"}}')},2561:(e,n,t)=>{t.r(n),t.d(n,{assets:()=>o,contentTitle:()=>l,default:()=>h,frontMatter:()=>s,metadata:()=>i,toc:()=>d});var i=t(788),a=t(4848),r=t(8453);const s={slug:"post-three",title:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search",authors:["aditya","jaya","adarsha"],date:new Date("2024-05-21T00:00:00.000Z"),tags:["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},l=void 0,o={authorsImageUrls:[void 0,void 0,void 0]},d=[{value:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search",id:"cracking-the-code-scaling-model-inference--real-time-embedding-search",level:2},{value:"Breaking Free from the Scalability Ceiling",id:"breaking-free-from-the-scalability-ceiling",level:2},{value:"The Model Serving Bottleneck\u2014A Wake-Up Call",id:"the-model-serving-bottlenecka-wake-up-call",level:3},{value:"Scaling Triton on GKE",id:"scaling-triton-on-gke",level:3},{value:"Fixing the Cold Start Problem",id:"fixing-the-cold-start-problem",level:3},{value:"Embedding Search: The Last Piece of the Puzzle",id:"embedding-search-the-last-piece-of-the-puzzle",level:2},{value:"Choosing the Right Vector Database",id:"choosing-the-right-vector-database",level:3},{value:"Embedding Freshness & Real-Time Updates",id:"embedding-freshness--real-time-updates",level:3},{value:"Final Takeaways: Scaling Smartly for Real-Time ML",id:"final-takeaways-scaling-smartly-for-real-time-ml",level:2}];function c(e){const n={h2:"h2",h3:"h3",img:"img",li:"li",p:"p",ul:"ul",...(0,r.R)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(n.p,{children:(0,a.jsx)(n.img,{alt:"BharatMLStack",src:t(6e3).A+"",width:"1396",height:"460"})}),"\n",(0,a.jsx)(n.h2,{id:"cracking-the-code-scaling-model-inference--real-time-embedding-search",children:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search"}),"\n",(0,a.jsx)(n.p,{children:"By mid-2023, we had transformed our ML stack\u2014building a real-time feature store, optimizing model retrieval, and fine-tuning ranking. But two critical gaps remained:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\udd39 Scaling model inference without hitting infrastructure roadblocks"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\udd39 Moving embedding search from batch to real-time for candidate generation"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"Here\u2019s how we tackled these last-mile challenges, broke free from infrastructure constraints, and built a cost-efficient, high-performance system."}),"\n",(0,a.jsx)(n.h2,{id:"breaking-free-from-the-scalability-ceiling",children:"Breaking Free from the Scalability Ceiling"}),"\n",(0,a.jsx)(n.h3,{id:"the-model-serving-bottlenecka-wake-up-call",children:"The Model Serving Bottleneck\u2014A Wake-Up Call"}),"\n",(0,a.jsx)(n.p,{children:"July 2023. With just months left for the Mega Blockbuster Sale (MBS), we noticed a serious issue\u2014scaling our model-serving infrastructure was taking 10\u201315 minutes. In real-time ML, that\u2019s an eternity.\nIn one of our war rooms, we ran a quick experiment:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 We deployed an XGBoost model on a self-hosted Triton Inference Server running on a 16-core machine."}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Fired requests and compared the outputs with our existing cloud-hosted setup."}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 The results matched\u2014perfectly."}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:'That moment changed everything. We prepped a backup Triton setup on EKS, just in case our cloud provider couldn\'t allocate enough compute resources in time. Luckily, they did\u2014but the seed was planted.\nThen in October, just two weeks before MBS, we got an alarming response from our infrastructure team:\n"Node availability may be an issue."\nWith no time to waste, we moved 30% of real-time ML traffic to our self-hosted Triton cluster. The results?'}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\u2705 p99 latency dropped from 90\u2013100ms to 30\u201340ms"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Triton handled significantly higher throughput on fewer resources"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 No model changes were needed"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"MBS ran without a hitch, proving that self-hosted inference was the way forward."}),"\n",(0,a.jsx)(n.h3,{id:"scaling-triton-on-gke",children:"Scaling Triton on GKE"}),"\n",(0,a.jsx)(n.p,{children:"This left us with two choices:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"1\ufe0f\u20e3 Port models to a managed cloud inference service, investing time in learning a new deployment stack"}),"\n",(0,a.jsx)(n.li,{children:"2\ufe0f\u20e3 Scale our existing Triton setup on GKE, optimizing for cost and performance"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"We went with Option 2\u2014and it slashed inference costs to 35% of what we previously paid, while giving us full control over scaling and optimizations."}),"\n",(0,a.jsx)(n.h3,{id:"fixing-the-cold-start-problem",children:"Fixing the Cold Start Problem"}),"\n",(0,a.jsx)(n.p,{children:"As we onboarded more deep learning (DL) models, we hit a new bottleneck, new inference pods took 7\u20139 minutes to spin up."}),"\n",(0,a.jsx)(n.p,{children:"After profiling, we found the culprits:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"Triton\u2019s base image\u2014a massive 5GB"}),"\n",(0,a.jsx)(n.li,{children:"Model binaries\u2014often 1GB+"}),"\n",(0,a.jsx)(n.li,{children:"Startup delay\u2014mostly due to downloading and initializing these assets"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"To fix this, we built a lightweight Triton image, stripping unused components and shrinking the size to 900MB. This cut cold start times drastically, making auto-scaling faster and smoother."}),"\n",(0,a.jsx)(n.h2,{id:"embedding-search-the-last-piece-of-the-puzzle",children:"Embedding Search: The Last Piece of the Puzzle"}),"\n",(0,a.jsx)(n.p,{children:"By mid-2023, most of our ML stack had gone real-time\u2014except for Candidate Generation (CG), which still ran in batch mode. To truly power real-time recommendations, we needed an online embedding search system."}),"\n",(0,a.jsx)(n.h3,{id:"choosing-the-right-vector-database",children:"Choosing the Right Vector Database"}),"\n",(0,a.jsx)(n.p,{children:"We benchmarked three production-ready vector DBs across key parameters:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"Milvus"}),"\n",(0,a.jsx)(n.li,{children:"Qdrant"}),"\n",(0,a.jsx)(n.li,{children:"Weaviate"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"After extensive POCs, Qdrant stood out for its:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\u2705 Blazing-fast search latency on high-dimensional vectors"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Efficient memory usage, crucial for in-memory workloads"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Support for upserts and soft deletes, vital for Ads use cases"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 gRPC + REST APIs, making integration seamless"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Powerful filtering, allowing fine-tuned retrieval (e.g., filtering Ads by category, active status, etc.)"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"At its core, Qdrant uses HNSW indexing, delivering both high recall and low-latency nearest-neighbor search\u2014a perfect fit for our needs."}),"\n",(0,a.jsx)(n.h3,{id:"embedding-freshness--real-time-updates",children:"Embedding Freshness & Real-Time Updates"}),"\n",(0,a.jsx)(n.p,{children:"To ensure embeddings stayed up to date, we built a dual ingestion pipeline:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\udccc Daily Refresh: A bulk pipeline updated embeddings overnight"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\udccc Real-Time Updates: Ads events triggered immediate upserts/deletes"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:'This setup powered real-time "Similar Products" recommendations on the product page and became the foundation for Ads Candidate Generation, ensuring the right ads surfaced in milliseconds.'}),"\n",(0,a.jsx)(n.p,{children:(0,a.jsx)(n.img,{alt:"Skye",src:t(702).A+"",width:"1260",height:"644"})}),"\n",(0,a.jsx)(n.h2,{id:"final-takeaways-scaling-smartly-for-real-time-ml",children:"Final Takeaways: Scaling Smartly for Real-Time ML"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Self-hosted inference on Triton gave us lower cost, faster scaling, and better performance than managed services"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Building a custom Triton image reduced cold starts, improving responsiveness"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Qdrant-based embedding search enabled real-time personalization at scale"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Real-time updates for embeddings unlocked dynamic, up-to-date recommendations"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"By early 2024, Meesho\u2019s ML stack had evolved into a fully real-time, scalable, and cost-efficient system, setting the foundation for even bigger leaps ahead."})]})}function h(e={}){const{wrapper:n}={...(0,r.R)(),...e.components};return n?(0,a.jsx)(n,{...e,children:(0,a.jsx)(c,{...e})}):c(e)}},6e3:(e,n,t)=>{t.d(n,{A:()=>i});const i=t.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},8453:(e,n,t)=>{t.d(n,{R:()=>s,x:()=>l});var i=t(6540);const a={},r=i.createContext(a);function s(e){const n=i.useContext(r);return i.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function l(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(a):e.components||a:s(e.components),i.createElement(r.Provider,{value:n},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/fccc4c42.796edc5f.js b/docs/assets/js/fccc4c42.796edc5f.js new file mode 100644 index 00000000..be0b348f --- /dev/null +++ b/docs/assets/js/fccc4c42.796edc5f.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[2117],{702:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/vss-c482f6eac4c68b3219e4c562a6b717ec.png"},788:e=>{e.exports=JSON.parse('{"permalink":"/BharatMLStack/blog/post-three","editUrl":"https://github.com/Meesho/BharatMLStack/tree/main/docs/blog/bharatmlstack-history/post-three/index.md","source":"@site/blog/bharatmlstack-history/post-three/index.md","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","description":"BharatMLStack","date":"2024-05-21T00:00:00.000Z","tags":[{"inline":true,"label":"model-inference","permalink":"/BharatMLStack/blog/tags/model-inference"},{"inline":true,"label":"embedding-search","permalink":"/BharatMLStack/blog/tags/embedding-search"},{"inline":true,"label":"mlplatform","permalink":"/BharatMLStack/blog/tags/mlplatform"},{"inline":true,"label":"meesho","permalink":"/BharatMLStack/blog/tags/meesho"},{"inline":true,"label":"bharatmlstack","permalink":"/BharatMLStack/blog/tags/bharatmlstack"}],"readingTime":3.6,"hasTruncateMarker":false,"authors":[{"name":"Aditya Kumar","title":"Lead Software Engineer @ Meesho","url":"https://github.com/Adit2607","imageURL":"https://github.com/Adit2607.png","key":"aditya","page":null},{"name":"Jaya Kumar","title":"Lead ML Engineer @ Meesho","url":"https://github.com/jayakommuru","imageURL":"https://github.com/jayakommuru.png","key":"jaya","page":null},{"name":"Adarsha Das","title":"Senior Architect @ Meesho","url":"https://github.com/a0d00kc","imageURL":"https://github.com/a0d00kc.png","key":"adarsha","page":null}],"frontMatter":{"slug":"post-three","title":"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search","authors":["aditya","jaya","adarsha"],"date":"2024-05-21T00:00:00.000Z","tags":["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},"unlisted":false,"prevItem":{"title":"Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving","permalink":"/BharatMLStack/blog/post-four"},"nextItem":{"title":"Building Meesho\u2019s ML Platform: Lessons from the First-Gen System (Part 2)","permalink":"/BharatMLStack/blog/post-two"}}')},2561:(e,n,i)=>{i.r(n),i.d(n,{assets:()=>o,contentTitle:()=>l,default:()=>h,frontMatter:()=>s,metadata:()=>t,toc:()=>d});var t=i(788),a=i(4848),r=i(8453);const s={slug:"post-three",title:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search",authors:["aditya","jaya","adarsha"],date:new Date("2024-05-21T00:00:00.000Z"),tags:["model-inference","embedding-search","mlplatform","meesho","bharatmlstack"]},l=void 0,o={authorsImageUrls:[void 0,void 0,void 0]},d=[{value:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search",id:"cracking-the-code-scaling-model-inference--real-time-embedding-search",level:2},{value:"Breaking Free from the Scalability Ceiling",id:"breaking-free-from-the-scalability-ceiling",level:2},{value:"The Model Serving Bottleneck\u2014A Wake-Up Call",id:"the-model-serving-bottlenecka-wake-up-call",level:3},{value:"Scaling Triton on GKE",id:"scaling-triton-on-gke",level:3},{value:"Fixing the Cold Start Problem",id:"fixing-the-cold-start-problem",level:3},{value:"Embedding Search: The Last Piece of the Puzzle",id:"embedding-search-the-last-piece-of-the-puzzle",level:2},{value:"Choosing the Right Vector Database",id:"choosing-the-right-vector-database",level:3},{value:"Embedding Freshness & Real-Time Updates",id:"embedding-freshness--real-time-updates",level:3},{value:"Final Takeaways: Scaling Smartly for Real-Time ML",id:"final-takeaways-scaling-smartly-for-real-time-ml",level:2}];function c(e){const n={h2:"h2",h3:"h3",img:"img",li:"li",p:"p",ul:"ul",...(0,r.R)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(n.p,{children:(0,a.jsx)(n.img,{alt:"BharatMLStack",src:i(6e3).A+"",width:"1396",height:"460"})}),"\n",(0,a.jsx)(n.h2,{id:"cracking-the-code-scaling-model-inference--real-time-embedding-search",children:"Cracking the Code: Scaling Model Inference & Real-Time Embedding Search"}),"\n",(0,a.jsx)(n.p,{children:"By mid-2023, we had transformed our ML stack\u2014building a real-time feature store, optimizing model retrieval, and fine-tuning ranking. But two critical gaps remained:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\udd39 Scaling model inference without hitting infrastructure roadblocks"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\udd39 Moving embedding search from batch to real-time for candidate generation"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"Here\u2019s how we tackled these last-mile challenges, broke free from infrastructure constraints, and built a cost-efficient, high-performance system."}),"\n",(0,a.jsx)(n.h2,{id:"breaking-free-from-the-scalability-ceiling",children:"Breaking Free from the Scalability Ceiling"}),"\n",(0,a.jsx)(n.h3,{id:"the-model-serving-bottlenecka-wake-up-call",children:"The Model Serving Bottleneck\u2014A Wake-Up Call"}),"\n",(0,a.jsx)(n.p,{children:"July 2023. With just months left for the Mega Blockbuster Sale (MBS), we noticed a serious issue\u2014scaling our model-serving infrastructure was taking 10\u201315 minutes. In real-time ML, that\u2019s an eternity.\nIn one of our war rooms, we ran a quick experiment:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 We deployed an XGBoost model on a self-hosted Triton Inference Server running on a 16-core machine."}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Fired requests and compared the outputs with our existing cloud-hosted setup."}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 The results matched\u2014perfectly."}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:'That moment changed everything. We prepped a backup Triton setup on EKS, just in case our cloud provider couldn\'t allocate enough compute resources in time. Luckily, they did\u2014but the seed was planted.\nThen in October, just two weeks before MBS, we got an alarming response from our infrastructure team:\n"Node availability may be an issue."\nWith no time to waste, we moved 30% of real-time ML traffic to our self-hosted Triton cluster. The results?'}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\u2705 p99 latency dropped from 90\u2013100ms to 30\u201340ms"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Triton handled significantly higher throughput on fewer resources"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 No model changes were needed"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"MBS ran without a hitch, proving that self-hosted inference was the way forward."}),"\n",(0,a.jsx)(n.h3,{id:"scaling-triton-on-gke",children:"Scaling Triton on GKE"}),"\n",(0,a.jsx)(n.p,{children:"This left us with two choices:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"1\ufe0f\u20e3 Port models to a managed cloud inference service, investing time in learning a new deployment stack"}),"\n",(0,a.jsx)(n.li,{children:"2\ufe0f\u20e3 Scale our existing Triton setup on GKE, optimizing for cost and performance"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"We went with Option 2\u2014and it slashed inference costs to 35% of what we previously paid, while giving us full control over scaling and optimizations."}),"\n",(0,a.jsx)(n.h3,{id:"fixing-the-cold-start-problem",children:"Fixing the Cold Start Problem"}),"\n",(0,a.jsx)(n.p,{children:"As we onboarded more deep learning (DL) models, we hit a new bottleneck, new inference pods took 7\u20139 minutes to spin up."}),"\n",(0,a.jsx)(n.p,{children:"After profiling, we found the culprits:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"Triton\u2019s base image\u2014a massive 5GB"}),"\n",(0,a.jsx)(n.li,{children:"Model binaries\u2014often 1GB+"}),"\n",(0,a.jsx)(n.li,{children:"Startup delay\u2014mostly due to downloading and initializing these assets"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"To fix this, we built a lightweight Triton image, stripping unused components and shrinking the size to 900MB. This cut cold start times drastically, making auto-scaling faster and smoother."}),"\n",(0,a.jsx)(n.h2,{id:"embedding-search-the-last-piece-of-the-puzzle",children:"Embedding Search: The Last Piece of the Puzzle"}),"\n",(0,a.jsx)(n.p,{children:"By mid-2023, most of our ML stack had gone real-time\u2014except for Candidate Generation (CG), which still ran in batch mode. To truly power real-time recommendations, we needed an online embedding search system."}),"\n",(0,a.jsx)(n.h3,{id:"choosing-the-right-vector-database",children:"Choosing the Right Vector Database"}),"\n",(0,a.jsx)(n.p,{children:"We benchmarked three production-ready vector DBs across key parameters:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"Milvus"}),"\n",(0,a.jsx)(n.li,{children:"Qdrant"}),"\n",(0,a.jsx)(n.li,{children:"Weaviate"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"After extensive POCs, Qdrant stood out for its:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\u2705 Blazing-fast search latency on high-dimensional vectors"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Efficient memory usage, crucial for in-memory workloads"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Support for upserts and soft deletes, vital for Ads use cases"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 gRPC + REST APIs, making integration seamless"}),"\n",(0,a.jsx)(n.li,{children:"\u2705 Powerful filtering, allowing fine-tuned retrieval (e.g., filtering Ads by category, active status, etc.)"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"At its core, Qdrant uses HNSW indexing, delivering both high recall and low-latency nearest-neighbor search\u2014a perfect fit for our needs."}),"\n",(0,a.jsx)(n.h3,{id:"embedding-freshness--real-time-updates",children:"Embedding Freshness & Real-Time Updates"}),"\n",(0,a.jsx)(n.p,{children:"To ensure embeddings stayed up to date, we built a dual ingestion pipeline:"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\udccc Daily Refresh: A bulk pipeline updated embeddings overnight"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\udccc Real-Time Updates: Ads events triggered immediate upserts/deletes"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:'This setup powered real-time "Similar Products" recommendations on the product page and became the foundation for Ads Candidate Generation, ensuring the right ads surfaced in milliseconds.'}),"\n",(0,a.jsx)(n.p,{children:(0,a.jsx)(n.img,{alt:"Skye",src:i(702).A+"",width:"1260",height:"644"})}),"\n",(0,a.jsx)(n.h2,{id:"final-takeaways-scaling-smartly-for-real-time-ml",children:"Final Takeaways: Scaling Smartly for Real-Time ML"}),"\n",(0,a.jsxs)(n.ul,{children:["\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Self-hosted inference on Triton gave us lower cost, faster scaling, and better performance than managed services"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Building a custom Triton image reduced cold starts, improving responsiveness"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Qdrant-based embedding search enabled real-time personalization at scale"}),"\n",(0,a.jsx)(n.li,{children:"\ud83d\ude80 Real-time updates for embeddings unlocked dynamic, up-to-date recommendations"}),"\n"]}),"\n",(0,a.jsx)(n.p,{children:"By early 2024, Meesho\u2019s ML stack had evolved into a fully real-time, scalable, and cost-efficient system, setting the foundation for even bigger leaps ahead."})]})}function h(e={}){const{wrapper:n}={...(0,r.R)(),...e.components};return n?(0,a.jsx)(n,{...e,children:(0,a.jsx)(c,{...e})}):c(e)}},6e3:(e,n,i)=>{i.d(n,{A:()=>t});const t=i.p+"assets/images/bms-7399e8796d2cd24617c432518ce3f312.png"},8453:(e,n,i)=>{i.d(n,{R:()=>s,x:()=>l});var t=i(6540);const a={},r=t.createContext(a);function s(e){const n=t.useContext(r);return t.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function l(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(a):e.components||a:s(e.components),t.createElement(r.Provider,{value:n},e.children)}}}]); \ No newline at end of file diff --git a/docs/assets/js/main.3e15e71d.js b/docs/assets/js/main.5b79a858.js similarity index 69% rename from docs/assets/js/main.3e15e71d.js rename to docs/assets/js/main.5b79a858.js index 1a92bb08..ba855303 100644 --- a/docs/assets/js/main.3e15e71d.js +++ b/docs/assets/js/main.5b79a858.js @@ -1,2 +1,2 @@ -/*! For license information please see main.3e15e71d.js.LICENSE.txt */ -(self.webpackChunkdocs=self.webpackChunkdocs||[]).push([[8792],{115:e=>{var t="undefined"!=typeof Element,n="function"==typeof Map,r="function"==typeof Set,a="function"==typeof ArrayBuffer&&!!ArrayBuffer.isView;function o(e,i){if(e===i)return!0;if(e&&i&&"object"==typeof e&&"object"==typeof i){if(e.constructor!==i.constructor)return!1;var l,s,c,u;if(Array.isArray(e)){if((l=e.length)!=i.length)return!1;for(s=l;0!==s--;)if(!o(e[s],i[s]))return!1;return!0}if(n&&e instanceof Map&&i instanceof Map){if(e.size!==i.size)return!1;for(u=e.entries();!(s=u.next()).done;)if(!i.has(s.value[0]))return!1;for(u=e.entries();!(s=u.next()).done;)if(!o(s.value[1],i.get(s.value[0])))return!1;return!0}if(r&&e instanceof Set&&i instanceof Set){if(e.size!==i.size)return!1;for(u=e.entries();!(s=u.next()).done;)if(!i.has(s.value[0]))return!1;return!0}if(a&&ArrayBuffer.isView(e)&&ArrayBuffer.isView(i)){if((l=e.length)!=i.length)return!1;for(s=l;0!==s--;)if(e[s]!==i[s])return!1;return!0}if(e.constructor===RegExp)return e.source===i.source&&e.flags===i.flags;if(e.valueOf!==Object.prototype.valueOf&&"function"==typeof e.valueOf&&"function"==typeof i.valueOf)return e.valueOf()===i.valueOf();if(e.toString!==Object.prototype.toString&&"function"==typeof e.toString&&"function"==typeof i.toString)return e.toString()===i.toString();if((l=(c=Object.keys(e)).length)!==Object.keys(i).length)return!1;for(s=l;0!==s--;)if(!Object.prototype.hasOwnProperty.call(i,c[s]))return!1;if(t&&e instanceof Element)return!1;for(s=l;0!==s--;)if(("_owner"!==c[s]&&"__v"!==c[s]&&"__o"!==c[s]||!e.$$typeof)&&!o(e[c[s]],i[c[s]]))return!1;return!0}return e!=e&&i!=i}e.exports=function(e,t){try{return o(e,t)}catch(n){if((n.message||"").match(/stack|recursion/i))return console.warn("react-fast-compare cannot handle circular refs"),!1;throw n}}},119:(e,t,n)=>{"use strict";n.r(t)},205:(e,t,n)=>{"use strict";n.d(t,{A:()=>a});var r=n(6540);const a=n(8193).A.canUseDOM?r.useLayoutEffect:r.useEffect},253:(e,t)=>{"use strict";Object.defineProperty(t,"__esModule",{value:!0}),t.getErrorCausalChain=function e(t){if(t.cause)return[t,...e(t.cause)];return[t]}},311:e=>{"use strict";e.exports=function(e,t,n,r,a,o,i,l){if(!e){var s;if(void 0===t)s=new Error("Minified exception occurred; use the non-minified dev environment for the full error message and additional helpful warnings.");else{var c=[n,r,a,o,i,l],u=0;(s=new Error(t.replace(/%s/g,function(){return c[u++]}))).name="Invariant Violation"}throw s.framesToPop=1,s}}},418:(e,t,n)=>{"use strict";n.d(t,{A:()=>r});const r=()=>null},440:(e,t,n)=>{"use strict";t.rA=t.Ks=t.LU=void 0;const r=n(1635);t.LU="__blog-post-container";var a=n(2983);Object.defineProperty(t,"Ks",{enumerable:!0,get:function(){return r.__importDefault(a).default}});var o=n(2566);var i=n(253);Object.defineProperty(t,"rA",{enumerable:!0,get:function(){return i.getErrorCausalChain}})},545:(e,t,n)=>{"use strict";n.d(t,{mg:()=>J,vd:()=>G});var r=n(6540),a=n(5556),o=n.n(a),i=n(115),l=n.n(i),s=n(311),c=n.n(s),u=n(2833),d=n.n(u);function f(){return f=Object.assign||function(e){for(var t=1;t=0||(a[n]=e[n]);return a}var g={BASE:"base",BODY:"body",HEAD:"head",HTML:"html",LINK:"link",META:"meta",NOSCRIPT:"noscript",SCRIPT:"script",STYLE:"style",TITLE:"title",FRAGMENT:"Symbol(react.fragment)"},b={rel:["amphtml","canonical","alternate"]},y={type:["application/ld+json"]},v={charset:"",name:["robots","description"],property:["og:type","og:title","og:url","og:image","og:image:alt","og:description","twitter:url","twitter:title","twitter:description","twitter:image","twitter:image:alt","twitter:card","twitter:site"]},w=Object.keys(g).map(function(e){return g[e]}),k={accesskey:"accessKey",charset:"charSet",class:"className",contenteditable:"contentEditable",contextmenu:"contextMenu","http-equiv":"httpEquiv",itemprop:"itemProp",tabindex:"tabIndex"},S=Object.keys(k).reduce(function(e,t){return e[k[t]]=t,e},{}),x=function(e,t){for(var n=e.length-1;n>=0;n-=1){var r=e[n];if(Object.prototype.hasOwnProperty.call(r,t))return r[t]}return null},_=function(e){var t=x(e,g.TITLE),n=x(e,"titleTemplate");if(Array.isArray(t)&&(t=t.join("")),n&&t)return n.replace(/%s/g,function(){return t});var r=x(e,"defaultTitle");return t||r||void 0},E=function(e){return x(e,"onChangeClientState")||function(){}},C=function(e,t){return t.filter(function(t){return void 0!==t[e]}).map(function(t){return t[e]}).reduce(function(e,t){return f({},e,t)},{})},A=function(e,t){return t.filter(function(e){return void 0!==e[g.BASE]}).map(function(e){return e[g.BASE]}).reverse().reduce(function(t,n){if(!t.length)for(var r=Object.keys(n),a=0;a/g,">").replace(/"/g,""").replace(/'/g,"'")},R=function(e){return Object.keys(e).reduce(function(t,n){var r=void 0!==e[n]?n+'="'+e[n]+'"':""+n;return t?t+" "+r:r},"")},D=function(e,t){return void 0===t&&(t={}),Object.keys(e).reduce(function(t,n){return t[k[n]||n]=e[n],t},t)},B=function(e,t){return t.map(function(t,n){var a,o=((a={key:n})["data-rh"]=!0,a);return Object.keys(t).forEach(function(e){var n=k[e]||e;"innerHTML"===n||"cssText"===n?o.dangerouslySetInnerHTML={__html:t.innerHTML||t.cssText}:o[n]=t[e]}),r.createElement(e,o)})},F=function(e,t,n){switch(e){case g.TITLE:return{toComponent:function(){return n=t.titleAttributes,(a={key:e=t.title})["data-rh"]=!0,o=D(n,a),[r.createElement(g.TITLE,o,e)];var e,n,a,o},toString:function(){return function(e,t,n,r){var a=R(n),o=j(t);return a?"<"+e+' data-rh="true" '+a+">"+O(o,r)+"":"<"+e+' data-rh="true">'+O(o,r)+""}(e,t.title,t.titleAttributes,n)}};case"bodyAttributes":case"htmlAttributes":return{toComponent:function(){return D(t)},toString:function(){return R(t)}};default:return{toComponent:function(){return B(e,t)},toString:function(){return function(e,t,n){return t.reduce(function(t,r){var a=Object.keys(r).filter(function(e){return!("innerHTML"===e||"cssText"===e)}).reduce(function(e,t){var a=void 0===r[t]?t:t+'="'+O(r[t],n)+'"';return e?e+" "+a:a},""),o=r.innerHTML||r.cssText||"",i=-1===N.indexOf(e);return t+"<"+e+' data-rh="true" '+a+(i?"/>":">"+o+"")},"")}(e,t,n)}}}},I=function(e){var t=e.baseTag,n=e.bodyAttributes,r=e.encode,a=e.htmlAttributes,o=e.noscriptTags,i=e.styleTags,l=e.title,s=void 0===l?"":l,c=e.titleAttributes,u=e.linkTags,d=e.metaTags,f=e.scriptTags,p={toComponent:function(){},toString:function(){return""}};if(e.prioritizeSeoTags){var h=function(e){var t=e.linkTags,n=e.scriptTags,r=e.encode,a=P(e.metaTags,v),o=P(t,b),i=P(n,y);return{priorityMethods:{toComponent:function(){return[].concat(B(g.META,a.priority),B(g.LINK,o.priority),B(g.SCRIPT,i.priority))},toString:function(){return F(g.META,a.priority,r)+" "+F(g.LINK,o.priority,r)+" "+F(g.SCRIPT,i.priority,r)}},metaTags:a.default,linkTags:o.default,scriptTags:i.default}}(e);p=h.priorityMethods,u=h.linkTags,d=h.metaTags,f=h.scriptTags}return{priority:p,base:F(g.BASE,t,r),bodyAttributes:F("bodyAttributes",n,r),htmlAttributes:F("htmlAttributes",a,r),link:F(g.LINK,u,r),meta:F(g.META,d,r),noscript:F(g.NOSCRIPT,o,r),script:F(g.SCRIPT,f,r),style:F(g.STYLE,i,r),title:F(g.TITLE,{title:s,titleAttributes:c},r)}},z=[],$=function(e,t){var n=this;void 0===t&&(t="undefined"!=typeof document),this.instances=[],this.value={setHelmet:function(e){n.context.helmet=e},helmetInstances:{get:function(){return n.canUseDOM?z:n.instances},add:function(e){(n.canUseDOM?z:n.instances).push(e)},remove:function(e){var t=(n.canUseDOM?z:n.instances).indexOf(e);(n.canUseDOM?z:n.instances).splice(t,1)}}},this.context=e,this.canUseDOM=t,t||(e.helmet=I({baseTag:[],bodyAttributes:{},encodeSpecialCharacters:!0,htmlAttributes:{},linkTags:[],metaTags:[],noscriptTags:[],scriptTags:[],styleTags:[],title:"",titleAttributes:{}}))},U=r.createContext({}),q=o().shape({setHelmet:o().func,helmetInstances:o().shape({get:o().func,add:o().func,remove:o().func})}),H="undefined"!=typeof document,G=function(e){function t(n){var r;return(r=e.call(this,n)||this).helmetData=new $(r.props.context,t.canUseDOM),r}return p(t,e),t.prototype.render=function(){return r.createElement(U.Provider,{value:this.helmetData.value},this.props.children)},t}(r.Component);G.canUseDOM=H,G.propTypes={context:o().shape({helmet:o().shape()}),children:o().node.isRequired},G.defaultProps={context:{}},G.displayName="HelmetProvider";var V=function(e,t){var n,r=document.head||document.querySelector(g.HEAD),a=r.querySelectorAll(e+"[data-rh]"),o=[].slice.call(a),i=[];return t&&t.length&&t.forEach(function(t){var r=document.createElement(e);for(var a in t)Object.prototype.hasOwnProperty.call(t,a)&&("innerHTML"===a?r.innerHTML=t.innerHTML:"cssText"===a?r.styleSheet?r.styleSheet.cssText=t.cssText:r.appendChild(document.createTextNode(t.cssText)):r.setAttribute(a,void 0===t[a]?"":t[a]));r.setAttribute("data-rh","true"),o.some(function(e,t){return n=t,r.isEqualNode(e)})?o.splice(n,1):i.push(r)}),o.forEach(function(e){return e.parentNode.removeChild(e)}),i.forEach(function(e){return r.appendChild(e)}),{oldTags:o,newTags:i}},W=function(e,t){var n=document.getElementsByTagName(e)[0];if(n){for(var r=n.getAttribute("data-rh"),a=r?r.split(","):[],o=[].concat(a),i=Object.keys(t),l=0;l=0;d-=1)n.removeAttribute(o[d]);a.length===o.length?n.removeAttribute("data-rh"):n.getAttribute("data-rh")!==i.join(",")&&n.setAttribute("data-rh",i.join(","))}},Q=function(e,t){var n=e.baseTag,r=e.htmlAttributes,a=e.linkTags,o=e.metaTags,i=e.noscriptTags,l=e.onChangeClientState,s=e.scriptTags,c=e.styleTags,u=e.title,d=e.titleAttributes;W(g.BODY,e.bodyAttributes),W(g.HTML,r),function(e,t){void 0!==e&&document.title!==e&&(document.title=j(e)),W(g.TITLE,t)}(u,d);var f={baseTag:V(g.BASE,n),linkTags:V(g.LINK,a),metaTags:V(g.META,o),noscriptTags:V(g.NOSCRIPT,i),scriptTags:V(g.SCRIPT,s),styleTags:V(g.STYLE,c)},p={},h={};Object.keys(f).forEach(function(e){var t=f[e],n=t.newTags,r=t.oldTags;n.length&&(p[e]=n),r.length&&(h[e]=f[e].oldTags)}),t&&t(),l(e,p,h)},K=null,Y=function(e){function t(){for(var t,n=arguments.length,r=new Array(n),a=0;a elements are self-closing and can not contain children. Refer to our API for more information.")}},n.flattenArrayTypeChildren=function(e){var t,n=e.child,r=e.arrayTypeChildren;return f({},r,((t={})[n.type]=[].concat(r[n.type]||[],[f({},e.newChildProps,this.mapNestedChildrenToProps(n,e.nestedChildren))]),t))},n.mapObjectTypeChildren=function(e){var t,n,r=e.child,a=e.newProps,o=e.newChildProps,i=e.nestedChildren;switch(r.type){case g.TITLE:return f({},a,((t={})[r.type]=i,t.titleAttributes=f({},o),t));case g.BODY:return f({},a,{bodyAttributes:f({},o)});case g.HTML:return f({},a,{htmlAttributes:f({},o)});default:return f({},a,((n={})[r.type]=f({},o),n))}},n.mapArrayTypeChildrenToProps=function(e,t){var n=f({},t);return Object.keys(e).forEach(function(t){var r;n=f({},n,((r={})[t]=e[t],r))}),n},n.warnOnInvalidChildren=function(e,t){return c()(w.some(function(t){return e.type===t}),"function"==typeof e.type?"You may be attempting to nest components within each other, which is not allowed. Refer to our API for more information.":"Only elements types "+w.join(", ")+" are allowed. Helmet does not support rendering <"+e.type+"> elements. Refer to our API for more information."),c()(!t||"string"==typeof t||Array.isArray(t)&&!t.some(function(e){return"string"!=typeof e}),"Helmet expects a string as a child of <"+e.type+">. Did you forget to wrap your children in braces? ( <"+e.type+">{``} ) Refer to our API for more information."),!0},n.mapChildrenToProps=function(e,t){var n=this,a={};return r.Children.forEach(e,function(e){if(e&&e.props){var r=e.props,o=r.children,i=m(r,X),l=Object.keys(i).reduce(function(e,t){return e[S[t]||t]=i[t],e},{}),s=e.type;switch("symbol"==typeof s?s=s.toString():n.warnOnInvalidChildren(e,o),s){case g.FRAGMENT:t=n.mapChildrenToProps(o,t);break;case g.LINK:case g.META:case g.NOSCRIPT:case g.SCRIPT:case g.STYLE:a=n.flattenArrayTypeChildren({child:e,arrayTypeChildren:a,newChildProps:l,nestedChildren:o});break;default:t=n.mapObjectTypeChildren({child:e,newProps:t,newChildProps:l,nestedChildren:o})}}}),this.mapArrayTypeChildrenToProps(a,t)},n.render=function(){var e=this.props,t=e.children,n=m(e,Z),a=f({},n),o=n.helmetData;return t&&(a=this.mapChildrenToProps(t,a)),!o||o instanceof $||(o=new $(o.context,o.instances)),o?r.createElement(Y,f({},a,{context:o.value,helmetData:void 0})):r.createElement(U.Consumer,null,function(e){return r.createElement(Y,f({},a,{context:e}))})},t}(r.Component);J.propTypes={base:o().object,bodyAttributes:o().object,children:o().oneOfType([o().arrayOf(o().node),o().node]),defaultTitle:o().string,defer:o().bool,encodeSpecialCharacters:o().bool,htmlAttributes:o().object,link:o().arrayOf(o().object),meta:o().arrayOf(o().object),noscript:o().arrayOf(o().object),onChangeClientState:o().func,script:o().arrayOf(o().object),style:o().arrayOf(o().object),title:o().string,titleAttributes:o().object,titleTemplate:o().string,prioritizeSeoTags:o().bool,helmetData:o().object},J.defaultProps={defer:!0,encodeSpecialCharacters:!0,prioritizeSeoTags:!1},J.displayName="Helmet"},609:(e,t,n)=>{"use strict";n.d(t,{V:()=>s,t:()=>c});var r=n(6540),a=n(9532),o=n(4848);const i=Symbol("EmptyContext"),l=r.createContext(i);function s({children:e,name:t,items:n}){const a=(0,r.useMemo)(()=>t&&n?{name:t,items:n}:null,[t,n]);return(0,o.jsx)(l.Provider,{value:a,children:e})}function c(){const e=(0,r.useContext)(l);if(e===i)throw new a.dV("DocsSidebarProvider");return e}},679:(e,t,n)=>{"use strict";n.d(t,{Wf:()=>c});n(6540);const r=JSON.parse('{"N":"localStorage","M":""}'),a=r.N;function o({key:e,oldValue:t,newValue:n,storage:r}){if(t===n)return;const a=document.createEvent("StorageEvent");a.initStorageEvent("storage",!1,!1,e,t,n,window.location.href,r),window.dispatchEvent(a)}function i(e=a){if("undefined"==typeof window)throw new Error("Browser storage is not available on Node.js/Docusaurus SSR process.");if("none"===e)return null;try{return window[e]}catch(n){return t=n,l||(console.warn("Docusaurus browser storage is not available.\nPossible reasons: running Docusaurus in an iframe, in an incognito browser session, or using too strict browser privacy settings.",t),l=!0),null}var t}let l=!1;const s={get:()=>null,set:()=>{},del:()=>{},listen:()=>()=>{}};function c(e,t){const n=`${e}${r.M}`;if("undefined"==typeof window)return function(e){function t(){throw new Error(`Illegal storage API usage for storage key "${e}".\nDocusaurus storage APIs are not supposed to be called on the server-rendering process.\nPlease only call storage APIs in effects and event handlers.`)}return{get:t,set:t,del:t,listen:t}}(n);const a=i(t?.persistence);return null===a?s:{get:()=>{try{return a.getItem(n)}catch(e){return console.error(`Docusaurus storage error, can't get key=${n}`,e),null}},set:e=>{try{const t=a.getItem(n);a.setItem(n,e),o({key:n,oldValue:t,newValue:e,storage:a})}catch(t){console.error(`Docusaurus storage error, can't set ${n}=${e}`,t)}},del:()=>{try{const e=a.getItem(n);a.removeItem(n),o({key:n,oldValue:e,newValue:null,storage:a})}catch(e){console.error(`Docusaurus storage error, can't delete key=${n}`,e)}},listen:e=>{try{const t=t=>{t.storageArea===a&&t.key===n&&e(t)};return window.addEventListener("storage",t),()=>window.removeEventListener("storage",t)}catch(t){return console.error(`Docusaurus storage error, can't listen for changes of key=${n}`,t),()=>{}}}}}},961:(e,t,n)=>{"use strict";!function e(){if("undefined"!=typeof __REACT_DEVTOOLS_GLOBAL_HOOK__&&"function"==typeof __REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE)try{__REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE(e)}catch(t){console.error(t)}}(),e.exports=n(6221)},1043:(e,t,n)=>{"use strict";n.r(t)},1107:(e,t,n)=>{"use strict";n.d(t,{A:()=>u});n(6540);var r=n(4164),a=n(1312),o=n(6342),i=n(8774),l=n(3427);const s={anchorWithStickyNavbar:"anchorWithStickyNavbar_LWe7",anchorWithHideOnScrollNavbar:"anchorWithHideOnScrollNavbar_WYt5"};var c=n(4848);function u({as:e,id:t,...n}){const u=(0,l.A)(),{navbar:{hideOnScroll:d}}=(0,o.p)();if("h1"===e||!t)return(0,c.jsx)(e,{...n,id:void 0});u.collectAnchor(t);const f=(0,a.T)({id:"theme.common.headingLinkTitle",message:"Direct link to {heading}",description:"Title for link to heading"},{heading:"string"==typeof n.children?n.children:t});return(0,c.jsxs)(e,{...n,className:(0,r.A)("anchor",d?s.anchorWithHideOnScrollNavbar:s.anchorWithStickyNavbar,n.className),id:t,children:[n.children,(0,c.jsx)(i.A,{className:"hash-link",to:`#${t}`,"aria-label":f,title:f,children:"\u200b"})]})}},1122:(e,t,n)=>{"use strict";n.d(t,{A:()=>u});var r=n(6540),a=n(4164),o=n(2303),i=n(5293);const l={themedComponent:"themedComponent_mlkZ","themedComponent--light":"themedComponent--light_NVdE","themedComponent--dark":"themedComponent--dark_xIcU"};var s=n(4848);function c({className:e,children:t}){const n=(0,o.A)(),{colorMode:c}=(0,i.G)();return(0,s.jsx)(s.Fragment,{children:(n?"dark"===c?["dark"]:["light"]:["light","dark"]).map(n=>{const o=t({theme:n,className:(0,a.A)(e,l.themedComponent,l[`themedComponent--${n}`])});return(0,s.jsx)(r.Fragment,{children:o},n)})})}function u(e){const{sources:t,className:n,alt:r,...a}=e;return(0,s.jsx)(c,{className:n,children:({theme:e,className:n})=>(0,s.jsx)("img",{src:t[e],alt:r,className:n,...a})})}},1247:(e,t,n)=>{"use strict";var r=n(9982),a=n(6540),o=n(961);function i(e){var t="https://react.dev/errors/"+e;if(1F||(e.current=B[F],B[F]=null,F--)}function $(e,t){F++,B[F]=e.current,e.current=t}var U=I(null),q=I(null),H=I(null),G=I(null);function V(e,t){switch($(H,t),$(q,e),$(U,null),t.nodeType){case 9:case 11:e=(e=t.documentElement)&&(e=e.namespaceURI)?ad(e):0;break;default:if(e=t.tagName,t=t.namespaceURI)e=od(t=ad(t),e);else switch(e){case"svg":e=1;break;case"math":e=2;break;default:e=0}}z(U),$(U,e)}function W(){z(U),z(q),z(H)}function Q(e){null!==e.memoizedState&&$(G,e);var t=U.current,n=od(t,e.type);t!==n&&($(q,e),$(U,n))}function K(e){q.current===e&&(z(U),z(q)),G.current===e&&(z(G),Qd._currentValue=D)}var Y=Object.prototype.hasOwnProperty,X=r.unstable_scheduleCallback,Z=r.unstable_cancelCallback,J=r.unstable_shouldYield,ee=r.unstable_requestPaint,te=r.unstable_now,ne=r.unstable_getCurrentPriorityLevel,re=r.unstable_ImmediatePriority,ae=r.unstable_UserBlockingPriority,oe=r.unstable_NormalPriority,ie=r.unstable_LowPriority,le=r.unstable_IdlePriority,se=r.log,ce=r.unstable_setDisableYieldValue,ue=null,de=null;function fe(e){if("function"==typeof se&&ce(e),de&&"function"==typeof de.setStrictMode)try{de.setStrictMode(ue,e)}catch(t){}}var pe=Math.clz32?Math.clz32:function(e){return 0===(e>>>=0)?32:31-(he(e)/me|0)|0},he=Math.log,me=Math.LN2;var ge=256,be=4194304;function ye(e){var t=42&e;if(0!==t)return t;switch(e&-e){case 1:return 1;case 2:return 2;case 4:return 4;case 8:return 8;case 16:return 16;case 32:return 32;case 64:return 64;case 128:return 128;case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return 4194048&e;case 4194304:case 8388608:case 16777216:case 33554432:return 62914560&e;case 67108864:return 67108864;case 134217728:return 134217728;case 268435456:return 268435456;case 536870912:return 536870912;case 1073741824:return 0;default:return e}}function ve(e,t,n){var r=e.pendingLanes;if(0===r)return 0;var a=0,o=e.suspendedLanes,i=e.pingedLanes;e=e.warmLanes;var l=134217727&r;return 0!==l?0!==(r=l&~o)?a=ye(r):0!==(i&=l)?a=ye(i):n||0!==(n=l&~e)&&(a=ye(n)):0!==(l=r&~o)?a=ye(l):0!==i?a=ye(i):n||0!==(n=r&~e)&&(a=ye(n)),0===a?0:0!==t&&t!==a&&0===(t&o)&&((o=a&-a)>=(n=t&-t)||32===o&&4194048&n)?t:a}function we(e,t){return 0===(e.pendingLanes&~(e.suspendedLanes&~e.pingedLanes)&t)}function ke(e,t){switch(e){case 1:case 2:case 4:case 8:case 64:return t+250;case 16:case 32:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return t+5e3;default:return-1}}function Se(){var e=ge;return!(4194048&(ge<<=1))&&(ge=256),e}function xe(){var e=be;return!(62914560&(be<<=1))&&(be=4194304),e}function _e(e){for(var t=[],n=0;31>n;n++)t.push(e);return t}function Ee(e,t){e.pendingLanes|=t,268435456!==t&&(e.suspendedLanes=0,e.pingedLanes=0,e.warmLanes=0)}function Ce(e,t,n){e.pendingLanes|=t,e.suspendedLanes&=~t;var r=31-pe(t);e.entangledLanes|=t,e.entanglements[r]=1073741824|e.entanglements[r]|4194090&n}function Ae(e,t){var n=e.entangledLanes|=t;for(e=e.entanglements;n;){var r=31-pe(n),a=1<)":-1--a||s[r]!==c[a]){var u="\n"+s[r].replace(" at new "," at ");return e.displayName&&u.includes("")&&(u=u.replace("",e.displayName)),u}}while(1<=r&&0<=a);break}}}finally{ot=!1,Error.prepareStackTrace=n}return(n=e?e.displayName||e.name:"")?at(n):""}function lt(e){switch(e.tag){case 26:case 27:case 5:return at(e.type);case 16:return at("Lazy");case 13:return at("Suspense");case 19:return at("SuspenseList");case 0:case 15:return it(e.type,!1);case 11:return it(e.type.render,!1);case 1:return it(e.type,!0);case 31:return at("Activity");default:return""}}function st(e){try{var t="";do{t+=lt(e),e=e.return}while(e);return t}catch(n){return"\nError generating stack: "+n.message+"\n"+n.stack}}function ct(e){switch(typeof e){case"bigint":case"boolean":case"number":case"string":case"undefined":case"object":return e;default:return""}}function ut(e){var t=e.type;return(e=e.nodeName)&&"input"===e.toLowerCase()&&("checkbox"===t||"radio"===t)}function dt(e){e._valueTracker||(e._valueTracker=function(e){var t=ut(e)?"checked":"value",n=Object.getOwnPropertyDescriptor(e.constructor.prototype,t),r=""+e[t];if(!e.hasOwnProperty(t)&&void 0!==n&&"function"==typeof n.get&&"function"==typeof n.set){var a=n.get,o=n.set;return Object.defineProperty(e,t,{configurable:!0,get:function(){return a.call(this)},set:function(e){r=""+e,o.call(this,e)}}),Object.defineProperty(e,t,{enumerable:n.enumerable}),{getValue:function(){return r},setValue:function(e){r=""+e},stopTracking:function(){e._valueTracker=null,delete e[t]}}}}(e))}function ft(e){if(!e)return!1;var t=e._valueTracker;if(!t)return!0;var n=t.getValue(),r="";return e&&(r=ut(e)?e.checked?"true":"false":e.value),(e=r)!==n&&(t.setValue(e),!0)}function pt(e){if(void 0===(e=e||("undefined"!=typeof document?document:void 0)))return null;try{return e.activeElement||e.body}catch(t){return e.body}}var ht=/[\n"\\]/g;function mt(e){return e.replace(ht,function(e){return"\\"+e.charCodeAt(0).toString(16)+" "})}function gt(e,t,n,r,a,o,i,l){e.name="",null!=i&&"function"!=typeof i&&"symbol"!=typeof i&&"boolean"!=typeof i?e.type=i:e.removeAttribute("type"),null!=t?"number"===i?(0===t&&""===e.value||e.value!=t)&&(e.value=""+ct(t)):e.value!==""+ct(t)&&(e.value=""+ct(t)):"submit"!==i&&"reset"!==i||e.removeAttribute("value"),null!=t?yt(e,i,ct(t)):null!=n?yt(e,i,ct(n)):null!=r&&e.removeAttribute("value"),null==a&&null!=o&&(e.defaultChecked=!!o),null!=a&&(e.checked=a&&"function"!=typeof a&&"symbol"!=typeof a),null!=l&&"function"!=typeof l&&"symbol"!=typeof l&&"boolean"!=typeof l?e.name=""+ct(l):e.removeAttribute("name")}function bt(e,t,n,r,a,o,i,l){if(null!=o&&"function"!=typeof o&&"symbol"!=typeof o&&"boolean"!=typeof o&&(e.type=o),null!=t||null!=n){if(("submit"===o||"reset"===o)&&null==t)return;n=null!=n?""+ct(n):"",t=null!=t?""+ct(t):n,l||t===e.value||(e.value=t),e.defaultValue=t}r="function"!=typeof(r=null!=r?r:a)&&"symbol"!=typeof r&&!!r,e.checked=l?e.checked:!!r,e.defaultChecked=!!r,null!=i&&"function"!=typeof i&&"symbol"!=typeof i&&"boolean"!=typeof i&&(e.name=i)}function yt(e,t,n){"number"===t&&pt(e.ownerDocument)===e||e.defaultValue===""+n||(e.defaultValue=""+n)}function vt(e,t,n,r){if(e=e.options,t){t={};for(var a=0;a=xn),Cn=String.fromCharCode(32),An=!1;function Ln(e,t){switch(e){case"keyup":return-1!==kn.indexOf(t.keyCode);case"keydown":return 229!==t.keyCode;case"keypress":case"mousedown":case"focusout":return!0;default:return!1}}function Tn(e){return"object"==typeof(e=e.detail)&&"data"in e?e.data:null}var jn=!1;var Pn={color:!0,date:!0,datetime:!0,"datetime-local":!0,email:!0,month:!0,number:!0,password:!0,range:!0,search:!0,tel:!0,text:!0,time:!0,url:!0,week:!0};function Mn(e){var t=e&&e.nodeName&&e.nodeName.toLowerCase();return"input"===t?!!Pn[e.type]:"textarea"===t}function Nn(e,t,n,r){Mt?Nt?Nt.push(r):Nt=[r]:Mt=r,0<(t=Hu(t,"onChange")).length&&(n=new Jt("onChange","change",null,n,r),e.push({event:n,listeners:t}))}var On=null,Rn=null;function Dn(e){Du(e,0)}function Bn(e){if(ft(qe(e)))return e}function Fn(e,t){if("change"===e)return t}var In=!1;if(Ft){var zn;if(Ft){var $n="oninput"in document;if(!$n){var Un=document.createElement("div");Un.setAttribute("oninput","return;"),$n="function"==typeof Un.oninput}zn=$n}else zn=!1;In=zn&&(!document.documentMode||9=t)return{node:r,offset:t-e};e=n}e:{for(;r;){if(r.nextSibling){r=r.nextSibling;break e}r=r.parentNode}r=void 0}r=Xn(r)}}function Jn(e,t){return!(!e||!t)&&(e===t||(!e||3!==e.nodeType)&&(t&&3===t.nodeType?Jn(e,t.parentNode):"contains"in e?e.contains(t):!!e.compareDocumentPosition&&!!(16&e.compareDocumentPosition(t))))}function er(e){for(var t=pt((e=null!=e&&null!=e.ownerDocument&&null!=e.ownerDocument.defaultView?e.ownerDocument.defaultView:window).document);t instanceof e.HTMLIFrameElement;){try{var n="string"==typeof t.contentWindow.location.href}catch(r){n=!1}if(!n)break;t=pt((e=t.contentWindow).document)}return t}function tr(e){var t=e&&e.nodeName&&e.nodeName.toLowerCase();return t&&("input"===t&&("text"===e.type||"search"===e.type||"tel"===e.type||"url"===e.type||"password"===e.type)||"textarea"===t||"true"===e.contentEditable)}var nr=Ft&&"documentMode"in document&&11>=document.documentMode,rr=null,ar=null,or=null,ir=!1;function lr(e,t,n){var r=n.window===n?n.document:9===n.nodeType?n:n.ownerDocument;ir||null==rr||rr!==pt(r)||("selectionStart"in(r=rr)&&tr(r)?r={start:r.selectionStart,end:r.selectionEnd}:r={anchorNode:(r=(r.ownerDocument&&r.ownerDocument.defaultView||window).getSelection()).anchorNode,anchorOffset:r.anchorOffset,focusNode:r.focusNode,focusOffset:r.focusOffset},or&&Yn(or,r)||(or=r,0<(r=Hu(ar,"onSelect")).length&&(t=new Jt("onSelect","select",null,t,n),e.push({event:t,listeners:r}),t.target=rr)))}function sr(e,t){var n={};return n[e.toLowerCase()]=t.toLowerCase(),n["Webkit"+e]="webkit"+t,n["Moz"+e]="moz"+t,n}var cr={animationend:sr("Animation","AnimationEnd"),animationiteration:sr("Animation","AnimationIteration"),animationstart:sr("Animation","AnimationStart"),transitionrun:sr("Transition","TransitionRun"),transitionstart:sr("Transition","TransitionStart"),transitioncancel:sr("Transition","TransitionCancel"),transitionend:sr("Transition","TransitionEnd")},ur={},dr={};function fr(e){if(ur[e])return ur[e];if(!cr[e])return e;var t,n=cr[e];for(t in n)if(n.hasOwnProperty(t)&&t in dr)return ur[e]=n[t];return e}Ft&&(dr=document.createElement("div").style,"AnimationEvent"in window||(delete cr.animationend.animation,delete cr.animationiteration.animation,delete cr.animationstart.animation),"TransitionEvent"in window||delete cr.transitionend.transition);var pr=fr("animationend"),hr=fr("animationiteration"),mr=fr("animationstart"),gr=fr("transitionrun"),br=fr("transitionstart"),yr=fr("transitioncancel"),vr=fr("transitionend"),wr=new Map,kr="abort auxClick beforeToggle cancel canPlay canPlayThrough click close contextMenu copy cut drag dragEnd dragEnter dragExit dragLeave dragOver dragStart drop durationChange emptied encrypted ended error gotPointerCapture input invalid keyDown keyPress keyUp load loadedData loadedMetadata loadStart lostPointerCapture mouseDown mouseMove mouseOut mouseOver mouseUp paste pause play playing pointerCancel pointerDown pointerMove pointerOut pointerOver pointerUp progress rateChange reset resize seeked seeking stalled submit suspend timeUpdate touchCancel touchEnd touchStart volumeChange scroll toggle touchMove waiting wheel".split(" ");function Sr(e,t){wr.set(e,t),Qe(t,[e])}kr.push("scrollEnd");var xr=new WeakMap;function _r(e,t){if("object"==typeof e&&null!==e){var n=xr.get(e);return void 0!==n?n:(t={value:e,source:t,stack:st(t)},xr.set(e,t),t)}return{value:e,source:t,stack:st(t)}}var Er=[],Cr=0,Ar=0;function Lr(){for(var e=Cr,t=Ar=Cr=0;t>=i,a-=i,Xr=1<<32-pe(t)+a|n<o?o:8;var i,l,s,c=O.T,u={};O.T=u,$i(e,!1,t,n);try{var d=a(),f=O.S;if(null!==f&&f(u,d),null!==d&&"object"==typeof d&&"function"==typeof d.then)zi(e,t,(i=r,l=[],s={status:"pending",value:null,reason:null,then:function(e){l.push(e)}},d.then(function(){s.status="fulfilled",s.value=i;for(var e=0;eh?(m=d,d=null):m=d.sibling;var g=p(a,d,l[h],s);if(null===g){null===d&&(d=m);break}e&&d&&null===g.alternate&&t(a,d),i=o(g,i,h),null===u?c=g:u.sibling=g,u=g,d=m}if(h===l.length)return n(a,d),oa&&Jr(a,h),c;if(null===d){for(;hm?(g=h,h=null):g=h.sibling;var v=p(a,h,y.value,c);if(null===v){null===h&&(h=g);break}e&&h&&null===v.alternate&&t(a,h),l=o(v,l,m),null===d?u=v:d.sibling=v,d=v,h=g}if(y.done)return n(a,h),oa&&Jr(a,m),u;if(null===h){for(;!y.done;m++,y=s.next())null!==(y=f(a,y.value,c))&&(l=o(y,l,m),null===d?u=y:d.sibling=y,d=y);return oa&&Jr(a,m),u}for(h=r(h);!y.done;m++,y=s.next())null!==(y=b(h,a,m,y.value,c))&&(e&&null!==y.alternate&&h.delete(null===y.key?m:y.key),l=o(y,l,m),null===d?u=y:d.sibling=y,d=y);return e&&h.forEach(function(e){return t(a,e)}),oa&&Jr(a,m),u}(s,c,u=v.call(u),d)}if("function"==typeof u.then)return y(s,c,Xi(u),d);if(u.$$typeof===k)return y(s,c,Aa(s,u),d);Ji(s,u)}return"string"==typeof u&&""!==u||"number"==typeof u||"bigint"==typeof u?(u=""+u,null!==c&&6===c.tag?(n(s,c.sibling),(d=a(c,u)).return=s,s=d):(n(s,c),(d=Ur(u,s.mode,d)).return=s,s=d),l(s)):n(s,c)}return function(e,t,n,r){try{Yi=0;var a=y(e,t,n,r);return Ki=null,a}catch(i){if(i===Ga||i===Wa)throw i;var o=Dr(29,i,null,e.mode);return o.lanes=r,o.return=e,o}}}var nl=tl(!0),rl=tl(!1),al=I(null),ol=null;function il(e){var t=e.alternate;$(ul,1&ul.current),$(al,e),null===ol&&(null===t||null!==ho.current||null!==t.memoizedState)&&(ol=e)}function ll(e){if(22===e.tag){if($(ul,ul.current),$(al,e),null===ol){var t=e.alternate;null!==t&&null!==t.memoizedState&&(ol=e)}}else sl()}function sl(){$(ul,ul.current),$(al,al.current)}function cl(e){z(al),ol===e&&(ol=null),z(ul)}var ul=I(0);function dl(e){for(var t=e;null!==t;){if(13===t.tag){var n=t.memoizedState;if(null!==n&&(null===(n=n.dehydrated)||"$?"===n.data||gd(n)))return t}else if(19===t.tag&&void 0!==t.memoizedProps.revealOrder){if(128&t.flags)return t}else if(null!==t.child){t.child.return=t,t=t.child;continue}if(t===e)break;for(;null===t.sibling;){if(null===t.return||t.return===e)return null;t=t.return}t.sibling.return=t.return,t=t.sibling}return null}function fl(e,t,n,r){n=null==(n=n(r,t=e.memoizedState))?t:f({},t,n),e.memoizedState=n,0===e.lanes&&(e.updateQueue.baseState=n)}var pl={enqueueSetState:function(e,t,n){e=e._reactInternals;var r=Oc(),a=ao(r);a.payload=t,null!=n&&(a.callback=n),null!==(t=oo(e,a,r))&&(Dc(t,e,r),io(t,e,r))},enqueueReplaceState:function(e,t,n){e=e._reactInternals;var r=Oc(),a=ao(r);a.tag=1,a.payload=t,null!=n&&(a.callback=n),null!==(t=oo(e,a,r))&&(Dc(t,e,r),io(t,e,r))},enqueueForceUpdate:function(e,t){e=e._reactInternals;var n=Oc(),r=ao(n);r.tag=2,null!=t&&(r.callback=t),null!==(t=oo(e,r,n))&&(Dc(t,e,n),io(t,e,n))}};function hl(e,t,n,r,a,o,i){return"function"==typeof(e=e.stateNode).shouldComponentUpdate?e.shouldComponentUpdate(r,o,i):!t.prototype||!t.prototype.isPureReactComponent||(!Yn(n,r)||!Yn(a,o))}function ml(e,t,n,r){e=t.state,"function"==typeof t.componentWillReceiveProps&&t.componentWillReceiveProps(n,r),"function"==typeof t.UNSAFE_componentWillReceiveProps&&t.UNSAFE_componentWillReceiveProps(n,r),t.state!==e&&pl.enqueueReplaceState(t,t.state,null)}function gl(e,t){var n=t;if("ref"in t)for(var r in n={},t)"ref"!==r&&(n[r]=t[r]);if(e=e.defaultProps)for(var a in n===t&&(n=f({},n)),e)void 0===n[a]&&(n[a]=e[a]);return n}var bl="function"==typeof reportError?reportError:function(e){if("object"==typeof window&&"function"==typeof window.ErrorEvent){var t=new window.ErrorEvent("error",{bubbles:!0,cancelable:!0,message:"object"==typeof e&&null!==e&&"string"==typeof e.message?String(e.message):String(e),error:e});if(!window.dispatchEvent(t))return}else if("object"==typeof process&&"function"==typeof process.emit)return void process.emit("uncaughtException",e);console.error(e)};function yl(e){bl(e)}function vl(e){console.error(e)}function wl(e){bl(e)}function kl(e,t){try{(0,e.onUncaughtError)(t.value,{componentStack:t.stack})}catch(n){setTimeout(function(){throw n})}}function Sl(e,t,n){try{(0,e.onCaughtError)(n.value,{componentStack:n.stack,errorBoundary:1===t.tag?t.stateNode:null})}catch(r){setTimeout(function(){throw r})}}function xl(e,t,n){return(n=ao(n)).tag=3,n.payload={element:null},n.callback=function(){kl(e,t)},n}function _l(e){return(e=ao(e)).tag=3,e}function El(e,t,n,r){var a=n.type.getDerivedStateFromError;if("function"==typeof a){var o=r.value;e.payload=function(){return a(o)},e.callback=function(){Sl(t,n,r)}}var i=n.stateNode;null!==i&&"function"==typeof i.componentDidCatch&&(e.callback=function(){Sl(t,n,r),"function"!=typeof a&&(null===_c?_c=new Set([this]):_c.add(this));var e=r.stack;this.componentDidCatch(r.value,{componentStack:null!==e?e:""})})}var Cl=Error(i(461)),Al=!1;function Ll(e,t,n,r){t.child=null===e?rl(t,null,n,r):nl(t,e.child,n,r)}function Tl(e,t,n,r,a){n=n.render;var o=t.ref;if("ref"in r){var i={};for(var l in r)"ref"!==l&&(i[l]=r[l])}else i=r;return Ea(t),r=Mo(e,t,n,i,o,a),l=Do(),null===e||Al?(oa&&l&&ta(t),t.flags|=1,Ll(e,t,r,a),t.child):(Bo(e,t,a),Kl(e,t,a))}function jl(e,t,n,r,a){if(null===e){var o=n.type;return"function"!=typeof o||Br(o)||void 0!==o.defaultProps||null!==n.compare?((e=zr(n.type,null,r,t,t.mode,a)).ref=t.ref,e.return=t,t.child=e):(t.tag=15,t.type=o,Pl(e,t,o,r,a))}if(o=e.child,!Yl(e,a)){var i=o.memoizedProps;if((n=null!==(n=n.compare)?n:Yn)(i,r)&&e.ref===t.ref)return Kl(e,t,a)}return t.flags|=1,(e=Fr(o,r)).ref=t.ref,e.return=t,t.child=e}function Pl(e,t,n,r,a){if(null!==e){var o=e.memoizedProps;if(Yn(o,r)&&e.ref===t.ref){if(Al=!1,t.pendingProps=r=o,!Yl(e,a))return t.lanes=e.lanes,Kl(e,t,a);131072&e.flags&&(Al=!0)}}return Rl(e,t,n,r,a)}function Ml(e,t,n){var r=t.pendingProps,a=r.children,o=null!==e?e.memoizedState:null;if("hidden"===r.mode){if(128&t.flags){if(r=null!==o?o.baseLanes|n:n,null!==e){for(a=t.child=e.child,o=0;null!==a;)o=o|a.lanes|a.childLanes,a=a.sibling;t.childLanes=o&~r}else t.childLanes=0,t.child=null;return Nl(e,t,r,n)}if(!(536870912&n))return t.lanes=t.childLanes=536870912,Nl(e,t,null!==o?o.baseLanes|n:n,n);t.memoizedState={baseLanes:0,cachePool:null},null!==e&&qa(0,null!==o?o.cachePool:null),null!==o?go(t,o):bo(),ll(t)}else null!==o?(qa(0,o.cachePool),go(t,o),sl(),t.memoizedState=null):(null!==e&&qa(0,null),bo(),sl());return Ll(e,t,a,n),t.child}function Nl(e,t,n,r){var a=Ua();return a=null===a?null:{parent:Ma._currentValue,pool:a},t.memoizedState={baseLanes:n,cachePool:a},null!==e&&qa(0,null),bo(),ll(t),null!==e&&xa(e,t,r,!0),null}function Ol(e,t){var n=t.ref;if(null===n)null!==e&&null!==e.ref&&(t.flags|=4194816);else{if("function"!=typeof n&&"object"!=typeof n)throw Error(i(284));null!==e&&e.ref===n||(t.flags|=4194816)}}function Rl(e,t,n,r,a){return Ea(t),n=Mo(e,t,n,r,void 0,a),r=Do(),null===e||Al?(oa&&r&&ta(t),t.flags|=1,Ll(e,t,n,a),t.child):(Bo(e,t,a),Kl(e,t,a))}function Dl(e,t,n,r,a,o){return Ea(t),t.updateQueue=null,n=Oo(t,r,n,a),No(e),r=Do(),null===e||Al?(oa&&r&&ta(t),t.flags|=1,Ll(e,t,n,o),t.child):(Bo(e,t,o),Kl(e,t,o))}function Bl(e,t,n,r,a){if(Ea(t),null===t.stateNode){var o=Or,i=n.contextType;"object"==typeof i&&null!==i&&(o=Ca(i)),o=new n(r,o),t.memoizedState=null!==o.state&&void 0!==o.state?o.state:null,o.updater=pl,t.stateNode=o,o._reactInternals=t,(o=t.stateNode).props=r,o.state=t.memoizedState,o.refs={},no(t),i=n.contextType,o.context="object"==typeof i&&null!==i?Ca(i):Or,o.state=t.memoizedState,"function"==typeof(i=n.getDerivedStateFromProps)&&(fl(t,n,i,r),o.state=t.memoizedState),"function"==typeof n.getDerivedStateFromProps||"function"==typeof o.getSnapshotBeforeUpdate||"function"!=typeof o.UNSAFE_componentWillMount&&"function"!=typeof o.componentWillMount||(i=o.state,"function"==typeof o.componentWillMount&&o.componentWillMount(),"function"==typeof o.UNSAFE_componentWillMount&&o.UNSAFE_componentWillMount(),i!==o.state&&pl.enqueueReplaceState(o,o.state,null),uo(t,r,o,a),co(),o.state=t.memoizedState),"function"==typeof o.componentDidMount&&(t.flags|=4194308),r=!0}else if(null===e){o=t.stateNode;var l=t.memoizedProps,s=gl(n,l);o.props=s;var c=o.context,u=n.contextType;i=Or,"object"==typeof u&&null!==u&&(i=Ca(u));var d=n.getDerivedStateFromProps;u="function"==typeof d||"function"==typeof o.getSnapshotBeforeUpdate,l=t.pendingProps!==l,u||"function"!=typeof o.UNSAFE_componentWillReceiveProps&&"function"!=typeof o.componentWillReceiveProps||(l||c!==i)&&ml(t,o,r,i),to=!1;var f=t.memoizedState;o.state=f,uo(t,r,o,a),co(),c=t.memoizedState,l||f!==c||to?("function"==typeof d&&(fl(t,n,d,r),c=t.memoizedState),(s=to||hl(t,n,s,r,f,c,i))?(u||"function"!=typeof o.UNSAFE_componentWillMount&&"function"!=typeof o.componentWillMount||("function"==typeof o.componentWillMount&&o.componentWillMount(),"function"==typeof o.UNSAFE_componentWillMount&&o.UNSAFE_componentWillMount()),"function"==typeof o.componentDidMount&&(t.flags|=4194308)):("function"==typeof o.componentDidMount&&(t.flags|=4194308),t.memoizedProps=r,t.memoizedState=c),o.props=r,o.state=c,o.context=i,r=s):("function"==typeof o.componentDidMount&&(t.flags|=4194308),r=!1)}else{o=t.stateNode,ro(e,t),u=gl(n,i=t.memoizedProps),o.props=u,d=t.pendingProps,f=o.context,c=n.contextType,s=Or,"object"==typeof c&&null!==c&&(s=Ca(c)),(c="function"==typeof(l=n.getDerivedStateFromProps)||"function"==typeof o.getSnapshotBeforeUpdate)||"function"!=typeof o.UNSAFE_componentWillReceiveProps&&"function"!=typeof o.componentWillReceiveProps||(i!==d||f!==s)&&ml(t,o,r,s),to=!1,f=t.memoizedState,o.state=f,uo(t,r,o,a),co();var p=t.memoizedState;i!==d||f!==p||to||null!==e&&null!==e.dependencies&&_a(e.dependencies)?("function"==typeof l&&(fl(t,n,l,r),p=t.memoizedState),(u=to||hl(t,n,u,r,f,p,s)||null!==e&&null!==e.dependencies&&_a(e.dependencies))?(c||"function"!=typeof o.UNSAFE_componentWillUpdate&&"function"!=typeof o.componentWillUpdate||("function"==typeof o.componentWillUpdate&&o.componentWillUpdate(r,p,s),"function"==typeof o.UNSAFE_componentWillUpdate&&o.UNSAFE_componentWillUpdate(r,p,s)),"function"==typeof o.componentDidUpdate&&(t.flags|=4),"function"==typeof o.getSnapshotBeforeUpdate&&(t.flags|=1024)):("function"!=typeof o.componentDidUpdate||i===e.memoizedProps&&f===e.memoizedState||(t.flags|=4),"function"!=typeof o.getSnapshotBeforeUpdate||i===e.memoizedProps&&f===e.memoizedState||(t.flags|=1024),t.memoizedProps=r,t.memoizedState=p),o.props=r,o.state=p,o.context=s,r=u):("function"!=typeof o.componentDidUpdate||i===e.memoizedProps&&f===e.memoizedState||(t.flags|=4),"function"!=typeof o.getSnapshotBeforeUpdate||i===e.memoizedProps&&f===e.memoizedState||(t.flags|=1024),r=!1)}return o=r,Ol(e,t),r=!!(128&t.flags),o||r?(o=t.stateNode,n=r&&"function"!=typeof n.getDerivedStateFromError?null:o.render(),t.flags|=1,null!==e&&r?(t.child=nl(t,e.child,null,a),t.child=nl(t,null,n,a)):Ll(e,t,n,a),t.memoizedState=o.state,e=t.child):e=Kl(e,t,a),e}function Fl(e,t,n,r){return pa(),t.flags|=256,Ll(e,t,n,r),t.child}var Il={dehydrated:null,treeContext:null,retryLane:0,hydrationErrors:null};function zl(e){return{baseLanes:e,cachePool:Ha()}}function $l(e,t,n){return e=null!==e?e.childLanes&~n:0,t&&(e|=gc),e}function Ul(e,t,n){var r,a=t.pendingProps,o=!1,l=!!(128&t.flags);if((r=l)||(r=(null===e||null!==e.memoizedState)&&!!(2&ul.current)),r&&(o=!0,t.flags&=-129),r=!!(32&t.flags),t.flags&=-33,null===e){if(oa){if(o?il(t):sl(),oa){var s,c=aa;if(s=c){e:{for(s=c,c=la;8!==s.nodeType;){if(!c){c=null;break e}if(null===(s=bd(s.nextSibling))){c=null;break e}}c=s}null!==c?(t.memoizedState={dehydrated:c,treeContext:null!==Yr?{id:Xr,overflow:Zr}:null,retryLane:536870912,hydrationErrors:null},(s=Dr(18,null,null,0)).stateNode=c,s.return=t,t.child=s,ra=t,aa=null,s=!0):s=!1}s||ca(t)}if(null!==(c=t.memoizedState)&&null!==(c=c.dehydrated))return gd(c)?t.lanes=32:t.lanes=536870912,null;cl(t)}return c=a.children,a=a.fallback,o?(sl(),c=Hl({mode:"hidden",children:c},o=t.mode),a=$r(a,o,n,null),c.return=t,a.return=t,c.sibling=a,t.child=c,(o=t.child).memoizedState=zl(n),o.childLanes=$l(e,r,n),t.memoizedState=Il,a):(il(t),ql(t,c))}if(null!==(s=e.memoizedState)&&null!==(c=s.dehydrated)){if(l)256&t.flags?(il(t),t.flags&=-257,t=Gl(e,t,n)):null!==t.memoizedState?(sl(),t.child=e.child,t.flags|=128,t=null):(sl(),o=a.fallback,c=t.mode,a=Hl({mode:"visible",children:a.children},c),(o=$r(o,c,n,null)).flags|=2,a.return=t,o.return=t,a.sibling=o,t.child=a,nl(t,e.child,null,n),(a=t.child).memoizedState=zl(n),a.childLanes=$l(e,r,n),t.memoizedState=Il,t=o);else if(il(t),gd(c)){if(r=c.nextSibling&&c.nextSibling.dataset)var u=r.dgst;r=u,(a=Error(i(419))).stack="",a.digest=r,ma({value:a,source:null,stack:null}),t=Gl(e,t,n)}else if(Al||xa(e,t,n,!1),r=0!==(n&e.childLanes),Al||r){if(null!==(r=rc)&&(0!==(a=0!==((a=42&(a=n&-n)?1:Le(a))&(r.suspendedLanes|n))?0:a)&&a!==s.retryLane))throw s.retryLane=a,Pr(e,a),Dc(r,e,a),Cl;"$?"===c.data||Wc(),t=Gl(e,t,n)}else"$?"===c.data?(t.flags|=192,t.child=e.child,t=null):(e=s.treeContext,aa=bd(c.nextSibling),ra=t,oa=!0,ia=null,la=!1,null!==e&&(Qr[Kr++]=Xr,Qr[Kr++]=Zr,Qr[Kr++]=Yr,Xr=e.id,Zr=e.overflow,Yr=t),(t=ql(t,a.children)).flags|=4096);return t}return o?(sl(),o=a.fallback,c=t.mode,u=(s=e.child).sibling,(a=Fr(s,{mode:"hidden",children:a.children})).subtreeFlags=65011712&s.subtreeFlags,null!==u?o=Fr(u,o):(o=$r(o,c,n,null)).flags|=2,o.return=t,a.return=t,a.sibling=o,t.child=a,a=o,o=t.child,null===(c=e.child.memoizedState)?c=zl(n):(null!==(s=c.cachePool)?(u=Ma._currentValue,s=s.parent!==u?{parent:u,pool:u}:s):s=Ha(),c={baseLanes:c.baseLanes|n,cachePool:s}),o.memoizedState=c,o.childLanes=$l(e,r,n),t.memoizedState=Il,a):(il(t),e=(n=e.child).sibling,(n=Fr(n,{mode:"visible",children:a.children})).return=t,n.sibling=null,null!==e&&(null===(r=t.deletions)?(t.deletions=[e],t.flags|=16):r.push(e)),t.child=n,t.memoizedState=null,n)}function ql(e,t){return(t=Hl({mode:"visible",children:t},e.mode)).return=e,e.child=t}function Hl(e,t){return(e=Dr(22,e,null,t)).lanes=0,e.stateNode={_visibility:1,_pendingMarkers:null,_retryCache:null,_transitions:null},e}function Gl(e,t,n){return nl(t,e.child,null,n),(e=ql(t,t.pendingProps.children)).flags|=2,t.memoizedState=null,e}function Vl(e,t,n){e.lanes|=t;var r=e.alternate;null!==r&&(r.lanes|=t),ka(e.return,t,n)}function Wl(e,t,n,r,a){var o=e.memoizedState;null===o?e.memoizedState={isBackwards:t,rendering:null,renderingStartTime:0,last:r,tail:n,tailMode:a}:(o.isBackwards=t,o.rendering=null,o.renderingStartTime=0,o.last=r,o.tail=n,o.tailMode=a)}function Ql(e,t,n){var r=t.pendingProps,a=r.revealOrder,o=r.tail;if(Ll(e,t,r.children,n),2&(r=ul.current))r=1&r|2,t.flags|=128;else{if(null!==e&&128&e.flags)e:for(e=t.child;null!==e;){if(13===e.tag)null!==e.memoizedState&&Vl(e,n,t);else if(19===e.tag)Vl(e,n,t);else if(null!==e.child){e.child.return=e,e=e.child;continue}if(e===t)break e;for(;null===e.sibling;){if(null===e.return||e.return===t)break e;e=e.return}e.sibling.return=e.return,e=e.sibling}r&=1}switch($(ul,r),a){case"forwards":for(n=t.child,a=null;null!==n;)null!==(e=n.alternate)&&null===dl(e)&&(a=n),n=n.sibling;null===(n=a)?(a=t.child,t.child=null):(a=n.sibling,n.sibling=null),Wl(t,!1,a,n,o);break;case"backwards":for(n=null,a=t.child,t.child=null;null!==a;){if(null!==(e=a.alternate)&&null===dl(e)){t.child=a;break}e=a.sibling,a.sibling=n,n=a,a=e}Wl(t,!0,n,null,o);break;case"together":Wl(t,!1,null,null,void 0);break;default:t.memoizedState=null}return t.child}function Kl(e,t,n){if(null!==e&&(t.dependencies=e.dependencies),pc|=t.lanes,0===(n&t.childLanes)){if(null===e)return null;if(xa(e,t,n,!1),0===(n&t.childLanes))return null}if(null!==e&&t.child!==e.child)throw Error(i(153));if(null!==t.child){for(n=Fr(e=t.child,e.pendingProps),t.child=n,n.return=t;null!==e.sibling;)e=e.sibling,(n=n.sibling=Fr(e,e.pendingProps)).return=t;n.sibling=null}return t.child}function Yl(e,t){return 0!==(e.lanes&t)||!(null===(e=e.dependencies)||!_a(e))}function Xl(e,t,n){if(null!==e)if(e.memoizedProps!==t.pendingProps)Al=!0;else{if(!(Yl(e,n)||128&t.flags))return Al=!1,function(e,t,n){switch(t.tag){case 3:V(t,t.stateNode.containerInfo),va(0,Ma,e.memoizedState.cache),pa();break;case 27:case 5:Q(t);break;case 4:V(t,t.stateNode.containerInfo);break;case 10:va(0,t.type,t.memoizedProps.value);break;case 13:var r=t.memoizedState;if(null!==r)return null!==r.dehydrated?(il(t),t.flags|=128,null):0!==(n&t.child.childLanes)?Ul(e,t,n):(il(t),null!==(e=Kl(e,t,n))?e.sibling:null);il(t);break;case 19:var a=!!(128&e.flags);if((r=0!==(n&t.childLanes))||(xa(e,t,n,!1),r=0!==(n&t.childLanes)),a){if(r)return Ql(e,t,n);t.flags|=128}if(null!==(a=t.memoizedState)&&(a.rendering=null,a.tail=null,a.lastEffect=null),$(ul,ul.current),r)break;return null;case 22:case 23:return t.lanes=0,Ml(e,t,n);case 24:va(0,Ma,e.memoizedState.cache)}return Kl(e,t,n)}(e,t,n);Al=!!(131072&e.flags)}else Al=!1,oa&&1048576&t.flags&&ea(t,Wr,t.index);switch(t.lanes=0,t.tag){case 16:e:{e=t.pendingProps;var r=t.elementType,a=r._init;if(r=a(r._payload),t.type=r,"function"!=typeof r){if(null!=r){if((a=r.$$typeof)===S){t.tag=11,t=Tl(null,t,r,e,n);break e}if(a===E){t.tag=14,t=jl(null,t,r,e,n);break e}}throw t=M(r)||r,Error(i(306,t,""))}Br(r)?(e=gl(r,e),t.tag=1,t=Bl(null,t,r,e,n)):(t.tag=0,t=Rl(null,t,r,e,n))}return t;case 0:return Rl(e,t,t.type,t.pendingProps,n);case 1:return Bl(e,t,r=t.type,a=gl(r,t.pendingProps),n);case 3:e:{if(V(t,t.stateNode.containerInfo),null===e)throw Error(i(387));r=t.pendingProps;var o=t.memoizedState;a=o.element,ro(e,t),uo(t,r,null,n);var l=t.memoizedState;if(r=l.cache,va(0,Ma,r),r!==o.cache&&Sa(t,[Ma],n,!0),co(),r=l.element,o.isDehydrated){if(o={element:r,isDehydrated:!1,cache:l.cache},t.updateQueue.baseState=o,t.memoizedState=o,256&t.flags){t=Fl(e,t,r,n);break e}if(r!==a){ma(a=_r(Error(i(424)),t)),t=Fl(e,t,r,n);break e}if(9===(e=t.stateNode.containerInfo).nodeType)e=e.body;else e="HTML"===e.nodeName?e.ownerDocument.body:e;for(aa=bd(e.firstChild),ra=t,oa=!0,ia=null,la=!0,n=rl(t,null,r,n),t.child=n;n;)n.flags=-3&n.flags|4096,n=n.sibling}else{if(pa(),r===a){t=Kl(e,t,n);break e}Ll(e,t,r,n)}t=t.child}return t;case 26:return Ol(e,t),null===e?(n=Ld(t.type,null,t.pendingProps,null))?t.memoizedState=n:oa||(n=t.type,e=t.pendingProps,(r=rd(H.current).createElement(n))[Me]=t,r[Ne]=e,ed(r,n,e),Ge(r),t.stateNode=r):t.memoizedState=Ld(t.type,e.memoizedProps,t.pendingProps,e.memoizedState),null;case 27:return Q(t),null===e&&oa&&(r=t.stateNode=wd(t.type,t.pendingProps,H.current),ra=t,la=!0,a=aa,pd(t.type)?(yd=a,aa=bd(r.firstChild)):aa=a),Ll(e,t,t.pendingProps.children,n),Ol(e,t),null===e&&(t.flags|=4194304),t.child;case 5:return null===e&&oa&&((a=r=aa)&&(null!==(r=function(e,t,n,r){for(;1===e.nodeType;){var a=n;if(e.nodeName.toLowerCase()!==t.toLowerCase()){if(!r&&("INPUT"!==e.nodeName||"hidden"!==e.type))break}else if(r){if(!e[Ie])switch(t){case"meta":if(!e.hasAttribute("itemprop"))break;return e;case"link":if("stylesheet"===(o=e.getAttribute("rel"))&&e.hasAttribute("data-precedence"))break;if(o!==a.rel||e.getAttribute("href")!==(null==a.href||""===a.href?null:a.href)||e.getAttribute("crossorigin")!==(null==a.crossOrigin?null:a.crossOrigin)||e.getAttribute("title")!==(null==a.title?null:a.title))break;return e;case"style":if(e.hasAttribute("data-precedence"))break;return e;case"script":if(((o=e.getAttribute("src"))!==(null==a.src?null:a.src)||e.getAttribute("type")!==(null==a.type?null:a.type)||e.getAttribute("crossorigin")!==(null==a.crossOrigin?null:a.crossOrigin))&&o&&e.hasAttribute("async")&&!e.hasAttribute("itemprop"))break;return e;default:return e}}else{if("input"!==t||"hidden"!==e.type)return e;var o=null==a.name?null:""+a.name;if("hidden"===a.type&&e.getAttribute("name")===o)return e}if(null===(e=bd(e.nextSibling)))break}return null}(r,t.type,t.pendingProps,la))?(t.stateNode=r,ra=t,aa=bd(r.firstChild),la=!1,a=!0):a=!1),a||ca(t)),Q(t),a=t.type,o=t.pendingProps,l=null!==e?e.memoizedProps:null,r=o.children,id(a,o)?r=null:null!==l&&id(a,l)&&(t.flags|=32),null!==t.memoizedState&&(a=Mo(e,t,Ro,null,null,n),Qd._currentValue=a),Ol(e,t),Ll(e,t,r,n),t.child;case 6:return null===e&&oa&&((e=n=aa)&&(null!==(n=function(e,t,n){if(""===t)return null;for(;3!==e.nodeType;){if((1!==e.nodeType||"INPUT"!==e.nodeName||"hidden"!==e.type)&&!n)return null;if(null===(e=bd(e.nextSibling)))return null}return e}(n,t.pendingProps,la))?(t.stateNode=n,ra=t,aa=null,e=!0):e=!1),e||ca(t)),null;case 13:return Ul(e,t,n);case 4:return V(t,t.stateNode.containerInfo),r=t.pendingProps,null===e?t.child=nl(t,null,r,n):Ll(e,t,r,n),t.child;case 11:return Tl(e,t,t.type,t.pendingProps,n);case 7:return Ll(e,t,t.pendingProps,n),t.child;case 8:case 12:return Ll(e,t,t.pendingProps.children,n),t.child;case 10:return r=t.pendingProps,va(0,t.type,r.value),Ll(e,t,r.children,n),t.child;case 9:return a=t.type._context,r=t.pendingProps.children,Ea(t),r=r(a=Ca(a)),t.flags|=1,Ll(e,t,r,n),t.child;case 14:return jl(e,t,t.type,t.pendingProps,n);case 15:return Pl(e,t,t.type,t.pendingProps,n);case 19:return Ql(e,t,n);case 31:return r=t.pendingProps,n=t.mode,r={mode:r.mode,children:r.children},null===e?((n=Hl(r,n)).ref=t.ref,t.child=n,n.return=t,t=n):((n=Fr(e.child,r)).ref=t.ref,t.child=n,n.return=t,t=n),t;case 22:return Ml(e,t,n);case 24:return Ea(t),r=Ca(Ma),null===e?(null===(a=Ua())&&(a=rc,o=Na(),a.pooledCache=o,o.refCount++,null!==o&&(a.pooledCacheLanes|=n),a=o),t.memoizedState={parent:r,cache:a},no(t),va(0,Ma,a)):(0!==(e.lanes&n)&&(ro(e,t),uo(t,null,null,n),co()),a=e.memoizedState,o=t.memoizedState,a.parent!==r?(a={parent:r,cache:r},t.memoizedState=a,0===t.lanes&&(t.memoizedState=t.updateQueue.baseState=a),va(0,Ma,r)):(r=o.cache,va(0,Ma,r),r!==a.cache&&Sa(t,[Ma],n,!0))),Ll(e,t,t.pendingProps.children,n),t.child;case 29:throw t.pendingProps}throw Error(i(156,t.tag))}function Zl(e){e.flags|=4}function Jl(e,t){if("stylesheet"!==t.type||4&t.state.loading)e.flags&=-16777217;else if(e.flags|=16777216,!$d(t)){if(null!==(t=al.current)&&((4194048&oc)===oc?null!==ol:(62914560&oc)!==oc&&!(536870912&oc)||t!==ol))throw Za=Qa,Va;e.flags|=8192}}function es(e,t){null!==t&&(e.flags|=4),16384&e.flags&&(t=22!==e.tag?xe():536870912,e.lanes|=t,bc|=t)}function ts(e,t){if(!oa)switch(e.tailMode){case"hidden":t=e.tail;for(var n=null;null!==t;)null!==t.alternate&&(n=t),t=t.sibling;null===n?e.tail=null:n.sibling=null;break;case"collapsed":n=e.tail;for(var r=null;null!==n;)null!==n.alternate&&(r=n),n=n.sibling;null===r?t||null===e.tail?e.tail=null:e.tail.sibling=null:r.sibling=null}}function ns(e){var t=null!==e.alternate&&e.alternate.child===e.child,n=0,r=0;if(t)for(var a=e.child;null!==a;)n|=a.lanes|a.childLanes,r|=65011712&a.subtreeFlags,r|=65011712&a.flags,a.return=e,a=a.sibling;else for(a=e.child;null!==a;)n|=a.lanes|a.childLanes,r|=a.subtreeFlags,r|=a.flags,a.return=e,a=a.sibling;return e.subtreeFlags|=r,e.childLanes=n,t}function rs(e,t,n){var r=t.pendingProps;switch(na(t),t.tag){case 31:case 16:case 15:case 0:case 11:case 7:case 8:case 12:case 9:case 14:case 1:return ns(t),null;case 3:return n=t.stateNode,r=null,null!==e&&(r=e.memoizedState.cache),t.memoizedState.cache!==r&&(t.flags|=2048),wa(Ma),W(),n.pendingContext&&(n.context=n.pendingContext,n.pendingContext=null),null!==e&&null!==e.child||(fa(t)?Zl(t):null===e||e.memoizedState.isDehydrated&&!(256&t.flags)||(t.flags|=1024,ha())),ns(t),null;case 26:return n=t.memoizedState,null===e?(Zl(t),null!==n?(ns(t),Jl(t,n)):(ns(t),t.flags&=-16777217)):n?n!==e.memoizedState?(Zl(t),ns(t),Jl(t,n)):(ns(t),t.flags&=-16777217):(e.memoizedProps!==r&&Zl(t),ns(t),t.flags&=-16777217),null;case 27:K(t),n=H.current;var a=t.type;if(null!==e&&null!=t.stateNode)e.memoizedProps!==r&&Zl(t);else{if(!r){if(null===t.stateNode)throw Error(i(166));return ns(t),null}e=U.current,fa(t)?ua(t):(e=wd(a,r,n),t.stateNode=e,Zl(t))}return ns(t),null;case 5:if(K(t),n=t.type,null!==e&&null!=t.stateNode)e.memoizedProps!==r&&Zl(t);else{if(!r){if(null===t.stateNode)throw Error(i(166));return ns(t),null}if(e=U.current,fa(t))ua(t);else{switch(a=rd(H.current),e){case 1:e=a.createElementNS("http://www.w3.org/2000/svg",n);break;case 2:e=a.createElementNS("http://www.w3.org/1998/Math/MathML",n);break;default:switch(n){case"svg":e=a.createElementNS("http://www.w3.org/2000/svg",n);break;case"math":e=a.createElementNS("http://www.w3.org/1998/Math/MathML",n);break;case"script":(e=a.createElement("div")).innerHTML=" - + + + - + \ No newline at end of file diff --git a/docs/blog/atom.xml b/docs/blog/atom.xml index 321c11b1..d11a6164 100644 --- a/docs/blog/atom.xml +++ b/docs/blog/atom.xml @@ -91,28 +91,28 @@ <![CDATA[Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving]]> - https://meesho.github.io/BharatMLStack/blog/post-three - + https://meesho.github.io/BharatMLStack/blog/post-four + 2025-03-29T00:00:00.000Z BharatMLStack

-

Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving

+

Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving

Serving large language models in production introduces new challenges across infrastructure, performance optimization, and operational lifecycle management. The LLM Inference Platform addresses these challenges by providing a unified system for deploying and managing open-source and fine-tuned LLMs at scale.

The platform implements a complete LLMOps lifecycle — from model registration and automated compilation to deployment, runtime optimization, and monitoring. Designed as a self-service environment, users can onboard models directly from open repositories such as Hugging Face or upload custom fine-tuned models, and deploy them using a single-click workflow with no manual infrastructure or configuration steps required.

In addition to fully automated deployment, the platform allows users to select and apply custom inference optimization techniques — such as quantization strategies, batching configurations, and runtime-specific performance enhancements — enabling teams to balance latency, throughput, and cost based on their use case. The goal is to reduce operational friction while enabling high-performance, production-grade LLM inference.

-

Why LLM Inference Is not just bigger ML model serving

+

Why LLM Inference Is not just bigger ML model serving

Large language model (LLM) inference introduces a fundamentally different set of challenges compared to traditional machine learning inference. While classical ML models typically perform a single forward pass to produce a fixed prediction, LLMs operate as autoregressive systems, generating outputs token by token based on previously generated context. This difference dramatically changes how inference systems must be designed, optimized, and scaled.

-

Autoregressive Generation and Sequential Computation:

+

Autoregressive Generation and Sequential Computation:

Unlike traditional models such as classifiers or recommenders — where inference cost is relatively constant — LLMs generate responses incrementally. Each new token depends on all previously generated tokens, making inference inherently sequential and dynamic. This means latency and compute requirements vary significantly depending on prompt length and output size, introducing complexity in scheduling and resource allocation. Because tokens cannot be generated fully in parallel during decoding, GPUs may become underutilized without specialized batching and scheduling strategies. This has led to the development of dedicated LLM inference engines optimized for token-level execution.

-

Prefill and Decode Phases:

+

Prefill and Decode Phases:

LLM inference typically consists of two distinct stages:

  • Prefill phase — the model processes the input prompt and builds internal representations. This stage is compute-heavy and highly parallelizable.
  • Decode phase — the model generates tokens sequentially, predicting one token at a time using previously generated context.

The decode stage often becomes memory-bound rather than compute-bound, which creates new performance bottlenecks compared to traditional ML workloads.

-

Context Management and KV Caching:

+

Context Management and KV Caching:

Another fundamental difference lies in how LLMs maintain context. Transformer-based models rely on attention mechanisms that require access to past token representations. To avoid recomputing these representations repeatedly, inference engines use key-value (KV) caching, which stores intermediate activations from previous tokens. KV caching significantly improves performance by eliminating redundant computation, but it introduces new challenges:

    @@ -121,7 +121,7 @@ KV caching significantly improves performance by eliminating redundant computati
  • Efficient memory management becomes essential for scaling concurrent requests

This tradeoff between compute efficiency and memory usage is unique to LLM inference workloads.

-

Dynamic and Irregular Workloads:

+

Dynamic and Irregular Workloads:

Traditional ML inference typically operates on fixed-size inputs with predictable latency. In contrast, LLM requests vary widely in prompt length, output length, and runtime behavior. As a result:

  • Batch sizes must be dynamic rather than static
  • @@ -129,10 +129,10 @@ KV caching significantly improves performance by eliminating redundant computati
  • Scheduling systems must continuously rebalance workloads to maximize GPU utilization

These characteristics require specialized serving architectures that differ significantly from standard ML serving pipelines.

-

Streaming and User Experience Constraints:

+

Streaming and User Experience Constraints:

Another distinguishing factor is the expectation of real-time streaming responses. Instead of returning a single output, LLM systems often stream tokens to users as they are generated. Because of these differences — sequential generation, growing memory requirements, dynamic workloads, and streaming constraints — LLM inference cannot be treated as a simple extension of existing ML serving systems. Production platforms must incorporate specialized runtime engines, advanced optimization techniques, and observability tailored specifically to LLM workloads.

-

LLMOps: High-Level Architecture

+

LLMOps: High-Level Architecture

LLM Architecture

The LLM Inference Framework is designed as a fully automated, end-to-end system for deploying and operating open-source and fine-tuned large language models at scale. The architecture abstracts the complexity of model optimization, hardware selection, deployment, and runtime management into a unified workflow that enables users to move from raw model weights to production-ready inference endpoints with minimal manual intervention.

Our LLM Inference Framework is architected not just as a serving engine, but as a complete lifecycle management system. As illustrated in the high-level design below, the platform automates the journey of a model through seven distinct stages, ensuring reproducibility, performance, and scalability.

@@ -232,7 +232,7 @@ Because of these differences — sequential generation, growing memory requireme -

Supported Inference backends (TensorRT LLM, Dynamo & vLLM)

+

Supported Inference backends (TensorRT LLM, Dynamo & vLLM)

Tailored for the Use Case: We do not believe in a "one-size-fits-all" approach to inference. Different use cases—whether a real-time voice bot requiring ultra-lowsub-second latency or a massive reasoning task requiring huge context windows—demand different runtime characteristics. Our platform is designed to be runtime-agnostic, allowing us to automatically select and tailor the best engine based on the specific requirements of the application:

  1. @@ -270,12 +270,12 @@ Because of these differences — sequential generation, growing memory requireme
-

Conclusion

+

Conclusion

Large language model inference introduces a fundamentally new class of infrastructure challenges—where performance is governed not just by raw compute, but by memory efficiency, intelligent scheduling, runtime specialization, and lifecycle automation. Unlike traditional ML serving, LLM inference requires systems that understand token-level execution, manage rapidly growing context state, and continuously balance latency, throughput, and cost under highly dynamic workloads.

The LLM Inference Framework addresses these challenges by transforming inference into a fully automated, reproducible lifecycle—from model onboarding and compilation to deployment, optimization, and observability. By integrating automated quantization and engine compilation, intelligent runtime selection, cold-start mitigation strategies, and LLM-specific observability metrics such as Time-to-First-Token and Inter-Token Latency, the platform ensures both high performance and operational simplicity.

Equally important, the framework is designed with flexibility and future evolution in mind. Its runtime-agnostic architecture enables seamless adoption of emerging inference engines, hardware accelerators, and optimization techniques without requiring platform redesign. This ensures that teams can continuously leverage advancements in the rapidly evolving LLM ecosystem while maintaining consistent operational workflows.

Ultimately, the goal of the platform is to make production-scale LLM deployment as seamless and reliable as traditional software deployment—allowing teams to focus on building intelligent applications rather than managing infrastructure complexity. By combining lifecycle automation, runtime optimization, and deep observability, the LLM Inference Framework provides a scalable foundation for delivering fast, cost-efficient, and production-ready LLM experiences.

-

Future Explorations

+

Future Explorations

While we have achieved significant milestones in latency and throughput, the landscape of GenAI is evolving rapidly. Our roadmap focuses on increasing flexibility, reducing costs, and enhancing reliability for enterprise-grade workloads. Here is what we are building next:

  • TPU Support: To diversify our hardware supply chain and further optimize cost-per-token, we are evaluating Google Cloud TPUs to bake it into our platform. By leveraging the JAX and PyTorch/XLA ecosystems, we aim to unlock the massive throughput potential of TPU v5e chips, particularly for our open-source Llama models. This will allow the hardware profiler to dynamically choose between NVIDIA GPUs and Google TPUs based on real-time availability and price-performance metrics.
  • @@ -303,195 +303,82 @@ Because of these differences — sequential generation, growing memory requireme 2024-05-21T00:00:00.000Z BharatMLStack

    -

    Designing a Production-Grade LLM Inference Platform: From Model Weights to Scalable GPU Serving

    -

    Serving large language models in production introduces new challenges across infrastructure, performance optimization, and operational lifecycle management. The LLM Inference Platform addresses these challenges by providing a unified system for deploying and managing open-source and fine-tuned LLMs at scale.

    -

    The platform implements a complete LLMOps lifecycle — from model registration and automated compilation to deployment, runtime optimization, and monitoring. Designed as a self-service environment, users can onboard models directly from open repositories such as Hugging Face or upload custom fine-tuned models, and deploy them using a single-click workflow with no manual infrastructure or configuration steps required.

    -

    In addition to fully automated deployment, the platform allows users to select and apply custom inference optimization techniques — such as quantization strategies, batching configurations, and runtime-specific performance enhancements — enabling teams to balance latency, throughput, and cost based on their use case. The goal is to reduce operational friction while enabling high-performance, production-grade LLM inference.

    -

    Why LLM Inference Is not just bigger ML model serving

    -

    Large language model (LLM) inference introduces a fundamentally different set of challenges compared to traditional machine learning inference. While classical ML models typically perform a single forward pass to produce a fixed prediction, LLMs operate as autoregressive systems, generating outputs token by token based on previously generated context. This difference dramatically changes how inference systems must be designed, optimized, and scaled.

    -

    Autoregressive Generation and Sequential Computation:

    -

    Unlike traditional models such as classifiers or recommenders — where inference cost is relatively constant — LLMs generate responses incrementally. Each new token depends on all previously generated tokens, making inference inherently sequential and dynamic. This means latency and compute requirements vary significantly depending on prompt length and output size, introducing complexity in scheduling and resource allocation. -Because tokens cannot be generated fully in parallel during decoding, GPUs may become underutilized without specialized batching and scheduling strategies. This has led to the development of dedicated LLM inference engines optimized for token-level execution.

    -

    Prefill and Decode Phases:

    -

    LLM inference typically consists of two distinct stages:

    -
      -
    • Prefill phase — the model processes the input prompt and builds internal representations. This stage is compute-heavy and highly parallelizable.
    • -
    • Decode phase — the model generates tokens sequentially, predicting one token at a time using previously generated context.
    • -
    -

    The decode stage often becomes memory-bound rather than compute-bound, which creates new performance bottlenecks compared to traditional ML workloads.

    -

    Context Management and KV Caching:

    -

    Another fundamental difference lies in how LLMs maintain context. Transformer-based models rely on attention mechanisms that require access to past token representations. To avoid recomputing these representations repeatedly, inference engines use key-value (KV) caching, which stores intermediate activations from previous tokens. -KV caching significantly improves performance by eliminating redundant computation, but it introduces new challenges:

    -
      -
    • Memory consumption grows with sequence length and batch size
    • -
    • GPU memory becomes a critical bottleneck
    • -
    • Efficient memory management becomes essential for scaling concurrent requests
    • -
    -

    This tradeoff between compute efficiency and memory usage is unique to LLM inference workloads.

    -

    Dynamic and Irregular Workloads:

    -

    Traditional ML inference typically operates on fixed-size inputs with predictable latency. In contrast, LLM requests vary widely in prompt length, output length, and runtime behavior. As a result:

    -
      -
    • Batch sizes must be dynamic rather than static
    • -
    • Requests may enter and leave batches asynchronously
    • -
    • Scheduling systems must continuously rebalance workloads to maximize GPU utilization
    • -
    -

    These characteristics require specialized serving architectures that differ significantly from standard ML serving pipelines.

    -

    Streaming and User Experience Constraints:

    -

    Another distinguishing factor is the expectation of real-time streaming responses. Instead of returning a single output, LLM systems often stream tokens to users as they are generated. -Because of these differences — sequential generation, growing memory requirements, dynamic workloads, and streaming constraints — LLM inference cannot be treated as a simple extension of existing ML serving systems. Production platforms must incorporate specialized runtime engines, advanced optimization techniques, and observability tailored specifically to LLM workloads.

    -

    LLMOps: High-Level Architecture

    -

    LLM Architecture

    -

    The LLM Inference Framework is designed as a fully automated, end-to-end system for deploying and operating open-source and fine-tuned large language models at scale. The architecture abstracts the complexity of model optimization, hardware selection, deployment, and runtime management into a unified workflow that enables users to move from raw model weights to production-ready inference endpoints with minimal manual intervention.

    -

    Our LLM Inference Framework is architected not just as a serving engine, but as a complete lifecycle management system. As illustrated in the high-level design below, the platform automates the journey of a model through seven distinct stages, ensuring reproducibility, performance, and scalability.

    -
      -
    1. -

      Onboarding & Registration (The Source of Truth)

      -

      The lifecycle begins with the Data Scientist or engineer.

      -
        -
      • Model Ingestion: Users onboard models—whether open-source (Hugging Face, NeMo) or internally fine-tuned—via the Truffle Box SDK/UI.
      • -
      • LLM + Prompt Registry: Unlike traditional systems that only track model weights, our registry is a unified control plane. It stores both the Model Artifacts and the Prompt Templates. This allows Data Scientists to register and version-control prompts (e.g., "customer_support_v2") independently of the application code.
      • -
      -
    2. -
    3. -

      The "Black Box" Build Engine

      -

      Once a model is registered, the Automated LLM Compiler + Quantizer Module kicks off a background job on ephemeral GPU resources.

      -
        -
      • Transformation: The raw model is converted into a TRT-LLM Checkpoint.
      • -
      • Quantization: The system automatically applies quantization algorithms (like INT4 AWQ or FP8) to reduce memory footprint.
      • -
      • Engine Building: Finally, it compiles a highly optimized TRT Engine specifically tuned for the target hardware.
      • -
      -
    4. -
    5. -

      Intelligent Profiling & Validation

      -

      Before deployment, the new engine passes through the Hardware & Inference Runtime Profiler.

      -
        -
      • Benchmarking: This module empirically tests the engine against various hardware configurations (L4 vs. A100) and runtimes (TRT-LLM vs. vLLM).
      • -
      • Optimization: It recommends the optimal configuration that meets latency SLAs (Time-To-First-Token) while minimizing cost.
      • -
      -
    6. -
    7. -

      Smart Artifact Generation & Distribution

      -

      To solve the Kubernetes "Cold Start" problem, the LLM Serving Artifacts Generation module packages the model using a bifurcated strategy:

      -
        -
      • Standard Models: Artifacts are uploaded to Cloud Storage (GCS) and downloaded by pods at startup.
      • -
      • Very Large Models: For massive models (>8GB) where network downloads are too slow, the system pre-caches the model onto Secondary Boot Disks. These disks are attached directly to new GPU nodes during autoscaling, eliminating download wait times.
      • -
      -
    8. -
    9. -

      Image Streaming & Deployment

      -

      Simultaneously, the inference runtime container images are pulled from the Artifact Registry.

      -
        -
      • Image Streaming: We utilize container image streaming to allow pods to start initializing while the massive Triton/Dynamo container layers are still downloading, further shaving seconds off the startup time. link
      • -
      -
    10. -
    11. -

      The Inference Runtime (Kubernetes)

      -

      The workload lands on Kubernetes with Autoscaling.

      -
        -
      • Dynamic Backends: Depending on the profile generated in Stage 3, the pod initializes either TensorRT-LLM (for throughput) or vLLM (for flexibility), or spins up a Dynamo worker for distributed inference.
      • -
      • Data Loading: The pod either downloads the model from Cloud Storage or mounts the pre-warmed Secondary Boot Disk ("Pull from Disk").
      • -
      -
    12. -
    13. -

      Client Interaction & Observability

      -

      Finally, the LLM Inference Client executes the request.

      -
        -
      • Prompt Injection: The client pulls the specific prompt template ID from the Registry, ensuring the exact versioned instructions are used.
      • -
      • Streaming Response: The request is sent via gRPC, and tokens are streamed back to the user in real-time.
      • -
      -
    14. -
    15. -

      Observability: Monitoring the Pulse of GenAI

      -

      In traditional microservices, success is measured by CPU utilization and request latency (p99). For Large Language Models, these metrics are insufficient. A user doesn't care if the GPU is at 80% utilization; they care about how fast the first word appears and how smoothly the rest of the sentence follows.

      -

      To capture the true user experience, our platform instrumentation focuses on three critical LLM-specific metrics:

      -
        -
      1. -

        Time to First Token (TTFT)

        -
          -
        • Definition: TTFT measures the time elapsed from the moment a request is received until the very first token is generated and streamed back to the user.
        • -
        • Why it matters: This represents the "Prefill Phase" latency—the time the model takes to process the input prompt and load weights. A high TTFT makes the application feel unresponsive or "hung."
        • -
        • Optimization: We closely monitor TTFT to ensure our Prefix Caching is effective (aiming for high cache hitrates), which drastically lowers this metric by skipping redundant prompt processing.
        • -
        -
      2. -
      3. -

        Inter-Token Latency (ITL)

        -
          -
        • Definition: ITL measures the average time interval between the generation of consecutive tokens during the "Decode Phase".
        • -
        • Why it matters: This defines the "perceived speed" of reading. Even if the first token is fast (low TTFT), high ITL makes the text generation look "jerky" or slow to the user.
        • -
        • Benchmarks: In our testing with Llama 3.1, we track p99 ITL to ensure it stays below human reading speeds to maintain a natural conversational flow.
        • -
        -
      4. -
      5. -

        Token Throughput vs. Request Throughput

        -
          -
        • We distinguish between two types of throughput to balance system efficiency with user load:
        • -
        • Token Throughput (tokens/sec): The total number of tokens generated across all concurrent requests. This measures the raw compute efficiency of the GPU and the effectiveness of batching.
        • -
        • Request Throughput (req/sec): The number of distinct user queries served per second. We use this to determine autoscaling thresholds, ensuring we scale out before the queue depth impacts ITL.
        • -
        -
      6. -
      7. -

        The Monitoring Stack

        -
          -
        • Real-time Dashboards: We utilize Grafana to visualize these streaming metrics in real-time, allowing on-call engineers to spot "slow generation" incidents that generic "500 error" alerts would miss.
        • -
        • Request Tracing: Since Triton Inference Server does not log request payloads by default, we integrate a Helix Client to asynchronously publish request logs to Log Tables. This allows us to trace a specific "slow" request back to its prompt to understand if a complex input caused the latency spike.
        • -
        -
      8. -
      -
    16. -
    -

    Supported Inference backends (TensorRT LLM, Dynamo & vLLM)

    -

    Tailored for the Use Case: We do not believe in a "one-size-fits-all" approach to inference. Different use cases—whether a real-time voice bot requiring ultra-lowsub-second latency or a massive reasoning task requiring huge context windows—demand different runtime characteristics. Our platform is designed to be runtime-agnostic, allowing us to automatically select and tailor the best engine based on the specific requirements of the application:

    -
      -
    1. -

      TensorRT-LLM: The High-Performance Standard

      -

      Suitable for: High-throughput production workloads where latency is critical (e.g., customer support chat, real-time voice bots).

      -

      TensorRT-LLM serves as our default backend for these scenarios. Our internal benchmarks on Llama 3.1 and 3.2 models demonstrated that a tuned TensorRT-LLM engine significantly outperforms standard runtimes, especially when utilizing INT4 AWQ and FP8 quantization .

      -

      Key optimizations we tailor for these high-load cases include:

      -
        -
      • Optimized execution via TensorRT engine compilation
      • -
      • Quantization-aware execution for reduced memory usage and improved throughput
      • -
      • Inflight Batching: Allowing requests to be processed continuously without waiting for the entire batch to finish, drastically improving GPU utilization .
      • -
      • Custom Plugins: Enabling specific NVIDIA plugins like the GEMM plugin and GPT Attention plugin to accelerate matrix multiplications and attention mechanisms .
      • -
      -
    2. -
    3. -

      Dynamo: Distributed Inference for Reasoning Models

      -

      Suitable for: Very large "reasoning" models (70B+) or scenarios requiring massive context windows where a single GPU's memory is insufficient.

      -

      For these memory-bound tasks, we utilize Dynamo, a low-latency distributed inference framework . Unlike monolithic servers, Dynamo disaggregates the inference process to scale resources horizontally:

      -
        -
      • KV Aware Routing: A specialized router directs requests to workers that already hold the relevant Key-Value (KV) cache, minimizing redundant computation .
      • -
      • Prefill vs. Decode Split: The workload is divided into Prefill Workers (processing the prompt) and Decode Workers (generating tokens), allowing us to scale the compute-heavy "reading" phase independently from the memory-heavy "writing" phase .
      • -
      • Distributed execution across multiple GPU resources
      • -
      -
    4. -
    5. -

      vLLM: The Flexible Baseline

      -

      Suitable for: Rapid prototyping, testing new model architectures, or low-traffic internal tools where ease of deployment outweighs raw throughput.

      -

      While TensorRT-LLM is optimized for maximum speed, vLLM provides a robust and flexible baseline .

      -
        -
      • High throughput through dynamic batching and efficient memory utilization
      • -
      • Paged KV cache management for handling long contexts and concurrent requests
      • -
      • Strong support for open-source model ecosystems
      • -
      • Rapid Adoption: It allows us to onboard new model architectures immediately without waiting for a custom TensorRT build.
      • -
      • Benchmarking Insight: In our internal tests, vLLM provided a strong baseline but often lacked the specific max-token optimizations present in our custom TRT engines . We use it strategically for initial testing before committing to a full TensorRT optimization pipeline.
      • -
      -
    6. -
    -

    Conclusion

    -

    Large language model inference introduces a fundamentally new class of infrastructure challenges—where performance is governed not just by raw compute, but by memory efficiency, intelligent scheduling, runtime specialization, and lifecycle automation. Unlike traditional ML serving, LLM inference requires systems that understand token-level execution, manage rapidly growing context state, and continuously balance latency, throughput, and cost under highly dynamic workloads.

    -

    The LLM Inference Framework addresses these challenges by transforming inference into a fully automated, reproducible lifecycle—from model onboarding and compilation to deployment, optimization, and observability. By integrating automated quantization and engine compilation, intelligent runtime selection, cold-start mitigation strategies, and LLM-specific observability metrics such as Time-to-First-Token and Inter-Token Latency, the platform ensures both high performance and operational simplicity.

    -

    Equally important, the framework is designed with flexibility and future evolution in mind. Its runtime-agnostic architecture enables seamless adoption of emerging inference engines, hardware accelerators, and optimization techniques without requiring platform redesign. This ensures that teams can continuously leverage advancements in the rapidly evolving LLM ecosystem while maintaining consistent operational workflows.

    -

    Ultimately, the goal of the platform is to make production-scale LLM deployment as seamless and reliable as traditional software deployment—allowing teams to focus on building intelligent applications rather than managing infrastructure complexity. By combining lifecycle automation, runtime optimization, and deep observability, the LLM Inference Framework provides a scalable foundation for delivering fast, cost-efficient, and production-ready LLM experiences.

    -

    Future Explorations

    -

    While we have achieved significant milestones in latency and throughput, the landscape of GenAI is evolving rapidly. Our roadmap focuses on increasing flexibility, reducing costs, and enhancing reliability for enterprise-grade workloads. Here is what we are building next:

    -
      -
    • TPU Support: To diversify our hardware supply chain and further optimize cost-per-token, we are evaluating Google Cloud TPUs to bake it into our platform. By leveraging the JAX and PyTorch/XLA ecosystems, we aim to unlock the massive throughput potential of TPU v5e chips, particularly for our open-source Llama models. This will allow the hardware profiler to dynamically choose between NVIDIA GPUs and Google TPUs based on real-time availability and price-performance metrics.
    • -
    • Multi-LoRA Serving (Serverless Experience): Currently, deploying a fine-tuned model requires a dedicated GPU. We are building support for Multi-LoRA serving, which will allow us to serve hundreds of unique, fine-tuned adapters on top of a single frozen base model. This will drastically reduce costs for multi-tenant applications, enabling a "serverless" experience where specific fine-tunes are hot-swapped instantly per request.
    • -
    • Spot Instance Orchestration: To further optimize cloud costs, we are developing fault-tolerant mechanisms to run inference workloads on Spot Instances. By implementing aggressive checkpointing and seamless request draining, we aim to leverage cheaper, preemptible compute capacity without interrupting the user's streaming experience.
    • -
    • Semantic Caching Layer: We plan to move beyond standard Prefix Caching to implement Semantic Caching. By using a vector database to fetch responses for semantically similar queries (e.g., "How do I reset my password?" vs. "Password reset steps"), we can bypass the GPU entirely for repetitive queries, reducing latency to near-zero.
    • -
    • Context-Aware Autoscaling: Standard CPU/GPU utilization metrics are often insufficient signals for scaling LLMs. We are working on KV-cache pressure metrics for autoscaling. This ensures that we scale out before the memory fills up, preventing eviction-based slowdowns during traffic spikes.
    • -
    • Online Evaluation & Guardrails: We are integrating a lightweight "Trust Layer" into the proxy. This will allow for low-latency input/output filtering (Guardrails) and asynchronous "LLM-as-a-Judge" evaluation pipelines to monitor response quality in production, not just system health.
    • -
    ]]>
    + +

    By mid-2023, we had transformed our ML stack—building a real-time feature store, optimizing model retrieval, and fine-tuning ranking. But two critical gaps remained:

    +
      +
    • 🔹 Scaling model inference without hitting infrastructure roadblocks
    • +
    • 🔹 Moving embedding search from batch to real-time for candidate generation
    • +
    +

    Here’s how we tackled these last-mile challenges, broke free from infrastructure constraints, and built a cost-efficient, high-performance system.

    +

    Breaking Free from the Scalability Ceiling

    +

    The Model Serving Bottleneck—A Wake-Up Call

    +

    July 2023. With just months left for the Mega Blockbuster Sale (MBS), we noticed a serious issue—scaling our model-serving infrastructure was taking 10–15 minutes. In real-time ML, that’s an eternity. +In one of our war rooms, we ran a quick experiment:

    +
      +
    • 🚀 We deployed an XGBoost model on a self-hosted Triton Inference Server running on a 16-core machine.
    • +
    • 🚀 Fired requests and compared the outputs with our existing cloud-hosted setup.
    • +
    • 🚀 The results matched—perfectly.
    • +
    +

    That moment changed everything. We prepped a backup Triton setup on EKS, just in case our cloud provider couldn't allocate enough compute resources in time. Luckily, they did—but the seed was planted. +Then in October, just two weeks before MBS, we got an alarming response from our infrastructure team: +"Node availability may be an issue." +With no time to waste, we moved 30% of real-time ML traffic to our self-hosted Triton cluster. The results?

    +
      +
    • ✅ p99 latency dropped from 90–100ms to 30–40ms
    • +
    • ✅ Triton handled significantly higher throughput on fewer resources
    • +
    • ✅ No model changes were needed
    • +
    +

    MBS ran without a hitch, proving that self-hosted inference was the way forward.

    +

    Scaling Triton on GKE

    +

    This left us with two choices:

    +
      +
    • 1️⃣ Port models to a managed cloud inference service, investing time in learning a new deployment stack
    • +
    • 2️⃣ Scale our existing Triton setup on GKE, optimizing for cost and performance
    • +
    +

    We went with Option 2—and it slashed inference costs to 35% of what we previously paid, while giving us full control over scaling and optimizations.

    +

    Fixing the Cold Start Problem

    +

    As we onboarded more deep learning (DL) models, we hit a new bottleneck, new inference pods took 7–9 minutes to spin up.

    +

    After profiling, we found the culprits:

    +
      +
    • Triton’s base image—a massive 5GB
    • +
    • Model binaries—often 1GB+
    • +
    • Startup delay—mostly due to downloading and initializing these assets
    • +
    +

    To fix this, we built a lightweight Triton image, stripping unused components and shrinking the size to 900MB. This cut cold start times drastically, making auto-scaling faster and smoother.

    +

    Embedding Search: The Last Piece of the Puzzle

    +

    By mid-2023, most of our ML stack had gone real-time—except for Candidate Generation (CG), which still ran in batch mode. To truly power real-time recommendations, we needed an online embedding search system.

    +

    Choosing the Right Vector Database

    +

    We benchmarked three production-ready vector DBs across key parameters:

    +
      +
    • Milvus
    • +
    • Qdrant
    • +
    • Weaviate
    • +
    +

    After extensive POCs, Qdrant stood out for its:

    +
      +
    • ✅ Blazing-fast search latency on high-dimensional vectors
    • +
    • ✅ Efficient memory usage, crucial for in-memory workloads
    • +
    • ✅ Support for upserts and soft deletes, vital for Ads use cases
    • +
    • ✅ gRPC + REST APIs, making integration seamless
    • +
    • ✅ Powerful filtering, allowing fine-tuned retrieval (e.g., filtering Ads by category, active status, etc.)
    • +
    +

    At its core, Qdrant uses HNSW indexing, delivering both high recall and low-latency nearest-neighbor search—a perfect fit for our needs.

    +

    Embedding Freshness & Real-Time Updates

    +

    To ensure embeddings stayed up to date, we built a dual ingestion pipeline:

    +
      +
    • 📌 Daily Refresh: A bulk pipeline updated embeddings overnight
    • +
    • 📌 Real-Time Updates: Ads events triggered immediate upserts/deletes
    • +
    +

    This setup powered real-time "Similar Products" recommendations on the product page and became the foundation for Ads Candidate Generation, ensuring the right ads surfaced in milliseconds.

    +

    Skye

    +

    Final Takeaways: Scaling Smartly for Real-Time ML

    +
      +
    • 🚀 Self-hosted inference on Triton gave us lower cost, faster scaling, and better performance than managed services
    • +
    • 🚀 Building a custom Triton image reduced cold starts, improving responsiveness
    • +
    • 🚀 Qdrant-based embedding search enabled real-time personalization at scale
    • +
    • 🚀 Real-time updates for embeddings unlocked dynamic, up-to-date recommendations
    • +
    +

    By early 2024, Meesho’s ML stack had evolved into a fully real-time, scalable, and cost-efficient system, setting the foundation for even bigger leaps ahead.

    ]]> Aditya Kumar https://github.com/Adit2607 @@ -775,7 +662,7 @@ To represent these groups efficiently, we adopted a layered storage approach:

    Expiry Timestamp and Schema Version were appended using a semi-colon delimiter at the end of the string.

Example:

-
feature_1_value,feature_2_value,feature_3_value;expiry_ts
+
feature_1_value,feature_2_value,feature_3_value;expiry_ts

This format allowed:

  • Consistent writes and reads at the group level
  • @@ -838,7 +725,7 @@ For the 0th version of the Interaction Store, we focused on a d

Storage Structure

Each user’s interactions were stored using a composite key format, uniquely identifying the user and interaction type. This structure allowed efficient organization and quick retrieval of recent activity for recommendation generation:

-
userId_eventType → ZSET[...(pid, ts)...]
+
userId_eventType → ZSET[...(pid, ts)...]

Within each ZSET: