diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 26a04e3..5ee2951 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -22,6 +22,7 @@ "react-router-dom": "^7.6.2", "react-syntax-highlighter": "^15.6.6", "rehype-raw": "^7.0.0", + "remark-gfm": "^4.0.1", "zustand": "^5.0.7" }, "devDependencies": { @@ -4100,6 +4101,44 @@ "@jridgewell/sourcemap-codec": "^1.5.0" } }, + "node_modules/markdown-table": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz", + "integrity": "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/mdast-util-find-and-replace": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mdast-util-find-and-replace/-/mdast-util-find-and-replace-3.0.2.tgz", + "integrity": "sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "escape-string-regexp": "^5.0.0", + "unist-util-is": "^6.0.0", + "unist-util-visit-parents": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-find-and-replace/node_modules/escape-string-regexp": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz", + "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/mdast-util-from-markdown": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.2.tgz", @@ -4123,6 +4162,107 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/mdast-util-gfm": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz", + "integrity": "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ==", + "license": "MIT", + "dependencies": { + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-gfm-autolink-literal": "^2.0.0", + "mdast-util-gfm-footnote": "^2.0.0", + "mdast-util-gfm-strikethrough": "^2.0.0", + "mdast-util-gfm-table": "^2.0.0", + "mdast-util-gfm-task-list-item": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-autolink-literal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.1.tgz", + "integrity": "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "ccount": "^2.0.0", + "devlop": "^1.0.0", + "mdast-util-find-and-replace": "^3.0.0", + "micromark-util-character": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-sqpDWlsHn7Ac9GNZQMeUzPQSMzR6Wv0WKRNvQRg0KqHh02fpTz69Qc1QSseNX29bhz1ROIyNyxExfawVKTm1GQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.1.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-strikethrough": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz", + "integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-table": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz", + "integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "markdown-table": "^3.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-task-list-item": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz", + "integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/mdast-util-mdx-expression": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.1.tgz", @@ -4321,6 +4461,127 @@ "micromark-util-types": "^2.0.0" } }, + "node_modules/micromark-extension-gfm": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm/-/micromark-extension-gfm-3.0.0.tgz", + "integrity": "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==", + "license": "MIT", + "dependencies": { + "micromark-extension-gfm-autolink-literal": "^2.0.0", + "micromark-extension-gfm-footnote": "^2.0.0", + "micromark-extension-gfm-strikethrough": "^2.0.0", + "micromark-extension-gfm-table": "^2.0.0", + "micromark-extension-gfm-tagfilter": "^2.0.0", + "micromark-extension-gfm-task-list-item": "^2.0.0", + "micromark-util-combine-extensions": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-autolink-literal": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-autolink-literal/-/micromark-extension-gfm-autolink-literal-2.1.0.tgz", + "integrity": "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw==", + "license": "MIT", + "dependencies": { + "micromark-util-character": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-footnote/-/micromark-extension-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-core-commonmark": "^2.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-strikethrough": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-strikethrough/-/micromark-extension-gfm-strikethrough-2.1.0.tgz", + "integrity": "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-util-chunked": "^2.0.0", + "micromark-util-classify-character": "^2.0.0", + "micromark-util-resolve-all": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-table": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-table/-/micromark-extension-gfm-table-2.1.1.tgz", + "integrity": "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-tagfilter": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-tagfilter/-/micromark-extension-gfm-tagfilter-2.0.0.tgz", + "integrity": "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg==", + "license": "MIT", + "dependencies": { + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-task-list-item": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-task-list-item/-/micromark-extension-gfm-task-list-item-2.1.0.tgz", + "integrity": "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/micromark-factory-destination": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz", @@ -5432,6 +5693,24 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/remark-gfm": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/remark-gfm/-/remark-gfm-4.0.1.tgz", + "integrity": "sha512-1quofZ2RQ9EWdeN34S79+KExV1764+wCUGop5CPL1WGdD0ocPpu91lzPGbwWMECpEpd42kJGQwzRfyov9j4yNg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-gfm": "^3.0.0", + "micromark-extension-gfm": "^3.0.0", + "remark-parse": "^11.0.0", + "remark-stringify": "^11.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/remark-parse": { "version": "11.0.0", "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz", @@ -5463,6 +5742,21 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/remark-stringify": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz", + "integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-to-markdown": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/resolve": { "version": "1.22.10", "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.10.tgz", diff --git a/frontend/package.json b/frontend/package.json index dc90aaa..b460f6d 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -26,6 +26,7 @@ "react-router-dom": "^7.6.2", "react-syntax-highlighter": "^15.6.6", "rehype-raw": "^7.0.0", + "remark-gfm": "^4.0.1", "zustand": "^5.0.7" }, "devDependencies": { diff --git a/frontend/src/components/markdown-renderer/MarkdownRenderer.tsx b/frontend/src/components/markdown-renderer/MarkdownRenderer.tsx index 69d08d3..a32513d 100644 --- a/frontend/src/components/markdown-renderer/MarkdownRenderer.tsx +++ b/frontend/src/components/markdown-renderer/MarkdownRenderer.tsx @@ -1,6 +1,7 @@ import React from "react"; import ReactMarkdown from "react-markdown"; import rehypeRaw from "rehype-raw"; +import remarkGfm from "remark-gfm"; type MarkdownRendererProps = { content: string; @@ -45,6 +46,7 @@ const MarkdownRenderer: React.FC = ({ const { align, ...styleProps } = mergedImageProps; return ( ( diff --git a/kernelboard/static/news/2026-01-21-gpumode-2026.md b/kernelboard/static/news/2026-01-21-gpumode-2026.md new file mode 100644 index 0000000..a3e9bb1 --- /dev/null +++ b/kernelboard/static/news/2026-01-21-gpumode-2026.md @@ -0,0 +1,145 @@ +## Our plans for 2026 + +This post is split into two parts: a non-technical retrospective on 2025 and our technical plans for 2026. The context helps, but feel free to [skip ahead](#gpu-mode-2026) to learn more about we're going to ship the world's best open Kernel LLM model and how you can get involved. + +### 2025 retrospective + +2025 was a wild year for us. We're now at 26.6K YouTube subscribers, 92 lectures, 24K Discord members, 3 $100K+ prize pool [kernel competitions](https://www.gpumode.com/v2/home) with over 400K submissions collectively, [3 events](https://www.gpumode.com/v2/news) with our partners NVIDIA, Jane Street, and Accel, and 10 active working groups with some [pioneering work around LLM kernel generation](https://gpu-mode.github.io/popcorn/). + +Most communities don't die; they slowly rot. Yet we've managed to stick and continue to be more relevant to the space since our founding two years ago. This is because we found ourselves a niche between hardware vendors that would like to sell more hardware and hackers that would like to get more out of the hardware they buy. Our mission has remained consistent, which is to make GPU programming [^1] more accessible, even though our approach has changed a lot since our early days. + +In the distant days of 2024, we were but a humble reading group for the fantastic PMPP book [^2], and Andreas and my primary responsibility needed to be keeping a consistent schedule and responding to every single message. We got lucky that one of my role models, Jeremy Howard, came to teach our [3rd lecture](https://www.youtube.com/watch?v=4sgKnKbR-WE) and introduced the idea of writing GPU code in Python and letting an LLM rewrite it into CUDA. Karpathy would later become an active member of the community in a public working group, [llm.c](https://github.com/karpathy/llm.c), where some of the best engineers on our server—Arun, Erik, and Aleksa—built a training loop in raw CUDA that beat torch.compile. 2024 wrapped up with perhaps my favorite hackathon, [CUDA MODE IRL](https://www.accel.com/noteworthies/6-keynotes-from-the-first-cuda-mode-irl-hackathon#all), which was masterminded by our resident gigachad Casey Aylward. + +All the above started a vibe shift where the community went from asking the question: "why are we relying on vendors for speed-of-light performance" to shipping libraries that became important in their own right, and those libraries started to gain more commercial relevance. Vikram from NVIDIA was one of the first industry folks to notice this trend. + +But the above introduced a major problem for our community, in that the best people would often "graduate" to AGI labs, never to be seen again. + +So the main question I tried to answer in 2025 was how I can take more people on this journey: how can we convert people from watching a lecture to shipping an important systems project? The answer I landed on was designing a smooth difficulty curve: +1. Watch a lecture +2. Participate in a kernel competition +3. Meet colleagues at a hackathon +4. Start a working group + +And so I got to work on creating a competition platform for kernels, and along the way I managed to recruit some brilliant folks Matej, Alex, Erik, Ben, Emre and Elaine to help me make it real, and the growth there has been staggering. + +In particular, many of you probably have already interacted with Matej and Alex, who have become key mods. I was really worried the community would stagnate while I was out on pat leave, but thanks to Matej and Alex, the public-facing stuff kept going full steam ahead, and thanks to Emre Guven from Meta for helping me manage all the new partnerships with hardware vendors and neo-clouds. +90 +KernelBot is now at close to 400K submissions, with 3 $100K kernel competitions with AMD and NVIDIA, and it's been a close race between human experts like @gau-nernst writing [cursed PTX code](https://gau-nernst.github.io/tcgen05/), @Simon who'd write some of the [best public material on CuteDSL](https://veitner.bearblog.dev/an-applied-introduction-to-cutedsl/), and finally shiyeegao who [never wrote GPU code](https://x.com/marksaroufim/status/2009497284418130202?s=20) in his life before. [^3] + + +### GPU MODE 2026 + +Going into 2026, my main goal for all of us is to create massively impactful systems projects in the open. Personally I'll continue working on Kernel LLM research but I'd also like throw out a call for community projects that I think will be important [^4] + +Before we get started though, I'd like to state that two things can be true at the same time about Kernel LLMs that most published Kernel LLM work is slop but also that LLMs will be tranformational for accelerating systems research. + +#### The goal for 2026 + +The high level goal is that we'd like to post-train a Kernel LLM and get some kernels merged into important repos such as PyTorch and VLLM, we're planning on shipping our first results by GTC timeframe in San Jose March 2026 and the second results by ICML timeframe in Seoul July 2026. + +This will be split up into several independent workstreams and you are welcome to join and contribute to any of them since our meetings and chats will be in public. And you can follow along in the popcorn channel on discord.gg/gpumode where we're also share links to our weekly meetings. + +#### Desplopifying LLM generated kernels + +In collaboration with PyTorch, VLLM and NVIDIA. + +If you visually inspect most LLM generated kernels, one striking thing is how verbose the kernels are, they tend to look like a splatted autotuning results, have tons of if conditions, weird try-except logic that fallsback to different kernels, the code is rarely numerically stable let alone deterministic. + +It's important that kernel LLM researchers understood no PyTorch or VLLM maintainer is going to accept a 5K LOC patch that's faster if they don't understand how it works or why. In PyTorch thanks to the work of Alban Desmaison and Richard Zou we also have the luxury of easily shipping out of core extensions to easily benchmark LLM generated kernels. So while [KernelBench](https://simonguo.tech/blog/2025-10-automated-gpu-kernels.html) was a critical first step the community is moving towards benchmarking within real systems, one exmaple is [BackendBench](https://github.com/meta-pytorch/BackendBench) for PyTorch and another is [FlashInfer-Bench](https://bench.flashinfer.ai/) which both ask similar questions: "what is the bar to merge an LLM generated kernel into an important repo". I'd like to see more such projects. I'm also optimistic about "micro-evals" that test for instance whether an LLM can use tcgen05 instructions and we'll be collaborating with the legendary Tianqi Chen to make progress here. [^8] + +In 2025, I spent a lot of time manually reading LLM generated kernels to deslopify them but it's likely I can be replaced by a Claude Skill. I think "some" manual interventions are probably still OK since one important kernel produced this group a week would still be transformational. + +#### Post training a Kernel LLM model + +In collaboration with Prime Intellect, Lambda and MIT. + +When I first proposed [popcorn](https://gpu-mode.github.io/popcorn/), my goal was to have an example of a community trained model with my biggest inspirations being the pioneering [BLOOM](https://huggingface.co/bigscience/bloom) and similar work done by communities such as Eleuther and Nous Research. + +And while in 2025 we did end up shipping [KernelLLM](https://huggingface.co/facebook/KernelLLM) we couldn't practicaly share the cluster with everyone who was working with us because it's tied to Meta infrastructure and we didn't really have the necessary level of influence to raise money for a large external cluster. In 2026, this is finally changing, I am beyond grateful to Zach from Lambda for sponsoring $100K in compute credits to make our research possible. + +However, if I look at my recent core competencies and those of the server it's easier for me to work on infra and evals but not so much on science and so we're going to be partnering with an another OSS community Prime Intellect to push the state of the art. I've known Johannes since the inception of CUDA MODE and I've been immensely impressed by their research team, we'll be working closely with Will and Sami. + +We're hypothesizing that the gains will come from two main areas. The first, profiler guided optimization which historically has been bogged down by cloud vendors not enabling ncu by default which is [super easy to fix](https://gist.github.com/msaroufim/9e56ce5d42a5e9ccd5e938c83181ea47) [^5]. The second is the role of memory, the best CUDA hackers write extensive worklogs in the [tradition of siboehm](https://siboehm.com/articles/22/CUDA-MMM) and turns out this helps AI quite a bit and not get bogged down as reasoning traces become quite long. Our very own Alex has a framework called [Recursive Language Models](https://arxiv.org/abs/2512.24601) which we believe is the right framework for approaching memory. + +#### End to end kernel competitions and social evals + +In collaboration with any hardware vendor or neo-cloud that would like to work with us. + +I'm already quite happy at how KernelBot has been doing and I hope non GPU vendors will reach out to us so we can start doing more kinds of kernel competitions. But I also think it's time for us to take things a step further. + +We started with kernels because they're easy, well scoped problems to define and if we instead did an end to end systems optimization competition and expected people to submit a VLLM rewrite then I'm fairly confident we'd receive ~0 submissions. + +But end to end optimizations have much more real world relevance and I believe we can make it happen, Simon and our good friends at Verda have some compelling ideas for making this work in practice and if you'd like to be involved in designing these new competitions, you can join the [kernelbot discord](https://discord.gg/RPM7xrnwGY). I draw a lot of inspiration here from our experience doing [close-ended hackathons](https://www.gpumode.com/v2/news/jane-street-hackathon) with Jane Street vs the [open ended ones](https://www.gpumode.com/v2/news/irl-at-accel) we've done with Accel. It very much feels like designing a video game! + +Somewhat related to the above, KernelBot was initially conceived of as a way of aggregating high quality human data and while the best contestants are still humans, the AIs are catching up and so researcherers are using our infrastructure as an eval suite This feels far better than asking researchers to locally install some eval script and running over N problems - we can make the results significantly more competitive if we make them more social. + + +#### From scratch repos + +Finally, as we work on all the above problems, we're going to learn the limitations of the current most popular tools. This is why I'm quite excited about minimal repos that try to get 80% of the performance of a popular repo with %10 of the code, such as [teenygrad](https://github.com/j4orz/teenygrad) an even tinier tinygrad, and [penny](https://github.com/SzymonOzog/Penny) a handrolled version of NCCL. + +Typically these repos are built with the primary purpose of educating the author, as Szymon puts it: “What I cannot create I do not understand” + +But I'd argue that these repos are tremendously relevant for industrial applications because of the [staggering volume of new Kernel DSLs](https://x.com/tetsuo_cpp/status/2009238107309461782?s=20) that have been released in 2025. I'm very thankful that all of those DSLs have active support groups on [discord.gg/gpumode](discord.gg/gpumode) including #triton/gluon, #helion, #cutlass, #cutile, #thunderkittens, #tilelang. [^6] So the $5T question is what's the right way to program GPUs post Blackwell? + +Honestly, I think the honest answer is nobody knows but if we have minimal repos that capture the gist of important workflows whether it be pretraining or inference, we could start using these repos as a benchmark suite for new kernel DSLs and get to the answer ourselves. [^7] + +Erik reminded me late last year that our community needs to get back to its CUDA MODE roots and this will be my contribution towards that end. + + +### Thank you + +At this point we have a long list of companies that have become close friends of GPU MODE, and I'd like to take some time to thank them all. If you're looking for a job in systems I think you'd benefit from talking to any one of PyTorch at Meta, NVIDIA, AMD, Accel, Lambda, Nebius, Modal, Jane Street, Prime Intellect, SemiAnalysis, Stanford, Mako, Tensorwave, CoreWeave, Hot Aisle, Dell, Unsloth, HuggingFace, Sesterce, and Verda. It's almost criminal not to list out the names of everyone involved but I worry too much I'll miss someone. I just want to say to all our partners, thank you for believing in our community. + + Thank you all for believing in our community. + +But most importantly thank you to everyone who shows up on the server every day. You make it all worth it ❤️ + +-- Mark + + +### Popcorn impact in 2025 +Popcorn/KernelLLM +* [NeurIPS tutorial](https://www.linkedin.com/posts/zhousharon_excited-to-share-our-neurips-2025-tutorial-activity-7401628716218638336-Px5m/?utm_source=share&utm_medium=member_desktop&rcm=ACoAAAWbErgBtCPmRJStcIqOGtOiTogXhv9YJ8g) on kernel LLM generation +* Shoutout from [Soumith](https://mlsys.org/virtual/2025/2886) at MLSys 2025 +* [MLSys paper](https://fb.workplace.com/groups/831302610278251/permalink/31990448537270251/) on agentic op generation for ML asics +* Shoutout from [Ian Buck](https://www.youtube.com/watch?v=mdDVkBeFy9A&t=2160s) +* Shoutout from [Vamsi Boppana](https://www.amd.com/en/developer/resources/technical-articles/2025/inside-amd-ai-devday-2025.html) +* Shoutout from [Lisa Su](https://x.com/marksaroufim/status/1934712037881647607?s=20) + +Evals: KernelBench and BackendBench +* BackendBench: first eval focused on correctness, [adopted by prime intellect](https://app.primeintellect.ai/dashboard/environments/siro/backend-bench) +* [KernelBench](https://simonguo.tech/blog/2025-10-automated-gpu-kernels.html): de facto kernel LLM eval, built at GPU MODE hackathon +* Shoutout from [Sholto Douglas](https://stateofai.online/) at the State of the AI meetup +* Shoutout from [Tianqi Chen](https://arxiv.org/abs/2601.00227v1) at PyTorch conference + +KernelBot +* Stanford parallel programming class [used our infra](https://github.com/stanford-cs149/asst5-kernels) for kernel competitions +* Used by NVIDIA C++ team for PMPP perf validation [PTC poster](https://naderalawar.github.io/files/High-Performance%20CUDA%20Ops%20in%20Python%2C%20JIT-Compiling%20CUB%20with%20cuda.compute.pdf) +* Shoutout from [Tri Dao](https://icml.cc/virtual/2025/48204) at ICML codeML 2025 +* [Kernelbot](https://icml.cc/virtual/2025/48204) CodeML workshop spotlight +* ncu kernel profiling now is something [neoclouds are graded on](https://newsletter.semianalysis.com/p/clustermax-20-the-industry-standard) + +Kernelbook used by +* [NVIDIA nemotron](https://research.nvidia.com/labs/nemotron/files/NVIDIA-Nemotron-3-White-Paper.pdf) +* [Cornell CWM paper](https://arxiv.org/pdf/2509.26476) +* [FAIR CWM](https://arxiv.org/abs/2510.02387) +* [Tritonforge](https://github.com/RLsys-Foundation/TritonForge) +* [TritonRL](https://openreview.net/pdf?id=feJ5T9sFSJ) + + +[^1]: Technically our mission is to make high-performance compute more accessible but HETEROGENEOUS COMPUTE MODE wouldn't fit nicely on a hat + +[^2]: I'm proud to have written the most popular [Amazon review](https://www.amazon.com/gp/customer-reviews/R1C6W7BNKTRFVQ/ref=cm_cr_dp_d_rvw_ttl?ie=UTF8) for this excellent book + +[^3]: And honestly I did not expect AI to be as good as it'd be this year, but if we have [5th grade teachers teaching themselves GPU programming](https://x.com/clarkkitchen22/status/2011474826997731385?s=20) in a few days and submitting a working NVFP4 kernel, then we should start paying attention. + +[^4]: If you're working on a cool oss project that touches one of any of themes I mention and you're missing compute to make your project a reality, please pitch your project on the server and tag me + +[^5]: We're grateful to our friends at SemiAnalysis for socializing this in [ClusterMax 2.0](https://newsletter.semianalysis.com/p/clustermax-20-the-industry-standard) + +[^6]: [Phil Tillet](https://youtu.be/o3DrHb-mVLM?si=_bgVq0bTDMKkEL79&t=841) himself was one of the first DSL authors to tell people to join our server + +[^7]: After I read Horace's wonderful post on [non-determinism](https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/) my takeaway was that it's easier to have strong guarantees on numerics if an entire post training infra can fit in one person's head + +[^8]: If you think having more than 3-4 evals for Kernel LLMs is too much, remember that [Deepseek 3.2](https://api-docs.deepseek.com/news/news251201) used over 1,800 environments \ No newline at end of file