From c3c72e83a093be85e40638dfad967e9d950195c4 Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Fri, 15 May 2026 09:43:17 +0800 Subject: [PATCH 1/2] docs(site): fix pages baseline and repo identity Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/pages.yml | 6 ++++- README.md | 32 +++++++++++++------------- docs/.vitepress/config.ts | 8 +++---- docs/index.md | 18 ++------------- docs/package.json | 3 ++- docs/scripts/verify-site.mjs | 44 ++++++++++++++++++++++++++++++++++++ 6 files changed, 73 insertions(+), 38 deletions(-) create mode 100644 docs/scripts/verify-site.mjs diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index e274dff..bd725a0 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -21,7 +21,7 @@ concurrency: jobs: deploy: # Only run on the original repository, not on forks - if: github.repository == 'LessUp/gpu-spmv' + if: github.repository == 'AICL-Lab/gpu-spmv' runs-on: ubuntu-latest environment: name: github-pages @@ -68,6 +68,10 @@ jobs: working-directory: docs run: npm run sync + - name: Verify docs site + working-directory: docs + run: npm run verify:site + - name: Build docs working-directory: docs env: diff --git a/README.md b/README.md index 735abfe..cf7ff4b 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,16 @@

- - CI + + CI - + Documentation - - Release + + Release - + License

@@ -105,7 +105,7 @@ if (result.error != SpMVError::SUCCESS) { ```bash # 1. Clone -git clone https://github.com/LessUp/gpu-spmv.git && cd gpu-spmv +git clone https://github.com/AICL-Lab/gpu-spmv.git && cd gpu-spmv # 2. Build cmake --preset release && cmake --build --preset release @@ -143,7 +143,7 @@ int main() { } ``` -📚 **More examples**: [Documentation Site](https://lessup.github.io/gpu-spmv/examples) +📚 **More examples**: [Documentation Site](https://aicl-lab.github.io/gpu-spmv/en/examples/basic-spmv) --- @@ -167,7 +167,7 @@ Benchmark on **NVIDIA RTX 3090** (Ampere, 936 GB/s peak): # Avg time: 23.5 ms | Bandwidth: 69.8 GB/s (71.5% of peak) ``` -📈 **Full performance guide**: [Performance Optimization](https://lessup.github.io/gpu-spmv/performance) +📈 **Full performance guide**: [Performance Optimization](https://aicl-lab.github.io/gpu-spmv/en/performance/optimization-guide) --- @@ -194,16 +194,16 @@ gpu-spmv/ ## 📚 Documentation -Complete documentation is available at **[https://lessup.github.io/gpu-spmv/](https://lessup.github.io/gpu-spmv/)**: +Complete documentation is available at **[https://aicl-lab.github.io/gpu-spmv/](https://aicl-lab.github.io/gpu-spmv/)**: | Document | Description | |:---------|:------------| -| [📦 Installation Guide](https://lessup.github.io/gpu-spmv/installation) | System requirements, detailed installation | -| [📚 API Reference](https://lessup.github.io/gpu-spmv/api) | Complete API documentation, data structures | -| [📝 Examples](https://lessup.github.io/gpu-spmv/examples) | 7 complete code examples (basic → advanced) | -| [🚀 Performance Guide](https://lessup.github.io/gpu-spmv/performance) | Tuning strategies, benchmark data | -| [🏗️ Architecture](https://lessup.github.io/gpu-spmv/architecture) | System design, kernel selection | -| [📋 Changelog](https://lessup.github.io/gpu-spmv/changelog) | Version history, migration guide | +| [📦 Installation Guide](https://aicl-lab.github.io/gpu-spmv/en/quickstart) | System requirements, detailed installation | +| [📚 API Reference](https://aicl-lab.github.io/gpu-spmv/en/api/spmv) | Complete API documentation, data structures | +| [📝 Examples](https://aicl-lab.github.io/gpu-spmv/en/examples/basic-spmv) | End-to-end code example and walkthrough | +| [🚀 Performance Guide](https://aicl-lab.github.io/gpu-spmv/en/performance/optimization-guide) | Tuning strategies, benchmark data | +| [🏗️ Architecture](https://aicl-lab.github.io/gpu-spmv/en/architecture/overview) | System design, kernel selection | +| [📋 Changelog](https://aicl-lab.github.io/gpu-spmv/en/changelog) | Version history, migration guide | --- diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts index 3cf4ba1..5da08f9 100644 --- a/docs/.vitepress/config.ts +++ b/docs/.vitepress/config.ts @@ -124,7 +124,7 @@ export default withMermaid( ] }, editLink: { - pattern: 'https://github.com/LessUp/gpu-spmv/edit/main/docs/:path', + pattern: 'https://github.com/AICL-Lab/gpu-spmv/edit/main/docs/:path', text: '在 GitHub 上编辑此页' }, docFooter: { prev: '上一页', next: '下一页' }, @@ -210,7 +210,7 @@ export default withMermaid( ] }, editLink: { - pattern: 'https://github.com/LessUp/gpu-spmv/edit/main/docs/:path', + pattern: 'https://github.com/AICL-Lab/gpu-spmv/edit/main/docs/:path', text: 'Edit this page on GitHub' }, outline: { label: 'On This Page', level: [2, 3] } @@ -222,12 +222,12 @@ export default withMermaid( logo: '/images/logo.svg', siteTitle: 'GPU SpMV', socialLinks: [ - { icon: 'github', link: 'https://github.com/LessUp/gpu-spmv' } + { icon: 'github', link: 'https://github.com/AICL-Lab/gpu-spmv' } ], search: { provider: 'local' }, footer: { message: 'MIT License', - copyright: '© 2024-2026 LessUp' + copyright: '© 2024-2026 AICL-Lab' }, outline: [2, 3] }, diff --git a/docs/index.md b/docs/index.md index 0177402..71e521c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,7 +2,8 @@ layout: home hero: name: GPU SpMV - text: ' ' + text: Bilingual Technical Whitepaper and Architecture Showcase + tagline: Read the project as a serious engineering artifact, not only as source code. actions: - theme: brand text: 简体中文 @@ -11,18 +12,3 @@ hero: text: English link: /en/ --- - - diff --git a/docs/package.json b/docs/package.json index b51c899..6dd19ef 100644 --- a/docs/package.json +++ b/docs/package.json @@ -5,8 +5,9 @@ "type": "module", "scripts": { "sync": "node scripts/sync-changelog.mjs", + "verify:site": "node scripts/verify-site.mjs", "dev": "npm run sync && vitepress dev", - "build": "npm run sync && vitepress build", + "build": "npm run sync && npm run verify:site && vitepress build", "preview": "vitepress preview" }, "devDependencies": { diff --git a/docs/scripts/verify-site.mjs b/docs/scripts/verify-site.mjs new file mode 100644 index 0000000..7106669 --- /dev/null +++ b/docs/scripts/verify-site.mjs @@ -0,0 +1,44 @@ +import { readFileSync } from 'node:fs' +import { join } from 'node:path' + +const root = process.cwd() +const canonicalRepo = 'AICL-Lab/gpu-spmv' + +const files = { + readme: join(root, '..', 'README.md'), + config: join(root, '.vitepress', 'config.ts'), + pages: join(root, '..', '.github', 'workflows', 'pages.yml'), + index: join(root, 'index.md') +} + +const contents = Object.fromEntries( + Object.entries(files).map(([key, filePath]) => [key, readFileSync(filePath, 'utf8')]) +) + +const failures = [] + +if (!contents.config.includes(canonicalRepo)) { + failures.push('config missing canonical repo') +} + +if (!contents.pages.includes("github.repository == 'AICL-Lab/gpu-spmv'")) { + failures.push('pages workflow missing canonical repo guard') +} + +if (/LessUp\/gpu-spmv|github\.com\/LessUp/.test(Object.values(contents).join('\n'))) { + failures.push('legacy LessUp repo references still present') +} + +if (/useRouter\(|router\.go\('\/(zh|en)\//.test(contents.index)) { + failures.push('root docs index still auto-redirects by locale') +} + +if (failures.length > 0) { + console.error('verify-site failed:') + for (const failure of failures) { + console.error(`- ${failure}`) + } + process.exit(1) +} + +console.log('verify-site: ok') From e9b8ca774795818bde4dcabcc7010a7543e2b889 Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Fri, 15 May 2026 10:02:43 +0800 Subject: [PATCH 2/2] feat(docs): rebuild whitepaper site and content Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG.md | 6 +- docs/.vitepress/config.ts | 23 +- docs/.vitepress/data/benchmarks.ts | 20 + docs/.vitepress/data/references.ts | 48 + docs/.vitepress/data/site.ts | 29 + docs/.vitepress/theme/Layout.vue | 13 + .../theme/components/ArchitectureCanvas.vue | 73 ++ .../theme/components/CalloutPanel.vue | 20 + .../theme/components/CitationGrid.vue | 20 + .../theme/components/HeroEvidence.vue | 47 + .../theme/components/MetricStrip.vue | 19 + .../theme/components/ThemeAwareArt.vue | 17 + .../theme/components/WhitepaperSection.vue | 24 + docs/.vitepress/theme/index.ts | 20 +- docs/.vitepress/theme/style.css | 933 +----------------- docs/.vitepress/theme/styles/base.css | 67 ++ docs/.vitepress/theme/styles/citation.css | 25 + docs/.vitepress/theme/styles/diagram.css | 54 + docs/.vitepress/theme/styles/home.css | 89 ++ docs/.vitepress/theme/styles/paper.css | 44 + docs/.vitepress/theme/styles/tokens.css | 40 + docs/en/architecture/execution-pipeline.md | 28 + docs/en/architecture/overview.md | 10 +- docs/en/architecture/reliability.md | 25 + docs/en/architecture/spec-driven.md | 2 +- docs/en/changelog.md | 6 +- docs/en/citation.md | 63 +- docs/en/contributing.md | 4 +- docs/en/faq.md | 2 +- docs/en/index.md | 179 +--- docs/en/performance/benchmarks.md | 36 +- docs/en/performance/methodology.md | 19 + docs/en/quickstart.md | 2 +- docs/en/references.md | 83 +- docs/en/whitepaper/index.md | 114 +-- docs/en/whitepaper/performance.md | 2 +- docs/public/images/brand/logo-mark-dark.svg | 11 + docs/public/images/brand/logo-mark-light.svg | 11 + docs/public/images/favicon.svg | 2 +- docs/public/images/logo.svg | 2 +- docs/public/images/og-image.svg | 4 +- docs/public/images/social/og-dark.svg | 28 + docs/public/images/social/og-light.svg | 28 + docs/scripts/verify-site.mjs | 147 ++- docs/zh/architecture/execution-pipeline.md | 28 + docs/zh/architecture/overview.md | 38 +- docs/zh/architecture/reliability.md | 25 + docs/zh/architecture/spec-driven.md | 2 +- docs/zh/changelog.md | 6 +- docs/zh/citation.md | 63 +- docs/zh/contributing.md | 4 +- docs/zh/faq.md | 2 +- docs/zh/index.md | 179 +--- docs/zh/performance/benchmarks.md | 36 +- docs/zh/performance/methodology.md | 19 + docs/zh/quickstart.md | 2 +- docs/zh/references.md | 81 +- docs/zh/whitepaper/index.md | 112 +-- docs/zh/whitepaper/performance.md | 2 +- 59 files changed, 1306 insertions(+), 1732 deletions(-) create mode 100644 docs/.vitepress/data/benchmarks.ts create mode 100644 docs/.vitepress/data/references.ts create mode 100644 docs/.vitepress/data/site.ts create mode 100644 docs/.vitepress/theme/Layout.vue create mode 100644 docs/.vitepress/theme/components/ArchitectureCanvas.vue create mode 100644 docs/.vitepress/theme/components/CalloutPanel.vue create mode 100644 docs/.vitepress/theme/components/CitationGrid.vue create mode 100644 docs/.vitepress/theme/components/HeroEvidence.vue create mode 100644 docs/.vitepress/theme/components/MetricStrip.vue create mode 100644 docs/.vitepress/theme/components/ThemeAwareArt.vue create mode 100644 docs/.vitepress/theme/components/WhitepaperSection.vue create mode 100644 docs/.vitepress/theme/styles/base.css create mode 100644 docs/.vitepress/theme/styles/citation.css create mode 100644 docs/.vitepress/theme/styles/diagram.css create mode 100644 docs/.vitepress/theme/styles/home.css create mode 100644 docs/.vitepress/theme/styles/paper.css create mode 100644 docs/.vitepress/theme/styles/tokens.css create mode 100644 docs/en/architecture/execution-pipeline.md create mode 100644 docs/en/architecture/reliability.md create mode 100644 docs/en/performance/methodology.md create mode 100644 docs/public/images/brand/logo-mark-dark.svg create mode 100644 docs/public/images/brand/logo-mark-light.svg create mode 100644 docs/public/images/social/og-dark.svg create mode 100644 docs/public/images/social/og-light.svg create mode 100644 docs/zh/architecture/execution-pipeline.md create mode 100644 docs/zh/architecture/reliability.md create mode 100644 docs/zh/performance/methodology.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 0df6d91..2392e06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,7 +42,7 @@ This is the first stable release of GPU SpMV, featuring complete CSR and ELL for - Doxygen-compatible documentation #### Documentation -- Full documentation site at https://lessup.github.io/gpu-spmv/ +- Full documentation site at https://aicl-lab.github.io/gpu-spmv/ - Bilingual README (English and Chinese) - API reference, performance guide, and code examples - Architecture documentation and design decision records @@ -140,5 +140,5 @@ No breaking changes from pre-release versions. The API is now stable. --- -[1.0.0]: https://github.com/LessUp/gpu-spmv/releases/tag/v1.0.0 -[0.1.0]: https://github.com/LessUp/gpu-spmv/tree/7d6dd0c +[1.0.0]: https://github.com/AICL-Lab/gpu-spmv/releases/tag/v1.0.0 +[0.1.0]: https://github.com/AICL-Lab/gpu-spmv/tree/7d6dd0c diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts index 5da08f9..051eefd 100644 --- a/docs/.vitepress/config.ts +++ b/docs/.vitepress/config.ts @@ -26,11 +26,11 @@ export default withMermaid( content: 'High-Performance CUDA Sparse Matrix-Vector Multiplication Library' } ], - ['meta', { property: 'og:image', content: `${base}images/og-image.svg` }], + ['meta', { property: 'og:image', content: `${base}images/social/og-dark.svg` }], ['meta', { name: 'twitter:card', content: 'summary_large_image' }], ['meta', { name: 'twitter:title', content: 'GPU SpMV' }], ['meta', { name: 'twitter:description', content: 'High-Performance CUDA Sparse Matrix-Vector Multiplication Library' }], - ['meta', { name: 'twitter:image', content: `${base}images/og-image.svg` }], + ['meta', { name: 'twitter:image', content: `${base}images/social/og-dark.svg` }], ['link', { rel: 'icon', href: `${base}images/favicon.svg`, type: 'image/svg+xml' }], ['link', { rel: 'preconnect', href: 'https://fonts.googleapis.com' }], ['link', { rel: 'preconnect', href: 'https://fonts.gstatic.com', crossorigin: '' }], @@ -59,10 +59,10 @@ export default withMermaid( themeConfig: { nav: [ { text: '技术白皮书', link: '/zh/whitepaper/', activeMatch: '/zh/whitepaper/' }, - { text: '快速开始', link: '/zh/quickstart', activeMatch: '/zh/(quickstart|examples)/' }, { text: '架构设计', link: '/zh/architecture/overview', activeMatch: '/zh/architecture/' }, + { text: '性能测试', link: '/zh/performance/benchmarks', activeMatch: '/zh/performance/' }, { text: 'API 参考', link: '/zh/api/spmv', activeMatch: '/zh/api/' }, - { text: '性能测试', link: '/zh/performance/benchmarks', activeMatch: '/zh/performance/' } + { text: '学术引用', link: '/zh/references', activeMatch: '/zh/(references|citation)/' } ], sidebar: { '/zh/': [ @@ -88,8 +88,10 @@ export default withMermaid( collapsed: false, items: [ { text: '系统概览', link: '/zh/architecture/overview' }, + { text: '执行流水线', link: '/zh/architecture/execution-pipeline' }, { text: 'Kernel 选择策略', link: '/zh/architecture/kernel-selection' }, { text: '内存布局', link: '/zh/architecture/memory-layout' }, + { text: '可靠性约束', link: '/zh/architecture/reliability' }, { text: 'Spec-Driven 开发', link: '/zh/architecture/spec-driven' } ] }, @@ -97,6 +99,7 @@ export default withMermaid( text: '性能优化', collapsed: false, items: [ + { text: '性能方法学', link: '/zh/performance/methodology' }, { text: '基准测试', link: '/zh/performance/benchmarks' }, { text: '优化指南', link: '/zh/performance/optimization-guide' } ] @@ -145,10 +148,10 @@ export default withMermaid( themeConfig: { nav: [ { text: 'Whitepaper', link: '/en/whitepaper/', activeMatch: '/en/whitepaper/' }, - { text: 'Getting Started', link: '/en/quickstart', activeMatch: '/en/(quickstart|examples)/' }, { text: 'Architecture', link: '/en/architecture/overview', activeMatch: '/en/architecture/' }, + { text: 'Benchmarks', link: '/en/performance/benchmarks', activeMatch: '/en/performance/' }, { text: 'API Reference', link: '/en/api/spmv', activeMatch: '/en/api/' }, - { text: 'Benchmarks', link: '/en/performance/benchmarks', activeMatch: '/en/performance/' } + { text: 'References', link: '/en/references', activeMatch: '/en/(references|citation)/' } ], sidebar: { '/en/': [ @@ -174,8 +177,10 @@ export default withMermaid( collapsed: false, items: [ { text: 'System Overview', link: '/en/architecture/overview' }, + { text: 'Execution Pipeline', link: '/en/architecture/execution-pipeline' }, { text: 'Kernel Selection', link: '/en/architecture/kernel-selection' }, { text: 'Memory Layout', link: '/en/architecture/memory-layout' }, + { text: 'Reliability Constraints', link: '/en/architecture/reliability' }, { text: 'Spec-Driven Dev', link: '/en/architecture/spec-driven' } ] }, @@ -183,6 +188,7 @@ export default withMermaid( text: 'Performance', collapsed: false, items: [ + { text: 'Methodology', link: '/en/performance/methodology' }, { text: 'Benchmarks', link: '/en/performance/benchmarks' }, { text: 'Optimization Guide', link: '/en/performance/optimization-guide' } ] @@ -219,7 +225,10 @@ export default withMermaid( }, themeConfig: { - logo: '/images/logo.svg', + logo: { + light: '/images/brand/logo-mark-light.svg', + dark: '/images/brand/logo-mark-dark.svg' + }, siteTitle: 'GPU SpMV', socialLinks: [ { icon: 'github', link: 'https://github.com/AICL-Lab/gpu-spmv' } diff --git a/docs/.vitepress/data/benchmarks.ts b/docs/.vitepress/data/benchmarks.ts new file mode 100644 index 0000000..6cb02eb --- /dev/null +++ b/docs/.vitepress/data/benchmarks.ts @@ -0,0 +1,20 @@ +export const benchmarkData = { + environment: [ + { label: 'GPU', value: 'RTX 3090' }, + { label: 'Peak BW', value: '936 GB/s' }, + { label: 'CUDA', value: '12.0' }, + { label: 'CPU', value: 'Ryzen 9 5950X' } + ], + summary: [ + { label: 'Typical Utilization', value: '70%+' }, + { label: 'Best Kernel Family', value: 'Merge Path' }, + { label: 'Best Regular Pattern', value: 'ELL' }, + { label: 'Selector Accuracy', value: '100%' } + ], + scenarios: [ + { label: 'Very sparse', value: 'Scalar CSR', description: 'avg_nnz_per_row < 4' }, + { label: 'Uniform rows', value: 'Vector CSR', description: 'Low skewness, good warp utilization' }, + { label: 'High skew', value: 'Merge Path', description: 'Irregular row lengths with better balancing' }, + { label: 'ELL-friendly', value: 'ELL Kernel', description: 'Uniform row width, coalesced memory access' } + ] +} diff --git a/docs/.vitepress/data/references.ts b/docs/.vitepress/data/references.ts new file mode 100644 index 0000000..b39962c --- /dev/null +++ b/docs/.vitepress/data/references.ts @@ -0,0 +1,48 @@ +export const references = { + papers: [ + { + key: 'bell-garland-2009', + title: 'Implementing Sparse Matrix-Vector Multiplication on Throughput-Oriented Processors', + meta: 'Nathan Bell, Michael Garland · SC 2009', + url: 'https://doi.org/10.1145/1654059.1654121' + }, + { + key: 'merrill-garland-2016', + title: 'Merge-based Parallel Sparse Matrix-Vector Multiplication', + meta: 'Duane Merrill, Michael Garland · SC 2016', + url: 'https://doi.org/10.1145/3016078.2851141' + }, + { + key: 'vazquez-ellrt-2011', + title: 'Automatic Tuning of the Sparse Matrix Vector Product on GPUs Based on the ELL-R-T Format', + meta: 'Fernando Vázquez et al. · Concurrency and Computation 2011', + url: 'https://doi.org/10.1002/cpe.1761' + } + ], + projects: [ + { + key: 'cusparse', + title: 'NVIDIA cuSPARSE', + meta: 'Vendor baseline and API reference for sparse GPU primitives', + url: 'https://developer.nvidia.com/cusparse' + }, + { + key: 'ginkgo', + title: 'Ginkgo', + meta: 'Production-quality sparse linear algebra library with strong docs', + url: 'https://github.com/ginkgo-project/ginkgo' + }, + { + key: 'moderngpu', + title: 'ModernGPU', + meta: 'Useful for understanding scan / merge / partitioning techniques on GPU', + url: 'https://github.com/moderngpu/moderngpu' + }, + { + key: 'suitesparse', + title: 'SuiteSparse Matrix Collection', + meta: 'Representative real-world sparse matrices for benchmark reasoning', + url: 'https://github.com/DrTimothyAldenDavis/SuiteSparse' + } + ] +} diff --git a/docs/.vitepress/data/site.ts b/docs/.vitepress/data/site.ts new file mode 100644 index 0000000..bdfb3fd --- /dev/null +++ b/docs/.vitepress/data/site.ts @@ -0,0 +1,29 @@ +export type SiteMetric = { + label: string + value: string + description?: string +} + +export const siteData = { + repo: 'https://github.com/AICL-Lab/gpu-spmv', + zh: { + heroTitle: 'GPU SpMV:技术白皮书与架构展示站', + heroLead: '把 CUDA 稀疏矩阵向量乘法项目打造成可读、可证、可展示的工程作品。', + metrics: [ + { label: 'Bandwidth Utilization', value: '70%+' }, + { label: 'Adaptive Kernels', value: '4' }, + { label: 'Sparse Formats', value: 'CSR + ELL' }, + { label: 'Property Tests', value: '100+' } + ] satisfies SiteMetric[] + }, + en: { + heroTitle: 'GPU SpMV: Technical Whitepaper and Architecture Showcase', + heroLead: 'Present the CUDA sparse matrix-vector multiplication project as a serious engineering artifact.', + metrics: [ + { label: 'Bandwidth Utilization', value: '70%+' }, + { label: 'Adaptive Kernels', value: '4' }, + { label: 'Sparse Formats', value: 'CSR + ELL' }, + { label: 'Property Tests', value: '100+' } + ] satisfies SiteMetric[] + } +} diff --git a/docs/.vitepress/theme/Layout.vue b/docs/.vitepress/theme/Layout.vue new file mode 100644 index 0000000..68d3147 --- /dev/null +++ b/docs/.vitepress/theme/Layout.vue @@ -0,0 +1,13 @@ + + + diff --git a/docs/.vitepress/theme/components/ArchitectureCanvas.vue b/docs/.vitepress/theme/components/ArchitectureCanvas.vue new file mode 100644 index 0000000..1043e38 --- /dev/null +++ b/docs/.vitepress/theme/components/ArchitectureCanvas.vue @@ -0,0 +1,73 @@ + + + diff --git a/docs/.vitepress/theme/components/CalloutPanel.vue b/docs/.vitepress/theme/components/CalloutPanel.vue new file mode 100644 index 0000000..a44aaf3 --- /dev/null +++ b/docs/.vitepress/theme/components/CalloutPanel.vue @@ -0,0 +1,20 @@ + + + diff --git a/docs/.vitepress/theme/components/CitationGrid.vue b/docs/.vitepress/theme/components/CitationGrid.vue new file mode 100644 index 0000000..7855fc0 --- /dev/null +++ b/docs/.vitepress/theme/components/CitationGrid.vue @@ -0,0 +1,20 @@ + + + diff --git a/docs/.vitepress/theme/components/HeroEvidence.vue b/docs/.vitepress/theme/components/HeroEvidence.vue new file mode 100644 index 0000000..26bc538 --- /dev/null +++ b/docs/.vitepress/theme/components/HeroEvidence.vue @@ -0,0 +1,47 @@ + + + diff --git a/docs/.vitepress/theme/components/MetricStrip.vue b/docs/.vitepress/theme/components/MetricStrip.vue new file mode 100644 index 0000000..d210b10 --- /dev/null +++ b/docs/.vitepress/theme/components/MetricStrip.vue @@ -0,0 +1,19 @@ + + + diff --git a/docs/.vitepress/theme/components/ThemeAwareArt.vue b/docs/.vitepress/theme/components/ThemeAwareArt.vue new file mode 100644 index 0000000..1ec9f6c --- /dev/null +++ b/docs/.vitepress/theme/components/ThemeAwareArt.vue @@ -0,0 +1,17 @@ + + + diff --git a/docs/.vitepress/theme/components/WhitepaperSection.vue b/docs/.vitepress/theme/components/WhitepaperSection.vue new file mode 100644 index 0000000..5bafc9a --- /dev/null +++ b/docs/.vitepress/theme/components/WhitepaperSection.vue @@ -0,0 +1,24 @@ + + + diff --git a/docs/.vitepress/theme/index.ts b/docs/.vitepress/theme/index.ts index 347ed6e..4b6a940 100644 --- a/docs/.vitepress/theme/index.ts +++ b/docs/.vitepress/theme/index.ts @@ -1,6 +1,24 @@ import DefaultTheme from 'vitepress/theme' +import Layout from './Layout.vue' +import HeroEvidence from './components/HeroEvidence.vue' +import MetricStrip from './components/MetricStrip.vue' +import WhitepaperSection from './components/WhitepaperSection.vue' +import ArchitectureCanvas from './components/ArchitectureCanvas.vue' +import CitationGrid from './components/CitationGrid.vue' +import ThemeAwareArt from './components/ThemeAwareArt.vue' +import CalloutPanel from './components/CalloutPanel.vue' import './style.css' export default { - extends: DefaultTheme + extends: DefaultTheme, + Layout, + enhanceApp({ app }) { + app.component('HeroEvidence', HeroEvidence) + app.component('MetricStrip', MetricStrip) + app.component('WhitepaperSection', WhitepaperSection) + app.component('ArchitectureCanvas', ArchitectureCanvas) + app.component('CitationGrid', CitationGrid) + app.component('ThemeAwareArt', ThemeAwareArt) + app.component('CalloutPanel', CalloutPanel) + } } diff --git a/docs/.vitepress/theme/style.css b/docs/.vitepress/theme/style.css index 35caddf..b7a0e06 100644 --- a/docs/.vitepress/theme/style.css +++ b/docs/.vitepress/theme/style.css @@ -1,927 +1,6 @@ -/** - * GPU SpMV Documentation Theme - * Technical Whitepaper / Architecture Showcase - * - * Design system: - * - Brand: NVIDIA Green (#76B900) - * - Accent: CUDA Teal (#00D4AA) - * - Light mode: clean white - * - Dark mode: GitHub Dark (#0d1117) - */ - -/* === CSS Variables: Light Mode (Default) === */ -:root { - /* Brand: NVIDIA Green */ - --vp-c-brand-1: #5A8F00; - --vp-c-brand-2: #76B900; - --vp-c-brand-3: #8ED100; - --vp-c-brand-soft: rgba(118, 185, 0, 0.14); - - /* Accent: CUDA Teal */ - --spmv-accent: #00B894; - --spmv-accent-soft: rgba(0, 184, 148, 0.14); - - /* Background */ - --vp-c-bg: #ffffff; - --vp-c-bg-alt: #f6f8fa; - --vp-c-bg-soft: #f6f8fa; - --vp-c-bg-elv: #ffffff; - --vp-c-bg-mute: #f6f8fa; - - /* Text */ - --vp-c-text-1: #24292f; - --vp-c-text-2: #57606a; - --vp-c-text-3: #8b949e; - - /* Border */ - --vp-c-border: #d0d7de; - --vp-c-divider: #d0d7de; - --vp-c-gutter: #d0d7de; - - /* Hero */ - --vp-home-hero-name-color: #5A8F00; - --vp-home-hero-name-background: transparent; - --vp-home-hero-image-background-image: none; - --vp-home-hero-image-filter: none; - - /* Typography */ - --vp-font-family-base: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; - --vp-font-family-mono: 'JetBrains Mono', 'Fira Code', SFMono-Regular, Menlo, monospace; - - /* Layout */ - --vp-layout-max-width: 1440px; - --vp-sidebar-width: 280px; - - /* Code */ - --vp-code-block-bg: #f6f8fa; - --vp-code-block-border: #d0d7de; - - /* Homepage */ - --tag-bg: rgba(118, 185, 0, 0.12); - --tag-text: #4A7600; - - /* Spacing */ - --spacing-xs: 8px; - --spacing-sm: 12px; - --spacing-md: 16px; - --spacing-lg: 24px; - --spacing-xl: 32px; - --spacing-2xl: 40px; - - /* Radius */ - --radius-sm: 6px; - --radius-md: 8px; - --radius-lg: 12px; - --radius-xl: 16px; - - /* Transitions */ - --transition-fast: 0.15s ease; - --transition-normal: 0.2s ease; - - /* Whitepaper-specific */ - --spmv-card-shadow: 0 1px 3px rgba(0, 0, 0, 0.06); -} - -/* === Dark Mode Override === */ -.dark { - --vp-c-brand-1: #76B900; - --vp-c-brand-2: #5A8F00; - --vp-c-brand-3: #4A7600; - --vp-c-brand-soft: rgba(118, 185, 0, 0.16); - - --spmv-accent: #00D4AA; - --spmv-accent-soft: rgba(0, 212, 170, 0.16); - - --vp-c-bg: #0d1117; - --vp-c-bg-alt: #161b22; - --vp-c-bg-soft: #21262d; - --vp-c-bg-elv: #21262d; - --vp-c-bg-mute: #21262d; - - --vp-c-text-1: #c9d1d9; - --vp-c-text-2: #8b949e; - --vp-c-text-3: #6e7681; - - --vp-c-border: #30363d; - --vp-c-divider: #30363d; - --vp-c-gutter: #30363d; - - --vp-home-hero-name-color: #76B900; - - --vp-code-block-bg: #0d1117; - --vp-code-block-border: #30363d; - - --tag-bg: rgba(118, 185, 0, 0.18); - --tag-text: #76B900; - - --spmv-card-shadow: 0 1px 3px rgba(0, 0, 0, 0.3); -} - -/* === Hero Section === */ -.VPHero { - padding: calc(var(--vp-nav-height) + 48px) var(--spacing-lg) 48px; -} - -.VPHero .name { - font-size: 32px; - font-weight: 700; - line-height: 1.2; - letter-spacing: -0.5px; - color: var(--vp-c-brand-1); -} - -.VPHero .text { - font-size: 16px; - font-weight: 400; - line-height: 1.5; - color: var(--vp-c-text-2); - margin-top: var(--spacing-sm); -} - -.VPHero .tagline { - font-size: 14px; - line-height: 1.6; - color: var(--vp-c-text-3); - margin-top: var(--spacing-md); - max-width: 600px; -} - -.VPHero .image { - display: none; -} - -/* === Buttons === */ -.VPHero .actions { - margin-top: var(--spacing-xl); - gap: var(--spacing-sm); -} - -.VPHero .VPButton { - border-radius: var(--radius-sm); - padding: 8px 16px; - font-size: 14px; - font-weight: 500; - transition: all var(--transition-fast); -} - -.VPHero .VPButton.medium.brand { - background: var(--vp-c-brand-1); - border: none; - color: #fff; -} - -.VPHero .VPButton.medium.brand:hover { - background: var(--vp-c-brand-2); -} - -.VPHero .VPButton.medium.alt { - border: 1px solid var(--vp-c-border); - background: transparent; - color: var(--vp-c-text-1); -} - -.VPHero .VPButton.medium.alt:hover { - border-color: var(--vp-c-brand-1); - color: var(--vp-c-brand-1); -} - -/* === Features === */ -.VPFeatures { - padding: 48px var(--spacing-lg); -} - -.VPFeatures .container { - max-width: 1200px; -} - -.VPFeatures .VPFeature { - padding: var(--spacing-lg); - border-radius: var(--radius-md); - background: var(--vp-c-bg); - border: 1px solid var(--vp-c-border); - transition: border-color var(--transition-fast); -} - -.VPFeatures .VPFeature:hover { - border-color: var(--vp-c-brand-1); -} - -.VPFeature .icon { - display: flex; - align-items: center; - justify-content: center; - width: 40px; - height: 40px; - font-size: 20px; - margin-bottom: var(--spacing-sm); -} - -.VPFeature .title { - font-size: 16px; - font-weight: 600; - margin-bottom: var(--spacing-xs); - color: var(--vp-c-text-1); -} - -.VPFeature .details { - font-size: 14px; - line-height: 1.6; - color: var(--vp-c-text-2); -} - -/* === Navigation === */ -.VPNav { - background: var(--vp-c-bg); - border-bottom: 1px solid var(--vp-c-border); -} - -.VPNavBarMenuLink, -.VPNavBarMenuGroup .button { - font-weight: 500; - font-size: 14px; -} - -.VPNavBarMenuLink.active, -.VPNavBarMenuGroup.open .button { - color: var(--vp-c-brand-1); -} - -/* === Search === */ -.VPNavBarSearch { - display: flex; - justify-content: flex-end; -} - -.VPLocalSearchBox { - backdrop-filter: blur(20px); -} - -.VPLocalSearchBox .backdrop { - background: rgba(0, 0, 0, 0.6); - backdrop-filter: blur(4px); -} - -.VPLocalSearchBox .shell { - border-radius: var(--radius-xl); - box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.25); -} - -/* === Sidebar === */ -.VPSidebar { - padding: var(--spacing-lg) var(--spacing-md) 96px; -} - -.VPSidebarGroup { - margin-bottom: var(--spacing-md); -} - -.VPSidebarGroup .title { - font-size: 13px; - font-weight: 600; - text-transform: uppercase; - letter-spacing: 0.5px; - color: var(--vp-c-text-3); - padding: var(--spacing-sm) 14px; -} - -.VPSidebarItem .text { - font-size: 14px; - padding: var(--spacing-xs) 14px; - border-radius: var(--radius-md); - transition: all var(--transition-fast); -} - -.VPSidebarItem .text:hover { - background: var(--vp-c-bg-soft); - color: var(--vp-c-brand-1); -} - -.VPSidebarItem.is-active .text { - background: var(--vp-c-brand-soft); - color: var(--vp-c-brand-1); - font-weight: 500; -} - -/* === Content === */ -.VPDoc { - padding: var(--spacing-xl) var(--spacing-lg) 96px; -} - -.VPDoc .content { - max-width: 720px; -} - -.VPDoc h1 { - font-size: 40px; - font-weight: 700; - letter-spacing: -1px; - margin-bottom: var(--spacing-lg); -} - -.VPDoc h2 { - font-size: 28px; - font-weight: 600; - margin-top: 48px; - margin-bottom: 20px; - padding-bottom: var(--spacing-sm); - border-bottom: 1px solid var(--vp-c-divider); -} - -.VPDoc h3 { - font-size: 20px; - font-weight: 600; - margin-top: var(--spacing-xl); - margin-bottom: var(--spacing-md); -} - -.VPDoc p { - font-size: 16px; - line-height: 1.8; - margin-bottom: var(--spacing-md); -} - -.VPDoc a { - color: var(--vp-c-brand-1); - text-decoration: none; - border-bottom: 1px solid transparent; - transition: border-color var(--transition-fast); -} - -.VPDoc a:hover { - border-bottom-color: var(--vp-c-brand-1); -} - -.VPDoc code { - font-size: 14px; - padding: 2px var(--spacing-xs); - border-radius: var(--radius-sm); - background: var(--vp-c-bg-soft); - color: var(--vp-c-text-1); -} - -.VPDoc pre code { - font-size: 14px; - padding: 0; - background: transparent; -} - -/* Custom Blocks */ -.VPDoc .custom-block { - border-radius: var(--radius-lg); - padding: 20px var(--spacing-lg); - margin: var(--spacing-lg) 0; -} - -.VPDoc .custom-block.info { - background: var(--vp-c-bg-soft); - border-left: 4px solid var(--vp-c-brand-1); -} - -.VPDoc .custom-block.tip { - background: var(--spmv-accent-soft); - border-left: 4px solid var(--spmv-accent); -} - -.VPDoc .custom-block.warning { - background: rgba(234, 179, 8, 0.14); - border-left: 4px solid #EAB308; -} - -.VPDoc .custom-block.danger { - background: rgba(239, 68, 68, 0.14); - border-left: 4px solid #EF4444; -} - -/* === Tables === */ -.VPDoc table { - width: 100%; - border-collapse: separate; - border-spacing: 0; - margin: var(--spacing-lg) 0; - border-radius: var(--radius-lg); - overflow: hidden; - border: 1px solid var(--vp-c-border); -} - -.VPDoc th { - background: var(--vp-c-bg-soft); - font-weight: 600; - text-align: left; - padding: 14px 18px; - font-size: 14px; - border-bottom: 1px solid var(--vp-c-border); -} - -.VPDoc td { - padding: var(--spacing-sm) 18px; - font-size: 14px; - border-bottom: 1px solid var(--vp-c-border); -} - -.VPDoc tr:last-child td { - border-bottom: none; -} - -.VPDoc tr:hover td { - background: var(--vp-c-bg-soft); -} - -/* === Footer === */ -.VPFooter { - padding: var(--spacing-2xl) var(--spacing-lg); - background: var(--vp-c-bg-alt); -} - -.VPFooter .message, -.VPFooter .copyright { - font-size: 14px; - color: var(--vp-c-text-3); -} - -/* === Mermaid === */ -.mermaid { - background: var(--vp-c-bg-soft); - border-radius: var(--radius-lg); - padding: var(--spacing-lg); - margin: var(--spacing-lg) 0; - text-align: center; -} - -/* === Homepage: Header === */ -.home-header { - display: flex; - justify-content: space-between; - align-items: center; - padding: var(--spacing-md) 0; - margin-bottom: var(--spacing-xl); - border-bottom: 1px solid var(--vp-c-border); -} - -.home-header-left { - display: flex; - align-items: center; - gap: var(--spacing-sm); -} - -.home-logo { - width: 36px; - height: 36px; - background: var(--vp-c-brand-1); - border-radius: 6px; - display: flex; - align-items: center; - justify-content: center; - color: #fff; - font-weight: 700; - font-size: 11px; - letter-spacing: -0.5px; -} - -.home-title { - font-weight: 600; - font-size: 16px; - color: var(--vp-c-text-1); -} - -.home-subtitle { - color: var(--vp-c-text-2); - font-size: 13px; - margin-left: var(--spacing-xs); -} - -.home-nav { - display: flex; - gap: var(--spacing-md); - font-size: 13px; -} - -.home-nav a { - color: var(--vp-c-text-2); - text-decoration: none; - transition: color var(--transition-fast); -} - -.home-nav a:hover { - color: var(--vp-c-brand-1); -} - -/* === Homepage: Hero Tech === */ -.home-hero-tech { - padding: var(--spacing-2xl) 0; - margin-bottom: var(--spacing-xl); -} - -.home-hero-tech h1 { - font-size: 36px; - font-weight: 800; - letter-spacing: -1px; - color: var(--vp-c-text-1); - margin: 0 0 var(--spacing-md) 0; - line-height: 1.2; -} - -.home-hero-tech .hero-tagline { - font-size: 18px; - color: var(--vp-c-text-2); - line-height: 1.6; - margin: 0 0 var(--spacing-lg) 0; - max-width: 680px; -} - -.home-hero-tech .hero-actions { - display: flex; - gap: var(--spacing-sm); - flex-wrap: wrap; -} - -.home-hero-tech .hero-actions a { - display: inline-block; - padding: 10px 20px; - border-radius: var(--radius-sm); - font-size: 14px; - font-weight: 500; - text-decoration: none; - transition: all var(--transition-fast); -} - -.home-hero-tech .hero-actions a.primary { - background: var(--vp-c-brand-1); - color: #fff; -} - -.home-hero-tech .hero-actions a.primary:hover { - background: var(--vp-c-brand-2); -} - -.home-hero-tech .hero-actions a.secondary { - border: 1px solid var(--vp-c-border); - color: var(--vp-c-text-1); - background: transparent; -} - -.home-hero-tech .hero-actions a.secondary:hover { - border-color: var(--vp-c-brand-1); - color: var(--vp-c-brand-1); -} - -/* === Homepage: Metrics Bar === */ -.home-metrics { - display: flex; - gap: var(--spacing-lg); - padding: var(--spacing-lg); - background: var(--vp-c-bg-soft); - border: 1px solid var(--vp-c-border); - border-radius: var(--radius-lg); - margin-bottom: var(--spacing-xl); -} - -.home-metric { - flex: 1; - text-align: center; -} - -.home-metric-value { - font-family: 'JetBrains Mono', monospace; - font-size: 28px; - font-weight: 700; - color: var(--vp-c-brand-1); - line-height: 1.2; -} - -.home-metric-label { - font-size: 13px; - color: var(--vp-c-text-3); - margin-top: var(--spacing-xs); -} - -/* === Homepage: Architecture === */ -.home-architecture { - margin-bottom: var(--spacing-xl); -} - -.home-architecture .mermaid { - margin: 0; -} - -/* === Homepage: Feature Map === */ -.feature-map { - display: grid; - grid-template-columns: repeat(3, 1fr); - gap: var(--spacing-md); - margin-bottom: var(--spacing-xl); -} - -.feature-card { - background: var(--vp-c-bg); - border: 1px solid var(--vp-c-border); - border-radius: var(--radius-md); - padding: var(--spacing-md); - transition: border-color var(--transition-fast), box-shadow var(--transition-fast); -} - -.feature-card:hover { - border-color: var(--vp-c-brand-1); - box-shadow: var(--spmv-card-shadow), 0 4px 12px rgba(118, 185, 0, 0.08); -} - -.feature-card-title { - font-weight: 600; - font-size: 15px; - color: var(--vp-c-text-1); - margin-bottom: var(--spacing-sm); -} - -.feature-card-desc { - font-size: 13px; - color: var(--vp-c-text-2); - line-height: 1.6; - margin-bottom: var(--spacing-sm); -} - -.feature-tags { - display: flex; - flex-wrap: wrap; - gap: 6px; -} - -.feature-tag { - font-size: 12px; - color: var(--tag-text); - text-decoration: none; - padding: 3px 8px; - background: var(--tag-bg); - border-radius: 4px; - transition: background var(--transition-fast); -} - -.feature-tag:hover { - background: var(--vp-c-brand-soft); -} - -/* === Performance Bars (CSS-based charts) === */ -.perf-bars { - margin: var(--spacing-xl) 0; - display: flex; - flex-direction: column; - gap: var(--spacing-xl); -} - -.perf-bar-group { - padding: var(--spacing-lg); - background: var(--vp-c-bg-soft); - border: 1px solid var(--vp-c-border); - border-radius: var(--radius-lg); -} - -.perf-bar-title { - font-size: 14px; - font-weight: 600; - color: var(--vp-c-text-1); - margin-bottom: var(--spacing-md); -} - -.perf-row { - display: flex; - align-items: center; - gap: var(--spacing-md); - margin-bottom: var(--spacing-sm); -} - -.perf-label { - width: 120px; - font-size: 13px; - color: var(--vp-c-text-2); - flex-shrink: 0; -} - -.perf-bar { - flex: 1; - height: 20px; - background: var(--vp-c-bg); - border-radius: 4px; - position: relative; - border: 1px solid var(--vp-c-border); -} - -.perf-bar::after { - content: ''; - position: absolute; - left: 0; - top: 0; - height: 100%; - width: var(--width); - background: var(--vp-c-brand-1); - border-radius: 4px; - transition: width var(--transition-normal); -} - -.perf-value { - width: 50px; - text-align: right; - font-size: 13px; - font-family: 'JetBrains Mono', monospace; - color: var(--vp-c-text-1); - flex-shrink: 0; -} - -/* === Language Selector (root page fallback) === */ -.language-selector { - display: flex; - flex-direction: column; - align-items: center; - justify-content: center; - min-height: calc(100vh - var(--vp-nav-height)); - padding: var(--spacing-2xl); - text-align: center; -} - -.language-selector h1 { - font-size: 48px; - font-weight: 800; - margin-bottom: var(--spacing-md); - background: linear-gradient(135deg, var(--vp-c-brand-1) 0%, var(--spmv-accent) 100%); - -webkit-background-clip: text; - -webkit-text-fill-color: transparent; -} - -.language-selector p { - font-size: 18px; - color: var(--vp-c-text-2); - margin-bottom: var(--spacing-2xl); -} - -.language-cards { - display: grid; - grid-template-columns: repeat(2, 1fr); - gap: var(--spacing-lg); - max-width: 600px; -} - -.language-card { - display: block; - padding: var(--spacing-xl); - background: var(--vp-c-bg-soft); - border: 1px solid var(--vp-c-border); - border-radius: var(--radius-xl); - text-decoration: none; - transition: all var(--transition-normal); -} - -.language-card:hover { - border-color: var(--vp-c-brand-1); - transform: translateY(-4px); - box-shadow: 0 12px 24px -8px rgba(118, 185, 0, 0.2); -} - -.language-card h2 { - font-size: 24px; - font-weight: 700; - color: var(--vp-c-text-1); - margin: 0 0 var(--spacing-sm) 0; - padding: 0; - border: none; -} - -.language-card p { - font-size: 14px; - color: var(--vp-c-text-3); - margin: 0; -} - -/* === Error Page === */ -.error-page h1 { - color: var(--vp-c-text-3); -} - -.error-page .btn { - display: inline-block; - padding: 8px 20px; - background: var(--vp-c-brand-1); - color: #fff; - border-radius: var(--radius-sm); - text-decoration: none; - font-size: 14px; - font-weight: 500; - transition: background var(--transition-fast); -} - -.error-page .btn:hover { - background: var(--vp-c-brand-2); -} - -/* === Responsive === */ -@media (max-width: 960px) { - .VPHero { - padding: calc(var(--vp-nav-height) + 60px) var(--spacing-lg) 60px; - } - - .VPHero .name { - font-size: 48px; - } - - .VPHero .text { - font-size: 18px; - } - - .VPDoc h1 { - font-size: 32px; - } - - .VPDoc h2 { - font-size: 24px; - } - - .home-metrics { - flex-wrap: wrap; - } - - .home-metric { - min-width: 120px; - } -} - -@media (max-width: 959px) { - .feature-map { - grid-template-columns: repeat(2, 1fr); - } - - .language-cards { - grid-template-columns: 1fr; - } -} - -@media (max-width: 640px) { - .VPHero .name { - font-size: 36px; - letter-spacing: -1px; - } - - .VPFeatures .VPFeature { - padding: var(--spacing-lg); - } - - .feature-map { - grid-template-columns: 1fr; - } - - .home-header { - flex-direction: column; - align-items: flex-start; - gap: var(--spacing-sm); - } - - .home-nav { - width: 100%; - justify-content: flex-start; - } - - .home-hero-tech h1 { - font-size: 28px; - } - - .home-metrics { - flex-direction: column; - gap: var(--spacing-md); - } - - .perf-row { - flex-wrap: wrap; - } - - .perf-label { - width: 100%; - margin-bottom: var(--spacing-xs); - } - - .perf-bar { - flex: 1; - } -} - -/* === Print === */ -@media print { - .VPNav, - .VPSidebar, - .VPFooter, - .home-header, - .home-nav { - display: none; - } - - .VPDoc { - padding: 0; - } - - .VPDoc .content { - max-width: 100%; - } - - .VPDoc a { - border-bottom: none; - } - - .VPDoc code { - background: transparent; - border: 1px solid #ddd; - } -} +@import './styles/tokens.css'; +@import './styles/base.css'; +@import './styles/home.css'; +@import './styles/paper.css'; +@import './styles/citation.css'; +@import './styles/diagram.css'; diff --git a/docs/.vitepress/theme/styles/base.css b/docs/.vitepress/theme/styles/base.css new file mode 100644 index 0000000..0fb1757 --- /dev/null +++ b/docs/.vitepress/theme/styles/base.css @@ -0,0 +1,67 @@ +.spmv-layout-top { + position: fixed; + inset: 0 0 auto 0; + height: 420px; + pointer-events: none; + background: + radial-gradient(circle at top left, rgba(118, 185, 0, 0.14), transparent 30%), + radial-gradient(circle at top right, rgba(0, 212, 170, 0.12), transparent 28%); + z-index: 0; +} + +.Layout, +.VPContent, +.VPDoc { + position: relative; + z-index: 1; +} + +.VPContent.is-home { + background: + linear-gradient(180deg, color-mix(in srgb, var(--spmv-surface-2) 88%, transparent) 0%, transparent 100%); +} + +.vp-doc h1, +.vp-doc h2, +.vp-doc h3 { + letter-spacing: -0.02em; +} + +.vp-doc p, +.vp-doc li { + color: var(--spmv-ink-2); +} + +.spmv-card-grid { + display: grid; + gap: var(--spmv-grid-gap); +} + +.spmv-card-grid.cols-2 { + grid-template-columns: repeat(auto-fit, minmax(260px, 1fr)); +} + +.spmv-card-grid.cols-3 { + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); +} + +.spmv-surface-card { + background: color-mix(in srgb, var(--spmv-surface-1) 92%, transparent); + border: 1px solid var(--spmv-border); + border-radius: var(--spmv-radius-lg); + box-shadow: var(--spmv-shadow-md); +} + +.spmv-eyebrow { + display: inline-flex; + align-items: center; + gap: 8px; + padding: 8px 12px; + border-radius: 999px; + background: rgba(118, 185, 0, 0.12); + color: var(--vp-c-brand-1); + font-size: 12px; + font-weight: 700; + letter-spacing: 0.08em; + text-transform: uppercase; +} diff --git a/docs/.vitepress/theme/styles/citation.css b/docs/.vitepress/theme/styles/citation.css new file mode 100644 index 0000000..9a84fea --- /dev/null +++ b/docs/.vitepress/theme/styles/citation.css @@ -0,0 +1,25 @@ +.spmv-citation-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(260px, 1fr)); + gap: 18px; + margin: 24px 0; +} + +.spmv-citation-card { + padding: 22px; +} + +.spmv-citation-card h3 { + margin: 0 0 10px; + font-size: 1.05rem; + color: var(--spmv-ink-1); +} + +.spmv-citation-card p { + margin: 0 0 10px; + font-size: 0.95rem; +} + +.spmv-citation-card a { + font-weight: 700; +} diff --git a/docs/.vitepress/theme/styles/diagram.css b/docs/.vitepress/theme/styles/diagram.css new file mode 100644 index 0000000..ed3a894 --- /dev/null +++ b/docs/.vitepress/theme/styles/diagram.css @@ -0,0 +1,54 @@ +.spmv-architecture-canvas, +.theme-aware-art { + width: 100%; + display: block; +} + +.spmv-architecture-shell { + padding: 22px; +} + +.spmv-node { + fill: color-mix(in srgb, var(--spmv-surface-1) 94%, transparent); + stroke: var(--spmv-border); + stroke-width: 1.5; +} + +.spmv-node-text { + fill: var(--spmv-ink-1); + font: 700 14px Inter, sans-serif; +} + +.spmv-node-caption { + fill: var(--spmv-ink-3); + font: 500 12px Inter, sans-serif; +} + +.spmv-link { + stroke: color-mix(in srgb, var(--spmv-brand-1) 55%, var(--spmv-border)); + stroke-width: 2; + fill: none; +} + +.theme-aware-art .art-bg { + fill: color-mix(in srgb, var(--spmv-surface-2) 92%, transparent); + stroke: var(--spmv-border); +} + +.theme-aware-art .art-accent-1 { + fill: var(--spmv-brand-1); +} + +.theme-aware-art .art-accent-2 { + fill: var(--spmv-brand-2); +} + +.theme-aware-art .art-title { + fill: var(--spmv-ink-1); + font: 700 24px Inter, sans-serif; +} + +.theme-aware-art .art-caption { + fill: var(--spmv-ink-2); + font: 500 14px Inter, sans-serif; +} diff --git a/docs/.vitepress/theme/styles/home.css b/docs/.vitepress/theme/styles/home.css new file mode 100644 index 0000000..6810239 --- /dev/null +++ b/docs/.vitepress/theme/styles/home.css @@ -0,0 +1,89 @@ +.spmv-hero { + display: grid; + grid-template-columns: minmax(0, 1.2fr) minmax(320px, 0.8fr); + gap: 32px; + align-items: stretch; + margin: 24px 0 48px; +} + +.spmv-hero-copy, +.spmv-hero-art { + padding: 32px; +} + +.spmv-hero-copy h1 { + margin: 18px 0 16px; + font-size: clamp(2.4rem, 5vw, 4.3rem); + line-height: 1.02; + color: var(--spmv-ink-1); +} + +.spmv-hero-copy p { + margin: 0; + font-size: 1.08rem; + line-height: 1.75; +} + +.spmv-hero-actions { + display: flex; + flex-wrap: wrap; + gap: 14px; + margin-top: 28px; +} + +.spmv-hero-actions a { + display: inline-flex; + align-items: center; + justify-content: center; + min-width: 160px; + padding: 13px 18px; + border-radius: 999px; + border: 1px solid var(--spmv-border); + font-weight: 700; + text-decoration: none; + transition: transform 0.2s ease, border-color 0.2s ease; +} + +.spmv-hero-actions a.primary { + background: linear-gradient(135deg, var(--spmv-brand-1), var(--spmv-brand-2)); + color: #ffffff; + border-color: transparent; +} + +.spmv-hero-actions a.secondary { + color: var(--spmv-ink-1); +} + +.spmv-hero-actions a:hover { + transform: translateY(-1px); + border-color: color-mix(in srgb, var(--spmv-brand-1) 50%, var(--spmv-border)); +} + +.spmv-metric-strip { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); + gap: 16px; + margin-top: 24px; +} + +.spmv-metric-card { + padding: 20px; +} + +.spmv-metric-value { + font-size: 1.8rem; + font-weight: 800; + color: var(--spmv-ink-1); +} + +.spmv-metric-label { + margin-top: 8px; + font-size: 0.94rem; + color: var(--spmv-ink-2); +} + +@media (max-width: 960px) { + .spmv-hero { + grid-template-columns: 1fr; + } +} diff --git a/docs/.vitepress/theme/styles/paper.css b/docs/.vitepress/theme/styles/paper.css new file mode 100644 index 0000000..76b313e --- /dev/null +++ b/docs/.vitepress/theme/styles/paper.css @@ -0,0 +1,44 @@ +.spmv-section { + margin: 36px 0; + padding: 28px; +} + +.spmv-section h2 { + margin: 14px 0 12px; + font-size: 1.85rem; + color: var(--spmv-ink-1); +} + +.spmv-section p { + margin: 0; +} + +.spmv-callout { + margin: 24px 0; + padding: 20px 22px; + border-radius: var(--spmv-radius-md); + border: 1px solid var(--spmv-border); + background: color-mix(in srgb, var(--spmv-surface-1) 92%, transparent); +} + +.spmv-callout[data-tone='success'] { + border-color: rgba(118, 185, 0, 0.35); + background: rgba(118, 185, 0, 0.08); +} + +.spmv-callout[data-tone='info'] { + border-color: rgba(0, 212, 170, 0.3); + background: rgba(0, 212, 170, 0.08); +} + +.spmv-callout[data-tone='warning'] { + border-color: rgba(251, 191, 36, 0.35); + background: rgba(251, 191, 36, 0.08); +} + +.spmv-callout-title { + margin: 0 0 8px; + font-size: 1rem; + font-weight: 800; + color: var(--spmv-ink-1); +} diff --git a/docs/.vitepress/theme/styles/tokens.css b/docs/.vitepress/theme/styles/tokens.css new file mode 100644 index 0000000..276554f --- /dev/null +++ b/docs/.vitepress/theme/styles/tokens.css @@ -0,0 +1,40 @@ +:root { + --vp-c-brand-1: #76b900; + --vp-c-brand-2: #5a8f00; + --vp-c-brand-3: #93d522; + --vp-c-brand-soft: rgba(118, 185, 0, 0.14); + + --spmv-brand-1: #76b900; + --spmv-brand-2: #00d4aa; + --spmv-ink-1: #0f172a; + --spmv-ink-2: #334155; + --spmv-ink-3: #64748b; + --spmv-surface-1: #ffffff; + --spmv-surface-2: #f8fafc; + --spmv-surface-3: #eef2f7; + --spmv-border: rgba(15, 23, 42, 0.12); + --spmv-shadow-lg: 0 24px 64px rgba(15, 23, 42, 0.12); + --spmv-shadow-md: 0 12px 30px rgba(15, 23, 42, 0.08); + --spmv-radius-xl: 24px; + --spmv-radius-lg: 20px; + --spmv-radius-md: 16px; + --spmv-radius-sm: 12px; + --spmv-max-width: 1240px; + --spmv-grid-gap: 24px; +} + +.dark { + --vp-c-brand-1: #93d522; + --vp-c-brand-2: #76b900; + --vp-c-brand-3: #5a8f00; + + --spmv-ink-1: #f8fafc; + --spmv-ink-2: #cbd5e1; + --spmv-ink-3: #94a3b8; + --spmv-surface-1: #020617; + --spmv-surface-2: #0f172a; + --spmv-surface-3: #162033; + --spmv-border: rgba(148, 163, 184, 0.2); + --spmv-shadow-lg: 0 24px 64px rgba(2, 6, 23, 0.5); + --spmv-shadow-md: 0 12px 30px rgba(2, 6, 23, 0.35); +} diff --git a/docs/en/architecture/execution-pipeline.md b/docs/en/architecture/execution-pipeline.md new file mode 100644 index 0000000..892f585 --- /dev/null +++ b/docs/en/architecture/execution-pipeline.md @@ -0,0 +1,28 @@ +# Execution Pipeline + +## Why this deserves its own page + +GPU SpMV is not just “launch a kernel.” The real engineering story is **how the matrix is analyzed, how kernel choice is made, how execution context is reused, and how the result is interpreted with confidence**. + +## Pipeline Breakdown + +1. **Input stage**: load CSR / ELL data structures and prepare the input vector. +2. **Analysis stage**: compute `avg_nnz_per_row`, skewness, and row distribution characteristics. +3. **Decision stage**: choose Scalar CSR, Vector CSR, Merge Path, or ELL. +4. **Execution stage**: launch the GPU kernel and record timing / bandwidth metrics. +5. **Validation stage**: compare against CPU reference behavior or established baselines. + +## Key Decisions + +| Observation | Decision | +|:------------|:---------| +| `avg_nnz_per_row < 4` | Scalar CSR to avoid wasting warp-scale resources | +| Rows are uniform and low-skew | Vector CSR for stronger warp collaboration | +| Row lengths are highly skewed | Merge Path to prioritize load balance | +| Row width is nearly fixed | ELL kernel to prioritize coalesced access | + +## Read this together with + +- [Kernel Selection](/en/architecture/kernel-selection) +- [Memory Layout](/en/architecture/memory-layout) +- [Performance Methodology](/en/performance/methodology) diff --git a/docs/en/architecture/overview.md b/docs/en/architecture/overview.md index d357561..9c1a962 100644 --- a/docs/en/architecture/overview.md +++ b/docs/en/architecture/overview.md @@ -1,6 +1,6 @@ # Architecture Overview -GPU SpMV uses a layered architecture design with clear separation of storage, computation, and application layers. +The architectural story of GPU SpMV is not just “what modules exist,” but **how matrix statistics, kernel choice, execution context, and validation fit together into an explainable engineering system**. ## System Architecture @@ -84,8 +84,16 @@ Applications built on SpMV: - **Graph Neural Networks** — Sparse graph convolution - **Scientific Computing** — FEM, CFD +## The three most important ideas on this page + +1. **How data flows** from sparse input to validated output. +2. **Why automatic selection is justified** by `avg_nnz_per_row` and skewness rather than opaque tuning. +3. **Why the system is trustworthy** thanks to resource management, semantic errors, CPU reference paths, and property tests. + ## Related Documentation - [Kernel Selection](/en/architecture/kernel-selection) +- [Execution Pipeline](/en/architecture/execution-pipeline) - [Memory Layout](/en/architecture/memory-layout) +- [Reliability Constraints](/en/architecture/reliability) - [Spec-Driven Development](/en/architecture/spec-driven) diff --git a/docs/en/architecture/reliability.md b/docs/en/architecture/reliability.md new file mode 100644 index 0000000..4020d05 --- /dev/null +++ b/docs/en/architecture/reliability.md @@ -0,0 +1,25 @@ +# Reliability and Engineering Constraints + +## Where reliability comes from + +Reliability in this project is not “it seems to run.” It comes from three lines of evidence working together: + +1. **Explicit resource lifetime** through `CudaBuffer` and execution-context abstractions instead of raw `cudaMalloc` / `cudaFree`. +2. **Explicit error semantics** through `SpMVError` and CUDA checking macros. +3. **Spec and test closure** through OpenSpec requirements and property-test coverage. + +## Why this matters for a showcase project + +Interviewers and open-source readers trust a project more when it can answer: + +- How are resources released? +- How are failures surfaced? +- How should benchmark evidence be interpreted? +- How are design changes traced? + +## Reliability Checklist + +- RAII wrappers for GPU resources +- CPU reference paths for cross-checking +- property tests with at least 100 iterations +- GitHub Pages used to publish architecture and references, not only marketing copy diff --git a/docs/en/architecture/spec-driven.md b/docs/en/architecture/spec-driven.md index 8593ee9..0c14fc5 100644 --- a/docs/en/architecture/spec-driven.md +++ b/docs/en/architecture/spec-driven.md @@ -89,5 +89,5 @@ Demonstrating Spec-Driven Development in interviews: ## References -- [OpenSpec Specs](https://github.com/LessUp/gpu-spmv/tree/main/openspec) +- [OpenSpec Specs](https://github.com/AICL-Lab/gpu-spmv/tree/main/openspec) - [Architecture Overview](/en/architecture/overview) \ No newline at end of file diff --git a/docs/en/changelog.md b/docs/en/changelog.md index c8c2e74..35a0798 100644 --- a/docs/en/changelog.md +++ b/docs/en/changelog.md @@ -46,7 +46,7 @@ This is the first stable release of GPU SpMV, featuring complete CSR and ELL for - Doxygen-compatible documentation #### Documentation -- Full documentation site at https://lessup.github.io/gpu-spmv/ +- Full documentation site at https://aicl-lab.github.io/gpu-spmv/ - Bilingual README (English and Chinese) - API reference, performance guide, and code examples - Architecture documentation and design decision records @@ -144,5 +144,5 @@ No breaking changes from pre-release versions. The API is now stable. --- -[1.0.0]: https://github.com/LessUp/gpu-spmv/releases/tag/v1.0.0 -[0.1.0]: https://github.com/LessUp/gpu-spmv/tree/7d6dd0c +[1.0.0]: https://github.com/AICL-Lab/gpu-spmv/releases/tag/v1.0.0 +[0.1.0]: https://github.com/AICL-Lab/gpu-spmv/tree/7d6dd0c diff --git a/docs/en/citation.md b/docs/en/citation.md index b818426..f87ce71 100644 --- a/docs/en/citation.md +++ b/docs/en/citation.md @@ -6,11 +6,11 @@ If you use GPU SpMV in your research, please cite: ```bibtex @software{gpu_spmv_2026, - author = {LessUp}, + author = {AICL-Lab}, title = {GPU SpMV: High-Performance CUDA Sparse Matrix-Vector Multiplication}, year = {2026}, publisher = {GitHub}, - url = {https://github.com/LessUp/gpu-spmv}, + url = {https://github.com/AICL-Lab/gpu-spmv}, version = {1.0.0} } ``` @@ -18,61 +18,14 @@ If you use GPU SpMV in your research, please cite: ## Text Format ``` -LessUp. GPU SpMV: High-Performance CUDA Sparse Matrix-Vector Multiplication. -GitHub repository, 2026. https://github.com/LessUp/gpu-spmv +AICL-Lab. GPU SpMV: High-Performance CUDA Sparse Matrix-Vector Multiplication. +GitHub repository, 2026. https://github.com/AICL-Lab/gpu-spmv ``` --- -## Related Publications +## Usage Guidance -The algorithms implemented in this library are based on the following research: - -### Merge Path Algorithm - -1. **Merrill, D., & Garland, M. (2016)**. Merge-based parallel sparse matrix-vector multiplication. *Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '16)*. IEEE. - - ::: tip Key Contribution - The Merge Path algorithm enables perfect load balancing for irregular sparse matrices by partitioning work based on the merge operation between row pointers and work indices. - ::: - -### Vectorized CSR - -2. **Bell, N., & Garland, M. (2009)**. Implementing sparse matrix-vector multiplication on throughput-oriented processors. *Proceedings of SC '09*. IEEE. - -3. **Bell, N., Dalton, S., & Olson, L. N. (2012)**. Exposing fine-grained parallelism in algebraic multigrid methods. *SIAM Journal on Scientific Computing*, 34(4), C170-C194. - -### ELL Format - -4. **Vázquez, F., Fernández, J. J., & Garzón, E. M. (2011)**. Automatic tuning of the sparse matrix vector product on GPUs based on the ELL-R-T format. *Concurrency and Computation: Practice and Experience*, 24(1), 1-20. - ---- - -## Algorithm References - -| Algorithm | Reference | Key Idea | -|:----------|:----------|:---------| -| Scalar CSR | Bell & Garland (2009) | One thread per row | -| Vector CSR | Bell & Garland (2009) | One warp per row | -| Merge Path | Merrill & Garland (2016) | Merge-based partitioning | -| ELL Kernel | Vázquez et al. (2011) | Column-major coalesced access | - ---- - -## Benchmark Methodology - -Our benchmark methodology follows best practices from: - -- **SPAPT Benchmark Suite**: Standardized performance assessment for sparse computations -- **SuiteSparse Matrix Collection**: Real-world test matrices -- **GPU Performance Metrics**: Memory bandwidth utilization as primary metric - ---- - -## Acknowledgments - -This library builds upon the excellent work of the CUDA ecosystem: - -- NVIDIA cuSPARSE for reference implementations -- Thrust library for parallel primitives -- Google Test for testing infrastructure \ No newline at end of file +- Use the software citation above when citing the **repository itself**. +- Also cite the relevant papers from [References](/en/references) when discussing the **algorithms behind the implementation**. +- If you cite performance charts or benchmark claims, mention the hardware and link back to [Performance Methodology](/en/performance/methodology). diff --git a/docs/en/contributing.md b/docs/en/contributing.md index 3f7fb89..eec5015 100644 --- a/docs/en/contributing.md +++ b/docs/en/contributing.md @@ -14,7 +14,7 @@ Thank you for your interest in contributing to GPU SpMV! ### Clone and Build ```bash -git clone https://github.com/LessUp/gpu-spmv.git +git clone https://github.com/AICL-Lab/gpu-spmv.git cd gpu-spmv cmake --preset default cmake --build --preset default @@ -79,7 +79,7 @@ npm run dev ## Getting Help -- Open an [Issue](https://github.com/LessUp/gpu-spmv/issues) +- Open an [Issue](https://github.com/AICL-Lab/gpu-spmv/issues) - Check existing documentation - Review OpenSpec specs diff --git a/docs/en/faq.md b/docs/en/faq.md index f4f199d..b53b81b 100644 --- a/docs/en/faq.md +++ b/docs/en/faq.md @@ -198,4 +198,4 @@ If the above doesn't answer your question: 1. Check [API Reference](/en/api/spmv) for detailed usage 2. Check [Optimization Guide](/en/performance/optimization-guide) for performance tips -3. Ask on [GitHub Issues](https://github.com/LessUp/gpu-spmv/issues) +3. Ask on [GitHub Issues](https://github.com/AICL-Lab/gpu-spmv/issues) diff --git a/docs/en/index.md b/docs/en/index.md index 0b68138..7e53728 100644 --- a/docs/en/index.md +++ b/docs/en/index.md @@ -1,134 +1,53 @@ --- layout: home +title: GPU SpMV Technical Whitepaper --- -
-
- -
- GPU SpMV - Technical Whitepaper -
-
- -
- -
-

Production-Grade CUDA Sparse Matrix-Vector Multiplication

-

- High-performance SpMV achieving 70%+ theoretical memory bandwidth on modern NVIDIA GPUs. - 4 adaptive kernels, intelligent selection algorithm, comprehensive API. -

- -
- -
-
-
70%+
-
Bandwidth Utilization
-
-
-
4
-
Adaptive Kernels
-
-
-
CSR+ELL
-
Sparse Formats
-
-
-
100+
-
Test Cases
-
-
- -## Architecture Overview - -
- -```mermaid -flowchart LR - Input[Sparse Matrix] --> Analysis[Matrix Analysis] - Analysis --> Decision{Auto Select} - Decision -->|avg_nnz < 4| Scalar[Scalar CSR] - Decision -->|uniform rows| Vector[Vector CSR] - Decision -->|high skew| Merge[Merge Path] - Decision -->|column-major| ELL[ELL Kernel] - Scalar --> GPU[GPU Execution] - Vector --> GPU - Merge --> GPU - ELL --> GPU - GPU --> Result[Result Vector] -``` - -
- -## Technical Features - -
-
-
Kernel Selection Strategy
-
- Automatic kernel selection based on matrix characteristics: avg_nnz, row length skewness. -
-
- Details -
-
- -
-
Merge Path Algorithm
-
- Perfect load balancing for irregular sparsity patterns. O(nnz + m) work decomposition. -
- -
- -
-
Production Quality
-
- RAII resource management, semantic error codes, CudaBuffer abstraction, cross-platform. -
-
- API -
-
- -
-
Spec-Driven Development
-
- OpenSpec specification-driven workflow. Design decisions traceable, documentation as code. -
-
- Workflow -
-
- -
-
Academic Rigor
-
- Complete academic citation support, BibTeX format, related paper references. -
-
- Citation -
-
- -
-
Quick Start
-
- git clone https://github.com/LessUp/gpu-spmv.git -
-
- Guide -
-
-
+ + + + + + + + + + + +
+
+

Performance-first

+

Kernel choice, irregular sparsity behavior, and bandwidth utilization are presented as explicit decisions.

+
+
+

Engineering clarity

+

The execution pipeline, memory layout, reliability story, and spec-driven workflow are all visible.

+
+
+

Interview-ready narrative

+

A reviewer can understand the value proposition, evidence chain, and reading path directly from the site.

+
+
+
diff --git a/docs/en/performance/benchmarks.md b/docs/en/performance/benchmarks.md index d22e756..8a5a022 100644 --- a/docs/en/performance/benchmarks.md +++ b/docs/en/performance/benchmarks.md @@ -1,6 +1,12 @@ # Benchmarks -GPU SpMV performance test results on NVIDIA RTX 3090. + + +This benchmark page is not only a table of numbers. Its purpose is to explain **what these results actually mean and how they should be interpreted**. + + ## Test Environment @@ -88,29 +94,15 @@ SpMV is memory bandwidth bound. Our implementation achieves 70%+ of theoretical - **Ampere (SM 8.6)**: Best performance - **Hopper (SM 9.0)**: Full support -## Benchmark Method - -```cpp -#include - -int main() { - CSRMatrix* csr = /* ... */; - csr_to_gpu(csr); - - // Multiple runs for average - BenchmarkResult result = benchmark_spmv(csr, 100); - - printf("Avg time: %.3f ms\n", result.avg_ms); - printf("Min time: %.3f ms\n", result.min_ms); - printf("Max time: %.3f ms\n", result.max_ms); - printf("Stddev: %.3f ms\n", result.stddev_ms); - printf("Bandwidth: %.1f GB/s\n", result.bandwidth_gb_s); +## How to read these results - return 0; -} -``` +- **70%+ utilization** means the implementation is approaching a sensible memory-bound ceiling. +- **ELL winning on regular patterns** does not mean it should be used universally; applicability and conversion cost still matter. +- **Merge Path staying ahead on skewed matrices** is evidence that load balancing is the dominant concern there. +- **The selector matters** because it turns those judgments into default behavior instead of a manual tuning burden. ## References +- [Performance Methodology](/en/performance/methodology) - [Optimization Guide](/en/performance/optimization-guide) -- [Kernel Selection](/en/architecture/kernel-selection) \ No newline at end of file +- [Kernel Selection](/en/architecture/kernel-selection) diff --git a/docs/en/performance/methodology.md b/docs/en/performance/methodology.md new file mode 100644 index 0000000..2ad695c --- /dev/null +++ b/docs/en/performance/methodology.md @@ -0,0 +1,19 @@ +# Performance Methodology + +## Read the method before the numbers + +Benchmark numbers are not persuasive on their own. This page explains **under what conditions the measurements make sense, how they should be read, and which conclusions are safe to draw**. + +## Measurement Assumptions + +- GPU: NVIDIA RTX 3090 (Ampere) +- Peak bandwidth: 936 GB/s +- Primary metrics: time, bandwidth, utilization, variance +- Main comparison: kernel choice across different sparsity patterns + +## Recommended Reading Order + +1. **Look for trends, not only peaks**: does the implementation stay near 70%+ utilization consistently? +2. **Read matrix pattern together with kernel choice**: regular and highly skewed matrices should not be judged the same way. +3. **Check whether the selector is explainable**: does the chosen kernel match the matrix statistics? +4. **Look at variance**: a high average with unstable spread is weaker evidence. diff --git a/docs/en/quickstart.md b/docs/en/quickstart.md index 1a409da..7f9512d 100644 --- a/docs/en/quickstart.md +++ b/docs/en/quickstart.md @@ -24,7 +24,7 @@ nvidia-smi ### 1. Clone Repository ```bash -git clone https://github.com/LessUp/gpu-spmv.git +git clone https://github.com/AICL-Lab/gpu-spmv.git cd gpu-spmv ``` diff --git a/docs/en/references.md b/docs/en/references.md index 8b18a4e..1ed799a 100644 --- a/docs/en/references.md +++ b/docs/en/references.md @@ -1,78 +1,21 @@ -# Academic References +# References -GPU SpMV is built upon the following academic research. + -## Core Algorithms +This page separates papers, comparable projects, and follow-up reading so readers can understand **what this project learned from and what ecosystem it belongs to**. -### Merge-based Parallel SpMV +## Core Papers -> Merrill, D., & Garland, M. (2016). **Merge-based parallel sparse matrix-vector multiplication**. *ACM SIGPLAN Notices*, 51(8), 12-21. + -- **Contribution**: Proposed the Merge Path algorithm for perfect load balancing -- **Applied to**: `MERGE_PATH` kernel -- [DOI: 10.1145/3016078.285114](https://doi.org/10.1145/3016078.285114) +## Representative Projects -### Bell & Garland Survey + -> Bell, N., & Garland, M. (2009). **Implementing sparse matrix-vector multiplication on throughput-oriented processors**. *SC'09: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis*. +## How to read these references -- **Contribution**: CSR vs ELL format performance analysis, foundational GPU SpMV theory -- **Applied to**: `VECTOR_CSR`, `ELL_KERNEL` design -- [DOI: 10.1145/1654059.1654121](https://doi.org/10.1145/1654059.1654121) - -### CSR5 Format - -> Liu, Y., & Vuduc, R. (2018). **An adaptive algorithm for sparse matrix-vector multiplication on GPUs**. *IEEE Transactions on Parallel and Distributed Systems*. - -- **Contribution**: CSR5 format with adaptive load balancing -- **Reference**: Understanding load distribution in irregular sparse matrices - -## GPU Computing - -### CUDA Best Practices - -> NVIDIA. (2024). **CUDA C++ Best Practices Guide**. - -- **Reference**: Memory coalescing, texture cache, warp synchronization -- [Link](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/) - -### CUDA Programming Guide - -> NVIDIA. (2024). **CUDA C++ Programming Guide**. - -- **Reference**: CUDA execution model, memory hierarchy -- [Link](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) - -## PageRank - -> Page, L., Brin, S., Motwani, R., & Winograd, T. (1999). **The PageRank citation ranking: Bringing order to the web**. *Stanford InfoLab*. - -- **Contribution**: Original PageRank algorithm -- **Applied to**: `pagerank()` implementation - -## Related Projects - -| Project | Stars | Description | Key Takeaway | -|:--------|:-----:|:------------|:-------------| -| [Ginkgo](https://github.com/ginkgo-project/ginkgo) | 597 | High-performance linear algebra | Performance visualization | -| [cuSPARSE](https://docs.nvidia.com/cuda/cusparse/) | N/A | NVIDIA official library | Performance baseline | -| [SuiteSparse](https://github.com/DrTimothyAldenDavis/SuiteSparse) | 947 | Sparse matrix collection | Standard test data | -| [Kokkos Kernels](https://github.com/kokkos/kokkos-kernels) | 300+ | Multi-backend sparse BLAS | Performance portability | - -## Cite This Project - -```bibtex -@software{gpuspmv2024, - author = {LessUp}, - title = {GPU SpMV: High-Performance CUDA Sparse Matrix-Vector Multiplication}, - year = {2024}, - url = {https://github.com/LessUp/gpu-spmv} -} -``` - -## Further Reading - -1. **GPU Architecture**: Understanding GPU memory hierarchy and execution model -2. **Sparse Matrix Formats**: Trade-offs between different formats -3. **Load Balancing**: Techniques for parallel load balancing -4. **Memory Coalescing**: GPU memory access optimization +1. Start with **Bell & Garland** for the classic GPU SpMV framing. +2. Read **Merrill & Garland** to understand why Merge Path matters for irregular work distribution. +3. Compare against **cuSPARSE / Ginkgo / SuiteSparse** to place this project inside the real sparse-computing ecosystem. diff --git a/docs/en/whitepaper/index.md b/docs/en/whitepaper/index.md index 705135d..9906b3d 100644 --- a/docs/en/whitepaper/index.md +++ b/docs/en/whitepaper/index.md @@ -1,100 +1,28 @@ -# Technical Whitepaper +# GPU SpMV: Read the project as an engineering artifact -## Executive Summary + +This site is written for interviewers, open-source readers, and performance engineers. The whitepaper landing page leads with conclusions, then points to the design decisions and evidence chain behind them. + -GPU SpMV is a **production-grade CUDA library** implementing high-performance sparse matrix-vector multiplication (SpMV), achieving **70%+ of theoretical memory bandwidth** on modern NVIDIA GPUs. +## Why this project deserves a whitepaper -### Key Contributions +- SpMV is a classic **memory-bandwidth-bound** workload, so performance depends more on access patterns than raw arithmetic throughput. +- The interesting part is not only which kernel exists, but **why it is chosen, when it is chosen, and how that choice is justified**. +- This project combines CUDA performance work with RAII resource management, explicit error handling, spec-driven development, and readable documentation. -| Contribution | Impact | -|:-------------|:-------| -| **4 Optimized Kernels** | Adaptive kernel selection based on matrix characteristics | -| **Merge Path Algorithm** | Perfect load balancing for irregular sparsity patterns | -| **ELL Column-Major Layout** | Fully coalesced memory access for uniform matrices | -| **Spec-Driven Development** | Complete design decision traceability | +## What this whitepaper is meant to answer -### Performance Highlights +1. Why the problem matters and where the real bottlenecks are. +2. What each optimized kernel and the selector are responsible for. +3. How performance, engineering discipline, and explainability are tied together. +4. Where to continue reading for architecture, API usage, performance interpretation, and references. -| Matrix Size | Non-zeros | Kernel | Bandwidth Utilization | -|:-----------:|:---------:|:-------|:---------------------:| -| 10K × 10K | 500K | Vector CSR | **70.2%** | -| 100K × 100K | 5M | Merge Path | **71.5%** | -| 1M × 1M | 50M | Merge Path | **70.8%** | +## Reading Path -::: info Benchmark Environment -NVIDIA RTX 3090 (Ampere architecture, theoretical bandwidth: 936 GB/s) -::: - -### Target Audience - -- **Systems Architects**: Designing GPU-accelerated sparse computations -- **HPC Engineers**: Optimizing memory-bound workloads -- **Researchers**: Requiring reproducible, well-documented baselines -- **Application Developers**: Building graph algorithms, iterative solvers - -### Document Structure - -| Section | Purpose | -|:--------|:--------| -| [Design Philosophy](/en/whitepaper/philosophy) | Architectural principles and trade-offs | -| [Performance Analysis](/en/whitepaper/performance) | Detailed benchmark methodology and results | -| [Architecture Overview](/en/architecture/overview) | System design documentation | -| [API Reference](/en/api/spmv) | Complete API documentation | - ---- - -## Why SpMV Matters - -Sparse matrix-vector multiplication (SpMV) is a fundamental operation in: - -- **Graph Analytics**: PageRank, community detection, shortest path -- **Scientific Computing**: Finite element analysis, CFD, iterative solvers -- **Machine Learning**: Sparse neural networks, recommendation systems - -SpMV is inherently **memory-bound** — each non-zero element requires reading matrix data, column indices, and vector values, with minimal computation. Achieving high bandwidth utilization is the primary optimization challenge. - ---- - -## Design Overview - -```mermaid -flowchart TB - subgraph Input["Input"] - Matrix[Sparse Matrix] - Vector[Dense Vector] - end - - subgraph Analysis["Matrix Analysis"] - NNZ[avg_nnz per row] - Skew[Skewness] - Pattern[Distribution Pattern] - end - - subgraph Selection["Kernel Selection"] - Decision{Auto Select} - Scalar[Scalar CSR
avg_nnz < 4] - Vector[Vector CSR
uniform rows] - Merge[Merge Path
high skewness] - ELL[ELL Kernel
column-major] - end - - subgraph Execution["GPU Execution"] - Compute[SpMV Computation] - Result[Result Vector] - end - - Matrix --> Analysis - Vector --> Execution - Analysis --> Decision - Decision --> Scalar - Decision --> Vector - Decision --> Merge - Decision --> ELL - Scalar --> Compute - Vector --> Compute - Merge --> Compute - ELL --> Compute - Compute --> Result -``` - -The library automatically selects the optimal kernel based on matrix characteristics, ensuring near-peak performance across diverse sparsity patterns. \ No newline at end of file +| Page | Role | +|:-----|:-----| +| [Design Philosophy](/en/whitepaper/philosophy) | See the architectural priorities and trade-offs | +| [Performance Analysis](/en/whitepaper/performance) | Learn how to interpret the benchmark evidence | +| [Architecture Overview](/en/architecture/overview) | Understand the execution pipeline and module boundaries | +| [API Reference](/en/api/spmv) | Inspect the external interface | +| [References](/en/references) | Review papers, projects, and further reading | diff --git a/docs/en/whitepaper/performance.md b/docs/en/whitepaper/performance.md index 30087b6..3a4c267 100644 --- a/docs/en/whitepaper/performance.md +++ b/docs/en/whitepaper/performance.md @@ -205,7 +205,7 @@ To reproduce these benchmarks: ```bash # Clone and build -git clone https://github.com/LessUp/gpu-spmv.git +git clone https://github.com/AICL-Lab/gpu-spmv.git cd gpu-spmv cmake -S . -B build -DCMAKE_BUILD_TYPE=Release cmake --build build diff --git a/docs/public/images/brand/logo-mark-dark.svg b/docs/public/images/brand/logo-mark-dark.svg new file mode 100644 index 0000000..e81f001 --- /dev/null +++ b/docs/public/images/brand/logo-mark-dark.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/docs/public/images/brand/logo-mark-light.svg b/docs/public/images/brand/logo-mark-light.svg new file mode 100644 index 0000000..6c2ce6f --- /dev/null +++ b/docs/public/images/brand/logo-mark-light.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/docs/public/images/favicon.svg b/docs/public/images/favicon.svg index c07e335..c5890bc 100644 --- a/docs/public/images/favicon.svg +++ b/docs/public/images/favicon.svg @@ -6,5 +6,5 @@ - Sp + Sp diff --git a/docs/public/images/logo.svg b/docs/public/images/logo.svg index ce34797..38dcc5a 100644 --- a/docs/public/images/logo.svg +++ b/docs/public/images/logo.svg @@ -8,5 +8,5 @@ - Sp + Sp diff --git a/docs/public/images/og-image.svg b/docs/public/images/og-image.svg index 0fd61b2..0f324bf 100644 --- a/docs/public/images/og-image.svg +++ b/docs/public/images/og-image.svg @@ -81,6 +81,6 @@ - github.com/LessUp/gpu-spmv + github.com/AICL-Lab/gpu-spmv - \ No newline at end of file + diff --git a/docs/public/images/social/og-dark.svg b/docs/public/images/social/og-dark.svg new file mode 100644 index 0000000..4c5662f --- /dev/null +++ b/docs/public/images/social/og-dark.svg @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + Sp + GPU SpMV + Technical whitepaper, architecture, and evidence-driven documentation. + + + + 70%+ + Bandwidth utilization + 4 + Adaptive kernels + Docs + Code + Interview-ready showcase + github.com/AICL-Lab/gpu-spmv + diff --git a/docs/public/images/social/og-light.svg b/docs/public/images/social/og-light.svg new file mode 100644 index 0000000..8e54e94 --- /dev/null +++ b/docs/public/images/social/og-light.svg @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + Sp + GPU SpMV + Technical whitepaper, architecture, and evidence-driven documentation. + + + + 70%+ + Bandwidth utilization + 4 + Adaptive kernels + Docs + Code + Interview-ready showcase + github.com/AICL-Lab/gpu-spmv + diff --git a/docs/scripts/verify-site.mjs b/docs/scripts/verify-site.mjs index 7106669..8134b16 100644 --- a/docs/scripts/verify-site.mjs +++ b/docs/scripts/verify-site.mjs @@ -1,14 +1,37 @@ -import { readFileSync } from 'node:fs' +import { existsSync, readdirSync, readFileSync } from 'node:fs' import { join } from 'node:path' const root = process.cwd() const canonicalRepo = 'AICL-Lab/gpu-spmv' +function collectTextFiles(dirPath) { + const entries = readdirSync(dirPath, { withFileTypes: true }) + const files = [] + + for (const entry of entries) { + const fullPath = join(dirPath, entry.name) + if (entry.isDirectory()) { + files.push(...collectTextFiles(fullPath)) + continue + } + if (/\.(md|ts|yml|svg)$/.test(entry.name)) { + files.push(fullPath) + } + } + + return files +} + const files = { readme: join(root, '..', 'README.md'), config: join(root, '.vitepress', 'config.ts'), pages: join(root, '..', '.github', 'workflows', 'pages.yml'), - index: join(root, 'index.md') + index: join(root, 'index.md'), + themeIndex: join(root, '.vitepress', 'theme', 'index.ts'), + zhHome: join(root, 'zh', 'index.md'), + enHome: join(root, 'en', 'index.md'), + zhWhitepaper: join(root, 'zh', 'whitepaper', 'index.md'), + enWhitepaper: join(root, 'en', 'whitepaper', 'index.md') } const contents = Object.fromEntries( @@ -33,6 +56,126 @@ if (/useRouter\(|router\.go\('\/(zh|en)\//.test(contents.index)) { failures.push('root docs index still auto-redirects by locale') } +const requiredThemeFiles = [ + join(root, '.vitepress', 'theme', 'Layout.vue'), + join(root, '.vitepress', 'theme', 'styles', 'tokens.css'), + join(root, '.vitepress', 'theme', 'styles', 'base.css'), + join(root, '.vitepress', 'theme', 'styles', 'home.css'), + join(root, '.vitepress', 'theme', 'styles', 'paper.css'), + join(root, '.vitepress', 'theme', 'styles', 'citation.css'), + join(root, '.vitepress', 'theme', 'styles', 'diagram.css'), + join(root, '.vitepress', 'theme', 'components', 'HeroEvidence.vue'), + join(root, '.vitepress', 'theme', 'components', 'MetricStrip.vue'), + join(root, '.vitepress', 'theme', 'components', 'WhitepaperSection.vue'), + join(root, '.vitepress', 'theme', 'components', 'ArchitectureCanvas.vue'), + join(root, '.vitepress', 'theme', 'components', 'CitationGrid.vue'), + join(root, '.vitepress', 'theme', 'components', 'ThemeAwareArt.vue'), + join(root, '.vitepress', 'theme', 'components', 'CalloutPanel.vue'), + join(root, '.vitepress', 'data', 'site.ts') +] + +const requiredAssetFiles = [ + join(root, 'public', 'images', 'brand', 'logo-mark-light.svg'), + join(root, 'public', 'images', 'brand', 'logo-mark-dark.svg'), + join(root, 'public', 'images', 'social', 'og-light.svg'), + join(root, 'public', 'images', 'social', 'og-dark.svg') +] + +const requiredContentFiles = [ + join(root, '.vitepress', 'data', 'references.ts'), + join(root, '.vitepress', 'data', 'benchmarks.ts'), + join(root, 'zh', 'architecture', 'execution-pipeline.md'), + join(root, 'en', 'architecture', 'execution-pipeline.md'), + join(root, 'zh', 'architecture', 'reliability.md'), + join(root, 'en', 'architecture', 'reliability.md'), + join(root, 'zh', 'performance', 'methodology.md'), + join(root, 'en', 'performance', 'methodology.md') +] + +for (const filePath of [...requiredThemeFiles, ...requiredAssetFiles, ...requiredContentFiles]) { + if (!existsSync(filePath)) { + failures.push(`missing theme file: ${filePath.replace(`${root}/`, '')}`) + } +} + +const themeIndexChecks = [ + 'HeroEvidence', + 'MetricStrip', + 'WhitepaperSection', + 'ArchitectureCanvas', + 'CitationGrid', + 'ThemeAwareArt', + 'CalloutPanel' +] + +for (const token of themeIndexChecks) { + if (!contents.themeIndex.includes(token)) { + failures.push(`theme index missing component registration: ${token}`) + } +} + +if (!contents.zhHome.includes(' readFileSync(filePath, 'utf8')) + .join('\n') + +if (/LessUp\/gpu-spmv|github\.com\/LessUp|lessup\.github\.io\/gpu-spmv/.test(docsCorpus)) { + failures.push('legacy LessUp references still exist in docs corpus') +} + if (failures.length > 0) { console.error('verify-site failed:') for (const failure of failures) { diff --git a/docs/zh/architecture/execution-pipeline.md b/docs/zh/architecture/execution-pipeline.md new file mode 100644 index 0000000..61dc1c3 --- /dev/null +++ b/docs/zh/architecture/execution-pipeline.md @@ -0,0 +1,28 @@ +# 执行流水线 + +## 为什么要单独讲执行流水线 + +GPU SpMV 的难点不在于“调用一个 kernel”,而在于 **输入矩阵如何被分析、如何做 kernel 选择、如何复用执行上下文,以及如何解释结果是否可信**。 + +## Pipeline 分解 + +1. **输入阶段**:加载 CSR / ELL 数据结构,准备输入向量。 +2. **分析阶段**:统计 `avg_nnz_per_row`、偏斜度和行分布模式。 +3. **决策阶段**:基于统计结果选择 Scalar CSR、Vector CSR、Merge Path 或 ELL。 +4. **执行阶段**:调度 GPU kernel,记录时间和带宽指标。 +5. **验证阶段**:与 CPU 参考结果或既有基线做一致性检查。 + +## 关键判断 + +| 现象 | 决策 | +|:-----|:-----| +| `avg_nnz_per_row < 4` | Scalar CSR,避免 warp 级资源浪费 | +| 行长度均匀、偏斜度低 | Vector CSR,提升 warp 内协作效率 | +| 行长度高度不均 | Merge Path,优先负载均衡 | +| 行宽近似固定 | ELL kernel,优先合并访存 | + +## 这个页面应该和什么一起看 + +- [Kernel 选择策略](/zh/architecture/kernel-selection) +- [内存布局](/zh/architecture/memory-layout) +- [性能方法学](/zh/performance/methodology) diff --git a/docs/zh/architecture/overview.md b/docs/zh/architecture/overview.md index 55137c5..02cb39c 100644 --- a/docs/zh/architecture/overview.md +++ b/docs/zh/architecture/overview.md @@ -1,6 +1,6 @@ # 架构概览 -GPU SpMV 采用分层架构设计,清晰分离存储、计算和应用层。 +GPU SpMV 的架构重点不是“模块图长什么样”,而是 **如何把矩阵统计、kernel 选择、执行上下文和验证链路串成可解释的工程系统**。 ## 系统架构 @@ -84,40 +84,16 @@ graph TB - **图神经网络** — 稀疏图卷积 - **科学计算** — 有限元、CFD -## 设计亮点 +## 这份架构总览最重要的三件事 -### 1. RAII 资源管理 - -```cpp -// 自动生命周期管理,防止内存泄漏 -class CudaBuffer { -public: - explicit CudaBuffer(size_t n) { cudaMalloc(&ptr_, n * sizeof(T)); } - ~CudaBuffer() { cudaFree(ptr_); } - // 禁用拷贝,允许移动 -}; -``` - -### 2. 执行上下文 - -```cpp -// 缓存纹理对象,避免重复创建 -SpMVExecutionContext ctx; -for (int i = 0; i < n_iter; i++) { - spmv_csr(csr, d_x, d_y, &config, n, &ctx); - // 纹理对象被复用 -} -``` - -### 3. 自动 Kernel 选择 - -```cpp -// 基于矩阵特征自动选择最优 Kernel -SpMVConfig config = spmv_auto_config(csr); -``` +1. **数据怎么流动**:输入矩阵先被分析,再决定走哪条执行路径。 +2. **为什么自动选择成立**:不是玄学 heuristics,而是围绕 `avg_nnz_per_row` 与偏斜度展开。 +3. **为什么它可信**:资源管理、错误语义、CPU 参考路径和 property tests 共同形成约束。 ## 相关文档 - [Kernel 选择策略](/zh/architecture/kernel-selection) +- [执行流水线](/zh/architecture/execution-pipeline) - [内存布局](/zh/architecture/memory-layout) +- [可靠性约束](/zh/architecture/reliability) - [Spec-Driven 开发](/zh/architecture/spec-driven) diff --git a/docs/zh/architecture/reliability.md b/docs/zh/architecture/reliability.md new file mode 100644 index 0000000..890b668 --- /dev/null +++ b/docs/zh/architecture/reliability.md @@ -0,0 +1,25 @@ +# 可靠性与工程约束 + +## 可靠性来自哪里 + +这个项目的可靠性不是靠“看起来能跑”,而是来自三条线同时成立: + +1. **资源生命周期明确**:使用 `CudaBuffer` 和执行上下文抽象,避免裸 `cudaMalloc` / `cudaFree`。 +2. **错误语义明确**:通过 `SpMVError` 和 CUDA 检查宏把失败显式暴露出来。 +3. **规范与测试闭环**:OpenSpec 提供需求来源,property tests 提供回归保护。 + +## 为什么这对展示项目很重要 + +面试或开源展示时,读者更容易相信一个项目,如果它能回答: + +- 资源怎么释放? +- 失败怎么暴露? +- benchmark 数据怎么解释? +- 设计变化如何追溯? + +## 可靠性清单 + +- RAII 封装 GPU 资源 +- CPU 参考实现用于交叉验证 +- property tests ≥ 100 次迭代 +- GitHub Pages 把设计与引用一起公开 diff --git a/docs/zh/architecture/spec-driven.md b/docs/zh/architecture/spec-driven.md index 100ef55..e8c87b6 100644 --- a/docs/zh/architecture/spec-driven.md +++ b/docs/zh/architecture/spec-driven.md @@ -157,5 +157,5 @@ flowchart LR ## 参考 -- [OpenSpec 规范](https://github.com/LessUp/gpu-spmv/tree/main/openspec) +- [OpenSpec 规范](https://github.com/AICL-Lab/gpu-spmv/tree/main/openspec) - [架构概览](/zh/architecture/overview) \ No newline at end of file diff --git a/docs/zh/changelog.md b/docs/zh/changelog.md index 9e2c329..398571b 100644 --- a/docs/zh/changelog.md +++ b/docs/zh/changelog.md @@ -46,7 +46,7 @@ This is the first stable release of GPU SpMV, featuring complete CSR and ELL for - Doxygen-compatible documentation #### Documentation -- Full documentation site at https://lessup.github.io/gpu-spmv/ +- Full documentation site at https://aicl-lab.github.io/gpu-spmv/ - Bilingual README (English and Chinese) - API reference, performance guide, and code examples - Architecture documentation and design decision records @@ -144,5 +144,5 @@ No breaking changes from pre-release versions. The API is now stable. --- -[1.0.0]: https://github.com/LessUp/gpu-spmv/releases/tag/v1.0.0 -[0.1.0]: https://github.com/LessUp/gpu-spmv/tree/7d6dd0c +[1.0.0]: https://github.com/AICL-Lab/gpu-spmv/releases/tag/v1.0.0 +[0.1.0]: https://github.com/AICL-Lab/gpu-spmv/tree/7d6dd0c diff --git a/docs/zh/citation.md b/docs/zh/citation.md index 6f84cff..4011448 100644 --- a/docs/zh/citation.md +++ b/docs/zh/citation.md @@ -6,11 +6,11 @@ ```bibtex @software{gpu_spmv_2026, - author = {LessUp}, + author = {AICL-Lab}, title = {GPU SpMV: High-Performance CUDA Sparse Matrix-Vector Multiplication}, year = {2026}, publisher = {GitHub}, - url = {https://github.com/LessUp/gpu-spmv}, + url = {https://github.com/AICL-Lab/gpu-spmv}, version = {1.0.0} } ``` @@ -18,61 +18,14 @@ ## 文本格式 ``` -LessUp. GPU SpMV: High-Performance CUDA Sparse Matrix-Vector Multiplication. -GitHub repository, 2026. https://github.com/LessUp/gpu-spmv +AICL-Lab. GPU SpMV: High-Performance CUDA Sparse Matrix-Vector Multiplication. +GitHub repository, 2026. https://github.com/AICL-Lab/gpu-spmv ``` --- -## 相关论文 +## 使用建议 -本库实现的算法基于以下研究: - -### Merge Path 算法 - -1. **Merrill, D., & Garland, M. (2016)**. Merge-based parallel sparse matrix-vector multiplication. *Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '16)*. IEEE. - - ::: tip 核心贡献 - Merge Path 算法通过基于行指针和工作索引的合并操作进行工作分区,为不规则稀疏矩阵实现完美负载均衡。 - ::: - -### 向量化 CSR - -2. **Bell, N., & Garland, M. (2009)**. Implementing sparse matrix-vector multiplication on throughput-oriented processors. *Proceedings of SC '09*. IEEE. - -3. **Bell, N., Dalton, S., & Olson, L. N. (2012)**. Exposing fine-grained parallelism in algebraic multigrid methods. *SIAM Journal on Scientific Computing*, 34(4), C170-C194. - -### ELL 格式 - -4. **Vázquez, F., Fernández, J. J., & Garzón, E. M. (2011)**. Automatic tuning of the sparse matrix vector product on GPUs based on the ELL-R-T format. *Concurrency and Computation: Practice and Experience*, 24(1), 1-20. - ---- - -## 算法参考 - -| 算法 | 参考文献 | 核心思想 | -|:-----|:---------|:---------| -| Scalar CSR | Bell & Garland (2009) | 每行一线程 | -| Vector CSR | Bell & Garland (2009) | 每行一 warp | -| Merge Path | Merrill & Garland (2016) | 基于合并的分区 | -| ELL Kernel | Vázquez et al. (2011) | 列主序合并访存 | - ---- - -## 基准测试方法 - -我们的基准测试方法遵循以下最佳实践: - -- **SPAPT 基准测试套件**:稀疏计算标准化性能评估 -- **SuiteSparse 矩阵集**:真实世界测试矩阵 -- **GPU 性能指标**:内存带宽利用率作为主要指标 - ---- - -## 致谢 - -本库基于 CUDA 生态系统的优秀工作: - -- NVIDIA cuSPARSE 提供参考实现 -- Thrust 库提供并行原语 -- Google Test 提供测试基础设施 \ No newline at end of file +- 如果引用的是**代码仓库**,请使用上面的软件引用格式。 +- 如果引用的是**算法来源**,请同时引用 [学术参考](/zh/references) 中对应论文。 +- 如果引用的是**性能数据或图示**,请在正文中说明硬件与方法学条件,并链接到 [性能方法学](/zh/performance/methodology)。 diff --git a/docs/zh/contributing.md b/docs/zh/contributing.md index e945638..f11b037 100644 --- a/docs/zh/contributing.md +++ b/docs/zh/contributing.md @@ -14,7 +14,7 @@ ### 克隆和构建 ```bash -git clone https://github.com/LessUp/gpu-spmv.git +git clone https://github.com/AICL-Lab/gpu-spmv.git cd gpu-spmv cmake --preset default cmake --build --preset default @@ -79,7 +79,7 @@ npm run dev ## 获取帮助 -- 提交 [Issue](https://github.com/LessUp/gpu-spmv/issues) +- 提交 [Issue](https://github.com/AICL-Lab/gpu-spmv/issues) - 查看现有文档 - 阅读 OpenSpec 规范 diff --git a/docs/zh/faq.md b/docs/zh/faq.md index 8baaf3c..85a36f3 100644 --- a/docs/zh/faq.md +++ b/docs/zh/faq.md @@ -198,4 +198,4 @@ auto end = std::chrono::high_resolution_clock::now(); 1. 查看 [API 参考](/zh/api/spmv) 了解详细用法 2. 查看 [性能指南](/zh/performance/optimization-guide) 了解优化技巧 -3. 在 [GitHub Issues](https://github.com/LessUp/gpu-spmv/issues) 提问 +3. 在 [GitHub Issues](https://github.com/AICL-Lab/gpu-spmv/issues) 提问 diff --git a/docs/zh/index.md b/docs/zh/index.md index 2ea3faf..f99e284 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -1,134 +1,53 @@ --- layout: home +title: GPU SpMV 技术白皮书 --- -
-
- -
- GPU SpMV - 技术白皮书 -
-
- -
- -
-

生产级 CUDA 稀疏矩阵向量乘法

-

- 高性能稀疏矩阵向量乘法(SpMV),在现代 NVIDIA GPU 上实现 70%+ 理论内存带宽利用率。 - 4 个自适应内核、智能选择算法、完整 API。 -

- -
- -
-
-
70%+
-
带宽利用率
-
-
-
4
-
自适应内核
-
-
-
CSR+ELL
-
稀疏格式
-
-
-
100+
-
测试用例
-
-
- -## 架构概览 - -
- -```mermaid -flowchart LR - Input[稀疏矩阵] --> Analysis[矩阵分析] - Analysis --> Decision{自动选择} - Decision -->|avg_nnz < 4| Scalar[Scalar CSR] - Decision -->|均匀行| Vector[Vector CSR] - Decision -->|高偏斜| Merge[Merge Path] - Decision -->|列主序| ELL[ELL 内核] - Scalar --> GPU[GPU 执行] - Vector --> GPU - Merge --> GPU - ELL --> GPU - GPU --> Result[结果向量] -``` - -
- -## 技术特性 - -
-
-
内核选择策略
-
- 基于矩阵特征自动选择最优内核:平均非零元数、行长度偏斜度。 -
- -
- -
-
Merge Path 算法
-
- 针对不规则稀疏模式的完美负载均衡,O(nnz + m) 工作分解。 -
- -
- -
-
生产级质量
-
- RAII 资源管理、语义化错误码、CudaBuffer 抽象、跨平台支持。 -
-
- API -
-
- -
-
Spec-Driven 开发
-
- OpenSpec 规范驱动工作流,设计决策可追溯,文档即代码。 -
-
- 工作流 -
-
- -
-
学术严谨
-
- 完整的学术引用支持、BibTeX 格式、相关论文参考。 -
- -
- -
-
快速开始
-
- git clone https://github.com/LessUp/gpu-spmv.git -
- -
-
+ + + + + + + + + + + +
+
+

性能导向

+

围绕内存带宽利用率、矩阵分布与 kernel 选择给出明确论证。

+
+
+

工程可解释

+

把执行流水线、数据布局、错误处理与 spec-driven workflow 全部显式化。

+
+
+

适合面试与开源展示

+

首页就能看到项目定位、亮点、证据链与延伸阅读路径。

+
+
+
diff --git a/docs/zh/performance/benchmarks.md b/docs/zh/performance/benchmarks.md index 3ac3f10..49c63be 100644 --- a/docs/zh/performance/benchmarks.md +++ b/docs/zh/performance/benchmarks.md @@ -1,6 +1,12 @@ # 基准测试 -GPU SpMV 在 NVIDIA RTX 3090 上的性能测试结果。 + + +GPU SpMV 的 benchmark 页面不只罗列数字,而是帮助读者理解 **这些数字说明了什么,不说明什么**。 + + ## 测试环境 @@ -88,29 +94,15 @@ SpMV 是内存带宽受限的计算,我们的实现达到 70%+ 的理论带宽 - **Ampere (SM 8.6)**: 最佳性能 - **Hopper (SM 9.0)**: 完全支持 -## 基准测试方法 - -```cpp -#include - -int main() { - CSRMatrix* csr = /* ... */; - csr_to_gpu(csr); - - // 多次运行取平均 - BenchmarkResult result = benchmark_spmv(csr, 100); - - printf("Avg time: %.3f ms\n", result.avg_ms); - printf("Min time: %.3f ms\n", result.min_ms); - printf("Max time: %.3f ms\n", result.max_ms); - printf("Stddev: %.3f ms\n", result.stddev_ms); - printf("Bandwidth: %.1f GB/s\n", result.bandwidth_gb_s); +## 如何阅读这些结果 - return 0; -} -``` +- **70%+ 带宽利用率** 说明实现已经接近“受限于访存”的合理上界。 +- **ELL 在规则模式下更高**,并不意味着它适合所有矩阵;格式转换和适用范围必须一起考虑。 +- **Merge Path 在高偏斜分布下稳定领先**,说明负载均衡确实是这类矩阵的第一问题。 +- **自动选择器的价值** 在于把这些判断变成默认能力,而不是要求用户手工猜测。 ## 参考 +- [性能方法学](/zh/performance/methodology) - [优化指南](/zh/performance/optimization-guide) -- [Kernel 选择策略](/zh/architecture/kernel-selection) \ No newline at end of file +- [Kernel 选择策略](/zh/architecture/kernel-selection) diff --git a/docs/zh/performance/methodology.md b/docs/zh/performance/methodology.md new file mode 100644 index 0000000..2edb0a4 --- /dev/null +++ b/docs/zh/performance/methodology.md @@ -0,0 +1,19 @@ +# 性能方法学 + +## 先看方法,再看数字 + +如果不先交代测量方法,性能数字本身没有说服力。这个页面的目标是帮助读者理解:**这些 benchmark 结果在什么条件下成立,应该怎样阅读,哪些结论可以安全地得出。** + +## 测量前提 + +- GPU:NVIDIA RTX 3090(Ampere) +- 理论带宽:936 GB/s +- 关注指标:时间、带宽、利用率、方差 +- 对比对象:不同稀疏模式下的 kernel 选择结果 + +## 推荐阅读方式 + +1. **先看趋势,不只看峰值**:是否稳定接近 70%+ 带宽利用率。 +2. **把矩阵模式和 kernel 一起看**:均匀矩阵与高偏斜矩阵的最优策略不同。 +3. **看选择器是否解释得通**:自动选择结果是否与统计特征一致。 +4. **看方差**:平均值高但波动大,未必适合当成强结论。 diff --git a/docs/zh/quickstart.md b/docs/zh/quickstart.md index f1aa636..f5940f1 100644 --- a/docs/zh/quickstart.md +++ b/docs/zh/quickstart.md @@ -24,7 +24,7 @@ nvidia-smi ### 1. 克隆仓库 ```bash -git clone https://github.com/LessUp/gpu-spmv.git +git clone https://github.com/AICL-Lab/gpu-spmv.git cd gpu-spmv ``` diff --git a/docs/zh/references.md b/docs/zh/references.md index 557bb2a..dbac42e 100644 --- a/docs/zh/references.md +++ b/docs/zh/references.md @@ -1,78 +1,21 @@ # 学术参考 -GPU SpMV 的实现基于以下学术研究成果。 + -## 核心算法 +本页把论文、项目和延伸阅读分开整理,方便读者快速建立“**这个项目参考了谁、站在什么技术谱系上**”的理解。 -### Merge-based Parallel SpMV +## 核心论文 -> Merrill, D., & Garland, M. (2016). **Merge-based parallel sparse matrix-vector multiplication**. *ACM SIGPLAN Notices*, 51(8), 12-21. + -- **贡献**: 提出 Merge Path 算法,实现完美负载均衡 -- **应用于**: `MERGE_PATH` kernel -- [DOI: 10.1145/3016078.285114](https://doi.org/10.1145/3016078.285114) +## 代表性项目 -### Bell & Garland Survey + -> Bell, N., & Garland, M. (2009). **Implementing sparse matrix-vector multiplication on throughput-oriented processors**. *SC'09: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis*. +## 如何使用这些参考 -- **贡献**: CSR vs ELL 格式性能分析,GPU SpMV 基础理论 -- **应用于**: `VECTOR_CSR`、`ELL_KERNEL` 设计 -- [DOI: 10.1145/1654059.1654121](https://doi.org/10.1145/1654059.1654121) - -### CSR5 Format - -> Liu, Y., & Vuduc, R. (2018). **An adaptive algorithm for sparse matrix-vector multiplication on GPUs**. *IEEE Transactions on Parallel and Distributed Systems*. - -- **贡献**: CSR5 格式,自适应负载均衡 -- **参考**: 理解不规则稀疏矩阵的负载分布 - -## GPU 计算基础 - -### CUDA Best Practices - -> NVIDIA. (2024). **CUDA C++ Best Practices Guide**. - -- **参考**: 内存合并、纹理缓存、Warp 同步 -- [Link](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/) - -### CUDA Programming Guide - -> NVIDIA. (2024). **CUDA C++ Programming Guide**. - -- **参考**: CUDA 执行模型、存储层次 -- [Link](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) - -## PageRank - -> Page, L., Brin, S., Motwani, R., & Winograd, T. (1999). **The PageRank citation ranking: Bringing order to the web**. *Stanford InfoLab*. - -- **贡献**: PageRank 算法原始论文 -- **应用于**: `pagerank()` 实现 - -## 相关项目 - -| Project | Stars | Description | Key Takeaway | -|:--------|:-----:|:------------|:-------------| -| [Ginkgo](https://github.com/ginkgo-project/ginkgo) | 597 | High-performance linear algebra | Performance visualization | -| [cuSPARSE](https://docs.nvidia.com/cuda/cusparse/) | N/A | NVIDIA official library | Performance baseline | -| [SuiteSparse](https://github.com/DrTimothyAldenDavis/SuiteSparse) | 947 | Sparse matrix collection | Standard test data | -| [Kokkos Kernels](https://github.com/kokkos/kokkos-kernels) | 300+ | Multi-backend sparse BLAS | Performance portability | - -## 引用本项目 - -```bibtex -@software{gpuspmv2024, - author = {LessUp}, - title = {GPU SpMV: High-Performance CUDA Sparse Matrix-Vector Multiplication}, - year = {2024}, - url = {https://github.com/LessUp/gpu-spmv} -} -``` - -## 推荐阅读 - -1. **GPU Architecture**: 了解 GPU 内存层次和执行模型 -2. **Sparse Matrix Formats**: 不同格式的优缺点 -3. **Load Balancing**: 并行计算中的负载均衡技术 -4. **Memory Coalescing**: GPU 内存访问优化 +1. **先读 Bell & Garland**,理解 GPU SpMV 的经典问题定义。 +2. **再看 Merrill & Garland**,理解 Merge Path 在不规则负载中的价值。 +3. **对照 cuSPARSE / Ginkgo / SuiteSparse**,把本项目放回真实工程生态里看。 diff --git a/docs/zh/whitepaper/index.md b/docs/zh/whitepaper/index.md index e85c512..f93e23c 100644 --- a/docs/zh/whitepaper/index.md +++ b/docs/zh/whitepaper/index.md @@ -1,100 +1,28 @@ -# 设计哲学 +# GPU SpMV:把项目当成工程作品来阅读 -## 执行摘要 + +这个站点服务于三类读者:面试官、开源读者、性能工程师。白皮书首页先给结论,随后给设计判断、证据链和深入阅读路径。 + -GPU SpMV 是一个 **生产级 CUDA 库**,实现了高性能稀疏矩阵向量乘法(SpMV),在现代 NVIDIA GPU 上达到 **70%+ 理论内存带宽利用率**。 +## 为什么这个项目值得单独写成白皮书 -### 核心贡献 +- SpMV 是典型的 **内存带宽受限** 问题,性能上限主要由访存效率决定。 +- 真正有展示价值的不只是 kernel 本身,而是 **为什么选它、什么时候选它、如何证明它值得选**。 +- 这个项目同时强调 CUDA 性能、RAII 资源管理、错误处理、Spec-Driven 开发和可读文档,这让它更像工程作品,而不只是 demo。 -| 贡献 | 影响 | -|:-----|:-----| -| **4 种优化内核** | 基于矩阵特征的自动内核选择 | -| **Merge Path 算法** | 不规则稀疏模式的完美负载均衡 | -| **ELL 列主序布局** | 均匀矩阵的完全合并访存 | -| **Spec-Driven 开发** | 完整的设计决策可追溯性 | - -### 性能亮点 - -| 矩阵规模 | 非零元素 | 内核 | 带宽利用率 | -|:--------:|:--------:|:-----|:----------:| -| 10K × 10K | 500K | Vector CSR | **70.2%** | -| 100K × 100K | 5M | Merge Path | **71.5%** | -| 1M × 1M | 50M | Merge Path | **70.8%** | - -::: info 测试环境 -NVIDIA RTX 3090(Ampere 架构,理论带宽:936 GB/s) -::: +## 这份白皮书会回答什么 -### 目标读者 +1. 这个问题为什么重要,以及 GPU SpMV 的瓶颈是什么。 +2. 四类 kernel 与自动选择策略分别解决了什么。 +3. 项目如何把性能、工程规范和可解释性结合起来。 +4. 哪些页面提供架构、性能、API 与引用材料,方便继续深入。 -- **系统架构师**:设计 GPU 加速的稀疏计算 -- **HPC 工程师**:优化内存受限的工作负载 -- **研究人员**:需要可复现、文档完善的基准 -- **应用开发者**:构建图算法、迭代求解器 +## 阅读路径 -### 文档结构 - -| 章节 | 目的 | +| 页面 | 作用 | |:-----|:-----| -| [设计哲学](/zh/whitepaper/philosophy) | 架构原则和权衡 | -| [性能分析](/zh/whitepaper/performance) | 详细基准测试方法和结果 | -| [架构概览](/zh/architecture/overview) | 系统设计文档 | -| [API 参考](/zh/api/spmv) | 完整 API 文档 | - ---- - -## SpMV 的重要性 - -稀疏矩阵向量乘法(SpMV)是以下领域的基础操作: - -- **图分析**:PageRank、社区发现、最短路径 -- **科学计算**:有限元分析、CFD、迭代求解器 -- **机器学习**:稀疏神经网络、推荐系统 - -SpMV 本质上是 **内存受限** 的——每个非零元素需要读取矩阵数据、列索引和向量值,计算量极小。实现高带宽利用率是主要的优化挑战。 - ---- - -## 设计概览 - -```mermaid -flowchart TB - subgraph Input["输入"] - Matrix[稀疏矩阵] - Vector[稠密向量] - end - - subgraph Analysis["矩阵分析"] - NNZ[每行平均 nnz] - Skew[偏度] - Pattern[分布模式] - end - - subgraph Selection["内核选择"] - Decision{自动选择} - Scalar[Scalar CSR
avg_nnz < 4] - Vector[Vector CSR
均匀行] - Merge[Merge Path
高偏度] - ELL[ELL Kernel
列主序] - end - - subgraph Execution["GPU 执行"] - Compute[SpMV 计算] - Result[结果向量] - end - - Matrix --> Analysis - Vector --> Execution - Analysis --> Decision - Decision --> Scalar - Decision --> Vector - Decision --> Merge - Decision --> ELL - Scalar --> Compute - Vector --> Compute - Merge --> Compute - ELL --> Compute - Compute --> Result -``` - -库会根据矩阵特征自动选择最优内核,确保在各种稀疏模式下都获得接近峰值性能。 \ No newline at end of file +| [设计哲学](/zh/whitepaper/philosophy) | 看这个项目优先优化什么、舍弃什么 | +| [性能分析](/zh/whitepaper/performance) | 看 benchmark 应该怎么读,数据说明什么 | +| [架构概览](/zh/architecture/overview) | 看执行流水线、数据流和模块边界 | +| [API 参考](/zh/api/spmv) | 看对外接口与使用方式 | +| [学术参考](/zh/references) | 看论文、项目与延伸材料 | diff --git a/docs/zh/whitepaper/performance.md b/docs/zh/whitepaper/performance.md index 9c01be5..91e31ce 100644 --- a/docs/zh/whitepaper/performance.md +++ b/docs/zh/whitepaper/performance.md @@ -205,7 +205,7 @@ for (auto& x : inputs) { ```bash # 克隆并构建 -git clone https://github.com/LessUp/gpu-spmv.git +git clone https://github.com/AICL-Lab/gpu-spmv.git cd gpu-spmv cmake -S . -B build -DCMAKE_BUILD_TYPE=Release cmake --build build